[英]How do you preprocess labels in a pipeline with sklearn?
我有一個預處理腳本,它從鑽石數據集中獲取數據並預處理數據。 我顯然也需要它來預處理標簽。
這是我的代碼:
# Data Preprocessing
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from icecream import ic
def diamond_preprocess(data_dir):
data = pd.read_csv(data_dir)
cleaned_data = data.drop(['id', 'depth_percent'], axis=1) # Features I don't want
x = cleaned_data.drop(['price'], axis=1) # Train data
y = cleaned_data['price'] # Label data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=99)
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = x_train.select_dtypes(include=['object']).columns.tolist()
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')), # Fill in missing data with median
('scaler', StandardScaler()) # Scale data
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Fill in missing data with 'missing'
('onehot', OneHotEncoder(handle_unknown='ignore')) # One hot encode categorical data
])
preprocessor_pipeline = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
])
# Fit to the training data
preprocessor_pipeline.fit(x_train)
preprocessor_pipeline.fit(y_train)
# Apply the pipeline to the training and test data
x_train_pipe = preprocessor_pipeline.transform(x_train)
x_test_pipe = preprocessor_pipeline.transform(x_test)
y_train_pipe = preprocessor_pipeline.transform(y_train)
y_test_pipe = preprocessor_pipeline.transform(y_test)
x_train = pd.DataFrame(data=x_train_pipe)
x_test = pd.DataFrame(data=x_test_pipe)
y_train = pd.DataFrame(data=y_train_pipe)
y_test = pd.DataFrame(data=y_test_pipe)
return x_train, x_test, y_train, y_test
我不太確定我的代碼是否正確,或者我對 sklearn 中管道和預處理的工作方式有很好的了解。 顯然,當我收到此錯誤時,口譯員同意:
File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\compose\_column_transformer.py", line 470, in fit
self.fit_transform(X, y=y)
File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\compose\_column_transformer.py", line 502, in fit_transform
self._check_n_features(X, reset=True)
File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\base.py", line 352, in _check_n_features
n_features = X.shape[1]
IndexError: tuple index out of range
如何像處理訓練數據一樣正確預處理標簽? 一個解釋也會很棒!
如果您想單獨應用轉換,您可以為目標列創建一個額外的管道,請參見下面的示例。
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# generate the data
data = pd.DataFrame({
'y': [1, 2, np.nan, 4, 5],
'x1': [6, 7, 8, np.nan, np.nan],
'x2': [9, 10, 11, np.nan, np.nan],
'x3': ['a', 'b', 'c', np.nan, np.nan],
'x4': [np.nan, np.nan, 'd', 'e', 'f']
})
# extract the features and target
x = data.drop(labels=['y'], axis=1)
y = data[['y']] # note that this is a data frame, not a series
# split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=99)
# map the features to the corresponding types (numerical or categorical)
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = x_train.select_dtypes(include=['object']).columns.tolist()
# define the features pipeline
numerical_features_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_features_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
features_pipeline = ColumnTransformer(transformers=[
('num_features', numerical_features_transformer, numerical_features),
('cat_features', categorical_features_transformer, categorical_features)
])
# define the target pipeline
target_pipeline = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
# fit the pipelines to the training data
features_pipeline.fit(x_train)
target_pipeline.fit(y_train)
# apply the pipelines to the training and test data
x_train_pipe = features_pipeline.transform(x_train)
x_test_pipe = features_pipeline.transform(x_test)
y_train_pipe = target_pipeline.transform(y_train)
y_test_pipe = target_pipeline.transform(y_test)
x_train = pd.DataFrame(data=x_train_pipe)
x_test = pd.DataFrame(data=x_test_pipe)
y_train = pd.DataFrame(data=y_train_pipe)
y_test = pd.DataFrame(data=y_test_pipe)
您可以使用 TransformedTargetRegressor 在回歸器前后的標簽上運行一些 function:
from sklearn.compose import TransformedTargetRegressor
pipeline = make_pipeline(
TransformedTargetRegressor(regressor=LinearRegression(fit_intercept = True, n_jobs = -1), func=np.log1p, inverse_func=np.expm1)
)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.