[英]transforming data first vs doing everything in pipe results in different results when using a model
I wanted to make all of the custom transformations I make to my data in a pipe. I thought that I could use it as pipe.fit_transform(X)
to transform my X before using it in a model, but I also thought that I'll be able to append to the pipeline model itself and simply use it as one using pipe.steps.append(('model', self.model))
.我想对 pipe 中的数据进行所有自定义转换。我认为我可以将它用作
pipe.fit_transform(X)
来转换我的 X,然后再将其用于 model,但我也认为我'您将能够将 append 连接到管道 model 本身,并使用pipe.steps.append(('model', self.model))
将其用作一个管道。
Unfortunately, after everything was built I've noticed that I'm getting different results when transforming the data and using it directly in a model vs doing everything in one pipeline.不幸的是,在构建完所有内容后,我注意到在转换数据并直接在 model 中使用它与在一个管道中完成所有操作时,我得到了不同的结果。 Have anyone experienced anything like this?
有没有人经历过这样的事情?
Adding code:添加代码:
# Base pipeline to be used
BASE_PIPE = Pipeline([
('dim_increase_num', data_num_mix()),
('dim_increase_cat', data_cat_mix()),
('start', data_get_dummies()),
('dm_correlation', data_x_corr_()),
('scaler', DFStandardScaler()),
('column_ectraction', ColumnExtractor(columns_catboost)),
])
class base_model_class:
def fit_predict(self, X_train:pd.DataFrame=X_train, y_train:pd.Series=y_train, X_test:pd.DataFrame=X_test):
return self.fit(X_train, y_train).predict(X_test)
def evaluate(self, X:pd.DataFrame=X, y:pd.Series=y):
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
y_pred = self.fit(X_train, y_train).predict(X_test)
result= r2_score(y_test, y_pred)
return result
class model_linear_regression(base_model_class):
def __init__(self, pipe=None, inverse=False):
self.name = 'Linear_Regression'
self.model = LinearRegression()
if pipe==None:
self.pipe = Pipeline([('model', self.model)])
else:
self.pipe = deepcopy(pipe)
self.pipe.steps.append(('model', self.model))
if inverse:
self.pipe = TransformedTargetRegressor( regressor=self.pipe,
func=np.log1p,
inverse_func=np.expm1)
def fit(self, X:pd.DataFrame=X_train, y:pd.Series=y_train):
self.pipe.fit(X, y)
return self
def predict(self, X:pd.DataFrame=X_test):
y_pred = self.pipe.predict(X)
return y_pred
And then, when using everything gives different R2 scores:然后,当使用所有东西时都会给出不同的 R2 分数:
Xx=BASE_PIPE.fit_transform(X)
model_linear_regression(inverse=False).evaluate(Xx,y)
>>> 0.7415005607713974
model_linear_regression(BASE_PIPE, inverse=False).evaluate(X,y)
>>> -6.306970505602111e+22
EDIT: providing all steps in pipeline used:编辑:提供使用的管道中的所有步骤:
class data_num_mix(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=NUMERIC_FEATURES):
self.columns = columns
def fit(self, X, y = None):
return self
def transform(self, X, y = None):
X_ = X.copy()
self.frames = [X_]
for col in self.columns:
A = pd.DataFrame(X_[col].map(lambda x: np.sqrt(x) if x>0 else -np.sqrt(-x)))
A = A.rename(columns={col:col+'^s'})
self.frames += [A]
B = pd.DataFrame(X_[col] * X_[col])
B = B.rename(columns={col:col+'^2'})
self.frames += [B]
return pd.concat(self.frames, axis=1)
class data_cat_mix(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=CATEGORICAL_FEATURES):
self.columns = columns
def fit(self, X, y = None):
return self
def transform(self, X, y = None) -> pd.DataFrame:
X_ = X.copy()
for col in self.columns:
df_col_count = X_[col].value_counts().to_frame().reset_index()
df_col_count.columns = ["var_name", "var_count"]
df_col_count["var_freq"] = df_col_count["var_count"] / df_col_count["var_count"].sum()
X_['C_'+col] = X_[col].replace(df_col_count.set_index('var_name')['var_count'])
X_['F_'+col] = X_[col].replace(df_col_count.set_index('var_name')['var_freq'])
return X_
class data_get_dummies(BaseEstimator, TransformerMixin):
def __init__(self, columns:list = CATEGORICAL_FEATURES):
self.columns = columns
self.encoder = make_column_transformer((OneHotEncoder(handle_unknown="ignore", sparse=False), self.columns),remainder='passthrough')
def fit(self, X, y = None):
self.encoder.fit(X)
return self
def transform(self, X, y = None) -> pd.DataFrame:
X_ = X.copy()
encoder_columns = self.encoder.get_feature_names_out()
fixed_columns = [x.replace('onehotencoder__','').replace('remainder__','') for x in encoder_columns ]
df_temp=pd.DataFrame(self.encoder.transform(X_), columns=fixed_columns)
return df_temp
class data_x_corr(BaseEstimator, TransformerMixin):
def __init__(self, columns:list=NUMERIC_FEATURES_, corr_val:float=0.95):
self.columns = columns
self.corr_val = corr_val
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
# prepare numeric df
X_ = X.copy()
x = X_[self.columns]
corr_matrix = x.corr(method='spearman')
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterates through Correlation Matrix Table to find correlated columns
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = item.values
if val >= self.corr_val:
drop_cols.append(i)
drops = sorted(set(drop_cols))[::-1]
# Drops the correlated columns
for i in drops:
col = x.iloc[:, (i+1):(i+2)].columns.values
X_ = X_.drop(col, axis=1)
return X_
class DFStandardScaler(TransformerMixin):
# StandardScaler but for pandas DataFrames
def __init__(self):
self.ss = None
self.mean_ = None
self.scale_ = None
def fit(self, X, y=None):
self.ss = StandardScaler()
self.ss.fit(X)
self.mean_ = pd.Series(self.ss.mean_, index=X.columns)
self.scale_ = pd.Series(self.ss.scale_, index=X.columns)
return self
def transform(self, X) -> pd.DataFrame:
# assumes X is a DataFrame
Xss = self.ss.transform(X)
Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
return Xscaled
def __str__(self):
return "DF_StandardScaler"
def __repr__(self):
return "DF_StandardScaler"
class ColumnExtractor(TransformerMixin, BaseEstimator):
def __init__(self, cols):
self.cols = cols
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
# assumes X is a DataFrame
Xcols = X[self.cols]
return Xcols
The one transformer that stands out to me is data_cat_mix
, specifically the count-of-level columns.对我来说最突出的一个转换器是
data_cat_mix
,特别是 count-of-level 列。 When applied to train+test, these are consistent (but leaks test information);当应用于训练+测试时,这些是一致的(但会泄漏测试信息); when applied separately, the values in train will generally be much higher (just from its size being three times larger), so the model doesn't really understand how to treat them in the test set.
当单独应用时,train 中的值通常会高得多(只是因为它的大小是原来的三倍),所以 model 并不真正理解如何在测试集中处理它们。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.