I recently started working with GridSearchCV and am not very comfortable with object-oriented programming.
Problem: I have some missing data in one of the predictors and I have a list of algorithms I want to use with appropriate parameter grid. I am wondering if there is a way to create a custom class that would impute missing values in my data different ways before fitting it and try different combinations of attributes for whatever algorithm I am using at the same time. Is it even possible?
Thanks a TON for your help!
This is a class I created for imputing missing values in SQFT column of my housing prices dataset:
class Impute_sqft():
train_X = houses_dummies_copy.dropna(subset=['sqft']).drop(columns=['sqft', 'final_price'])
train_Y = houses_dummies_copy.dropna(subset=['sqft'])['sqft']
test_X = houses_dummies_copy[pd.isna(houses_dummies_copy.sqft)].drop(columns=['sqft', 'final_price'])
def __init__(self, how='random forest'):
self.how = how
def impute(self):
# replace missing values with the ones predicted by random forest
if self.how == 'random forest':
houses_dummies_copy = houses_dummies.copy()
rf = RandomForestRegressor()
rf.fit(train_X, train_Y)
pred_Y = rf.predict(test_X)
houses_dummies_copy.loc[test_X.index,'sqft'] = pred_Y
return houses_dummies_copy[predictors]
# replace missing values with the ones predicted by knn
if self.how == 'knn':
houses_dummies_copy = houses_dummies.copy()
import sys
from impyute.imputation.cs import fast_knn
sys.setrecursionlimit(100000)
knn_n = 30
result = fast_knn(houses_dummies_copy[predictors], k=knn_n)
result.columns = houses_dummies_copy[predictors].columns
return result
# replace missing values with the mean for every type of property
if self.how == 'mean':
houses_dummies_copy = houses_dummies.copy()
sqft_statistics = []
for house_type in houses_types:
statistic = houses_dummies_copy[houses_dummies_copy['type_' + house_type] == 1].sqft.mean(skipna=True)
indexes = houses_dummies_copy[(houses_dummies_copy['type_' + house_type] == 1) & pd.isna(houses_dummies_copy.sqft)].index
houses_dummies_copy.loc[indexes, 'sqft'] = statistic
return houses_dummies_copy[predictors]
What grid I have for GridSearchCV:
param_grid = [{
'bootstrap': [True, False],
'n_estimators': [3, 10],
'max_features': [2, 3, 4]
}]
What grid I want for GridSearchCV:
param_grid = [{
'bootstrap': [True, False],
'n_estimators': [3, 10],
'max_features': [2, 3, 4],
'sqft_imputer': ['random forest', 'knn', 'mean']
}]
What you want is to create a custom Transformer
, and to use it before your estimator in a Pipeline
.
Take a look at the template: TemplateTransformer
.
A few comments:
BaseEstimator
. self.how
has to be defined in the __init__
method of the class. fit
, and to apply them during the transform
. (see also Why does sklearn Imputer need to fit? ) To chain your custom imputer and an estimator, you can use a scikit-learn Pipeline
:
PCA
and a LogisticRegression
in a Pipeline
, and then uses it in a GridSearchCV
. param_grid = [{
'estimator__bootstrap': [True, False],
'estimator__n_estimators': [3, 10],
'estimator__max_features': [2, 3, 4],
'imputer__how': ['random forest', 'knn', 'mean']
}]
Following the answer above. Everything works now.
from sklearn.base import BaseEstimator, TransformerMixin
class Impute_sqft(BaseEstimator, TransformerMixin):
def __init__(self, how='random forest'):
self.how = how
def fit(self, X, y=None):
return self
def transform(self, X):
import sys
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS
from impyute.imputation.cs import fast_knn, mice
result = X.copy()
if self.how == 'random forest':
train_X = houses_dummies.dropna(subset=['sqft']).drop(columns=['sqft', 'final_price'])
train_Y = houses_dummies.dropna(subset=['sqft'])['sqft']
test_X = result[pd.isna(result.sqft)].drop(columns=['sqft'])
rf = RandomForestRegressor()
rf.fit(train_X, train_Y)
pred_Y = rf.predict(test_X)
result.loc[test_X.index,'sqft'] = pred_Y
if self.how == 'knn':
knn_n = 30
result = fast_knn(houses_dummies[predictors], k=knn_n)
result.columns = houses_dummies[predictors].columns
result.index = houses_dummies[predictors].index
result = result.loc[X.index,:]
if self.how == 'mice':
result = mice(houses_dummies[predictors])
result.columns = houses_dummies[predictors].columns
result.index = houses_dummies[predictors].index
result = result.loc[X.index,:]
if self.how == 'mean':
result['sqft'] = houses_edited.groupby('type')['sqft'].transform(lambda x: x.fillna(x.mean(skipna=True)))
if self.how == 'median':
result['sqft'] = houses_edited.groupby('type')['sqft'].transform(lambda x: x.fillna(x.median(skipna=True)))
if self.how == 'mode':
result['sqft'] = houses_edited.groupby('type')['sqft'].transform(lambda x: x.fillna(x.mode()[0]))
return result[predictors]
methods = ['random forest', 'knn', 'mice', 'mean', 'median', 'mode']
imputer = Impute_sqft()
rf = RandomForestRegressor()
pipe = Pipeline(steps=[
('imputer', imputer),
('rf', rf)
])
param_grid = [{
'imputer__how':methods,
'rf__max_depth':[1,2,None],
'rf__bootstrap':[True, False]
}]
houses_dummies_copy = houses_dummies.copy()
search = GridSearchCV(pipe, param_grid, iid=False, cv=5, scoring='neg_mean_absolute_error')
search.fit(houses_dummies_copy[predictors], houses_dummies_copy.final_price)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.