[英]How to update fit method in a sklearn class?
我最近開始使用GridSearchCV,對面向對象的編程不太滿意。
問題:我在一個預測變量中缺少一些數據,並且我要在適當的參數網格中使用一系列算法。 我想知道是否存在一種創建自定義類的方法,該類將在擬合數據之前以不同的方式插補數據中的缺失值,並針對我同時使用的任何算法嘗試不同的屬性組合。 可能嗎?
感謝TON的幫助!
這是我為在房屋價格數據集的SQFT列中估算缺失值而創建的一個類:
class Impute_sqft():
train_X = houses_dummies_copy.dropna(subset=['sqft']).drop(columns=['sqft', 'final_price'])
train_Y = houses_dummies_copy.dropna(subset=['sqft'])['sqft']
test_X = houses_dummies_copy[pd.isna(houses_dummies_copy.sqft)].drop(columns=['sqft', 'final_price'])
def __init__(self, how='random forest'):
self.how = how
def impute(self):
# replace missing values with the ones predicted by random forest
if self.how == 'random forest':
houses_dummies_copy = houses_dummies.copy()
rf = RandomForestRegressor()
rf.fit(train_X, train_Y)
pred_Y = rf.predict(test_X)
houses_dummies_copy.loc[test_X.index,'sqft'] = pred_Y
return houses_dummies_copy[predictors]
# replace missing values with the ones predicted by knn
if self.how == 'knn':
houses_dummies_copy = houses_dummies.copy()
import sys
from impyute.imputation.cs import fast_knn
sys.setrecursionlimit(100000)
knn_n = 30
result = fast_knn(houses_dummies_copy[predictors], k=knn_n)
result.columns = houses_dummies_copy[predictors].columns
return result
# replace missing values with the mean for every type of property
if self.how == 'mean':
houses_dummies_copy = houses_dummies.copy()
sqft_statistics = []
for house_type in houses_types:
statistic = houses_dummies_copy[houses_dummies_copy['type_' + house_type] == 1].sqft.mean(skipna=True)
indexes = houses_dummies_copy[(houses_dummies_copy['type_' + house_type] == 1) & pd.isna(houses_dummies_copy.sqft)].index
houses_dummies_copy.loc[indexes, 'sqft'] = statistic
return houses_dummies_copy[predictors]
我對GridSearchCV有什么網格:
param_grid = [{
'bootstrap': [True, False],
'n_estimators': [3, 10],
'max_features': [2, 3, 4]
}]
我想要GridSearchCV的什么網格:
param_grid = [{
'bootstrap': [True, False],
'n_estimators': [3, 10],
'max_features': [2, 3, 4],
'sqft_imputer': ['random forest', 'knn', 'mean']
}]
您需要創建一個自定義的Transformer
,並在Pipeline
的估算器之前使用它。
看一下模板: TemplateTransformer
。
一些評論:
BaseEstimator
繼承。 self.how
的選擇必須在類的__init__
方法中定義。 fit
過程中估計統計量,並在transform
過程中應用它們。 (另請參閱為什么sklearn Imputer需要安裝? ) 要鏈接自定義的估算者和估算器,可以使用scikit-learn Pipeline
:
param_grid = [{
'estimator__bootstrap': [True, False],
'estimator__n_estimators': [3, 10],
'estimator__max_features': [2, 3, 4],
'imputer__how': ['random forest', 'knn', 'mean']
}]
按照上面的答案。 現在一切正常。
from sklearn.base import BaseEstimator, TransformerMixin
class Impute_sqft(BaseEstimator, TransformerMixin):
def __init__(self, how='random forest'):
self.how = how
def fit(self, X, y=None):
return self
def transform(self, X):
import sys
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS
from impyute.imputation.cs import fast_knn, mice
result = X.copy()
if self.how == 'random forest':
train_X = houses_dummies.dropna(subset=['sqft']).drop(columns=['sqft', 'final_price'])
train_Y = houses_dummies.dropna(subset=['sqft'])['sqft']
test_X = result[pd.isna(result.sqft)].drop(columns=['sqft'])
rf = RandomForestRegressor()
rf.fit(train_X, train_Y)
pred_Y = rf.predict(test_X)
result.loc[test_X.index,'sqft'] = pred_Y
if self.how == 'knn':
knn_n = 30
result = fast_knn(houses_dummies[predictors], k=knn_n)
result.columns = houses_dummies[predictors].columns
result.index = houses_dummies[predictors].index
result = result.loc[X.index,:]
if self.how == 'mice':
result = mice(houses_dummies[predictors])
result.columns = houses_dummies[predictors].columns
result.index = houses_dummies[predictors].index
result = result.loc[X.index,:]
if self.how == 'mean':
result['sqft'] = houses_edited.groupby('type')['sqft'].transform(lambda x: x.fillna(x.mean(skipna=True)))
if self.how == 'median':
result['sqft'] = houses_edited.groupby('type')['sqft'].transform(lambda x: x.fillna(x.median(skipna=True)))
if self.how == 'mode':
result['sqft'] = houses_edited.groupby('type')['sqft'].transform(lambda x: x.fillna(x.mode()[0]))
return result[predictors]
methods = ['random forest', 'knn', 'mice', 'mean', 'median', 'mode']
imputer = Impute_sqft()
rf = RandomForestRegressor()
pipe = Pipeline(steps=[
('imputer', imputer),
('rf', rf)
])
param_grid = [{
'imputer__how':methods,
'rf__max_depth':[1,2,None],
'rf__bootstrap':[True, False]
}]
houses_dummies_copy = houses_dummies.copy()
search = GridSearchCV(pipe, param_grid, iid=False, cv=5, scoring='neg_mean_absolute_error')
search.fit(houses_dummies_copy[predictors], houses_dummies_copy.final_price)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.