简体   繁体   中英

How to give GridSearchCV a list of indicies for cross-validation?

I'm trying to use custom cross-validation sets for a very specific dataset and scikit-optimize using BayesSearchCV . I've been able to replicate the error with scikit-learn using GridSearchCV .

Straight from the documentation :

cv : int, cross-validation generator or an iterable, optional

Determines the cross-validation splitting strategy. Possible inputs for cv are:

None, to use the default 3-fold cross validation, integer, to specify the number of folds in a (Stratified)KFold, An object to be used as a cross-validation generator. An iterable yielding train, test splits. For integer/None inputs, if the estimator is a classifier and y is either binary or multiclass, StratifiedKFold is used. In all other cases, KFold is used.

Refer User Guide for the various cross-validation strategies that can be used here.

I can't use cv=10 in my specific dataset. This is only to illustrate the error.

I would like to use a list of lists for the cross-validation training-testing splits as it says in the documentation. How can I format my cross-validation lists correctly?

# Generate data
def iris_data(noise=None, palette="hls", desat=1):
    # Iris dataset
    X = pd.DataFrame(load_iris().data,
                     index = [*map(lambda x:f"iris_{x}", range(150))],
                     columns = [*map(lambda x: x.split(" (cm)")[0].replace(" ","_"), load_iris().feature_names)])

    y = pd.Series(load_iris().target,
                           index = X.index,
                           name = "Species")
    cmap = map_colors(y, mode=1, palette=palette, desat=desat)#y.map(lambda x:{0:"red",1:"green",2:"blue"}[x])

    if noise is not None:
        X_noise = pd.DataFrame(
            np.random.RandomState(0).normal(size=(X.shape[0], noise)),
            index=X_iris.index,
            columns=[*map(lambda x:f"noise_{x}", range(noise))]
        )
        X = pd.concat([X, X_noise], axis=1)
    return (X, y, cmap)

X, y, c = iris_data(noise=50)

# Get cross-validations
cv = list()
for i in range(10):
    idx_tr = np.random.choice(np.arange(X.shape[0]),size=100, replace=False)
    idx_te = set(range(X.shape[0])) - set(idx_tr)
    tr_te_splits = [idx_tr.tolist(), list(idx_te)]
    cv.append(tr_te_splits)

# Get hyperparameter searchspace
search_spaces = {
    "n_estimators": [1,10,50],
    "criterion": ["gini", "entropy"],
    "max_features": ["sqrt", "log2", None],
    "min_samples_leaf": [1,2,3,5,8,13],

}
opt = GridSearchCV(RandomForestClassifier(random_state=0), search_spaces, scoring="accuracy", n_jobs=1, cv=cv)
opt.fit(X,y)

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-26-d1117d10dfa6> in <module>()
     59 }
     60 opt = GridSearchCV(RandomForestClassifier(random_state=0), search_spaces, scoring="accuracy", n_jobs=1, cv=cv)
---> 61 opt.fit(X,y)

~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
    640 
    641         # if one choose to see train score, "out" will contain train score info

~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    446     start_time = time.time()
    447 
--> 448     X_train, y_train = _safe_split(estimator, X, y, train)
    449     X_test, y_test = _safe_split(estimator, X, y, test, train)
    450 

~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
    198             X_subset = X[np.ix_(indices, train_indices)]
    199     else:
--> 200         X_subset = safe_indexing(X, indices)
    201 
    202     if y is not None:

~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/utils/__init__.py in safe_indexing(X, indices)
    144     if hasattr(X, "iloc"):
    145         # Work-around for indexing with read-only indices in pandas
--> 146         indices = indices if indices.flags.writeable else indices.copy()
    147         # Pandas Dataframes and Series
    148         try:

AttributeError: 'list' object has no attribute 'flags'

)

Since the input objects X and y are pandas they require named indicies I believe. If I convert them to numpy via .values method then it works. You just need to make sure the orders are correct if you do it this way.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM