简体   繁体   中英

Shap/numpy: all the input array dimensions for the concatenation axis must match exactly

Could someone please explain how to fix when this code (a reproducible example):

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest 
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest, RFECV 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt 
import pickle
#import neptune.new as neptune
import pandas as pd
import shap


    full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
    

    def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'): 

          cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
          acc_list = list()
          f1_list = list()
          precision_list = list()
          recall_list = list()
          auc_list = list()
    
    
          #for ROC curve
          tprs = []
          base_fpr = np.linspace(0, 1, 101)
          plt.figure(figsize=(5, 5))
          plt.axes().set_aspect('equal', 'datalim')
          count = 0
    
          list_shap_values = list()
          list_test_sets = list()
    
          for train_ix,test_ix in cv_outer.split(X_train):
                  split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc               
                  split_y_train, split_y_test = y_train[train_ix],y_train[test_ix]  #add in .iloc
    
                  cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
                  model = model_name
                  rfecv = RFECV(estimator=model, step=1,scoring='accuracy',cv=cv_inner) #change scoring to scoring='roc_auc'
                  pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',model)])
                  search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
                  best_model = search.best_estimator_[0]
                  selected_features = best_model.support_
    
                  split_x_test_selected_features = split_x_test[:,selected_features]
                  print(split_x_test_selected_features.shape)
                  best_model_shap = search.best_estimator_[1]
                  print(best_model_shap)
                  print(search.best_params_)
                  print(search.best_score_)
                  print(search.best_estimator_)
    
                  #transformed_x_test = best_model.transform(split_x_test)
                  yhat = search.predict(split_x_test)  #changed from best_model and split_x_test
    
                  accuracy = accuracy_score(split_y_test,yhat)
                  acc_list.append(accuracy)
    
                  f1_sc = f1_score(split_y_test,yhat)
                  f1_list.append(f1_sc)
    
                  precision_sc = precision_score(split_y_test,yhat)
                  precision_list.append(precision_sc)
    
                  recall_sc = recall_score(split_y_test,yhat)
                  recall_list.append(recall_sc)
                  
                  fpr, tpr, _ = roc_curve(split_y_test, yhat)
                  auc = metrics.auc(fpr,tpr)
                  auc_list.append(auc)
    
                  plt.plot(fpr, tpr, 'b', alpha=0.15)
                  tpr = np.interp(base_fpr, fpr, tpr)
                  tpr[0] = 0.0
                  tprs.append(tpr)
                  count +=1
    
                  print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,search.best_score_,search.best_params_))
    
                  explainer = shap.TreeExplainer(best_model_shap)
                  shap_values = explainer.shap_values(split_x_test_selected_features)
                  list_shap_values.append(shap_values)
                  list_test_sets.append(test_ix) 
    
          test_set = list_test_sets[0]
          shap_values = np.array(list_shap_values[0])
    
          for i in range(1,len(list_test_sets)):
              test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
              shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
    
    
          X_test_df = pd.DataFrame(full_X_train[test_set])
          cols = X_test_df.columns
          shap_sum = np.abs(shap_values[1,:,:]).mean(0)
          
    
          importance_df = pd.DataFrame({
               'column_name':cols,
               'shap_values':shap_sum
          }) 
               
          importance_df.sort_values('shap_values',ascending=False)
    
          print('Accuracy: %.3f (%.3f)' % (mean(acc_list),std(acc_list)))
          print('F1: %.3f (%.3f)' % (mean(f1_list),std(f1_list)))
          print('Precision: %.3f (%.3f)' % (mean(precision_list),std(precision_list)))
          print('Recall: %.3f (%.3f)' % (mean(recall_list),std(recall_list)))
          print('AUC: %.3f (%.3f)' % (mean(auc_list),std(auc_list)))
    
    
          tprs = np.array(tprs)
          mean_tprs = tprs.mean(axis=0)
          tpr_std = tprs.std(axis=0)
    
          tprs_upper = np.minimum(mean_tprs + tpr_std, 1)
          tprs_lower = mean_tprs - tpr_std
    
    
          plt.plot(base_fpr, mean_tprs, 'b')
          plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
          plt.plot([0, 1], [0, 1],'r--')
          plt.xlim([-0.01, 1.01])
          plt.ylim([-0.01, 1.01])
          plt.ylabel('True Positive Rate')
          plt.xlabel('False Positive Rate')
          plt.title('ROC for stratified 5-fold CV (blue line = mean)')
          plt.savefig(output_plt_file)
    
          print(importance_df)
    
          return
    
    
    param_grid = [{

                   'clf_cv__min_samples_leaf':[1,3,5],

                  }]
    
    run_model_with_grid_search(param_grid=param_grid)
            

Produces the error:

  File "/home/data/ml_models.py", line 180, in <module>
    run_model_with_grid_search(param_grid=param_grid)
  File "/home/data/ml_models.py", line 127, in run_model_with_grid_search
    shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
  File "<__array_function__ internals>", line 5, in concatenate
ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 2, the array at index 0 has size 20 and the array at index 1 has size 16

I can generally see other questions like this on SO, but not a specific solution that I can understand how to apply here, so if someone could show me how to change this code I'd appreciate it.

Before you can fix a problem, you have to understand it. When it comes to shape errors, you have to know the shape of all variables involved. Sometimes that can be deduced, but often I have to add some print statements to be sure.

shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
    test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
    shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
 

shap_values starts as the first element of list_shap_values . I can't say exactly what that is, except it was created by appending shap_values in an iteration:

 list_shap_values.append(shap_values)

Then you appear to iterate on that list, can try to concatenate more elements from that list.

For the concatenate work, you need to know what the shap_values.shape is, as well as np.array(list_shap_values[i]).shape . I stress KNOW, no guessing allowed.

From the error message

ValueError: all the input array dimensions for the concatenation axis must match exactly, 
but along dimension 2, the array at index 0 has size 20 
and the array at index 1 has size 16

So in this iteration (I don't know if it's the first or not),

shap_values must be (n,m1,20) shape, and np.array(...) must be (n,m2,16).

You are concatenating on axis 1, but the other dimensions must match. Is that clear?

I wonder why you are doing this repeated concatenate . You already showed you know how to use list append. That is much faster.

I was going to suggest np.concatenate(list_shap_values, axis=1) , that is, doing a concatenate on all elements of the list - once, rather than piecemeal. But this error suggests that the arrays in list_shap_values are not compatible. Some have 20 on this axis=2, others 16, and who knows what else.

Anyways, that's what I can deduce from the code and the error. They don't pay me enough to take it any further :)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM