KeyError: "None of [Index([('A','B','C')] , dtype='object')] are in the [columns]

Question

I defined my X and y as follows:

X=df[text_columns + categorical_columns + textual_columns + numeric_columns]
y=df[['Label']]

where

text_columns='Tweet'
categorical_columns=['A','B','C']
numeric_columns =['N1','N2']

The columns names are just for example. Then I split into train/test:

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=(1/5), random_state=38, stratify=y)

I am trying to build a customized transformer as follows:

Categorical

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

class CategoricalTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        super().__init__()

    # Return self nothing else to do here
    def fit(self, X, y=None):
        return self

    # Helper function that converts values to Binary depending on input
    def create_binary(self, obj):
        if obj == 0:
            return 'No'
        else:
            return 'Yes'

    # Transformer method for this transformer
    def transform(self, X, y=None):
        # Categorical features to pass down the categorical pipeline
        return X[[categorical_columns]].values
    
    def get_feature_names(self):
        return X.columns.tolist()
    
# Defining the steps in the categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('categorical_transformer', CategoricalTransformer()),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))])

Text class TextTransformer(BaseEstimator, TransformerMixin): def init (self): super(). init ()

    # Return self nothing else to do here
    def fit(self, X, y=None):
        return self

    # Helper function that converts values to Binary depending on input
    def create_binary(self, obj):
        if obj == 0:
            return 'No'
        else:
            return 'Yes'

    # Transformer method for this transformer
    def transform(self, X, y=None):
        # Text features to pass down the text pipeline
        return X[['Tweet']].values
    
    def get_feature_names(self):
        return X.columns.tolist()
    
# Defining the steps in the text pipeline
text_pipeline = Pipeline(steps=[
    ('text_transformer', TextTransformer()),
    ('cv', CountVectorizer())])

Numeric

class NumericalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Numerical features to pass down the numerical pipeline
        X = X[[numeric_columns]]
        X = X.replace([np.inf, -np.inf], np.nan)
        return X.values
    
    def get_feature_names(self):
        return X.columns.tolist()
    
# Defining the steps in the numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('numerical_transformer', NumericalTransformer()),
    ('imputer', KNNImputer(n_neighbors=2)),
    ('minmax', MinMaxScaler())])

Then I use feature union:

from sklearn.pipeline import FeatureUnion

union_pipeline = FeatureUnion(transformer_list=[
    ('categorical_pipeline', categorical_pipeline),
    ('numerical_pipeline', numerical_pipeline), 
    ('text_pipeline', text_pipeline)])

# Combining the custom imputer with the categorical, text and numerical pipeline
preprocess_pipeline = Pipeline(steps=[('full_pipeline', union_pipeline)])

But when I run the model

# MODEL
from sklearn import tree

# Decision Tree
decision_tree = tree.DecisionTreeClassifier()
full_pipeline = Pipeline(steps=[
    ('preprocess_pipeline', preprocess_pipeline),
    ('model', decision_tree)])

# fit on the complete pipeline
training = full_pipeline.fit(X, y)
print(full_pipeline.get_params())

# metrics
score_test = \
    round(training.score(X, y) * 100, 2)
print(f"\nTraining Accuracy: {score_test}")

I get this error:

---> 12 training = full_pipeline.fit(X, y)
<ipython-input-69-051568c7b272> in transform(self, X, y)
     21     def transform(self, X, y=None):
     22         # Categorical features to pass down the categorical pipeline
---> 23         return X[[('A','B','C')]].values
     24 
     25     def get_feature_names(self):
....

KeyError: "None of [Index([('A','B','C')], dtype='object')] are in the [columns]"

I get a similar error also with numeric columns. TextTransformer seems to be the only one which works with no errors.

I guess there is a problem with the dataset/columns that I am considering.

Answer 1

If numeric_columns (and any of the others) are tuples, then you do

X[numeric_columns]

instead of

X[[numeric_columns]]

to select that subset of columns from a pandas DataFrame

KeyError: "None of [Index([('A','B','C')] , dtype='object')] are in the [columns]

Question

1 answers

solution1
1 ACCPTED 2021-05-17 13:10:28

KeyError: "None of [Index([('A','B','C')] , dtype='object')] are in the [columns]

Question

1 answers

solution1 1 ACCPTED 2021-05-17 13:10:28

solution1
1 ACCPTED 2021-05-17 13:10:28