繁体   English   中英

在 python 中删除 DataFrame 列

[英]drop a DataFrame column in python

我在这里迫切需要帮助。 我正在尝试获取 dataframe 的尺寸。 我总是得到 31 列而不是 30:值应该是 30,找到 31。我试图 reset_index(drop = True) 但我仍然得到同样的错误。 任何帮助表示赞赏。 注意安全。

def read_data(dataset_id):
    data = None
    # Begin CODE
    if dataset_id == 'breast_cancer':
        disease = 'breast_cancer'
        datafile = 'wdbc.data'  

        bc_columns = ['ptid', 'diagnosis', 'mean_radius', 'mean_texture', 
        'mean_perimeter', 'mean_area',
                  'mean_smoothness', 'mean_compactness', 'mean_concavity', 
        'mean_concave_pts', 'mean_symmetry ',
                  'mean_fractal_dim', 'std_err_radius', 'std_err_texture', 
        'std_err_perimeter', 'std_err_area',
                  'std_err_smoothness', 'std_err_compactness', 
        'std_err_concavity', 'std_err_concave_pts',
                  'std_err_symmetry ', 'std_err_fractal_dim', 'worst_radius', 
        'worst_texture', 'worst_perimeter',
                  'worst_area', 'worst_smoothness', 'worst_compactness', 
        'worst_concavity', 'worst_concave_pts',
                  'worst_symmetry ', 'worst_fractal_dim']

        data = pd.read_csv(datafile, skipinitialspace=True, names=bc_columns)

        data.drop(labels=['ptid'], axis=1, inplace=True)

        bc_diag_class = get_class_list_dict(data['diagnosis'])

        elif dataset_id == 'hyperthyroidism':
            disease = 'hyperthyroidism'
            datafile1 = 'allhyper.data'  # tab delimited, no header
            datafile2 = 'allhyper.test'  # comma delimited, no header

    ht_columns = ['age', 'Gender', 'on thyroxine', 'query on thyroxine', 'on 
    antithyroid medication', 'sick',
                  'pregnant', 'thyroid surgery', 'I131 treatment', 'query 
    hypothyroid', 'query hyperthyroid',
                  'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 
   'TSH measured', 'TSH', 'T3 measured',
                  'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U', 'FTI 
    measured', 'FTI', 'TBG measured', 'TBG',
                  'referral source', 'diag_class']

    data1 = pd.read_csv(datafile1, sep='\t', skipinitialspace=True, 
    names=ht_columns)
    data2 = pd.read_csv(datafile2, skipinitialspace=True, names=ht_columns)

    data = data1.append(data2, ignore_index=True)

    data = data.replace(to_replace='?', value=float('nan'))

    data[['diag_class', 'ptid']] = data['diag_class'].str.split(pat='.\|', 
    expand=True)

    diag_class = data['diag_class']
    data.drop(labels=['diag_class', 'ptid'], axis=1, inplace=True)
    data.insert(0, 'diag_class', diag_class)

    data[['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']] \
        = data[['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 
    'TBG']].apply(pd.to_numeric)

        elif dataset_id == 'cervical_cancer':
           disease = 'cervical_cancer'
            datafile = 'risk_factors_cervical_cancer.csv'  

    cc_columns = ('Age', 'Num_sex_partners', 'First_sex_intercourse', 
    'Num_pregnancies',
                  'Smokes', 'Smokes_years', 'Smokes_packs_year', 
    'Hormonal_Contraceps',
                  'Hormonal_Contraceps_years', 'IUD', 'IUD_years', 'STD', 
    'STD_number',
                  'STD_condylomatosis', 'STDscervical_condylomatosis', 
    'STD_vaginal_condylomatosis',
                  'STD_vulvo_perin_condylomatosis', 'STD_syphilis', 
    'STD_pelvic_inflam_disease',
                  'STD_genital_herpes', 'STD_molluscum_contagiosum', 
    'STD_AIDS', 'STD_HIV', 'STD_HepB',
                  'STD_HPV', 'STD_Num_diagnosis', 
   'STD_Time_since_first_diag', 'STDs_Time_since_last_diag',
   'Dx_Cancer', 'Dx_CIN', 'Dx_HPV', 'Dx', 'Hinselmann', 'Schiller', 
    'Citology', 'Biopsy')

    data = pd.read_csv(datafile, skipinitialspace=True)
    data.columns = cc_columns
    data = data.replace(to_replace='?', value=float('nan'))
    biopsy_class = data['Biopsy']
    data.drop(labels=['Dx_Cancer', 'Dx_CIN', 'Dx_HPV', 'Dx', 'Hinselmann', 
    'Schiller', 'Citology', 'Biopsy'],
              axis=1, inplace=True)

    data.insert(0, 'Biopsy', biopsy_class)

    data[['Num_sex_partners', 'First_sex_intercourse', 'Num_pregnancies', 
    'Smokes_years', 'Smokes_packs_year',
          'Hormonal_Contraceps_years', 'IUD_years',
          'STD_number', 'STD_Time_since_first_diag', 
    'STDs_Time_since_last_diag']] \
        = data[['Num_sex_partners', 'First_sex_intercourse', 
   'Num_pregnancies', 'Smokes_years', 'Smokes_packs_year',
                'Hormonal_Contraceps_years', 'IUD_years',
                'STD_number', 'STD_Time_since_first_diag', 
   'STDs_Time_since_last_diag']].apply(pd.to_numeric)

elif dataset_id == 'liver_cancer':
    disease = 'liver_cancer'
    datafile = 'Indian Liver Patient Dataset (ILPD).csv'  # comma delimited, 
     no header
    ld_columns = ['Age', 'Gender', 'TB', 'DB', 'Alkphos', 'Sgpt', 'Sgot', 
    'TP', 'ALB', 'A/G Ratio', 'Selector']

    data = pd.read_csv(datafile, skipinitialspace=True, names=ld_columns)

    data.loc[data['Gender'] == 'Male', 'Gender'] = 'M'
    data.loc[data['Gender'] == 'Female', 'Gender'] = 'F'

    selector_class = data['Selector']
    data.drop(labels=['Selector'], axis=1, inplace=True)

    data.insert(0, 'Selector', selector_class)
    data.reset_index(drop=True, inplace=True)
# End CODE
print(data.head(20))
return data


def dimensions(dataset_id, dataset):
    dim = None
    # dim = dataset.shape
    num_inst = len(dataset)
    num_feat = len(dataset.iloc[0].reset_index())
    dim = (num_inst, num_feat)
    return dim

如果你想从 DataFrame 中删除一列,你可以这样做。

如果要删除单列:

df.drop(['column_name'], axis = 1) 

如果要删除多列:

df.drop(['Column1', 'Column2'], axis = 1) 

如果您想根据其他条件而不是列名删除。 您可以在下方发表评论。 我会相应地更新答案。 希望能帮助到你..

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM