首页 > 解决方案 > 在 python 中删除一个 DataFrame 列

问题描述

我在这里迫切需要帮助。我正在尝试获取数据框的维度。我总是得到 31 列而不是 30:值应该是 30,找到 31。我试图 reset_index(drop = True) 但我仍然得到同样的错误。任何帮助表示赞赏。注意安全。

def read_data(dataset_id):
    data = None
    # Begin CODE
    if dataset_id == 'breast_cancer':
        disease = 'breast_cancer'
        datafile = 'wdbc.data'  

        bc_columns = ['ptid', 'diagnosis', 'mean_radius', 'mean_texture', 
        'mean_perimeter', 'mean_area',
                  'mean_smoothness', 'mean_compactness', 'mean_concavity', 
        'mean_concave_pts', 'mean_symmetry ',
                  'mean_fractal_dim', 'std_err_radius', 'std_err_texture', 
        'std_err_perimeter', 'std_err_area',
                  'std_err_smoothness', 'std_err_compactness', 
        'std_err_concavity', 'std_err_concave_pts',
                  'std_err_symmetry ', 'std_err_fractal_dim', 'worst_radius', 
        'worst_texture', 'worst_perimeter',
                  'worst_area', 'worst_smoothness', 'worst_compactness', 
        'worst_concavity', 'worst_concave_pts',
                  'worst_symmetry ', 'worst_fractal_dim']

        data = pd.read_csv(datafile, skipinitialspace=True, names=bc_columns)

        data.drop(labels=['ptid'], axis=1, inplace=True)

        bc_diag_class = get_class_list_dict(data['diagnosis'])

        elif dataset_id == 'hyperthyroidism':
            disease = 'hyperthyroidism'
            datafile1 = 'allhyper.data'  # tab delimited, no header
            datafile2 = 'allhyper.test'  # comma delimited, no header

    ht_columns = ['age', 'Gender', 'on thyroxine', 'query on thyroxine', 'on 
    antithyroid medication', 'sick',
                  'pregnant', 'thyroid surgery', 'I131 treatment', 'query 
    hypothyroid', 'query hyperthyroid',
                  'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 
   'TSH measured', 'TSH', 'T3 measured',
                  'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U', 'FTI 
    measured', 'FTI', 'TBG measured', 'TBG',
                  'referral source', 'diag_class']

    data1 = pd.read_csv(datafile1, sep='\t', skipinitialspace=True, 
    names=ht_columns)
    data2 = pd.read_csv(datafile2, skipinitialspace=True, names=ht_columns)

    data = data1.append(data2, ignore_index=True)

    data = data.replace(to_replace='?', value=float('nan'))

    data[['diag_class', 'ptid']] = data['diag_class'].str.split(pat='.\|', 
    expand=True)

    diag_class = data['diag_class']
    data.drop(labels=['diag_class', 'ptid'], axis=1, inplace=True)
    data.insert(0, 'diag_class', diag_class)

    data[['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']] \
        = data[['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 
    'TBG']].apply(pd.to_numeric)

        elif dataset_id == 'cervical_cancer':
           disease = 'cervical_cancer'
            datafile = 'risk_factors_cervical_cancer.csv'  

    cc_columns = ('Age', 'Num_sex_partners', 'First_sex_intercourse', 
    'Num_pregnancies',
                  'Smokes', 'Smokes_years', 'Smokes_packs_year', 
    'Hormonal_Contraceps',
                  'Hormonal_Contraceps_years', 'IUD', 'IUD_years', 'STD', 
    'STD_number',
                  'STD_condylomatosis', 'STDscervical_condylomatosis', 
    'STD_vaginal_condylomatosis',
                  'STD_vulvo_perin_condylomatosis', 'STD_syphilis', 
    'STD_pelvic_inflam_disease',
                  'STD_genital_herpes', 'STD_molluscum_contagiosum', 
    'STD_AIDS', 'STD_HIV', 'STD_HepB',
                  'STD_HPV', 'STD_Num_diagnosis', 
   'STD_Time_since_first_diag', 'STDs_Time_since_last_diag',
   'Dx_Cancer', 'Dx_CIN', 'Dx_HPV', 'Dx', 'Hinselmann', 'Schiller', 
    'Citology', 'Biopsy')

    data = pd.read_csv(datafile, skipinitialspace=True)
    data.columns = cc_columns
    data = data.replace(to_replace='?', value=float('nan'))
    biopsy_class = data['Biopsy']
    data.drop(labels=['Dx_Cancer', 'Dx_CIN', 'Dx_HPV', 'Dx', 'Hinselmann', 
    'Schiller', 'Citology', 'Biopsy'],
              axis=1, inplace=True)

    data.insert(0, 'Biopsy', biopsy_class)

    data[['Num_sex_partners', 'First_sex_intercourse', 'Num_pregnancies', 
    'Smokes_years', 'Smokes_packs_year',
          'Hormonal_Contraceps_years', 'IUD_years',
          'STD_number', 'STD_Time_since_first_diag', 
    'STDs_Time_since_last_diag']] \
        = data[['Num_sex_partners', 'First_sex_intercourse', 
   'Num_pregnancies', 'Smokes_years', 'Smokes_packs_year',
                'Hormonal_Contraceps_years', 'IUD_years',
                'STD_number', 'STD_Time_since_first_diag', 
   'STDs_Time_since_last_diag']].apply(pd.to_numeric)

elif dataset_id == 'liver_cancer':
    disease = 'liver_cancer'
    datafile = 'Indian Liver Patient Dataset (ILPD).csv'  # comma delimited, 
     no header
    ld_columns = ['Age', 'Gender', 'TB', 'DB', 'Alkphos', 'Sgpt', 'Sgot', 
    'TP', 'ALB', 'A/G Ratio', 'Selector']

    data = pd.read_csv(datafile, skipinitialspace=True, names=ld_columns)

    data.loc[data['Gender'] == 'Male', 'Gender'] = 'M'
    data.loc[data['Gender'] == 'Female', 'Gender'] = 'F'

    selector_class = data['Selector']
    data.drop(labels=['Selector'], axis=1, inplace=True)

    data.insert(0, 'Selector', selector_class)
    data.reset_index(drop=True, inplace=True)
# End CODE
print(data.head(20))
return data


def dimensions(dataset_id, dataset):
    dim = None
    # dim = dataset.shape
    num_inst = len(dataset)
    num_feat = len(dataset.iloc[0].reset_index())
    dim = (num_inst, num_feat)
    return dim

标签: pythonpandas

解决方案


如果你想从 DataFrame 中删除一列,你可以这样做。

如果要删除单列:

df.drop(['column_name'], axis = 1) 

如果要删除多列:

df.drop(['Column1', 'Column2'], axis = 1) 

如果您想根据其他条件而不是列名删除。您可以在下方发表评论。我会相应地更新答案。希望能帮助到你!。


推荐阅读