將文本列拆分為 Pandas DataFrame 中的兩列，用於不同的數據幀

Question

我有六個不同的數據框，其中一些數據框具有“NaN”值。 我在沒有 if 語句的情況下嘗試了它，它僅適用於沒有“NaN”值的 dataframe（當我在其他 dfs 上嘗試時出現此錯誤：“ValueError: Columns must be same length as key”）。 我要做的是創建一個 function 將 df 列分成兩列（空氣質量值和單位）。

def formatting(df):
    """ split text columns into two columns and changes data type"""

    # setting all floats to 2 digits in general
    pd.options.display.float_format = "{:.2f}".format

    # NO2
    if 'NO2' != 'NaN':
        df[['NO2', 'NO2_UNIT']] = df.NO2.apply(lambda x: pd.Series(str(x).split(' ')))
        if 'NO2' != 'NaN':
            df['NO2'] = pd.to_numeric(df['NO2'], downcast="float")
    else:
        pass

    # SO2
    if 'SO2' != 'NaN':
        df[['SO2', 'SO2_UNIT']] = df.SO2.apply(lambda x: pd.Series(str(x).split(' ')))
        if 'SO2' != 'NaN':
            df['SO2'] = pd.to_numeric(df['SO2'], downcast="float")
    else:
        pass

    # 03
    if 'O3' != 'NaN':
        df[['O3', 'O3_UNIT']] = df.O3.apply(lambda x: pd.Series(str(x).split(' ')))
        if 'O3' != 'NaN':
            df['O3'] = pd.to_numeric(df['O3'], downcast="float")
    else:
        pass

    # PM10
    if 'PM10' != 'NaN':
        df[['PM10', 'PM10_UNIT']] = df.PM10.apply(lambda x: pd.Series(str(x).split(' ')))
        if 'PM10' != 'NaN':
            df['PM10'] = pd.to_numeric(df['PM10'], downcast="float")
    else:
        pass

    # PM2.5
    if 'PM2.5' != 'NaN':
        df.rename(columns={'PM2.5': 'PM25'}, inplace = True)
        df[['PM25', 'PM25_UNIT']] = df.PM25.apply(lambda x: pd.Series(str(x).split(" ")))
        if 'PM2.5' != 'NaN':
            df['PM25'] = pd.to_numeric(df['PM25'], downcast="float")
    else:
        pass

    # CO
    if 'CO' != 'NaN':
        df[['CO', 'CO_UNIT']] = df.CO.apply(lambda x: pd.Series(str(x).split(" ")))
        if 'CO' != 'NaN':
            df['CO'] = pd.to_numeric(df['CO'], downcast="float")
    else:
        pass

    # TEMP
    if 'TEMP' != 'NaN':
        df[['TEMP', 'TEMP_UNIT']] = df.TEMP.apply(lambda x: pd.Series(str(x).split(" ")))
        if 'TEMP' != 'NaN':
            df['TEMP'] = pd.to_numeric(df['TEMP'], downcast="float")
    else:
        pass

    # HUM
    if 'HUM' != 'NaN':
        df[['HUM', 'HUM_UNIT']] = df.HUM.apply(lambda x: pd.Series(str(x).split(" ")))
        if 'HUM' != 'NaN':
            df['HUM'] = pd.to_numeric(df['HUM'], downcast="float")
    else:
        pass

    # AIRPRES
    if 'AIRPRES' != 'NaN':
        df[['AIRPRES', 'AIRPRES_UNIT']] = df.AIRPRES.apply(lambda x: pd.Series(str(x).split(" ")))
        if 'AIRPRES' != 'NaN':
            df['AIRPRES'] = df['AIRPRES'].replace(',', '', regex=True)
            df['AIRPRES'] = pd.to_numeric(df['AIRPRES'], downcast="float")
    else:
        pass

    # WS
    if 'WS' != 'NaN':
        df[['WS', 'WS_UNIT']] = df.WS.apply(lambda x: pd.Series(str(x).split(" ")))
        if 'WS' != 'NaN':
            df['WS'] = pd.to_numeric(df['WS'], downcast="float")
    else:
        pass

    # WD
    if 'WD' != 'NaN':
        df[['WD', 'WD_UNIT']] = df.WD.apply(lambda x: pd.Series(str(x).split(" ")))
        if 'WD' != 'NaN':
            df['WD'] = pd.to_numeric(df['WD'], downcast="float")
    else:
        pass

    # NO
    if 'NO' != 'NaN':
        df[['NO', 'NO_UNIT']] = df.NO.apply(lambda x: pd.Series(str(x).split(" ")))
        if 'NO' != 'NaN':
            df['NO'] = pd.to_numeric(df['NO'], downcast="float")
    else:
        pass

    # BENZENE
    if 'BENZENE' != 'NaN':
        df[['BENZENE', 'BENZENE_UNIT']] = df.BENZENE.apply(lambda x: pd.Series(str(x).split(" ")))
        if 'BENZENE' != 'NaN':
            df['BENZENE'] = pd.to_numeric(df['BENZENE'], downcast="float")
    else:
        pass

    # order columns
    df = df[['TIMESTAMP', 'NO2', 'NO2_UNIT', 'SO2', 'SO2_UNIT', 'O3', 'O3_UNIT',
             'PM10', 'PM10_UNIT', 'PM25', 'PM25_UNIT', 'CO', 'CO_UNIT', 'TEMP',
             'TEMP_UNIT', 'HUM', 'HUM_UNIT', 'AIRPRES', 'AIRPRES_UNIT', 'WS',
             'WS_UNIT', 'WD', 'WD_UNIT', 'NO', 'NO_UNIT', 'BENZENE', 'BENZENE_UNIT']]
    return df

然后我打算將所有 df 放在一個列表中，然后使用 for 循環在每個 df 上運行 function。

在這里您可以看到標題和前三行：

print(gharb.head(3).to_dict())
{'TIMESTAMP': {0: '26/01/2022 14:00', 1: '26/01/2022 13:00', 2: '26/01/2022 12:00'}, 
'NO2': {0: '1.3 µg/m3', 1: '1.41 µg/m3', 2: '2.11 µg/m3'}, 
'SO2': {0: '0.78 µg/m3', 1: '0.81 µg/m3', 2: '0.89 µg/m3'}, 
'O3': {0: '90.05 µg/m3', 1: '88.33 µg/m3', 2: '86.41 µg/m3'}, 
'PM10': {0: '1.9 µg/m3', 1: '2.18 µg/m3', 2: '3.28 µg/m3'}, 
'CO': {0: '0.19 mg/m3', 1: '0.19 mg/m3', 2: '0.19 mg/m3'}, 
'TEMP': {0: '10.1 °C', 1: '9.99 °C', 2: '9.79 °C'}, 
'HUM': {0: '64.98 %', 1: '63.59 %', 2: '64.63 %'}, 
'WS': {0: '4.92 m/s', 1: '5.24 m/s', 2: '5.37 m/s'}, 
'WD': {0: '249.15 Deg', 1: '232.48 Deg', 2: '238.07 Deg'}, 
'NO': {0: '0.12 µg/m3', 1: '0.14 µg/m3', 2: '0.31 µg/m3'}, 
'PM2.5': {0: 'None', 1: 'None', 2: 'None'}, 
'AIRPRES': {0: 'None', 1: 'None', 2: 'None'}, 
'BENZENE': {0: 'None', 1: 'None', 2: 'None'}}

Answer 1

這是一種適用於您的輸入數據的方法：

def formatting(df):
    """ split text columns into two columns and changes data type"""

    # setting all floats to 2 digits in general
    pd.options.display.float_format = "{:.2f}".format
    
    # define all the columns to perform the split
    # could also be an input of the function
    cols = [ 'NO2', 'SO2', 'O3', 'PM10', 'CO', 'TEMP', 'HUM', 'WS',
            'WD', 'NO', 'PM2.5', 'AIRPRES', 'BENZENE']
    
    # to get all result columns available
    res_cols = ['TIMESTAMP']
    # iterate over the columns to split
    for col in cols:
        #use try/except instead of if to be able to handle weird columns
        try: 
            # add the column to select in the result
            res_cols.append(col) 
            # now split the column and expand one time only, in case several space
            df[[col, col+'_UNIT']] = df[col].astype(str).str.split(' ', expand=True, n=1)
            # add the unit column only if the split works
            res_cols.append(col+'_UNIT') 
        # in case of the split does not work
        except ValueError:
            print(f'Error for column {col}')
        # from string to float, coerce (aka replace by NaN) if not possible
        df[col] = pd.to_numeric(df[col], downcast="float", errors='coerce')
     

    # order columns
    df = df[res_cols]
    return df

現在你明白了。 除非您不關心，否則您可以刪除其中的打印件。

df = formatting(df)
# Error for column PM2.5
# Error for column AIRPRES
# Error for column BENZENE
print(df)
#           TIMESTAMP  NO2 NO2_UNIT  SO2 SO2_UNIT    O3 O3_UNIT  PM10 PM10_UNIT  \
# 0  26/01/2022 14:00 1.30    µg/m3 0.78    µg/m3 90.05   µg/m3  1.90     µg/m3   
# 1  26/01/2022 13:00 1.41    µg/m3 0.81    µg/m3 88.33   µg/m3  2.18     µg/m3   
# 2  26/01/2022 12:00 2.11    µg/m3 0.89    µg/m3 86.41   µg/m3  3.28     µg/m3   

#     CO CO_UNIT  TEMP TEMP_UNIT   HUM HUM_UNIT   WS WS_UNIT     WD WD_UNIT  \
# 0 0.19   mg/m3 10.10        °C 64.98        % 4.92     m/s 249.15     Deg   
# 1 0.19   mg/m3  9.99        °C 63.59        % 5.24     m/s 232.48     Deg   
# 2 0.19   mg/m3  9.79        °C 64.63        % 5.37     m/s 238.07     Deg   

#     NO NO_UNIT  PM2.5  AIRPRES  BENZENE  
# 0 0.12   µg/m3    NaN      NaN      NaN  
# 1 0.14   µg/m3    NaN      NaN      NaN  
# 2 0.31   µg/m3    NaN      NaN      NaN

請注意，如果您重新運行 df 的功能，您會為所有列打印一個錯誤打印，但結果仍然很好。

將文本列拆分為 Pandas DataFrame 中的兩列，用於不同的數據幀

問題描述

1 個解決方案

解決方案1
0 2022-01-26 14:40:56

將文本列拆分為 Pandas DataFrame 中的兩列，用於不同的數據幀

問題描述

1 個解決方案

解決方案1 0 2022-01-26 14:40:56

解決方案1
0 2022-01-26 14:40:56