简体   繁体   中英

how to create 3 tables using join in pandas/python?

I need help / guidance with my code below to see if I am doing wrong or what i need to add. I am trying to create three tables using joins in pandas. Can anyone tell or help me out with my code below. Right now, it is getting an slight error ValueError: All arrays must be of the same length for the table1 last varchar2 data type. Thanks for the help. here is my code.

   #folder_location = '/content/0_Test/' #You need to mount google drive and change this directory string
folder_location = '/content/drive/MyDrive/ITA HRP Data Conversion Materials/Compensation Extractor/0_Test/'
df = pandas.read_excel(folder_location+'Compensation Worksheet.xlsx')#,dtype=str)
df = df[['Row ID',
         'Existing P Level',
          'Existing P Code',
          'Existing P Variation Code',
          'Considered "Adds to Rate"? (Y/N)',
          'Calc Method',
          'Calculation Sequence',
          'MOU',
          'Frequency',
          'Rate or Flat Amount',
          'Percent',
          'FMS Dept.',
          'Class Code',
          'Reference ID',
          'Comp Mapping Tab'
          ]
        ]
#The main filter is to avoid all Blanks and N/A on the Reference ID columns
df = df.dropna(subset = ['Reference ID'])

df[~df['Reference ID'].str.contains('N/A', case = False)].any()
df = df.fillna("") #make sure NaNs are simply empty strings

def basic_formatting(x):
  str_x = str(x)
  str_x = str_x.strip()
  str_x = str_x.replace(' IN ','')
  str_x = str_x.replace("\n",";")
  str_x = str_x.replace(',',';')
  str_x = str_x.replace(';;',';')
  str_x = str_x.replace('‒','-')
  return str_x
  
def specific_formatting(x):
  str_x = str(x)
  str_x = str_x.strip()
  str_x = str_x.upper().replace('AND',';')
  str_x = str_x.upper().replace('&',';')
  str_x = str_x.upper().replace('MOUS','')
  str_x = str_x.upper().replace('MOU','')
  str_x = re.sub('^ALL\s*EXCEPT\s','NOT', str_x)
  return str_x

def class_formatting(x):
  str_x = str(x)
  str_x = str_x.strip()
  str_x = re.sub(u'\2014','',str_x)
  return str_x

df[['MOU','FMS Dept.','Class Code','Existing P Variation Code','Existing P Code','Existing P Level']] = df[['MOU','FMS Dept.','Class Code','Existing P Variation Code','Existing P Code','Existing P Level']].applymap(basic_formatting)
df[['Class Code']] = df[['Class Code']].applymap(class_formatting)
df[['MOU','FMS Dept.','Class Code','Existing P Variation Code']] = df[['MOU','FMS Dept.','Class Code','Existing P Variation Code',]].applymap(specific_formatting)

wd_m_compensation_information = [['Reference ID',
          'Row ID',
          'Comp Mapping Tab',
          'Considered "Adds to Rate"? (Y/N)',
          'Calc Method',
          'Calculation Sequence',
          'Frequency',
          'Rate or Flat Amount',
          'Percent'
          ]]
wd_m_compensation_p_codes_and_levels = [['Reference ID','Row ID','P Bonus Code', 'P Level']]

wd_m_compensation_p_varcodes = [['Reference ID','Row ID','P Varcode']]

wd_m_compensation_mou_inclusions = [['Reference ID','Row ID','MOU']]
wd_m_compensation_mou_exclusions = [['Reference ID','Row ID','MOU']]

wd_m_compensation_fms_inclusions = [['Reference ID','Row ID','FMS']]
wd_m_compensation_fms_exclusions = [['Reference ID','Row ID','FMS']]

wd_m_compensation_class_inclusions = [['Reference ID','Row ID','CLASS']]
wd_m_compensation_class_exclusions = [['Reference ID','Row ID','CLASS']]

for index, row in df.iterrows():
  #Comp information table
  ref_id = row['Reference ID']
  row_id = row['Row ID']
  r_wd_m_compensation_information = [ row['Reference ID'],
                                     row['Row ID'],
                                     row['Comp Mapping Tab'],
                                     row['Considered "Adds to Rate"? (Y/N)'],
                                     row['Calc Method'], 
                                     row['Calculation Sequence'], 
                                     row['Frequency'], 
                                     row['Rate or Flat Amount'],
                                     row['Percent']
  ]
  wd_m_compensation_information.append(r_wd_m_compensation_information)

  
  l_p_codes = str.split(row['Existing p Code'],';')
  l_p_levels = str.split(row['Existing p Level'],';')
  l_cross_codes_level = [(i,j) for i in l_p_codes for j in l_p_levels]
  for (code,level) in l_cross_codes_level:
    if( (code is not None and len(code) > 0) and (level is not None and len(level) > 0)):
      wd_m_compensation_p_codes_and_levels.append([ref_id, row_id, code, level])

  #Varcodes
  l_varcodes = str.split(row['Existing P Variation Code'],';')
  for varcode in l_varcodes:
    if( len(varcode.strip() ) ):
      wd_m_compensation_p_varcodes.append([ref_id, row_id, varcode.strip()])

  #MOUs
  l_mous = str.split(row['MOU'],';')
  for mou in l_mous:
    mou = mou.strip()
    if( len(mou) > 0):
      if 'NOT' not in mou:
        mou = re.sub("[^\d]+",'',mou)
        wd_m_compensation_mou_inclusions.append([ref_id, row_id, mou])
      else:
        mou = mou.replace('NOT','')
        mou = re.sub("[^\d]+",'',mou)
        wd_m_compensation_mou_exclusions.append([ref_id, row_id, mou])

  #FMSs
  l_fms = str.split(row['FMS Dept.'],';')
  for fms in l_fms:
    fms = fms.strip()
    if( len(fms) > 0 ):
      if 'NOT' not in fms:
        fms = re.sub("[^\d]+",'',fms)
        wd_m_compensation_fms_inclusions.append([ref_id, row_id,  fms])
      else:
        fms = fms.replace('NOT','')
        fms = re.sub("[^\d]+",'',fms)
        wd_m_compensation_fms_exclusions.append([ref_id, row_id,  fms])

  #Class Codes
  l_cCodes = str.split(row['Class Code'],';')
  for cCode in l_cCodes:
    cCode = cCode.strip()
    if( len(cCode) > 0):
      if 'NOT' not in cCode:
        wd_m_compensation_class_inclusions.append([ref_id, row_id,  cCode])
      else:
        cCode = cCode.replace('NOT','')
        wd_m_compensation_class_exclusions.append([ref_id, row_id,  cCode])


 
import pandas as pd


data1 = {'COLUMN_NAME':['P_CODE',
                     'NOT_JOB_CLASS',
                     'MOU',
                     'NOT_MOU',
                     'FMS',
                     'NOT_FMS',
                     'CLASS_CODE',
                     'JOB_CLASS', 
                     'COMP_PLAN_REF_ID', 
                     'BONUS_PERCENT', 
                     'BONUS_AMOUNT', 
                     'CALC_METHOD', 
                     'FREQUENCY',
                     'NOT_CLASS_CODE', 
                     'P_LEVEL'],
        'DATA_TYPE':['VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'NUMBER', 
                'NUMBER', 
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2']}

table1 = pd.DataFrame(data1, columns = ['COLUMN_NAME', 'DATA_TYPE'])
table1['TABLE_NAME'] = 'WD_W_F41_BONUS_MAPPING'


data2 = {'COLUMN_NAME':['CALC_METHOD',
                     'CALC_SEQUENCE',
                     'COMP_PLAN_REF_ID',
                     'ADD_RATE_OR_PAY',
                     'FREQUENCY'],
        'DATA_TYPE':['VARCHAR2',
                'NUMBER',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2']}

table2 = pd.DataFrame(data2, columns = ['COLUMN_NAME', 'DATA_TYPE'])
table2['TABLE_NAME'] = 'WD_W_F41_LEGACY_PLAN'

data3 = {'COLUMN_NAME':[  'CALC_METHOD',
                     'CALC_SEQUENCE',
                     'COMP_PLAN_REF_ID',
                     'ADD_RATE_OR_PAY',
                     'FREQUENCY',
                     'FREQUENCY',
                     'FREQUENCY',
                     'FREQUENCY',
                     'FREQUENCY',
                     'FREQUENCY',
                     'FREQUENCY'],
        'DATA_TYPE':['VARCHAR2',
                'NUMBER',
                'NUMBER',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2']}

table3 = pd.DataFrame(data3, columns = ['COLUMN_NAME', 'DATA_TYPE'])
table3['TABLE_NAME'] = 'WD_W_VAR_CODE_REF_ID'
 
df = pd.concat([table1, table2, table3])

Output:

print(df)
         COLUMN_NAME DATA_TYPE              TABLE_NAME
0             P_CODE  VARCHAR2  WD_W_F41_BONUS_MAPPING
1      NOT_JOB_CLASS  VARCHAR2  WD_W_F41_BONUS_MAPPING
2                MOU  VARCHAR2  WD_W_F41_BONUS_MAPPING
3            NOT_MOU  VARCHAR2  WD_W_F41_BONUS_MAPPING
4                FMS  VARCHAR2  WD_W_F41_BONUS_MAPPING
5            NOT_FMS  VARCHAR2  WD_W_F41_BONUS_MAPPING
6         CLASS_CODE  VARCHAR2  WD_W_F41_BONUS_MAPPING
7          JOB_CLASS  VARCHAR2  WD_W_F41_BONUS_MAPPING
8   COMP_PLAN_REF_ID  VARCHAR2  WD_W_F41_BONUS_MAPPING
9      BONUS_PERCENT  VARCHAR2  WD_W_F41_BONUS_MAPPING
10      BONUS_AMOUNT    NUMBER  WD_W_F41_BONUS_MAPPING
11       CALC_METHOD    NUMBER  WD_W_F41_BONUS_MAPPING
12         FREQUENCY  VARCHAR2  WD_W_F41_BONUS_MAPPING
13    NOT_CLASS_CODE  VARCHAR2  WD_W_F41_BONUS_MAPPING
14           P_LEVEL  VARCHAR2  WD_W_F41_BONUS_MAPPING
0        CALC_METHOD  VARCHAR2    WD_W_F41_LEGACY_PLAN
1      CALC_SEQUENCE    NUMBER    WD_W_F41_LEGACY_PLAN
2   COMP_PLAN_REF_ID  VARCHAR2    WD_W_F41_LEGACY_PLAN
3    ADD_RATE_OR_PAY  VARCHAR2    WD_W_F41_LEGACY_PLAN
4          FREQUENCY  VARCHAR2    WD_W_F41_LEGACY_PLAN
0        CALC_METHOD  VARCHAR2    WD_W_VAR_CODE_REF_ID
1      CALC_SEQUENCE    NUMBER    WD_W_VAR_CODE_REF_ID
2   COMP_PLAN_REF_ID    NUMBER    WD_W_VAR_CODE_REF_ID
3    ADD_RATE_OR_PAY  VARCHAR2    WD_W_VAR_CODE_REF_ID
4          FREQUENCY  VARCHAR2    WD_W_VAR_CODE_REF_ID
5          FREQUENCY  VARCHAR2    WD_W_VAR_CODE_REF_ID
6          FREQUENCY  VARCHAR2    WD_W_VAR_CODE_REF_ID
7          FREQUENCY  VARCHAR2    WD_W_VAR_CODE_REF_ID
8          FREQUENCY  VARCHAR2    WD_W_VAR_CODE_REF_ID
9          FREQUENCY  VARCHAR2    WD_W_VAR_CODE_REF_ID
10         FREQUENCY  VARCHAR2    WD_W_VAR_CODE_REF_ID

Because of "TABLE_NAME" dataframes aren't same size. So i don't know how your data structure should look like, im not sure is it solution to your question but mayby my answear at least will help you:

import pandas as pd

table1=pd.DataFrame({
    'COLUMN_NAME':[  'P_CODE',
                     'NOT_JOB_CLASS',
                     'MOU',
                     'NOT_MOU',
                     'FMS',
                     'NOT_FMS',
                     'CLASS_CODE',
                     'JOB_CLASS', 
                     'COMP_PLAN_REF_ID', 
                     'BONUS_PERCENT', 
                     'BONUS_AMOUNT', 
                     'CALC_METHOD', 
                     'FREQUENCY',
                     'NOT_CLASS_CODE', 
                     'P_LEVEL'],
    'DATA_TYPE':['VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'NUMBER', 
                'NUMBER', 
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2'],
    
})

table2=pd.DataFrame({
    'COLUMN_NAME':[
                     'CALC_METHOD',
                     'CALC_SEQUENCE',
                     'COMP_PLAN_REF_ID',
                     'ADD_RATE_OR_PAY',
                     'FREQUENCY'],
    'DATA_TYPE':['VARCHAR2',
                'NUMBER',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2'],
    
})
 
table3=pd.DataFrame({
    'COLUMN_NAME':[
                     'CALC_METHOD',
                     'CALC_SEQUENCE',
                     'COMP_PLAN_REF_ID',
                     'ADD_RATE_OR_PAY',
                     'FREQUENCY',
                     'FREQUENCY',
                     'FREQUENCY',
                     'FREQUENCY',
                     'FREQUENCY',
                     'FREQUENCY',
                     'FREQUENCY'],
    'DATA_TYPE':['VARCHAR2',
                'NUMBER',
                'NUMBER',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2',
                'VARCHAR2'],
    
})

pd.concat([table1,table2,table3])

That's the result, did you mean it?

这是结果

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM