I need help / guidance with my code below to see if I am doing wrong or what i need to add. I am trying to create three tables using joins in pandas. Can anyone tell or help me out with my code below. Right now, it is getting an slight error ValueError: All arrays must be of the same length for the table1 last varchar2 data type. Thanks for the help. here is my code.
#folder_location = '/content/0_Test/' #You need to mount google drive and change this directory string
folder_location = '/content/drive/MyDrive/ITA HRP Data Conversion Materials/Compensation Extractor/0_Test/'
df = pandas.read_excel(folder_location+'Compensation Worksheet.xlsx')#,dtype=str)
df = df[['Row ID',
'Existing P Level',
'Existing P Code',
'Existing P Variation Code',
'Considered "Adds to Rate"? (Y/N)',
'Calc Method',
'Calculation Sequence',
'MOU',
'Frequency',
'Rate or Flat Amount',
'Percent',
'FMS Dept.',
'Class Code',
'Reference ID',
'Comp Mapping Tab'
]
]
#The main filter is to avoid all Blanks and N/A on the Reference ID columns
df = df.dropna(subset = ['Reference ID'])
df[~df['Reference ID'].str.contains('N/A', case = False)].any()
df = df.fillna("") #make sure NaNs are simply empty strings
def basic_formatting(x):
str_x = str(x)
str_x = str_x.strip()
str_x = str_x.replace(' IN ','')
str_x = str_x.replace("\n",";")
str_x = str_x.replace(',',';')
str_x = str_x.replace(';;',';')
str_x = str_x.replace('‒','-')
return str_x
def specific_formatting(x):
str_x = str(x)
str_x = str_x.strip()
str_x = str_x.upper().replace('AND',';')
str_x = str_x.upper().replace('&',';')
str_x = str_x.upper().replace('MOUS','')
str_x = str_x.upper().replace('MOU','')
str_x = re.sub('^ALL\s*EXCEPT\s','NOT', str_x)
return str_x
def class_formatting(x):
str_x = str(x)
str_x = str_x.strip()
str_x = re.sub(u'\2014','',str_x)
return str_x
df[['MOU','FMS Dept.','Class Code','Existing P Variation Code','Existing P Code','Existing P Level']] = df[['MOU','FMS Dept.','Class Code','Existing P Variation Code','Existing P Code','Existing P Level']].applymap(basic_formatting)
df[['Class Code']] = df[['Class Code']].applymap(class_formatting)
df[['MOU','FMS Dept.','Class Code','Existing P Variation Code']] = df[['MOU','FMS Dept.','Class Code','Existing P Variation Code',]].applymap(specific_formatting)
wd_m_compensation_information = [['Reference ID',
'Row ID',
'Comp Mapping Tab',
'Considered "Adds to Rate"? (Y/N)',
'Calc Method',
'Calculation Sequence',
'Frequency',
'Rate or Flat Amount',
'Percent'
]]
wd_m_compensation_p_codes_and_levels = [['Reference ID','Row ID','P Bonus Code', 'P Level']]
wd_m_compensation_p_varcodes = [['Reference ID','Row ID','P Varcode']]
wd_m_compensation_mou_inclusions = [['Reference ID','Row ID','MOU']]
wd_m_compensation_mou_exclusions = [['Reference ID','Row ID','MOU']]
wd_m_compensation_fms_inclusions = [['Reference ID','Row ID','FMS']]
wd_m_compensation_fms_exclusions = [['Reference ID','Row ID','FMS']]
wd_m_compensation_class_inclusions = [['Reference ID','Row ID','CLASS']]
wd_m_compensation_class_exclusions = [['Reference ID','Row ID','CLASS']]
for index, row in df.iterrows():
#Comp information table
ref_id = row['Reference ID']
row_id = row['Row ID']
r_wd_m_compensation_information = [ row['Reference ID'],
row['Row ID'],
row['Comp Mapping Tab'],
row['Considered "Adds to Rate"? (Y/N)'],
row['Calc Method'],
row['Calculation Sequence'],
row['Frequency'],
row['Rate or Flat Amount'],
row['Percent']
]
wd_m_compensation_information.append(r_wd_m_compensation_information)
l_p_codes = str.split(row['Existing p Code'],';')
l_p_levels = str.split(row['Existing p Level'],';')
l_cross_codes_level = [(i,j) for i in l_p_codes for j in l_p_levels]
for (code,level) in l_cross_codes_level:
if( (code is not None and len(code) > 0) and (level is not None and len(level) > 0)):
wd_m_compensation_p_codes_and_levels.append([ref_id, row_id, code, level])
#Varcodes
l_varcodes = str.split(row['Existing P Variation Code'],';')
for varcode in l_varcodes:
if( len(varcode.strip() ) ):
wd_m_compensation_p_varcodes.append([ref_id, row_id, varcode.strip()])
#MOUs
l_mous = str.split(row['MOU'],';')
for mou in l_mous:
mou = mou.strip()
if( len(mou) > 0):
if 'NOT' not in mou:
mou = re.sub("[^\d]+",'',mou)
wd_m_compensation_mou_inclusions.append([ref_id, row_id, mou])
else:
mou = mou.replace('NOT','')
mou = re.sub("[^\d]+",'',mou)
wd_m_compensation_mou_exclusions.append([ref_id, row_id, mou])
#FMSs
l_fms = str.split(row['FMS Dept.'],';')
for fms in l_fms:
fms = fms.strip()
if( len(fms) > 0 ):
if 'NOT' not in fms:
fms = re.sub("[^\d]+",'',fms)
wd_m_compensation_fms_inclusions.append([ref_id, row_id, fms])
else:
fms = fms.replace('NOT','')
fms = re.sub("[^\d]+",'',fms)
wd_m_compensation_fms_exclusions.append([ref_id, row_id, fms])
#Class Codes
l_cCodes = str.split(row['Class Code'],';')
for cCode in l_cCodes:
cCode = cCode.strip()
if( len(cCode) > 0):
if 'NOT' not in cCode:
wd_m_compensation_class_inclusions.append([ref_id, row_id, cCode])
else:
cCode = cCode.replace('NOT','')
wd_m_compensation_class_exclusions.append([ref_id, row_id, cCode])
import pandas as pd
data1 = {'COLUMN_NAME':['P_CODE',
'NOT_JOB_CLASS',
'MOU',
'NOT_MOU',
'FMS',
'NOT_FMS',
'CLASS_CODE',
'JOB_CLASS',
'COMP_PLAN_REF_ID',
'BONUS_PERCENT',
'BONUS_AMOUNT',
'CALC_METHOD',
'FREQUENCY',
'NOT_CLASS_CODE',
'P_LEVEL'],
'DATA_TYPE':['VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'NUMBER',
'NUMBER',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2']}
table1 = pd.DataFrame(data1, columns = ['COLUMN_NAME', 'DATA_TYPE'])
table1['TABLE_NAME'] = 'WD_W_F41_BONUS_MAPPING'
data2 = {'COLUMN_NAME':['CALC_METHOD',
'CALC_SEQUENCE',
'COMP_PLAN_REF_ID',
'ADD_RATE_OR_PAY',
'FREQUENCY'],
'DATA_TYPE':['VARCHAR2',
'NUMBER',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2']}
table2 = pd.DataFrame(data2, columns = ['COLUMN_NAME', 'DATA_TYPE'])
table2['TABLE_NAME'] = 'WD_W_F41_LEGACY_PLAN'
data3 = {'COLUMN_NAME':[ 'CALC_METHOD',
'CALC_SEQUENCE',
'COMP_PLAN_REF_ID',
'ADD_RATE_OR_PAY',
'FREQUENCY',
'FREQUENCY',
'FREQUENCY',
'FREQUENCY',
'FREQUENCY',
'FREQUENCY',
'FREQUENCY'],
'DATA_TYPE':['VARCHAR2',
'NUMBER',
'NUMBER',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2']}
table3 = pd.DataFrame(data3, columns = ['COLUMN_NAME', 'DATA_TYPE'])
table3['TABLE_NAME'] = 'WD_W_VAR_CODE_REF_ID'
df = pd.concat([table1, table2, table3])
Output:
print(df)
COLUMN_NAME DATA_TYPE TABLE_NAME
0 P_CODE VARCHAR2 WD_W_F41_BONUS_MAPPING
1 NOT_JOB_CLASS VARCHAR2 WD_W_F41_BONUS_MAPPING
2 MOU VARCHAR2 WD_W_F41_BONUS_MAPPING
3 NOT_MOU VARCHAR2 WD_W_F41_BONUS_MAPPING
4 FMS VARCHAR2 WD_W_F41_BONUS_MAPPING
5 NOT_FMS VARCHAR2 WD_W_F41_BONUS_MAPPING
6 CLASS_CODE VARCHAR2 WD_W_F41_BONUS_MAPPING
7 JOB_CLASS VARCHAR2 WD_W_F41_BONUS_MAPPING
8 COMP_PLAN_REF_ID VARCHAR2 WD_W_F41_BONUS_MAPPING
9 BONUS_PERCENT VARCHAR2 WD_W_F41_BONUS_MAPPING
10 BONUS_AMOUNT NUMBER WD_W_F41_BONUS_MAPPING
11 CALC_METHOD NUMBER WD_W_F41_BONUS_MAPPING
12 FREQUENCY VARCHAR2 WD_W_F41_BONUS_MAPPING
13 NOT_CLASS_CODE VARCHAR2 WD_W_F41_BONUS_MAPPING
14 P_LEVEL VARCHAR2 WD_W_F41_BONUS_MAPPING
0 CALC_METHOD VARCHAR2 WD_W_F41_LEGACY_PLAN
1 CALC_SEQUENCE NUMBER WD_W_F41_LEGACY_PLAN
2 COMP_PLAN_REF_ID VARCHAR2 WD_W_F41_LEGACY_PLAN
3 ADD_RATE_OR_PAY VARCHAR2 WD_W_F41_LEGACY_PLAN
4 FREQUENCY VARCHAR2 WD_W_F41_LEGACY_PLAN
0 CALC_METHOD VARCHAR2 WD_W_VAR_CODE_REF_ID
1 CALC_SEQUENCE NUMBER WD_W_VAR_CODE_REF_ID
2 COMP_PLAN_REF_ID NUMBER WD_W_VAR_CODE_REF_ID
3 ADD_RATE_OR_PAY VARCHAR2 WD_W_VAR_CODE_REF_ID
4 FREQUENCY VARCHAR2 WD_W_VAR_CODE_REF_ID
5 FREQUENCY VARCHAR2 WD_W_VAR_CODE_REF_ID
6 FREQUENCY VARCHAR2 WD_W_VAR_CODE_REF_ID
7 FREQUENCY VARCHAR2 WD_W_VAR_CODE_REF_ID
8 FREQUENCY VARCHAR2 WD_W_VAR_CODE_REF_ID
9 FREQUENCY VARCHAR2 WD_W_VAR_CODE_REF_ID
10 FREQUENCY VARCHAR2 WD_W_VAR_CODE_REF_ID
Because of "TABLE_NAME" dataframes aren't same size. So i don't know how your data structure should look like, im not sure is it solution to your question but mayby my answear at least will help you:
import pandas as pd
table1=pd.DataFrame({
'COLUMN_NAME':[ 'P_CODE',
'NOT_JOB_CLASS',
'MOU',
'NOT_MOU',
'FMS',
'NOT_FMS',
'CLASS_CODE',
'JOB_CLASS',
'COMP_PLAN_REF_ID',
'BONUS_PERCENT',
'BONUS_AMOUNT',
'CALC_METHOD',
'FREQUENCY',
'NOT_CLASS_CODE',
'P_LEVEL'],
'DATA_TYPE':['VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'NUMBER',
'NUMBER',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2'],
})
table2=pd.DataFrame({
'COLUMN_NAME':[
'CALC_METHOD',
'CALC_SEQUENCE',
'COMP_PLAN_REF_ID',
'ADD_RATE_OR_PAY',
'FREQUENCY'],
'DATA_TYPE':['VARCHAR2',
'NUMBER',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2'],
})
table3=pd.DataFrame({
'COLUMN_NAME':[
'CALC_METHOD',
'CALC_SEQUENCE',
'COMP_PLAN_REF_ID',
'ADD_RATE_OR_PAY',
'FREQUENCY',
'FREQUENCY',
'FREQUENCY',
'FREQUENCY',
'FREQUENCY',
'FREQUENCY',
'FREQUENCY'],
'DATA_TYPE':['VARCHAR2',
'NUMBER',
'NUMBER',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2',
'VARCHAR2'],
})
pd.concat([table1,table2,table3])
That's the result, did you mean it?
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.