[英]Pandas - Merge two DataFrame with partial match
有了下圖所示的數據框,我想先合並['A','B','C']
和['X','Y','Z']
然后逐漸尋找匹配少一列,即['A','B']
和['X','Y']
然后['A']
和['X']
不復制結果的行,在下面a,y,y,v3
示例中a,y,y,v3
被忽略,因為a,d,d
已經匹配。
到目前為止,我的代碼匹配所有 3 列:
df1 = pd.DataFrame({"A":['a','b','c'],"B":['d','e','f'],"C":['d','e','f']})
df2 = pd.DataFrame({"X":['a','b','a','c'],"Y":['d','e','y','z'],"Z":['d','x','y','z'],"V":['v1','v2','v3','v4']})
merged = pd.merge(df1,df2,left_on=['A','B','C'],right_on=['X','Y','Z'], how='left')
merged = merged.drop_duplicates(['A','B','C'])
merged.head()
我怎樣才能實現我的目標?
一個想法是在循環中使用DataFrame.drop_duplicates
進行多次merge
,第二個DataFrame
應該避免在最終DataFrame
中重復行:
from functools import reduce
dfs = []
L = [['A', 'B', 'C'], ['X', 'Y', 'Z']]
for i in range(len(L[0]), 0, -1):
df22 = df2.drop_duplicates(L[1][:i])
df = pd.merge(df1,df22,left_on=L[0][:i],right_on=L[1][:i], how='left')
dfs.append(df)
df = reduce(lambda l,r: pd.DataFrame.fillna(l,r), dfs)
print (df)
A B C X Y Z V
0 a d d a d d v1
1 b e e b e x v2
2 c f f c z z v4
像這樣工作:
merged1 = pd.merge(df1,df2.drop_duplicates(['X','Y','Z']),left_on=['A','B','C'],right_on=['X','Y','Z'], how='left')
merged2 = pd.merge(df1,df2.drop_duplicates(['X','Y']),left_on=['A','B'],right_on=['X','Y'], how='left')
merged3 = pd.merge(df1,df2.drop_duplicates('X'),left_on=['A'],right_on=['X'], how='left')
df = merged1.fillna(merged2).fillna(merged3)
print (df)
A B C X Y Z V
0 a d d a d d v1
1 b e e b e x v2
2 c f f c z z v4
那這個呢:
matches = [['A', 'B', 'C'], ['X', 'Y', 'Z']]
df = df1.copy()
for k in range(len(matches[0])):
#Get your left/right keys right at each iteration :
left, right = matches
left = left if k==0 else left[:-k]
right = right if k==0 else right[:-k]
#Make sure columns from df2 exist in df
for col in df2.columns.tolist():
try:
df[col]
except Exception:
df[col] = np.nan
#Merge dataframes
df = df.merge(df2, left_on=left, right_on=right, how='left')
#Find which row of df's "left" columns (previously initialised) are empty
ix_left_part = np.all([df[x + "_x"].isnull() for x in right], axis=0)
#Find which row of df's "right" columns are not empty
ix_right_part = np.all([df[x + "_y"].notnull() for x in right], axis=0)
#Combine both to get indexes
ix = df[ix_left_part & ix_right_part].index
#Complete values on "left" with those from "right"
for x in df2.columns.tolist():
df.loc[ix, x+"_x"] = df.loc[ix, x+'_y']
#Drop values from "right"
df.drop([x+"_y" for x in df2.columns.tolist()], axis=1, inplace=True)
#Rename "left" columns to stick with original names from df2
df.rename({x+"_x":x for x in df2.columns.tolist()}, axis=1, inplace=True)
#drop eventual duplicates
df.drop_duplicates(keep="first", inplace=True)
print(df)
編輯
我糾正了循環; 這在 memory 上應該更容易:
import pandas as pd
import numpy as np
df1 = pd.DataFrame({"A":['a','b','c'],"B":['d','e','f'],"C":['d','e','f']})
df2 = pd.DataFrame({"X":['a','b','a','c'],"Y":['d','e','y','z'],"Z":['d','x','y','z'],"V":['v1','v2','v3','v4']})
matches = [['A', 'B', 'C'], ['X', 'Y', 'Z']]
df = df1.copy()
#Make sure columns of df2 exist in df
for col in df2.columns.tolist():
df[col] = np.nan
for k in range(len(matches[0])):
#Get your left/right keys right at each iteration :
left, right = matches
left = left if k==0 else left[:-k]
right = right if k==0 else right[:-k]
#recreate dataframe of (potential) usable datas in df2:
ix = df[df.V.isnull()].index
temp = (
df.loc[ix, left]
.rename(dict(zip(left, right)), axis=1)
)
temp=temp.merge(df2, on=right, how="inner")
#Merge dataframes
df = df.merge(temp, left_on=left, right_on=right, how='left')
#Combine both to get indexes
ix = df[(df['V_x'].isnull()) & (df['V_y'].notnull())].index
#Complete values on "left" with those from "right"
cols_left = [x+'_x' for x in df2.columns.tolist()]
cols_right = [x+'_y' for x in df2.columns.tolist()]
df.loc[ix, cols_left] = df.loc[ix, cols_right].values.tolist()
#Drop values from "right"
df.drop(cols_right, axis=1, inplace=True)
#Rename "left" columns to stick with original names from df2
rename = {x+"_x":x for x in df2.columns.tolist()}
df.rename(rename, axis=1, inplace=True)
print(df)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.