[英]How to join tables by multiple columns for each id
我被迫刪除了我之前的主題,因為它沒有得到很好的詢問,而且這個例子有點復雜,所以這里的內容很簡單。
我有2個數據幀:
DF1<-data.frame(id1=c(1,1,1,1,1,2),client_code=c("x1","x1","x1","x2","x2","x3"),id2=c("a","b","c","d","e","y"),value1=c(0.1,0.2,0.3,0.4,0.5,0.6),value2=c(1.1,1.2,1.3,1.4,1.5,1.6))
> DF1
id1 client_code id2 value1 value2
1 1 x1 a 0.1 1.1
2 1 x1 b 0.2 1.2
3 1 x1 c 0.3 1.3
4 1 x2 d 0.4 1.4
5 1 x2 e 0.5 1.5
6 2 x3 y 0.6 1.6
DF2<-data.frame(id1=c(1,1,1,1,1,1,2,2),id2=c("a","b","c","d","e","f","x","y"),value1=c(10,11,12,13,14,15,16,17),value2=c(20,21,22,23,24,25,26,27))
> DF2
id1 id2 value1 value2
1 1 a 10 20
2 1 b 11 21
3 1 c 12 22
4 1 d 13 23
5 1 e 14 24
6 1 f 15 25
7 2 x 16 26
8 2 y 17 27
每個客戶端屬於一組客戶端,由列(id1)標識
我要做的是在DF2中添加缺少的行,它們的id2在DF1中不存在。 應該對屬於DF1中的同一組客戶端(id1)的每個客戶端(client_code)執行此過程。
(我不知道自己是否足夠清楚)
所需的輸出:
output<-data.frame(id1=c(1,1,1,1,1,1,1,1,1,1,1,1,2,2),client_code=c("x1","x1","x1","x1","x1","x1","x2","x2","x2","x2","x2","x2","x3","x3"),id2=c("a","b","c","d","e","f","d","e","a","b","c","f","y","x"), value1=c(0.1,0.2,0.3,13,14,15,0.4,0.5,10,11,12,15,0.6,16),value2=c(1.1,1.2,1.3,23,24,25,1.4,1.5,20,21,22,25,1.6,26))
> output
id1 client_code id2 value1 value2
1 1 x1 a 0.1 1.1
2 1 x1 b 0.2 1.2
3 1 x1 c 0.3 1.3
4 1 x1 d 13.0 23.0
5 1 x1 e 14.0 24.0
6 1 x1 f 15.0 25.0
7 1 x2 d 0.4 1.4
8 1 x2 e 0.5 1.5
9 1 x2 a 10.0 20.0
10 1 x2 b 11.0 21.0
11 1 x2 c 12.0 22.0
12 1 x2 f 15.0 25.0
13 2 x3 y 0.6 1.6
14 2 x3 x 16.0 26.0
提前致謝。
首先,創建一個表格,顯示每個客戶端應該存在的id2
值:
library(tidyverse)
client_defaults <- df1 %>%
distinct(client_code, id1) %>%
left_join(df2, by = "id1")
client_defaults
#> client_code id1 id2 value1 value2
#> 1 x1 1 a 10 20
#> 2 x1 1 b 11 21
#> 3 x1 1 c 12 22
#> 4 x1 1 d 13 23
#> 5 x1 1 e 14 24
#> 6 x1 1 f 15 25
#> 7 x2 1 a 10 20
#> 8 x2 1 b 11 21
#> 9 x2 1 c 12 22
#> 10 x2 1 d 13 23
#> 11 x2 1 e 14 24
#> 12 x2 1 f 15 25
#> 13 x3 2 x 16 26
#> 14 x3 2 y 17 27
然后,刪除df1
中已存在的行,並添加其余行:
client_missing <- client_defaults %>%
anti_join(df1, by = c("client_code", "id2"))
bind_rows(df1, client_missing) %>% arrange(client_code)
#> client_code id1 id2 value1 value2
#> 1 x1 1 a 0.1 1.1
#> 2 x1 1 b 0.2 1.2
#> 3 x1 1 c 0.3 1.3
#> 4 x1 1 d 13.0 23.0
#> 5 x1 1 e 14.0 24.0
#> 6 x1 1 f 15.0 25.0
#> 7 x2 1 d 0.4 1.4
#> 8 x2 1 e 0.5 1.5
#> 9 x2 1 a 10.0 20.0
#> 10 x2 1 b 11.0 21.0
#> 11 x2 1 c 12.0 22.0
#> 12 x2 1 f 15.0 25.0
#> 13 x3 2 y 0.6 1.6
#> 14 x3 2 x 16.0 26.0
數據:
df1 <- data.frame(client_code = c("x1", "x1", "x1", "x2", "x2", "x3"), id1 = c(1, 1, 1, 1, 1, 2), id2 = c("a", "b", "c", "d", "e", "y"), value1 = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6), value2 = c(1.1, 1.2, 1.3, 1.4, 1.5, 1.6), stringsAsFactors = FALSE)
df2 <- data.frame(id1 = c(1, 1, 1, 1, 1, 1, 2, 2), id2 = c("a", "b", "c", "d", "e", "f", "x", "y"), value1 = c(10, 11, 12, 13, 14, 15, 16, 17), value2 = c(20, 21, 22, 23, 24, 25, 26, 27), stringsAsFactors = FALSE)
由reprex包創建於2019-07-01(v0.2.1)
這是一個可能的data.table解決方案,它更簡潔一些。 基本上它執行兩個步驟:
id1
和client_code
每個組上連接DF1
和DF2
來構造已完成的data.table DF1
值更新value1
和value2
列 library(data.table)
setDT(DF1); setDT(DF2)
DF <- DF1[, DF2[.BY, .SD, on = "id1", .SDcols = id2:value2], by = .(id1, client_code)]
DF[DF1, `:=`(value1 = i.value1, value2 = i.value2), on = c("id1", "client_code", "id2")]
DF
#> id1 client_code id2 value1 value2
#> 1: 1 x1 a 0.1 1.1
#> 2: 1 x1 b 0.2 1.2
#> 3: 1 x1 c 0.3 1.3
#> 4: 1 x1 d 13.0 23.0
#> 5: 1 x1 e 14.0 24.0
#> 6: 1 x1 f 15.0 25.0
#> 7: 1 x2 a 10.0 20.0
#> 8: 1 x2 b 11.0 21.0
#> 9: 1 x2 c 12.0 22.0
#> 10: 1 x2 d 0.4 1.4
#> 11: 1 x2 e 0.5 1.5
#> 12: 1 x2 f 15.0 25.0
#> 13: 2 x3 x 16.0 26.0
#> 14: 2 x3 y 0.6 1.6
由reprex包創建於2019-07-01(v0.3.0)
這是使用sql查詢的解決方案。
library(sqldf)
sqldf('
select distinct
a.id1
, a.client_code
, b.id2
, coalesce(d.value1, e.value1) as value1
, coalesce(d.value2, e.value2) as value2
from df1 a
left join df2 b
on a.id1 = b.id1
left join df1 d
on a.id1 = d.id1
and b.id2 = d.id2
and a.client_code = d.client_code
left join df2 e
on a.id1 = e.id1
and b.id2 = e.id2
')
# id1 client_code id2 value1 value2
# 1 1 x1 a 0.1 1.1
# 2 1 x1 b 0.2 1.2
# 3 1 x1 c 0.3 1.3
# 4 1 x1 d 13.0 23.0
# 5 1 x1 e 14.0 24.0
# 6 1 x1 f 15.0 25.0
# 7 1 x2 a 10.0 20.0
# 8 1 x2 b 11.0 21.0
# 9 1 x2 c 12.0 22.0
# 10 1 x2 d 0.4 1.4
# 11 1 x2 e 0.5 1.5
# 12 1 x2 f 15.0 25.0
# 13 2 x3 x 16.0 26.0
# 14 2 x3 y 0.6 1.6
另一種選擇是檢查DF2中的每一行。
我們重新創建你的表並將一些列作為字符:
library(data.table)
DF1<-data.frame(id1=c(1,1,1,1,1,2),client_code=c("x1","x1","x1","x2","x2","x3"),id2=c("a","b","c","d","e","y"),value1=c(0.1,0.2,0.3,0.4,0.5,0.6),value2=c(1.1,1.2,1.3,1.4,1.5,1.6))
DF1$id2 <- as.character(DF1$id2)
DF2<-data.frame(id1=c(1,1,1,1,1,1,2,2),id2=c("a","b","c","d","e","f","x","y"),value1=c(10,11,12,13,14,15,16,17),value2=c(20,21,22,23,24,25,26,27))
DF2$id2 <- as.character(DF2$id2)
然后我們保存DF1的列順序(稍后將需要它)
column_order <- colnames(DF1)
現在我們將每個客戶代碼分開
client_codes <- as.character(unique(DF1$client_code))
我們定義了將包含最終結果作為數據框的New表
New_Table <- data.frame()
現在我們創建一個嵌套的for循環,以考慮不同的客戶端代碼,並檢查DF2中的每一行。
for(i in client_codes){
#So here we subset the DF1 data frame to only a certain client code
New_DF1 <- DF1[DF1$client_code == i,]
temp_id <- unique(New_DF1$id1)
#and here we select the correct ids
New_DF2 <- DF2[DF2$id1 %in% temp_id,]
temp_df <- data.frame()
#And with this second for loop we perform a check on each of the new_DF2 rows
#to see if they have a matching client id
for(j in 1:nrow(New_DF2)){
temp_row <- New_DF2[j,]
if(nrow((New_DF1[New_DF1$id1 == temp_row$id1 & New_DF1$id2 == temp_row$id2,])) == 0){
temp_row$client_code <- i
setcolorder(temp_row, column_order)
temp_df <- rbind(temp_df, temp_row)
}
}
New_DF1 <- rbind(New_DF1, temp_df)
New_Table <- rbind(New_Table, New_DF1)
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.