简体   繁体   中英

How to join tables by multiple columns for each id

I was forced to delete my previous topic because it wasn't well asked and the example was a little bit complex, so here it is with a simple one.

I have 2 dataframes :

DF1<-data.frame(id1=c(1,1,1,1,1,2),client_code=c("x1","x1","x1","x2","x2","x3"),id2=c("a","b","c","d","e","y"),value1=c(0.1,0.2,0.3,0.4,0.5,0.6),value2=c(1.1,1.2,1.3,1.4,1.5,1.6))

> DF1
  id1 client_code id2 value1 value2
1   1          x1   a    0.1    1.1
2   1          x1   b    0.2    1.2
3   1          x1   c    0.3    1.3
4   1          x2   d    0.4    1.4
5   1          x2   e    0.5    1.5
6   2          x3   y    0.6    1.6

DF2<-data.frame(id1=c(1,1,1,1,1,1,2,2),id2=c("a","b","c","d","e","f","x","y"),value1=c(10,11,12,13,14,15,16,17),value2=c(20,21,22,23,24,25,26,27))

> DF2
  id1 id2 value1 value2
1   1   a     10     20
2   1   b     11     21
3   1   c     12     22
4   1   d     13     23
5   1   e     14     24
6   1   f     15     25
7   2   x     16     26
8   2   y     17     27

Each client belongs to a group of clients which is identified by column (id1)

What I'm trying to do is to add the missing rows from DF2 which their id2 are not present in DF1. This process should be done for each client (client_code) belonging to the same group of clients (id1) in DF1.

(I don't know if I made myself clear enough)

The desired output :

output<-data.frame(id1=c(1,1,1,1,1,1,1,1,1,1,1,1,2,2),client_code=c("x1","x1","x1","x1","x1","x1","x2","x2","x2","x2","x2","x2","x3","x3"),id2=c("a","b","c","d","e","f","d","e","a","b","c","f","y","x"),                  value1=c(0.1,0.2,0.3,13,14,15,0.4,0.5,10,11,12,15,0.6,16),value2=c(1.1,1.2,1.3,23,24,25,1.4,1.5,20,21,22,25,1.6,26))

> output
   id1 client_code id2 value1 value2
1    1          x1   a    0.1    1.1
2    1          x1   b    0.2    1.2
3    1          x1   c    0.3    1.3
4    1          x1   d   13.0   23.0
5    1          x1   e   14.0   24.0
6    1          x1   f   15.0   25.0
7    1          x2   d    0.4    1.4
8    1          x2   e    0.5    1.5
9    1          x2   a   10.0   20.0
10   1          x2   b   11.0   21.0
11   1          x2   c   12.0   22.0
12   1          x2   f   15.0   25.0
13   2          x3   y    0.6    1.6
14   2          x3   x   16.0   26.0

Thanks in advance.

First, create a table that shows for each client, what values of id2 should be present:

library(tidyverse)

client_defaults <- df1 %>% 
  distinct(client_code, id1) %>% 
  left_join(df2, by = "id1")

client_defaults
#>    client_code id1 id2 value1 value2
#> 1           x1   1   a     10     20
#> 2           x1   1   b     11     21
#> 3           x1   1   c     12     22
#> 4           x1   1   d     13     23
#> 5           x1   1   e     14     24
#> 6           x1   1   f     15     25
#> 7           x2   1   a     10     20
#> 8           x2   1   b     11     21
#> 9           x2   1   c     12     22
#> 10          x2   1   d     13     23
#> 11          x2   1   e     14     24
#> 12          x2   1   f     15     25
#> 13          x3   2   x     16     26
#> 14          x3   2   y     17     27

Then, remove the rows that are already present in df1 , and add the rest:

client_missing <- client_defaults %>% 
  anti_join(df1, by = c("client_code", "id2"))

bind_rows(df1, client_missing) %>% arrange(client_code)
#>    client_code id1 id2 value1 value2
#> 1           x1   1   a    0.1    1.1
#> 2           x1   1   b    0.2    1.2
#> 3           x1   1   c    0.3    1.3
#> 4           x1   1   d   13.0   23.0
#> 5           x1   1   e   14.0   24.0
#> 6           x1   1   f   15.0   25.0
#> 7           x2   1   d    0.4    1.4
#> 8           x2   1   e    0.5    1.5
#> 9           x2   1   a   10.0   20.0
#> 10          x2   1   b   11.0   21.0
#> 11          x2   1   c   12.0   22.0
#> 12          x2   1   f   15.0   25.0
#> 13          x3   2   y    0.6    1.6
#> 14          x3   2   x   16.0   26.0

Data:

df1 <- data.frame(client_code = c("x1", "x1", "x1", "x2", "x2", "x3"), id1 = c(1, 1, 1, 1, 1, 2), id2 = c("a", "b", "c", "d", "e", "y"), value1 = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6), value2 = c(1.1, 1.2, 1.3, 1.4, 1.5, 1.6), stringsAsFactors = FALSE)

df2 <- data.frame(id1 = c(1, 1, 1, 1, 1, 1, 2, 2), id2 = c("a", "b", "c", "d", "e", "f", "x", "y"), value1 = c(10, 11, 12, 13, 14, 15, 16, 17), value2 = c(20, 21, 22, 23, 24, 25, 26, 27), stringsAsFactors = FALSE)

Created on 2019-07-01 by the reprex package (v0.2.1)

Here is a possible data.table solution, which is a bit more concise. Essentially it performs two steps:

  1. Construct a completed data.table by joining DF1 and DF2 on each group in id1 and client_code
  2. Update the value1 and value2 columns by their appropriate DF1 values
library(data.table)

setDT(DF1); setDT(DF2)

DF <- DF1[, DF2[.BY, .SD, on = "id1", .SDcols = id2:value2], by = .(id1, client_code)]
DF[DF1, `:=`(value1 = i.value1, value2 = i.value2), on = c("id1", "client_code", "id2")]

DF
#>     id1 client_code id2 value1 value2
#>  1:   1          x1   a    0.1    1.1
#>  2:   1          x1   b    0.2    1.2
#>  3:   1          x1   c    0.3    1.3
#>  4:   1          x1   d   13.0   23.0
#>  5:   1          x1   e   14.0   24.0
#>  6:   1          x1   f   15.0   25.0
#>  7:   1          x2   a   10.0   20.0
#>  8:   1          x2   b   11.0   21.0
#>  9:   1          x2   c   12.0   22.0
#> 10:   1          x2   d    0.4    1.4
#> 11:   1          x2   e    0.5    1.5
#> 12:   1          x2   f   15.0   25.0
#> 13:   2          x3   x   16.0   26.0
#> 14:   2          x3   y    0.6    1.6

Created on 2019-07-01 by the reprex package (v0.3.0)

Here's a solution using a sql query.

library(sqldf)

sqldf('
select  distinct
        a.id1
        , a.client_code
        , b.id2
        , coalesce(d.value1, e.value1) as value1
        , coalesce(d.value2, e.value2) as value2
from    df1 a
        left join df2 b
          on  a.id1 = b.id1
        left join df1 d
          on  a.id1 = d.id1
              and b.id2 = d.id2
              and a.client_code = d.client_code
        left join df2 e
          on  a.id1 = e.id1
              and b.id2 = e.id2
')

#    id1 client_code id2 value1 value2
# 1    1          x1   a    0.1    1.1
# 2    1          x1   b    0.2    1.2
# 3    1          x1   c    0.3    1.3
# 4    1          x1   d   13.0   23.0
# 5    1          x1   e   14.0   24.0
# 6    1          x1   f   15.0   25.0
# 7    1          x2   a   10.0   20.0
# 8    1          x2   b   11.0   21.0
# 9    1          x2   c   12.0   22.0
# 10   1          x2   d    0.4    1.4
# 11   1          x2   e    0.5    1.5
# 12   1          x2   f   15.0   25.0
# 13   2          x3   x   16.0   26.0
# 14   2          x3   y    0.6    1.6

Another option is to check each row in DF2.

We recreate your tables and make some of the columns as.character:

library(data.table)

DF1<-data.frame(id1=c(1,1,1,1,1,2),client_code=c("x1","x1","x1","x2","x2","x3"),id2=c("a","b","c","d","e","y"),value1=c(0.1,0.2,0.3,0.4,0.5,0.6),value2=c(1.1,1.2,1.3,1.4,1.5,1.6))
DF1$id2 <- as.character(DF1$id2)

DF2<-data.frame(id1=c(1,1,1,1,1,1,2,2),id2=c("a","b","c","d","e","f","x","y"),value1=c(10,11,12,13,14,15,16,17),value2=c(20,21,22,23,24,25,26,27))
DF2$id2 <- as.character(DF2$id2)

Then we save the column order of DF1 (will need this later)

column_order <- colnames(DF1)

And now we separate each client codes

client_codes <- as.character(unique(DF1$client_code))

And we define the New table that will contain the final results as a data frame

New_Table <- data.frame()

And now we create a nested for loop to take into account the different client codes and also check each row in DF2.

for(i in client_codes){
#So here we subset the DF1 data frame to only a certain client code  
New_DF1 <- DF1[DF1$client_code == i,]

temp_id <- unique(New_DF1$id1)
#and here we select the correct ids
New_DF2 <- DF2[DF2$id1 %in% temp_id,]

temp_df <- data.frame()
#And with this second for loop we perform a check on each of the new_DF2 rows
#to see if they have a matching client id
for(j in 1:nrow(New_DF2)){

  temp_row <- New_DF2[j,]
  if(nrow((New_DF1[New_DF1$id1 == temp_row$id1 & New_DF1$id2 == temp_row$id2,])) == 0){
    temp_row$client_code <- i
    setcolorder(temp_row, column_order)
    temp_df <- rbind(temp_df, temp_row)
  }

}

New_DF1 <- rbind(New_DF1, temp_df)

New_Table <- rbind(New_Table, New_DF1)

}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM