I was forced to delete my previous topic because it wasn't well asked and the example was a little bit complex, so here it is with a simple one.
I have 2 dataframes :
DF1<-data.frame(id1=c(1,1,1,1,1,2),client_code=c("x1","x1","x1","x2","x2","x3"),id2=c("a","b","c","d","e","y"),value1=c(0.1,0.2,0.3,0.4,0.5,0.6),value2=c(1.1,1.2,1.3,1.4,1.5,1.6))
> DF1
id1 client_code id2 value1 value2
1 1 x1 a 0.1 1.1
2 1 x1 b 0.2 1.2
3 1 x1 c 0.3 1.3
4 1 x2 d 0.4 1.4
5 1 x2 e 0.5 1.5
6 2 x3 y 0.6 1.6
DF2<-data.frame(id1=c(1,1,1,1,1,1,2,2),id2=c("a","b","c","d","e","f","x","y"),value1=c(10,11,12,13,14,15,16,17),value2=c(20,21,22,23,24,25,26,27))
> DF2
id1 id2 value1 value2
1 1 a 10 20
2 1 b 11 21
3 1 c 12 22
4 1 d 13 23
5 1 e 14 24
6 1 f 15 25
7 2 x 16 26
8 2 y 17 27
Each client belongs to a group of clients which is identified by column (id1)
What I'm trying to do is to add the missing rows from DF2 which their id2 are not present in DF1. This process should be done for each client (client_code) belonging to the same group of clients (id1) in DF1.
(I don't know if I made myself clear enough)
The desired output :
output<-data.frame(id1=c(1,1,1,1,1,1,1,1,1,1,1,1,2,2),client_code=c("x1","x1","x1","x1","x1","x1","x2","x2","x2","x2","x2","x2","x3","x3"),id2=c("a","b","c","d","e","f","d","e","a","b","c","f","y","x"), value1=c(0.1,0.2,0.3,13,14,15,0.4,0.5,10,11,12,15,0.6,16),value2=c(1.1,1.2,1.3,23,24,25,1.4,1.5,20,21,22,25,1.6,26))
> output
id1 client_code id2 value1 value2
1 1 x1 a 0.1 1.1
2 1 x1 b 0.2 1.2
3 1 x1 c 0.3 1.3
4 1 x1 d 13.0 23.0
5 1 x1 e 14.0 24.0
6 1 x1 f 15.0 25.0
7 1 x2 d 0.4 1.4
8 1 x2 e 0.5 1.5
9 1 x2 a 10.0 20.0
10 1 x2 b 11.0 21.0
11 1 x2 c 12.0 22.0
12 1 x2 f 15.0 25.0
13 2 x3 y 0.6 1.6
14 2 x3 x 16.0 26.0
Thanks in advance.
First, create a table that shows for each client, what values of id2
should be present:
library(tidyverse)
client_defaults <- df1 %>%
distinct(client_code, id1) %>%
left_join(df2, by = "id1")
client_defaults
#> client_code id1 id2 value1 value2
#> 1 x1 1 a 10 20
#> 2 x1 1 b 11 21
#> 3 x1 1 c 12 22
#> 4 x1 1 d 13 23
#> 5 x1 1 e 14 24
#> 6 x1 1 f 15 25
#> 7 x2 1 a 10 20
#> 8 x2 1 b 11 21
#> 9 x2 1 c 12 22
#> 10 x2 1 d 13 23
#> 11 x2 1 e 14 24
#> 12 x2 1 f 15 25
#> 13 x3 2 x 16 26
#> 14 x3 2 y 17 27
Then, remove the rows that are already present in df1
, and add the rest:
client_missing <- client_defaults %>%
anti_join(df1, by = c("client_code", "id2"))
bind_rows(df1, client_missing) %>% arrange(client_code)
#> client_code id1 id2 value1 value2
#> 1 x1 1 a 0.1 1.1
#> 2 x1 1 b 0.2 1.2
#> 3 x1 1 c 0.3 1.3
#> 4 x1 1 d 13.0 23.0
#> 5 x1 1 e 14.0 24.0
#> 6 x1 1 f 15.0 25.0
#> 7 x2 1 d 0.4 1.4
#> 8 x2 1 e 0.5 1.5
#> 9 x2 1 a 10.0 20.0
#> 10 x2 1 b 11.0 21.0
#> 11 x2 1 c 12.0 22.0
#> 12 x2 1 f 15.0 25.0
#> 13 x3 2 y 0.6 1.6
#> 14 x3 2 x 16.0 26.0
Data:
df1 <- data.frame(client_code = c("x1", "x1", "x1", "x2", "x2", "x3"), id1 = c(1, 1, 1, 1, 1, 2), id2 = c("a", "b", "c", "d", "e", "y"), value1 = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6), value2 = c(1.1, 1.2, 1.3, 1.4, 1.5, 1.6), stringsAsFactors = FALSE)
df2 <- data.frame(id1 = c(1, 1, 1, 1, 1, 1, 2, 2), id2 = c("a", "b", "c", "d", "e", "f", "x", "y"), value1 = c(10, 11, 12, 13, 14, 15, 16, 17), value2 = c(20, 21, 22, 23, 24, 25, 26, 27), stringsAsFactors = FALSE)
Created on 2019-07-01 by the reprex package (v0.2.1)
Here is a possible data.table solution, which is a bit more concise. Essentially it performs two steps:
DF1
and DF2
on each group in id1
and client_code
value1
and value2
columns by their appropriate DF1
values library(data.table)
setDT(DF1); setDT(DF2)
DF <- DF1[, DF2[.BY, .SD, on = "id1", .SDcols = id2:value2], by = .(id1, client_code)]
DF[DF1, `:=`(value1 = i.value1, value2 = i.value2), on = c("id1", "client_code", "id2")]
DF
#> id1 client_code id2 value1 value2
#> 1: 1 x1 a 0.1 1.1
#> 2: 1 x1 b 0.2 1.2
#> 3: 1 x1 c 0.3 1.3
#> 4: 1 x1 d 13.0 23.0
#> 5: 1 x1 e 14.0 24.0
#> 6: 1 x1 f 15.0 25.0
#> 7: 1 x2 a 10.0 20.0
#> 8: 1 x2 b 11.0 21.0
#> 9: 1 x2 c 12.0 22.0
#> 10: 1 x2 d 0.4 1.4
#> 11: 1 x2 e 0.5 1.5
#> 12: 1 x2 f 15.0 25.0
#> 13: 2 x3 x 16.0 26.0
#> 14: 2 x3 y 0.6 1.6
Created on 2019-07-01 by the reprex package (v0.3.0)
Here's a solution using a sql query.
library(sqldf)
sqldf('
select distinct
a.id1
, a.client_code
, b.id2
, coalesce(d.value1, e.value1) as value1
, coalesce(d.value2, e.value2) as value2
from df1 a
left join df2 b
on a.id1 = b.id1
left join df1 d
on a.id1 = d.id1
and b.id2 = d.id2
and a.client_code = d.client_code
left join df2 e
on a.id1 = e.id1
and b.id2 = e.id2
')
# id1 client_code id2 value1 value2
# 1 1 x1 a 0.1 1.1
# 2 1 x1 b 0.2 1.2
# 3 1 x1 c 0.3 1.3
# 4 1 x1 d 13.0 23.0
# 5 1 x1 e 14.0 24.0
# 6 1 x1 f 15.0 25.0
# 7 1 x2 a 10.0 20.0
# 8 1 x2 b 11.0 21.0
# 9 1 x2 c 12.0 22.0
# 10 1 x2 d 0.4 1.4
# 11 1 x2 e 0.5 1.5
# 12 1 x2 f 15.0 25.0
# 13 2 x3 x 16.0 26.0
# 14 2 x3 y 0.6 1.6
Another option is to check each row in DF2.
We recreate your tables and make some of the columns as.character:
library(data.table)
DF1<-data.frame(id1=c(1,1,1,1,1,2),client_code=c("x1","x1","x1","x2","x2","x3"),id2=c("a","b","c","d","e","y"),value1=c(0.1,0.2,0.3,0.4,0.5,0.6),value2=c(1.1,1.2,1.3,1.4,1.5,1.6))
DF1$id2 <- as.character(DF1$id2)
DF2<-data.frame(id1=c(1,1,1,1,1,1,2,2),id2=c("a","b","c","d","e","f","x","y"),value1=c(10,11,12,13,14,15,16,17),value2=c(20,21,22,23,24,25,26,27))
DF2$id2 <- as.character(DF2$id2)
Then we save the column order of DF1 (will need this later)
column_order <- colnames(DF1)
And now we separate each client codes
client_codes <- as.character(unique(DF1$client_code))
And we define the New table that will contain the final results as a data frame
New_Table <- data.frame()
And now we create a nested for loop to take into account the different client codes and also check each row in DF2.
for(i in client_codes){
#So here we subset the DF1 data frame to only a certain client code
New_DF1 <- DF1[DF1$client_code == i,]
temp_id <- unique(New_DF1$id1)
#and here we select the correct ids
New_DF2 <- DF2[DF2$id1 %in% temp_id,]
temp_df <- data.frame()
#And with this second for loop we perform a check on each of the new_DF2 rows
#to see if they have a matching client id
for(j in 1:nrow(New_DF2)){
temp_row <- New_DF2[j,]
if(nrow((New_DF1[New_DF1$id1 == temp_row$id1 & New_DF1$id2 == temp_row$id2,])) == 0){
temp_row$client_code <- i
setcolorder(temp_row, column_order)
temp_df <- rbind(temp_df, temp_row)
}
}
New_DF1 <- rbind(New_DF1, temp_df)
New_Table <- rbind(New_Table, New_DF1)
}
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.