简体   繁体   中英

How to merge different dataframes of the same observation with different names in R?

I have three data frames. The first data frame df_1 is the baseline data frame, the rest of the data frames contains information that I want to add to the matching observations of df_1 The problem is that the observations don't have the same names I have a fourth data frame with the corresponding name on each variable. I want to recognize those values that have matching observations in the rest of the data frames to get a single data frame with all the observations.

set.seed(123)
# I have three dataframes
a <- rnorm(6, 3, 2)
b <- rnorm(6, 1, 3)
c <- rpois(6, 3)
year <- c(rep(2014, 6))
player <- c("Aaron Badaley", "Andrew Loupe", "Ben Crane", "Ben Curtis", "Ben Martin", "Brendon de Jonge")

df_1 <- data.frame(player, year, a, b, c)

d <- rnorm(3, 3, 2)
e <- rnorm(3, 1, 3)
f <- rpois(3, 3)
year <- c(rep(2014, 3))
player <- c("Andrew Loupe IV", "Ben Crane", "Brendon de Jonge")

df_2 <- data.frame(player, year, d, e, f)

g <- rnorm(4, 3, 2)
h <- rnorm(4, 1, 3)
i <- rpois(4, 3)
year <- c(rep(2014, 4))
player <- c("Aron Badelay", "Ben Crane 3", "Brendon de Jonge", "Ben Curt")

df_3 <- data.frame(player, year, g, h, i)

a_to_c <- c("Aaron Badaley", "CT", "Andrew Loupe", "Ben Crane","Brendon de Jonge", "Ben Curtis", "Ben Martin")
d_to_f <- c("Aron Badaley", "C.T.", "Andrew Loupe", "Ben Crane","Brendon de Jonge", "Ben Curt", "Ben")
g_to_i <- c("Aron Badelay", "CT", "Andrew Loupe", "Ben Crane 3","Brendon de Jonge", "Ben Curt", "Ben Martin")

names_palyer <- data.frame(a_to_c, d_to_f, g_to_i)

These are the three data frames. They are of different lengths. There are names, years, and variables. For this example, there is only one year but in reality, there are more years and I have thousands of observations. The main problem is that not all observations containing in df_1 have a matching observation with df_2 or/and df_3 but all observations in df_2 and df_3 should be inside df_1 . The matching data frame has more observations than all the data frames since this is like a dictionary for names.

This is what I tried to do:

out <- data.table::copy(df_1)
for(i in 1:(ncol(names_palyer)-1)) {
  tmp <- data.table::copy(get(paste0('df_', i + 1)))
  keydat <- names_palyer[c(i, i + 1)]
  keydat <- keydat[keydat[[2]] %in% tmp$player,, drop = FALSE]
  i1 <- match(keydat[[2]], tmp$player, nomatch = 0) 
  
  tmp$player[i1] <- keydat[[1]]
  print(tmp)
  out <- merge(out, tmp, by = c('player', 'year'), all.x = TRUE)
  i2 <- match(keydat[[1]], out$player, nomatch = 0)
  
  out$player[i2] <- keydat[[2]][keydat[[1]] %in% out$player] 
}

library(dplyr)
library(purrr)
split.default(out[-(1:2)], sub("\\..*", "", names(out)[-(1:2)])) %>%
  map_dfc(reduce, coalesce)  %>% 
  bind_cols( out[1:2], .)

This output doesn't work since does not recognize the observations that are present in df_1 and df_3 but not inside df_2.

This is the output I need:

head(out,1)
         player year        a        b c  d  e  f     g       h i
1 Aaron Badaley 2014 1.879049 2.382749 4 NA NA NA 0.829 -2.4966 1

If we are making the change to first column index, it seems to work

out <- data.table::copy(df_1)
for(i in 1:(ncol(names_palyer)-1)) {
  tmp <- data.table::copy(get(paste0('df_', i + 1)))
  keydat <- copy(names_palyer)
  keydat <- keydat[keydat[[i+1]] %in% tmp$player,, drop = FALSE]
  
  i1 <- match(keydat[[i + 1]], tmp$player, nomatch = 0) 
 
  
  tmp$player[i1] <- keydat[[1]]
 
  out <- merge(out, tmp, by = c('player', 'year'), all.x = TRUE)
  i2 <- match(keydat[[1]], out$player, nomatch = 0)
  
  #out$player[i2] <- keydat[[i+1]][keydat[[1]] %in% out$player] 
}

-checking

out[1, ]
#         player year        a        b c  d  e  f         g         h i
#1 Aaron Badaley 2014 1.879049 2.382749 4 NA NA NA 0.8286017 -2.496635 1

Or may be we could bind the rows together and change the 'player' name and then grouped by 'player' replace the column values with the non-NA value

bind_rows(df_1, df_2, df_3 ) %>% 
    mutate(player = recode(player, !!! setNames(names_palyer[[1]], names_palyer[[2]]) )) %>%
    group_by(player, year) %>% 
    mutate(across(a:f, ~ .[complete.cases(.)][1])) %>%
    mutate(player = recode(player, !!! setNames(names_palyer[[1]], names_palyer[[3]]))) %>% 
    group_by(player, year) %>%
    mutate(across(a:i, ~ .[complete.cases(.)][1])) %>% 
    ungroup %>% 
    distinct 
# A tibble: 7 x 11
#  player            year     a      b     c      d      e     f      g       h     i
#  <chr>            <dbl> <dbl>  <dbl> <int>  <dbl>  <dbl> <int>  <dbl>   <dbl> <int>
#1 Aaron Badaley     2014  1.88  2.38      4 NA     NA        NA  0.829 -2.50       1
#2 Andrew Loupe      2014  2.54 -2.80      4 NA     NA        NA NA     NA         NA
#3 Ben Crane         2014  6.12 -1.06      3  4.00  -0.418     2  2.83  -1.46       2
#4 Ben Curtis        2014  3.14 -0.337     3 NA     NA        NA  2.71   0.0398     4
#5 Ben Martin        2014  3.26  4.67      2 NA     NA        NA NA     NA         NA
#6 Brendon de Jonge  2014  6.43  2.08      1 -0.933 -2.20      1  5.14   3.05       2
#7 Andrew Loupe IV   2014 NA    NA        NA  6.57   3.10      2 NA     NA         NA

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM