如何根據特定條件刪除行並連接兩個數據框？

Question

這是一組 2 個數據框。

id <- c(1,2,3,4)
id2 <- c(5,6,7,8)
list <- c("list1","list2","list3","list4")
progress <- c("A", "A", "B", "C")
grade <- c("A", NA, "B", "C")
df1 <- data.frame(id, id2, list, progress, grade)
df1

id <- c(1,2,3,5)
id2 <- c(5,6,7,9)
list <- c("list1","list2","list5","list6")
progress <- c("B", "B", "A", "D")
grade2 <- c("B", NA, "B", "D")
df2 <- data.frame(id, id2, list, progress, grade2)
df2

我希望以這樣的方式組合df1和df2 ，

a) 對於列list ，如果id和id2的值重復，則list對應的值也應該匹配。 否則該值應返回NA 。 此條件不適用於id和id2的唯一值。

b) progress列如果id和id2有重復的值，則取第一次出現的值。

c) 對於列grade和grade2 ，如果id和id2的值重復，則在這種情況下必須刪除NA 。

預期的 output 如下：-

   #id id2  list   progress grade grade2
   #1   5   list1        A     A   B
   #2   6   list2        A    NA   NA
   #3   7   NA           B     B   B
   #4   8   list4        C     C   NA
   #5   9   list6        D    NA   D

Answer 1

由於您的初始數據結構，這個答案相當復雜，但這是我使用dplyr中的工具的解決方案：

library(dplyr)
# Bind the rows of the two dataframes together
bind_rows(df1, df2) %>%
    # a) For each pair of id and id2...
    group_by(id, id2) %>% 
    # ...when there is more than one list, set to NA, otherwise, take the value
    mutate(list = case_when(length(unique(list)) > 1 ~ NA_character_, 
                            TRUE ~ unique(list))) %>% 
    # b) Take the first occurring progress value (still for each id, id2 pair)
    mutate(progress = progress[1]) %>% 
    ungroup() %>% 
    # Keep distinct pairs
    distinct(id, id2, list, progress) %>% 
    # c)
    # Create a smaller data set of the non-NA grade for the id, id2 pairs
    # Joint it onto the larger data set
    left_join(
        bind_rows(df1, df2) %>% 
        select(id, id2, grade) %>% 
        na.omit(),
        by = c("id", "id2")
    ) %>% 
    # c continued)
    # Create a smaller data set of the non-NA grade2 for the id, id2 pairs
    # Joint it onto the larger data set
    left_join(
        bind_rows(df1, df2) %>% 
        select(id, id2, grade2) %>% 
        na.omit(),
        by = c("id", "id2")
    )

Answer 2

“第一”困擾着我，但這似乎符合您想要的 output：

library(tidyverse)

bind_rows(
  left_join(df1, df2, by = c('id', 'id2', 'list', 'progress'), ),
  anti_join(df2, df1, by = c('id', 'id2', 'list', 'progress'))
  ) %>%
  group_by(id, id2) %>%
  mutate(
    list     = ifelse(n_distinct(list) > 1, NA, list),
    progress = first(progress),
    grade    = first(grade),
    grade2   = first(na.omit(grade2))
  ) %>%
  ungroup() %>%
  distinct()

Output：

# # A tibble: 5 x 6
#      id   id2 list  progress grade grade2
#   <dbl> <dbl> <chr> <chr>    <chr> <chr> 
# 1     1     5 list1 A        A     B     
# 2     2     6 list2 A        NA    NA    
# 3     3     7 NA    B        B     B     
# 4     4     8 list4 C        C     NA    
# 5     5     9 list6 D        NA    D

數據：

df1 <- data.frame(
  id       = 1:4,
  id2      = 5:8,
  list     = paste0('list', 1:4),
  progress = c('A', 'A', 'B', 'C'),
  grade    = c('A', NA, 'B', 'C'),
  stringsAsFactors = FALSE
)

df2 <- data.frame(
  id       = c(1, 2, 3, 5),
  id2      = c(5, 6, 7, 9),
  list     = paste0('list', c(1, 2, 5, 6)),
  progress = c('B', 'B', 'A', 'D'),
  grade2   = c('B', NA, 'B', 'D'),
  stringsAsFactors = FALSE
)

Answer 3

這是dplyr和purrr package 的另一種選擇。

library(purrr)
library(dplyr)

x <- list(df1, df2) %>% reduce(full_join, by = c("id","id2"))

x$progress.x <- as.character(x$progress.x)    
x$progress.y <- as.character(x$progress.y)

coalesce_unless <- function(x, y){
    
    cxy <- coalesce(x, y)
    cyx <- coalesce(y, x)
    
    cxy[cxy != cyx] <- NA
    
    cxy
    
}

x %>% 
    mutate(list = coalesce_unless(list.x, list.y)) %>%
    mutate(progress = ifelse(is.na(x$progress.x), x$progress.y, x$progress.x))%>% 
    select(id, id2, list, progress, grade, grade2)

#   id id2 list progress grade grade2
#1  1   5 list1        A     A      B
#2  2   6 list2        A  <NA>   <NA>
#3  3   7  <NA>        B     B      B
#4  4   8 list4        C     C   <NA>
#5  5   9 list6        D  <NA>      D

如何根據特定條件刪除行並連接兩個數據框？

問題描述

3 個解決方案

解決方案1
2 已采納 2020-07-29 07:43:47

解決方案2
2 2020-07-29 08:33:23

解決方案3
1 2020-07-29 10:57:13

如何根據特定條件刪除行並連接兩個數據框？

問題描述

3 個解決方案

解決方案1 2 已采納 2020-07-29 07:43:47

解決方案2 2 2020-07-29 08:33:23

解決方案3 1 2020-07-29 10:57:13

解決方案1
2 已采納 2020-07-29 07:43:47

解決方案2
2 2020-07-29 08:33:23

解決方案3
1 2020-07-29 10:57:13