简体   繁体   中英

How to combine columns in R, setting values to NA when not equal

Take this as a sample input:

library(tidyverse)

id <- c(1,2,3,4)
id2 <- c(5,6,7,8)
area <- c("Area1","Area2","Area3","Area4")
test <- c("A",NA,"B","C")
info1 <- c("N","N","H","H")
df1 <- data.frame(id,id2,area,test,info1)


id <- c(1,2,3,5)
id2 <- c(5,6,7,9)
area <- c("Area1","Area2","Area5","Area6")
test <- c("B",NA,"B","D")
info2 <- c("B","C","B","C")
df2 <- data.frame(id,id2,area,test,info2)

x <- list(df1, df2) %>% reduce(full_join, by = c("id","id2"))

Joining these produces area.x, area.y, test.x, test.y. How is it possible to combine these columns into area and test. Set to NA when the values don't match, or if one value is NA then set the value that is not NA.

Desired output:

id|id2|area |test|info1|info2
1 |5  |Area1|NA  |N    |B
2 |6  |Area2|NA  |N    |C
3 |7  |NA   |B   |H    |B
4 |8  |Area4|C   |H    |NA
5 |9  |Area6|D   |NA   |C
x$area = apply(with(x, data.frame(area.x, area.y)), 1, function(x) {
    if (is.na(x[1]) & is.na(x[2])) {
        return(NA)
    } else {
        if (!is.na(x[1]) & is.na(x[2])) return(x[1])
        if (is.na(x[1]) & !is.na(x[2])) return(x[2])        
    }
    if (x[1] == x[2]) {
        return(x[1])
    } else {
        return(NA)
    }
})

x$test = apply(with(x, data.frame(test.x, test.y)), 1, function(x) {
    if (is.na(x[1]) & is.na(x[2])) {
        return(NA)
    } else {
        if (!is.na(x[1]) & is.na(x[2])) return(x[1])
        if (is.na(x[1]) & !is.na(x[2])) return(x[2])        
    }
    if (x[1] == x[2]) {
        return(x[1])
    } else {
        return(NA)
    }
})

x$info = apply(with(x, data.frame(info1, info2)), 1, function(x) {
    if (is.na(x[1]) & is.na(x[2])) {
        return(NA)
    } else {
        if (!is.na(x[1]) & is.na(x[2])) return(x[1])
        if (is.na(x[1]) & !is.na(x[2])) return(x[2])        
    }
    if (x[1] == x[2]) {
        return(x[1])
    } else {
        return(NA)
    }
})

x = within(x, rm(area.x, area.y, test.x, test.y, info1, info2))
 library(tidyverse)  
  compare <- function(d){
      x <- select(d,ends_with("X"))
      y <- select(d,ends_with("y"))
      a <- map2_dfc(x,y,coalesce)
      a[x != y & !is.na(x)& !is.na(y)] <- NA
      d[names(y)] <- NULL
      d[names(x)] <- a
      rename_with(d,~sub("\\.x$","",.x))
    }

compare(x)
 id id2  area test info1 info2
1  1   5 Area1 <NA>     N     B
2  2   6 Area2 <NA>     N     C
3  3   7  <NA>    B     H     B
4  4   8 Area4    C     H  <NA>
5  5   9 Area6    D  <NA>     C

This can be written in a function:

join_dfs <- function(x, y, by){
  x %>%
    full_join(y,by = by)%>%
    cbind(map2_dfc(select(.,ends_with("x")),
                   select(.,ends_with("y")),
                   ~`is.na<-`(coalesce(.x,.y),.x!=.y))%>%
            rename_with(~sub("\\.x$","",.x)))%>%
    select(-matches("\\.(x|y)$"))
}

list(df1, df2) %>% reduce(join_dfs, by = c("id","id2"))
  id id2 info1 info2  area test
1  1   5     N     B Area1 <NA>
2  2   6     N     C Area2 <NA>
3  3   7     H     B  <NA>    B
4  4   8     H  <NA> Area4    C
5  5   9  <NA>     C Area6    D

Your data:

library(dplyr)

id <- c(1,2,3,4)
id2 <- c(5,6,7,8)
area <- c("Area1","Area2","Area3","Area4")
test <- c("A",NA,"B","C")
info1 <- c("N","N","H","H")
df1 <- data.frame(id,id2,area,test,info1)


id <- c(1,2,3,5)
id2 <- c(5,6,7,9)
area <- c("Area1","Area2","Area5","Area6")
test <- c("B",NA,"B","D")
info2 <- c("B","C","B","C")
df2 <- data.frame(id,id2,area,test,info2)


x <- full_join(df1, df2, by = c("id","id2"))

Solution:

coalesce_unless <- function(x, y){
  
  cxy <- coalesce(x, y)
  cyx <- coalesce(y, x)
  
  cxy[cxy != cyx] <- NA
  
  cxy
  
}

x %>% 
  mutate(area = coalesce_unless(area.x, area.y),
         test = coalesce_unless(test.x, test.y)) %>% 
  select(id, id2, area, test, info1, info2)


#   id id2  area test info1 info2
# 1  1   5 Area1 <NA>     N     B
# 2  2   6 Area2 <NA>     N     C
# 3  3   7  <NA>    B     H     B
# 4  4   8 Area4    C     H  <NA>
# 5  5   9 Area6    D  <NA>     C

What do you think? If you have dplyr version 1 I think it can be improved with across()

Here is another long alternative from dplyr package.

df1$area <- as.character(df1$area)
df2$area <- as.character(df2$area)
df1$test <- as.character(df1$test)
df2$test <- as.character(df2$test)

library(dplyr)
bind_rows(df1, df2) %>% 
    group_by(id, id2) %>% 
    mutate(area = case_when(
        length(unique(area)) > 1 ~ NA_character_,
        TRUE ~ unique(area))) %>% 
    mutate(test = case_when(
        length(unique(test)) > 1 ~ NA_character_,
        TRUE ~ unique(test))) %>% 
    ungroup() %>% 
    distinct(id, id2, area, test) %>%
    left_join(
        bind_rows(df1, df2) %>% 
        select(id, id2, info1) %>% 
        na.omit(), 
        by = c("id", "id2")
    ) %>% 
    left_join(
        bind_rows(df1, df2) %>% 
        select("id", "id2", info2) %>% 
        na.omit(),
        by = c("id", "id2")
    )


#     id   id2 area  test  info1 info2
#  <dbl> <dbl> <chr> <chr> <fct> <fct>
#1     1     5 Area1 NA    N     B    
#2     2     6 Area2 NA    N     C    
#3     3     7 NA    B     H     B    
#4     4     8 Area4 C     H     NA   
#5     5     9 Area6 D     NA    C   

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM