How to combine columns in R, setting values to NA when not equal

Question

Take this as a sample input:

library(tidyverse)

id <- c(1,2,3,4)
id2 <- c(5,6,7,8)
area <- c("Area1","Area2","Area3","Area4")
test <- c("A",NA,"B","C")
info1 <- c("N","N","H","H")
df1 <- data.frame(id,id2,area,test,info1)


id <- c(1,2,3,5)
id2 <- c(5,6,7,9)
area <- c("Area1","Area2","Area5","Area6")
test <- c("B",NA,"B","D")
info2 <- c("B","C","B","C")
df2 <- data.frame(id,id2,area,test,info2)

x <- list(df1, df2) %>% reduce(full_join, by = c("id","id2"))

Joining these produces area.x, area.y, test.x, test.y. How is it possible to combine these columns into area and test. Set to NA when the values don't match, or if one value is NA then set the value that is not NA.

Desired output:

id|id2|area |test|info1|info2
1 |5  |Area1|NA  |N    |B
2 |6  |Area2|NA  |N    |C
3 |7  |NA   |B   |H    |B
4 |8  |Area4|C   |H    |NA
5 |9  |Area6|D   |NA   |C

Answer 1

x$area = apply(with(x, data.frame(area.x, area.y)), 1, function(x) {
    if (is.na(x[1]) & is.na(x[2])) {
        return(NA)
    } else {
        if (!is.na(x[1]) & is.na(x[2])) return(x[1])
        if (is.na(x[1]) & !is.na(x[2])) return(x[2])        
    }
    if (x[1] == x[2]) {
        return(x[1])
    } else {
        return(NA)
    }
})

x$test = apply(with(x, data.frame(test.x, test.y)), 1, function(x) {
    if (is.na(x[1]) & is.na(x[2])) {
        return(NA)
    } else {
        if (!is.na(x[1]) & is.na(x[2])) return(x[1])
        if (is.na(x[1]) & !is.na(x[2])) return(x[2])        
    }
    if (x[1] == x[2]) {
        return(x[1])
    } else {
        return(NA)
    }
})

x$info = apply(with(x, data.frame(info1, info2)), 1, function(x) {
    if (is.na(x[1]) & is.na(x[2])) {
        return(NA)
    } else {
        if (!is.na(x[1]) & is.na(x[2])) return(x[1])
        if (is.na(x[1]) & !is.na(x[2])) return(x[2])        
    }
    if (x[1] == x[2]) {
        return(x[1])
    } else {
        return(NA)
    }
})

x = within(x, rm(area.x, area.y, test.x, test.y, info1, info2))

Answer 2

 library(tidyverse)  
  compare <- function(d){
      x <- select(d,ends_with("X"))
      y <- select(d,ends_with("y"))
      a <- map2_dfc(x,y,coalesce)
      a[x != y & !is.na(x)& !is.na(y)] <- NA
      d[names(y)] <- NULL
      d[names(x)] <- a
      rename_with(d,~sub("\\.x$","",.x))
    }

compare(x)
 id id2  area test info1 info2
1  1   5 Area1 <NA>     N     B
2  2   6 Area2 <NA>     N     C
3  3   7  <NA>    B     H     B
4  4   8 Area4    C     H  <NA>
5  5   9 Area6    D  <NA>     C

This can be written in a function:

join_dfs <- function(x, y, by){
  x %>%
    full_join(y,by = by)%>%
    cbind(map2_dfc(select(.,ends_with("x")),
                   select(.,ends_with("y")),
                   ~`is.na<-`(coalesce(.x,.y),.x!=.y))%>%
            rename_with(~sub("\\.x$","",.x)))%>%
    select(-matches("\\.(x|y)$"))
}

list(df1, df2) %>% reduce(join_dfs, by = c("id","id2"))
  id id2 info1 info2  area test
1  1   5     N     B Area1 <NA>
2  2   6     N     C Area2 <NA>
3  3   7     H     B  <NA>    B
4  4   8     H  <NA> Area4    C
5  5   9  <NA>     C Area6    D

Answer 3

Your data:

library(dplyr)

id <- c(1,2,3,4)
id2 <- c(5,6,7,8)
area <- c("Area1","Area2","Area3","Area4")
test <- c("A",NA,"B","C")
info1 <- c("N","N","H","H")
df1 <- data.frame(id,id2,area,test,info1)


id <- c(1,2,3,5)
id2 <- c(5,6,7,9)
area <- c("Area1","Area2","Area5","Area6")
test <- c("B",NA,"B","D")
info2 <- c("B","C","B","C")
df2 <- data.frame(id,id2,area,test,info2)


x <- full_join(df1, df2, by = c("id","id2"))

Solution:

coalesce_unless <- function(x, y){
  
  cxy <- coalesce(x, y)
  cyx <- coalesce(y, x)
  
  cxy[cxy != cyx] <- NA
  
  cxy
  
}

x %>% 
  mutate(area = coalesce_unless(area.x, area.y),
         test = coalesce_unless(test.x, test.y)) %>% 
  select(id, id2, area, test, info1, info2)


#   id id2  area test info1 info2
# 1  1   5 Area1 <NA>     N     B
# 2  2   6 Area2 <NA>     N     C
# 3  3   7  <NA>    B     H     B
# 4  4   8 Area4    C     H  <NA>
# 5  5   9 Area6    D  <NA>     C

What do you think? If you have dplyr version 1 I think it can be improved with across()

Answer 4

Here is another long alternative from dplyr package.

df1$area <- as.character(df1$area)
df2$area <- as.character(df2$area)
df1$test <- as.character(df1$test)
df2$test <- as.character(df2$test)

library(dplyr)
bind_rows(df1, df2) %>% 
    group_by(id, id2) %>% 
    mutate(area = case_when(
        length(unique(area)) > 1 ~ NA_character_,
        TRUE ~ unique(area))) %>% 
    mutate(test = case_when(
        length(unique(test)) > 1 ~ NA_character_,
        TRUE ~ unique(test))) %>% 
    ungroup() %>% 
    distinct(id, id2, area, test) %>%
    left_join(
        bind_rows(df1, df2) %>% 
        select(id, id2, info1) %>% 
        na.omit(), 
        by = c("id", "id2")
    ) %>% 
    left_join(
        bind_rows(df1, df2) %>% 
        select("id", "id2", info2) %>% 
        na.omit(),
        by = c("id", "id2")
    )


#     id   id2 area  test  info1 info2
#  <dbl> <dbl> <chr> <chr> <fct> <fct>
#1     1     5 Area1 NA    N     B    
#2     2     6 Area2 NA    N     C    
#3     3     7 NA    B     H     B    
#4     4     8 Area4 C     H     NA   
#5     5     9 Area6 D     NA    C

How to combine columns in R, setting values to NA when not equal

Question

4 answers

solution1
0 2020-07-28 17:16:20

solution2
0 2020-07-28 17:39:37

solution3
0 2020-07-28 17:40:26

solution4
0 2020-07-29 10:06:13

How to combine columns in R, setting values to NA when not equal

Question

4 answers

solution1 0 2020-07-28 17:16:20

solution2 0 2020-07-28 17:39:37

solution3 0 2020-07-28 17:40:26

solution4 0 2020-07-29 10:06:13

solution1
0 2020-07-28 17:16:20

solution2
0 2020-07-28 17:39:37

solution3
0 2020-07-28 17:40:26

solution4
0 2020-07-29 10:06:13