简体   繁体   中英

Matching values in multiple columns in R based on condition

Say I have a datafame df

resident    faculty    submittedBy    match    caseID    phase

george      sally      george         1        george_1  pre
george      sally      sally          0        george_1  pre
george      sally      george         1        george_1  intra
jane        carl       jane           1        jane_1    pre
jane        carl       carl           0        jane_1    pre
jane        carl       carl           0        jane_1    intra

and I want to add a column df$response to this dataframe according to the following parameters (I'm thinking I need a set of nested ifelses, but I'm struggling to execute it correctly):

For a given row X, if df$match = 1,

print "1" in df$response if:

any row in df$match where df$match = 0 has the same contents in df$caseID , df$faculty , and df$phase as row X. Otherwise print "0".

So the output should be this:

response

1
0
0
1
0
0

because only the first and fourth rows contain values for which there are matches in df$caseID , df$faculty , and df$phase for both a row where df$match = 1 and a row where df$match = 0.

We can use data.table methods. Convert the 'data.frame' to 'data.table' ( setDT(df1) ), grouped by 'caseID', 'faculty', 'phase', get the length of unique elements of match check if it is equal to 2 and create a binary column ('response'), For values where 'match' is 0', assign the 'response' to 0

library(data.table)
setDT(df1)[, response := +((uniqueN(match) == 2) & match != 0), 
                  .(caseID, faculty, phase)][]
#   resident faculty submittedBy match   caseID phase response
#1:   george   sally      george     1 george_1   pre        1
#2:   george   sally       sally     0 george_1   pre        0
#3:   george   sally      george     1 george_1 intra        0
#4:     jane    carl        jane     1   jane_1   pre        1
#5:     jane    carl        carl     0   jane_1   pre        0
#6:     jane    carl        carl     0   jane_1 intra        0

Or using base R with ave

with(df1,+( match != 0 & ave(match, caseID, faculty, phase, 
         FUN = function(x) length(unique(x))) == 2))
#[1] 1 0 0 1 0 0

data

df1 <- structure(list(resident = structure(c(1L, 1L, 1L, 2L, 2L, 2L), 
.Label = c("george", 
"jane"), class = "factor"), faculty = structure(c(2L, 2L, 2L, 
1L, 1L, 1L), .Label = c("carl", "sally"), class = "factor"), 
    submittedBy = structure(c(2L, 4L, 2L, 3L, 1L, 1L), .Label = c("carl", 
    "george", "jane", "sally"), class = "factor"), match = c(1L, 
    0L, 1L, 1L, 0L, 0L), caseID = structure(c(1L, 1L, 1L, 2L, 
    2L, 2L), .Label = c("george_1", "jane_1"), class = "factor"), 
    phase = structure(c(2L, 2L, 1L, 2L, 2L, 1L), .Label = c("intra", 
    "pre"), class = "factor")), class = "data.frame", row.names = c(NA, 
-6L))

Here is how I'd do it

# read the data
test <- read.table(text = 'resident    faculty    submittedBy    match    caseID    phase
                   george      sally      george         1        george_1  pre
                   george      sally      sally          0        george_1  pre
                   george      sally      george         1        george_1  intra
                   jane        carl       jane           1        jane_1    pre
                   jane        carl       carl           0        jane_1    pre
                   jane        carl       carl           0        jane_1    intra', header=T)

# create the response
resp <- logical(0)

# iterate over each loop
for (rr in 1:nrow(test)){
  if (test$match[rr] == 0){
    resp[rr] <- 0
  }
  else{
    tmp <- rbind(test[-rr, c('faculty', 'caseID', 'phase')],  # add the onto the end
                 test[rr, c('faculty', 'caseID', 'phase')])   # test if line is duplicated
    resp[rr] <- ifelse(duplicated(tmp)[nrow(tmp)], 1, 0)
  }
}

Indexing using [] is much faster, and less costly on your machine

df <- data.frame(
  "resident" = c("george","george","george","jane","jane","jane"),
  "faculty" = c("sally","sally","sally","carl","carl","carl"),
  "submittedBy" = c("george","sally","george","jane","carl","carl"),
  "match" = c(1,0,1,1,0,0),
  "caseID" = c("george_1","george_1","george_1","jane_1","jane_1","jane_1"),
  "phase" = c("pre","pre","intra","pre","pre","intra"),
  stringsAsFactors = FALSE
  )

response <- NULL

for (i in 1:nrow(df)) {
  response[i] <- ifelse(
    df$match[i] == 0, 0,
    ifelse(
      any(paste(df$caseID,df$faculty,df$phase,sep="")[df$match == 0] == 
            paste(df$caseID,df$faculty,df$phase,sep="")[i]),
      1, 0
    )
  )
}

response
[1] 1 0 0 1 0 0

Another data.table approach. Join on the key variables and check if the values are not in the match==0 set:

library(data.table)
setDT(dat)

dat[, response := match==1]
dat[!dat[match==0], on=c("caseID","faculty","phase"), response := FALSE]

dat
#   resident faculty submittedBy match   caseID phase response
#1:   george   sally      george     1 george_1   pre     TRUE
#2:   george   sally       sally     0 george_1   pre    FALSE
#3:   george   sally      george     1 george_1 intra    FALSE
#4:     jane    carl        jane     1   jane_1   pre     TRUE
#5:     jane    carl        carl     0   jane_1   pre    FALSE
#6:     jane    carl        carl     0   jane_1 intra    FALSE

Assuming you have only 1 and 0 values in match , one way with dplyr would be to check for every caseID , faculty and phase if there is two distinct values in match (1 and 0) and replace the response to 0 where match is 0.

library(dplyr)
df %>%
  group_by(caseID, faculty, phase) %>%
  mutate(response = as.integer(n_distinct(match) == 2),
         response = replace(response, match == 0, 0))

#  resident faculty submittedBy match caseID   phase response
#  <chr>    <chr>   <chr>       <dbl> <chr>    <chr>    <dbl>
#1 george   sally   george          1 george_1 pre          1
#2 george   sally   sally           0 george_1 pre          0
#3 george   sally   george          1 george_1 intra        0
#4 jane     carl    jane            1 jane_1   pre          1
#5 jane     carl    carl            0 jane_1   pre          0
#6 jane     carl    carl            0 jane_1   intra        0

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM