Say I have a datafame df
resident faculty submittedBy match caseID phase
george sally george 1 george_1 pre
george sally sally 0 george_1 pre
george sally george 1 george_1 intra
jane carl jane 1 jane_1 pre
jane carl carl 0 jane_1 pre
jane carl carl 0 jane_1 intra
and I want to add a column df$response
to this dataframe according to the following parameters (I'm thinking I need a set of nested ifelses, but I'm struggling to execute it correctly):
For a given row X, if df$match
= 1,
print "1" in df$response
if:
any row in df$match
where df$match
= 0 has the same contents in df$caseID
, df$faculty
, and df$phase
as row X. Otherwise print "0".
So the output should be this:
response
1
0
0
1
0
0
because only the first and fourth rows contain values for which there are matches in df$caseID
, df$faculty
, and df$phase
for both a row where df$match
= 1 and a row where df$match
= 0.
We can use data.table
methods. Convert the 'data.frame' to 'data.table' ( setDT(df1)
), grouped by 'caseID', 'faculty', 'phase', get the length of unique
elements of match
check if it is equal to 2 and create a binary column ('response'), For values where 'match' is 0', assign the 'response' to 0
library(data.table)
setDT(df1)[, response := +((uniqueN(match) == 2) & match != 0),
.(caseID, faculty, phase)][]
# resident faculty submittedBy match caseID phase response
#1: george sally george 1 george_1 pre 1
#2: george sally sally 0 george_1 pre 0
#3: george sally george 1 george_1 intra 0
#4: jane carl jane 1 jane_1 pre 1
#5: jane carl carl 0 jane_1 pre 0
#6: jane carl carl 0 jane_1 intra 0
Or using base R
with ave
with(df1,+( match != 0 & ave(match, caseID, faculty, phase,
FUN = function(x) length(unique(x))) == 2))
#[1] 1 0 0 1 0 0
df1 <- structure(list(resident = structure(c(1L, 1L, 1L, 2L, 2L, 2L),
.Label = c("george",
"jane"), class = "factor"), faculty = structure(c(2L, 2L, 2L,
1L, 1L, 1L), .Label = c("carl", "sally"), class = "factor"),
submittedBy = structure(c(2L, 4L, 2L, 3L, 1L, 1L), .Label = c("carl",
"george", "jane", "sally"), class = "factor"), match = c(1L,
0L, 1L, 1L, 0L, 0L), caseID = structure(c(1L, 1L, 1L, 2L,
2L, 2L), .Label = c("george_1", "jane_1"), class = "factor"),
phase = structure(c(2L, 2L, 1L, 2L, 2L, 1L), .Label = c("intra",
"pre"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L))
Here is how I'd do it
# read the data
test <- read.table(text = 'resident faculty submittedBy match caseID phase
george sally george 1 george_1 pre
george sally sally 0 george_1 pre
george sally george 1 george_1 intra
jane carl jane 1 jane_1 pre
jane carl carl 0 jane_1 pre
jane carl carl 0 jane_1 intra', header=T)
# create the response
resp <- logical(0)
# iterate over each loop
for (rr in 1:nrow(test)){
if (test$match[rr] == 0){
resp[rr] <- 0
}
else{
tmp <- rbind(test[-rr, c('faculty', 'caseID', 'phase')], # add the onto the end
test[rr, c('faculty', 'caseID', 'phase')]) # test if line is duplicated
resp[rr] <- ifelse(duplicated(tmp)[nrow(tmp)], 1, 0)
}
}
Indexing using []
is much faster, and less costly on your machine
df <- data.frame(
"resident" = c("george","george","george","jane","jane","jane"),
"faculty" = c("sally","sally","sally","carl","carl","carl"),
"submittedBy" = c("george","sally","george","jane","carl","carl"),
"match" = c(1,0,1,1,0,0),
"caseID" = c("george_1","george_1","george_1","jane_1","jane_1","jane_1"),
"phase" = c("pre","pre","intra","pre","pre","intra"),
stringsAsFactors = FALSE
)
response <- NULL
for (i in 1:nrow(df)) {
response[i] <- ifelse(
df$match[i] == 0, 0,
ifelse(
any(paste(df$caseID,df$faculty,df$phase,sep="")[df$match == 0] ==
paste(df$caseID,df$faculty,df$phase,sep="")[i]),
1, 0
)
)
}
response
[1] 1 0 0 1 0 0
Another data.table approach. Join on the key variables and check if the values are not in the match==0
set:
library(data.table)
setDT(dat)
dat[, response := match==1]
dat[!dat[match==0], on=c("caseID","faculty","phase"), response := FALSE]
dat
# resident faculty submittedBy match caseID phase response
#1: george sally george 1 george_1 pre TRUE
#2: george sally sally 0 george_1 pre FALSE
#3: george sally george 1 george_1 intra FALSE
#4: jane carl jane 1 jane_1 pre TRUE
#5: jane carl carl 0 jane_1 pre FALSE
#6: jane carl carl 0 jane_1 intra FALSE
Assuming you have only 1 and 0 values in match
, one way with dplyr
would be to check for every caseID
, faculty
and phase
if there is two distinct values in match
(1 and 0) and replace the response
to 0 where match
is 0.
library(dplyr)
df %>%
group_by(caseID, faculty, phase) %>%
mutate(response = as.integer(n_distinct(match) == 2),
response = replace(response, match == 0, 0))
# resident faculty submittedBy match caseID phase response
# <chr> <chr> <chr> <dbl> <chr> <chr> <dbl>
#1 george sally george 1 george_1 pre 1
#2 george sally sally 0 george_1 pre 0
#3 george sally george 1 george_1 intra 0
#4 jane carl jane 1 jane_1 pre 1
#5 jane carl carl 0 jane_1 pre 0
#6 jane carl carl 0 jane_1 intra 0
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.