简体   繁体   中英

data.table update last element in group based on condition

I have a data.table with 3 columns: id, time and status. For each id, I want to find the record with the maximum time - then if for that record, the status is true, I want to set it to false if the time is > 7 (for example). I am doing it in the following manner.

x <- data.table(id=c(1,1,2,2),time=c(5,6,7,8),status=c(FALSE,TRUE,FALSE,TRUE))
setkey(x,id,time)
y <- x[,.SD[.N],by=id]
x[y,status:=status & time > 7]

I have a lot of data I am working with and would like to speed up this operation. Any suggestions would be appreciated!

x[x[,.N, by=id][,cumsum(N)], status := status * time <=7]

If i am not mistaken, this is no join as x[,.N, by=id][,cumsum(N)] returns the row-indices of the last elements per group.

Update: After seeing the speed comparison this one seems the winner and should be listed first

This was my initial attempt which turns out to be the slowest of all suggested solutions

x[,status := c(.SD[-.N, status], .SD[.N, status * time <=7]), by=id]

One data.table approach is

x[ x[order(time), .I[.N], by=id]$V1 , status := ifelse(time > 7, FALSE, TRUE)]

> x
#   id time status
#1:  1    5  FALSE
#2:  1    6   TRUE
#3:  2    7  FALSE
#4:  2    8  FALSE

as x[order(time), .I[.N], by=id]$V1 gives us the row index of the maximum time for each group ( id )

And, borrowing from @Floo0's answer we can simplify it slightly to

x[ x[order(time), .I[.N], by=id]$V1 , status := status * time <= 7]

Speed Comparison

A speed test of the various answers (and keeping the keys on the data)

set.seed(123)
x <- data.table(id=c(rep(seq(1:10000), each=10)),
                time=c(rep(seq(1:10000), 10)),
                status=c(sample(c(TRUE, FALSE), 10000*10, replace=T)))
setkey(x,id,time)
x1 <- copy(x); x2 <- copy(x); x3 <- copy(x); x4 <- copy(x); x5 <- copy(x); x6 <- copy(x)

library(microbenchmark)

microbenchmark(

    Symbolix = {x1[ x1[order(time), .I[.N], by=id]$V1 , status := status * time < 7 ] },

    Floo0_1 = {x2[,status := c(.SD[-.N, status], .SD[.N, status * time > 7]), by=id]},

    Floo0_2 = {x3[x3[,.N, by=id][,cumsum(N)], status := status * time > 7]},

    Original = { 
                y <- x4[,.SD[.N],by=id]
                x4[y,status:=status & time > 7]
               },

    Frank = {
             y <- x5[, .SD[.N, .(time, status)], by=id][time > 7 & status]
             x5[y, status := FALSE]
             },

    thelatemail = {x6[ x6[,.I==.I[which.max(time)], by=id]$V1 & time > 7, status := FALSE]}
)

Unit: milliseconds
        expr         min          lq        mean      median          uq         max neval cld
    Symbolix    5.419768    5.857477    6.514111    6.222118    6.936000   11.284580   100 a  
     Floo0_1 4550.314775 4710.679867 4787.086279 4776.794072 4850.334011 5097.136148   100   c
     Floo0_2    1.653419    1.792378    1.945203    1.881609    2.014325    4.096006   100 a  
    Original   10.052947   10.986294   12.541595   11.431182   12.391287   89.494783   100 a  
       Frank 4609.115061 4697.687642 4743.886186 4735.086113 4785.212543 4932.270602   100  b 
 thelatemail   10.300864   11.594972   12.421889   12.315852   12.984146   17.630736   100 a  

Another attempt:

x[ x[,.I==.I[which.max(time)], by=id]$V1 & time > 7, status := FALSE]
x

#   id time status
#1:  1    5  FALSE
#2:  1    6   TRUE
#3:  2    7  FALSE
#4:  2    8  FALSE

Here's another way, similar to the OP's:

y = unique(x[,c("id","time"), with=FALSE], by="id", fromLast=TRUE)
x[y[time > 7], status := FALSE]

Here's another benchmark:

n_id = 1e3; n_col = 100; n_draw  = 5

set.seed(1)
X = data.table(id = 1:n_id)[, .(
    time    = sample(10,n_draw), 
    status  = sample(c(T,F), n_draw, replace=TRUE)
), by=id][, paste0("V",1:n_col) := 0]
setkey(X,id,time)

X1 = copy(X); X2 = copy(X); X3 = copy(X); X4 = copy(X)
X5 = copy(X); X6 = copy(X); X7 = copy(X); X8 = copy(X)

library(microbenchmark)
library(multcomp)

microbenchmark(
unique = {
    Y = unique(X1[,c("id","time"), with=FALSE], by="id", fromLast=TRUE)
    X1[Y[time > 7], status := FALSE]
},
OP = {
    y <- X2[,.SD[.N],by=id]
    X2[y,status:=status & time > 7]
},
Floo0a = X3[,status := c(.SD[-.N, status], .SD[.N, status * time >7]), by=id],
Floo0b = X4[X4[,.N, by=id][,cumsum(N)], status := status * time >7],
tlm = X5[ X5[,.I==.I[which.max(time)], by=id]$V1 & time > 7, status := FALSE],
Symbolix=X6[ X6[order(time), .I[.N], by=id]$V1 , status := ifelse(time > 7, FALSE, TRUE)],
Frank1 = {
    y <- X7[, .SD[.N, .(time, status)], by=id][time > 7 & status]
    X7[y, status := FALSE]
},
Frank2 = {
    y <- X8[, .SD[.N], by=id][time > 7 & status]
    X8[y, status := FALSE]
}, times = 1, unit = "relative")

Result:

     expr        min         lq       mean     median         uq        max neval
   unique   1.348592   1.348592   1.348592   1.348592   1.348592   1.348592     1
       OP  35.048724  35.048724  35.048724  35.048724  35.048724  35.048724     1
   Floo0a 416.175654 416.175654 416.175654 416.175654 416.175654 416.175654     1
   Floo0b   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000     1
      tlm   2.151996   2.151996   2.151996   2.151996   2.151996   2.151996     1
 Symbolix   1.770835   1.770835   1.770835   1.770835   1.770835   1.770835     1
   Frank1 404.045660 404.045660 404.045660 404.045660 404.045660 404.045660     1
   Frank2  36.603303  36.603303  36.603303  36.603303  36.603303  36.603303     1

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM