r中data.table中的组累积地识别新值

Question

如何创建一个新列，通过Year + Month的唯一梳子组累计标识Letter列中的新值外观？

数据样本。

require(data.table)
dt <- data.table(Letter = c(LETTERS[c(5, 1:2, 1:2, 1:4, 3:6)]),
                 Year = 2018,
                 Month = c(rep(5,5), rep(6,4), rep(7,4)))

打印。

    Letter Year Month
 1:      E 2018     5
 2:      A 2018     5
 3:      B 2018     5
 4:      A 2018     5
 5:      B 2018     5
 6:      A 2018     6
 7:      B 2018     6
 8:      C 2018     6
 9:      D 2018     6
10:      C 2018     7
11:      D 2018     7
12:      E 2018     7
13:      F 2018     7

结果我试图得到：

    Letter Year Month   New
 1:      E 2018     5  TRUE
 2:      A 2018     5  TRUE
 3:      B 2018     5  TRUE
 4:      A 2018     5  TRUE
 5:      B 2018     5  TRUE
 6:      A 2018     6 FALSE
 7:      B 2018     6 FALSE
 8:      C 2018     6  TRUE
 9:      D 2018     6  TRUE
10:      C 2018     7 FALSE
11:      D 2018     7 FALSE
12:      E 2018     7 FALSE
13:      F 2018     7  TRUE

详细问题：

默认情况下，Group1（“E”，“A”，“B”，“A”，“B”）都为TRUE，无法与之比较。
group1中的哪个字母（“A”，“B”，“C”，“D”）在group1中不重复。
然后，group3中的哪个字母（“C”，“D”，“E”，“F”）在第1组和第2组（“E”，“A”，“B”，“A”，“B”中不重复“， “A B C D”）。

Answer 1

初始化为FALSE; 然后加入每个字母的第一个年月，并更新为TRUE：

dt[, v := FALSE]
dt[unique(dt, by="Letter"), on=.(Letter, Year, Month), v := TRUE][]

    Letter Year Month     v
 1:      E 2018     5  TRUE
 2:      A 2018     5  TRUE
 3:      B 2018     5  TRUE
 4:      A 2018     5  TRUE
 5:      B 2018     5  TRUE
 6:      A 2018     6 FALSE
 7:      B 2018     6 FALSE
 8:      C 2018     6  TRUE
 9:      D 2018     6  TRUE
10:      C 2018     7 FALSE
11:      D 2018     7 FALSE
12:      E 2018     7 FALSE
13:      F 2018     7  TRUE

Answer 2

只是：

 # dt[,new := ifelse(Letter %in% dt$Letter[dt$Month<Month],F,T), by="Month"][]

 #   Letter Year Month   new
 #1:      E 2018     5  TRUE
 #2:      A 2018     5  TRUE
 #3:      B 2018     5  TRUE
 #4:      A 2018     5  TRUE
 #5:      B 2018     5  TRUE
 #6:      A 2018     6 FALSE
 #7:      B 2018     6 FALSE
 #8:      C 2018     6  TRUE
 #9:      D 2018     6  TRUE
#10:      C 2018     7 FALSE
#11:      D 2018     7 FALSE
#12:      E 2018     7 FALSE
#13:      F 2018     7  TRUE

有了David A.的非常有效的评论，一个更快，更简洁的版本:( 推荐）

dt[, new := !(Letter %in% dt$Letter[dt$Month<Month]), by=Month][]

Answer 3

另一种可行方法：

dupes <- c()
dt[, New := {
    x <- !Letter %chin% dupes
    dupes <- c(dupes, unique(Letter[x]))
    x
}, by=.(Year, Month)]

一些时间供以下参考：

如果Letter是整数：

library(microbenchmark)
microbenchmark(mtd0=dt0[, New := !(Letter %in% dt0$Letter[dt0$Month<Month]), by=Month],
    mtd1={
        dt1[, v := FALSE]
        dt1[unique(dt1, by="Letter"), on=.(Letter, Year, Month), v := TRUE]
    },
    mtd2={
        dupes <- c()
        dt2[, New := {
            x <- !Letter %in% dupes
            dupes <- c(dupes, unique(Letter[x]))
            x
        }, by=.(Year, Month)]        
    },
    times=3L)

整数时序输出：

Unit: milliseconds
 expr       min       lq      mean    median        uq      max neval
 mtd0 1293.3100 1318.775 1331.7129 1344.2398 1350.9143 1357.589     3
 mtd1  377.1534  391.178  402.4423  405.2026  415.0868  424.971     3
 mtd2 2015.2115 2020.926 2023.7209 2026.6400 2027.9756 2029.311     3

如果Letter是一个字符：

microbenchmark(mtd0=dt0[, New := !(Letter %chin% dt0$Letter[dt0$Month<Month]), by=Month],
    mtd1={
        dt1[, v := FALSE]
        dt1[unique(dt1, by="Letter"), on=.(Letter, Year, Month), v := TRUE]
    },
    mtd2={
        dupes <- c()
        dt2[, New := {
            x <- !Letter %chin% dupes
            dupes <- c(dupes, unique(Letter[x]))
            x
        }, by=.(Year, Month)]        
    },
    times=3L)

定时输出：

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval
 mtd0 1658.5806 1689.8941 1765.9329 1721.2076 1819.6090 1918.0105     3
 mtd1  849.2361  851.1807  852.8632  853.1253  854.6768  856.2283     3
 mtd2  420.1013  426.0941  433.9202  432.0869  440.8296  449.5723     3

校验：

> identical(dt2$New, dt1$v)
[1] TRUE
> identical(dt0$New, dt1$v)
[1] FALSE

数据：

set.seed(0L)
nr <- 1e7
dt <- unique(data.table(Letter=sample(nr/1e2, nr, replace=TRUE),
    Year=sample(2014:2018, nr, replace=TRUE),
    Month=sample(1:12, nr, replace=TRUE)))
setorder(dt, Year, Month)#[, Letter := as.character(Letter)]
dt0 <- copy(dt)
dt1 <- copy(dt)
dt2 <- copy(dt)

#for seed=0L, dt has about 4.8mio rows

r中data.table中的组累积地识别新值

问题描述

3 个解决方案

解决方案1
5 已采纳 2018-09-20 14:34:59

解决方案2
3 2018-09-20 11:16:43

解决方案3
2 2018-09-20 12:57:28

r中data.table中的组累积地识别新值

问题描述

3 个解决方案

解决方案1 5 已采纳 2018-09-20 14:34:59

解决方案2 3 2018-09-20 11:16:43

解决方案3 2 2018-09-20 12:57:28

解决方案1
5 已采纳 2018-09-20 14:34:59

解决方案2
3 2018-09-20 11:16:43

解决方案3
2 2018-09-20 12:57:28