繁体   English   中英

如何按组有效地聚合多个 data.table 列,一次 N,其中 N 是可变的

[英]How to efficiently aggregate multiple data.table columns by groups, N-at-a-time, where N is variable

我使用的数据集结构如下:

library("data.table")

   dt <- data.table(Id = c(1,2,3,4,5,6,7,8), "01.01.2005" = c(10,0,0,0,5,7,7,0),
 "02.01.2005" = c(4,5,7,0,0,0,9,9), "03.01.2005" = c(2,3,3,6,7,77,45,0), 
"04.01.2005" = c(0,0,0,0,0,0,0,1), "05.01.2005" = c(0,1,1,1,1,0,0,2), 
"06.01.2005" = c(45,15,45,54,5,76,67,10), "07.01.2005" = c(0,0,0,0,466,21,832,54), 
"08.01.2005" = c(2,3,734,2,3,4,5,6))

>dt
       Id 01.01.2005 02.01.2005 03.01.2005 04.01.2005 05.01.2005 06.01.2005 07.01.2005 08.01.2005
    1:  1         10          4          2          0          0         45          0          2
    2:  2          0          5          3          0          1         15          0          3
    3:  3          0          7          3          0          1         45          0        734
    4:  4          0          0          6          0          1         54          0          2
    5:  5          5          0          7          0          1          5        466          3
    6:  6          7          0         77          0          0         76         21          4
    7:  7          7          9         45          0          0         67        832          5
    8:  8          0          9          0          1          2         10         54          6

我需要按某个步骤对一定数量的列进行求和。 即,如果 step = 2,则聚合列 (2,3),(4,5)、(6,7) 和 (8,9) 也是如此。 输出需要如下所示:

>output
       Id 01.01.2005-02.01.2005 03.01.2005-04.01.2005 05.01.2005-06.01.2005 07.01.2005-08.01.2005
    1:  1                    14                     2                    45                     2
    2:  2                     5                     3                    16                     3
    3:  3                     7                     3                    46                   734
    4:  4                     0                     6                    55                     2
    5:  5                     5                     7                     6                   469
    6:  6                     7                    77                    76                    25
    7:  7                    16                    45                    67                   837
    8:  8                     9                     1                    12                    60

为了实现这一点,我使用了一个循环:

output <- dt[, list(Id)]
step = 2
for(i in seq(nrow(dt), 2, by = -step)){
     output <- cbind(output, temp.col = rowSums(dt[, i:(i-step+1), with = F], 
                      na.rm = FALSE, dims = 1))
     setnames(output, "temp.col", "new.name...")
}

但是对于大型数据集,这种方法非常慢。 是否存在可以在没有循环的情况下完成我需要的功能?

此外:“步骤”需要是一个变量输入。

提前致谢

这是另一种可能的方法

step <- 2
temp <- melt(dt, "Id")[, indx := rep(seq_len((ncol(dt)-1L)/step), each = nrow(dt)*step)]
dcast(temp, Id ~ indx, sum, value.var = "value")
#    Id  1  2  3   4
# 1:  1 14  2 45   2
# 2:  2  5  3 16   3
# 3:  3  7  3 46 734
# 4:  4  0  6 55   2
# 5:  5  5  7  6 469
# 6:  6  7 77 76  25
# 7:  7 16 45 67 837
# 8:  8  9  1 12  60

你可以试试

f1 <- function(DT, step){
  m1 <- as.matrix(DT[,-1])
  lst <- lapply(seq(step), function(i) seq(i, ncol(m1), by = step))
  nm1 <- do.call(paste, c(lapply(lst, function(x) colnames(m1)[x]), sep="_"))
  res <- cbind(Id= DT[["Id"]], as.data.table(Reduce(`+`,
                                 lapply(lst, function(i) m1[,i]))))
  setnames(res, 2:ncol(res), nm1)
  res}

f1(dt, 2)  
#   Id 01.01.2005_02.01.2005 03.01.2005_04.01.2005 05.01.2005_06.01.2005
#1:  1                    14                     2                    45
#2:  2                     5                     3                    16
#3:  3                     7                     3                    46
#4:  4                     0                     6                    55
#5:  5                     5                     7                     6
#6:  6                     7                    77                    76
#7:  7                    16                    45                    67
#8:  8                     9                     1                    12
#   07.01.2005_08.01.2005
#1:                     2
#2:                     3
#3:                   734
#4:                     2
#5:                   469
#6:                    25
#7:                   837
#8:                    60

使用其他“步骤”进行测试

dt1 <- data.table(Id = c(1,2,3,4,5,6,7,8),
"01.01.2005" =   c(10,0,0,0,5,7,7,0),
"02.01.2005" = c(4,5,7,0,0,0,9,9), "03.01.2005" = c(2,3,3,6,7,77,45,0), 
"04.01.2005" = c(0,0,0,0,0,0,0,1), "05.01.2005" = c(0,1,1,1,1,0,0,2), 
"06.01.2005" = c(45,15,45,54,5,76,67,10),
"07.01.2005" = c(0,0,0,0,466,21,832,54), 
"08.01.2005" = c(2,3,734,2,3,4,5,6),
"09.01.2005"= c(4, 3, 3, 5, 7, 9, 10, 11))

f1(dt1, 3)
# Id 01.01.2005_02.01.2005_03.01.2005 04.01.2005_05.01.2005_06.01.2005
#1:  1                               16                               45
#2:  2                                8                               16
#3:  3                               10                               46
#4:  4                                6                               55
#5:  5                               12                                6
#6:  6                               84                               76
#7:  7                               61                               67
#8:  8                                9                               13
#   07.01.2005_08.01.2005_09.01.2005
#1:                                6
#2:                                6
#3:                              737
#4:                                7
#5:                              476
#6:                               34
#7:                              847
#8:                               71

更新

另一种选择是使用setdata.table 新函数f2将不依赖于数据集中的列数。 如果step <- 3 f1将在 'dt' 上失败。 f2 ,如果要用于计算的列的数目是“8”与步骤为“3”,它获取rowSums第一3, rowSums的4:6,和rowSums的7:8。 在示例中,不清楚 OP 如何在这些情况下sum 同样对于step <- 5 ,我们得到第 1 5 列的rowSums ,然后是6:8

f2 <- function(DT, step){
  indx <- 2:ncol(DT)
  indx1 <-  (seq_along(indx)-1)%/%step +1L
  lst <- split(indx, indx1)
  nm1 <- sapply(lst, function(i) paste(names(DT)[i], collapse="_")) 
  res <- as.data.table(matrix(0, ncol=length(lst),
               nrow=nrow(dt), dimnames=list(NULL, nm1)))

  for(j in seq_along(res)){
   set(res, i=NULL, j=j, value=DT[, lst[[j]], with=FALSE][, Reduce(`+`, .SD)])
   }
  cbind(Id=DT[['Id']], res)
  } 

f2(dt,2)
#   Id 01.01.2005_02.01.2005 03.01.2005_04.01.2005 05.01.2005_06.01.2005
#1:  1                    14                     2                    45
#2:  2                     5                     3                    16
#3:  3                     7                     3                    46
#4:  4                     0                     6                    55
#5:  5                     5                     7                     6
#6:  6                     7                    77                    76
#7:  7                    16                    45                    67
#8:  8                     9                     1                    12
#   07.01.2005_08.01.2005
#1:                     2
#2:                     3
#3:                   734
#4:                     2
#5:                   469
#6:                    25
#7:                   837
#8:                    60

f2(dt,3)
# Id 01.01.2005_02.01.2005_03.01.2005 04.01.2005_05.01.2005_06.01.2005
#1:  1                               16                               45
#2:  2                                8                               16
#3:  3                               10                               46
#4:  4                                6                               55
#5:  5                               12                                6
#6:  6                               84                               76
#7:  7                               61                               67
#8:  8                                9                               13
#   07.01.2005_08.01.2005
#1:                     2
#2:                     3
#3:                   734
#4:                     2
#5:                   469
#6:                    25
#7:                   837
#8:                    60

f2(dt,5)
#  Id 01.01.2005_02.01.2005_03.01.2005_04.01.2005_05.01.2005
#1:  1                                                     16
#2:  2                                                      9
#3:  3                                                     11
#4:  4                                                      7
#5:  5                                                     13
#6:  6                                                     84
#7:  7                                                     61
#8:  8                                                     12
#   06.01.2005_07.01.2005_08.01.2005
#1:                               47
#2:                               18
#3:                              779
#4:                               56
#5:                              474
#6:                              101
#7:                              904
#8:                               70

使用稍大的数据集

set.seed(24)
dt <- as.data.table(matrix(sample(0:9, 5000*5000, replace=TRUE), 
     ncol=5000))
dt <- cbind(Id=1:5000, dt)
dim(dt)
#[1] 5000 5001
system.time(f2(dt, 4))
#   user  system elapsed 
#  2.269   0.022   2.291 

基本上与 David Arenburg 的答案相同,但可能更易读一点是首先使用优化的.GRP运算符从data.table获取组号,然后对步骤取模以获取索引。

step <- 2
temp <- melt(dt, "Id")
temp[, group := .GRP + step-1L, variable]
temp[, indx := group %/% step]
dcast(temp, Id ~ indx, sum, value.var = "value")

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM