简体   繁体   中英

Aggregate with combination of multiple columns in R with all possible combination values

Trying to get a possible combination of multiple column values and aggregate them . Below is the data set

data <- data.table(A   = c("X", "Z", "X"),
                   AB   = c("W", "W", "Y"),
                   ABC = c("A", "B", "B"),
                   Total S   = c(200,300,50),
                   Total B = c(512, 600, 300))

I want to get a combination of all values in column A, AB and ABC in first three columns and then an aggregate in next two as Total S and Total B/ Total S.

Below is the needed output.

在此处输入图片说明

Any lead will be highly appreciated.

Here is an option using CJ :

ans <- data[CJ(A=c("", A), AB=c("", AB), ABC=c("", ABC), unique=TRUE), on=.(A, AB, ABC), 
    by=.EACHI, .(
        TotalS={x <- nafill(TotalS, "const", 0)}, 
        TotalB_S=x + nafill(TotalB, "const", 0))]
ans[-ans[, .I[rowSums(.SD=="")>=2L], .SDcols=A:ABC]]

output:

    A AB ABC TotalS TotalB_S
 1:    W   A      0        0
 2:    W   B      0        0
 3:    Y   A      0        0
 4:    Y   B      0        0
 5: X      A      0        0
 6: X      B      0        0
 7: X  W          0        0
 8: X  W   A    200      712
 9: X  W   B      0        0
10: X  Y          0        0
11: X  Y   A      0        0
12: X  Y   B     50      350
13: Z      A      0        0
14: Z      B      0        0
15: Z  W          0        0
16: Z  W   A      0        0
17: Z  W   B    300      900
18: Z  Y          0        0
19: Z  Y   A      0        0
20: Z  Y   B      0        0

Another cleaner approach with less typing of column names:

cols <- c("A","AB","ABC")
combi <- data[, do.call(CJ, lapply(.SD, function(x) unique(c("",x)))), 
    .SDcols=cols]
ans <- data[combi[rowSums(combi!="")>=2L], on=.NATURAL, 
    c(mget(cols), .(
        TotalS={x <- nafill(TotalS, "const", 0)}, 
        TotalB_S=x + nafill(TotalB, "const", 0)))]

Here is almost the same answer as Chinsoon's, but with expand.grid() instead of CJ .

#create a data.table with all possible unique combinations, inclusing (new!) empty ones
dt <- setDT( unique( expand.grid( A = c("",data$A), AB = c("",data$AB), ABC = c("",data$ABC ), stringsAsFactors = FALSE ) ))
#initialise total-columns
dt[, `:=`( Total_S = 0, Total_B_S = 0 ) ]
#update join on the totals
dt[ data, `:=`( Total_S = i.Total_S, Total_B_S = i.Total_S + i.Total_B ), on = .(A, AB, ABC) ][]

#     A AB ABC Total_S Total_B_S
#  1:                0         0
#  2: X              0         0
#  3: Z              0         0
#  4:    W           0         0
#  5: X  W           0         0
#  6: Z  W           0         0
#  7:    Y           0         0
#  8: X  Y           0         0
#  9: Z  Y           0         0
# 10:        A       0         0
# 11: X      A       0         0
# 12: Z      A       0         0
# 13:    W   A       0         0
# 14: X  W   A     200       712
# 15: Z  W   A       0         0
# 16:    Y   A       0         0
# 17: X  Y   A       0         0
# 18: Z  Y   A       0         0
# 19:        B       0         0
# 20: X      B       0         0
# 21: Z      B       0         0
# 22:    W   B       0         0
# 23: X  W   B       0         0
# 24: Z  W   B     300       900
# 25:    Y   B       0         0
# 26: X  Y   B      50       350
# 27: Z  Y   B       0         0
#     A AB ABC Total_S Total_B_S     

benchmarks

no idea why, but expand.grid seems to be marginally faster..

# Unit: milliseconds
#     expr    min      lq     mean  median      uq     max neval
#   wimpel 3.2360 3.45340 4.281843 3.60015 4.22865 11.4641   100
# chinsoon 5.2775 5.68705 6.365414 5.80085 6.11475 12.5649   100

microbenchmark::microbenchmark(
  wimpel = {
#create all possible combinations, inclusing empty ones
dt <- setDT( unique( expand.grid( A = c("",data$A), AB = c("",data$AB), ABC = c("",data$ABC ), stringsAsFactors = FALSE ) ))
#initialise total-columns
dt[, `:=`( Total_S = 0, Total_B_S = 0 ) ]
#update join on the totals
dt[ data, `:=`( Total_S = i.Total_S, Total_B_S = i.Total_S + i.Total_B ), on = .(A, AB, ABC) ][]
},
chinsoon = {
ans <- data[CJ(A=c("", A), AB=c("", AB), ABC=c("", ABC), unique=TRUE), on=.(A, AB, ABC), 
            by=.EACHI, .(
              TotalS={x <- nafill(Total_S, "const", 0)}, 
              TotalB_S=x + nafill(Total_B, "const", 0))]
},
times = 100)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM