Trying to get a possible combination of multiple column values and aggregate them . Below is the data set
data <- data.table(A = c("X", "Z", "X"),
AB = c("W", "W", "Y"),
ABC = c("A", "B", "B"),
Total S = c(200,300,50),
Total B = c(512, 600, 300))
I want to get a combination of all values in column A, AB and ABC in first three columns and then an aggregate in next two as Total S and Total B/ Total S.
Below is the needed output.
Any lead will be highly appreciated.
Here is an option using CJ
:
ans <- data[CJ(A=c("", A), AB=c("", AB), ABC=c("", ABC), unique=TRUE), on=.(A, AB, ABC),
by=.EACHI, .(
TotalS={x <- nafill(TotalS, "const", 0)},
TotalB_S=x + nafill(TotalB, "const", 0))]
ans[-ans[, .I[rowSums(.SD=="")>=2L], .SDcols=A:ABC]]
output:
A AB ABC TotalS TotalB_S
1: W A 0 0
2: W B 0 0
3: Y A 0 0
4: Y B 0 0
5: X A 0 0
6: X B 0 0
7: X W 0 0
8: X W A 200 712
9: X W B 0 0
10: X Y 0 0
11: X Y A 0 0
12: X Y B 50 350
13: Z A 0 0
14: Z B 0 0
15: Z W 0 0
16: Z W A 0 0
17: Z W B 300 900
18: Z Y 0 0
19: Z Y A 0 0
20: Z Y B 0 0
Another cleaner approach with less typing of column names:
cols <- c("A","AB","ABC")
combi <- data[, do.call(CJ, lapply(.SD, function(x) unique(c("",x)))),
.SDcols=cols]
ans <- data[combi[rowSums(combi!="")>=2L], on=.NATURAL,
c(mget(cols), .(
TotalS={x <- nafill(TotalS, "const", 0)},
TotalB_S=x + nafill(TotalB, "const", 0)))]
Here is almost the same answer as Chinsoon's, but with expand.grid()
instead of CJ
.
#create a data.table with all possible unique combinations, inclusing (new!) empty ones
dt <- setDT( unique( expand.grid( A = c("",data$A), AB = c("",data$AB), ABC = c("",data$ABC ), stringsAsFactors = FALSE ) ))
#initialise total-columns
dt[, `:=`( Total_S = 0, Total_B_S = 0 ) ]
#update join on the totals
dt[ data, `:=`( Total_S = i.Total_S, Total_B_S = i.Total_S + i.Total_B ), on = .(A, AB, ABC) ][]
# A AB ABC Total_S Total_B_S
# 1: 0 0
# 2: X 0 0
# 3: Z 0 0
# 4: W 0 0
# 5: X W 0 0
# 6: Z W 0 0
# 7: Y 0 0
# 8: X Y 0 0
# 9: Z Y 0 0
# 10: A 0 0
# 11: X A 0 0
# 12: Z A 0 0
# 13: W A 0 0
# 14: X W A 200 712
# 15: Z W A 0 0
# 16: Y A 0 0
# 17: X Y A 0 0
# 18: Z Y A 0 0
# 19: B 0 0
# 20: X B 0 0
# 21: Z B 0 0
# 22: W B 0 0
# 23: X W B 0 0
# 24: Z W B 300 900
# 25: Y B 0 0
# 26: X Y B 50 350
# 27: Z Y B 0 0
# A AB ABC Total_S Total_B_S
no idea why, but expand.grid
seems to be marginally faster..
# Unit: milliseconds
# expr min lq mean median uq max neval
# wimpel 3.2360 3.45340 4.281843 3.60015 4.22865 11.4641 100
# chinsoon 5.2775 5.68705 6.365414 5.80085 6.11475 12.5649 100
microbenchmark::microbenchmark(
wimpel = {
#create all possible combinations, inclusing empty ones
dt <- setDT( unique( expand.grid( A = c("",data$A), AB = c("",data$AB), ABC = c("",data$ABC ), stringsAsFactors = FALSE ) ))
#initialise total-columns
dt[, `:=`( Total_S = 0, Total_B_S = 0 ) ]
#update join on the totals
dt[ data, `:=`( Total_S = i.Total_S, Total_B_S = i.Total_S + i.Total_B ), on = .(A, AB, ABC) ][]
},
chinsoon = {
ans <- data[CJ(A=c("", A), AB=c("", AB), ABC=c("", ABC), unique=TRUE), on=.(A, AB, ABC),
by=.EACHI, .(
TotalS={x <- nafill(Total_S, "const", 0)},
TotalB_S=x + nafill(Total_B, "const", 0))]
},
times = 100)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.