I have a large data frame and I want to aggregate it according to two different ids. Different columns have different aggregation rules, and I want to write a compact code to do the aggregation (there are also many useless variables that I don't need in the final dataset). I did a toy example aggregating my data with dplyr::group_by:
n=10
df <- data.frame(id1 = sample(c("a","b"),n,T),id2 = sample(c("c","d"),n,T), # variables with IDs
var_sum1 = rnorm(n,0,1),var_sum2 = rnorm(n,5,1), # variables to sum
var_mean1 = rnorm(n,10,1), var_mean2 = rnorm(n,15,1), # variables to average
var_weighted_mean = rnorm(n,0,1), # vars to weight average
weight = sample(c(1,2),n,T), # weight
var_useless_1 = 1,var_useless_n = 1) # useless variables to throw away
final_dplyr <- df %>%
group_by(id1, id2) %>%
summarise(var_sum1 = sum(var_sum1),
var_sum2 = sum(var_sum2),
var_mean1 = mean(var_mean1),
var_mean2 = mean(var_mean2),
var_weighted_mean = weighted.mean(var_weighted_mean,weight))
Now, I want to define in a vector the variables that will follow each rule:
ids <- c("id1","id2")
summing = c("var_sum1","var_sum2")
averaging = c("var_mean1","var_mean2")
wght_avergage = c("var_weighted_mean")
Each of this vectors will contain the names of more or less 20 variables, so aggregating it "by hand" like I did with the dplyr toy-example would be kind of anoying.
Can I implement it with the data.table package? Other solutions are also welcome, but as I'm learning this package now, I would really appreciate a solution with data.table.
I thought about something like this (but as I am new to data.table, it might be completely wrong):
dt <- as.data.table(df)
# line not working
dt[ , .(summing, averaging, wght_average) := list(lapply(.SD[,.(summing)],sum),
lapply(.SD[,.(averaging)],mean),
lapply(.SD[,.(wght_average)],function(x)weighted.mean(x,weight))),
by = .(ids),
.SDcols = .(summing, averaging, wght_average)]
Thanks for any help!
You can use that general syntax, just a few changes (1) you're creating a new data frame (with columns whose length doesn't equal nrow(df)
), so you don't need the :=
and the part before it (2) you can use mget
to get a list of columns to lapply
over from a character vector (3) use c
to concatenate lists together, rather than list
which creates sublists.
ids <- c("id1","id2")
summing = c("var_sum1","var_sum2")
averaging = c("var_mean1","var_mean2")
wght_average = c("var_weighted_mean")
df[ , c(lapply(mget(summing), sum),
lapply(mget(averaging), mean),
lapply(mget(wght_average), weighted.mean, weight)),
by = c(ids)]
# id1 id2 var_sum1 var_sum2 var_mean1 var_mean2 var_weighted_mean
# 1: a c -0.4091754 19.469144 10.181026 15.29206 0.06766247
# 2: a d -0.9797636 4.884255 8.856079 15.36002 1.43762082
# 3: b c -3.0569705 15.284160 10.021045 14.94577 -0.72186913
# 4: b d -0.4616429 10.076022 8.442672 15.09100 0.13813689
A possible tidyverse solution is to store the rules in a tibble
library(tidyverse)
ids = c("id1","id2")
do_over <-
list(
summing = c("var_sum1","var_sum2"),
averaging = c("var_mean1","var_mean2"),
wght_average = c("var_weighted_mean"))
do_what <-
list(
summing = sum,
averaging = mean,
wght_average = ~weighted.mean(., weight))
todo <- tibble(do_over, do_what)
todo
# # A tibble: 3 x 2
# do_over do_what
# <named list> <named list>
# 1 <chr [2]> <fn>
# 2 <chr [2]> <fn>
# 3 <chr [1]> <formula>
Then pmap over the tibble to get your output
pmap_dfc(todo, ~
df %>%
group_by_at(ids) %>%
summarise_at(.x, .y))
# # A tibble: 3 x 11
# # Groups: id1 [2]
# id1 id2 var_sum1 var_sum2 id11 id21 var_mean1 var_mean2 id12 id22 var_weighted_mean
# <fct> <fct> <dbl> <dbl> <fct> <fct> <dbl> <dbl> <fct> <fct> <dbl>
# 1 a c 0.152 4.90 a c 9.04 15.1 a c 0.294
# 2 a d 2.74 16.0 a d 10.0 14.8 a d -0.486
# 3 b c -0.112 23.6 b c 10.2 14.5 b c 0.421
In dplyr
, you can use the _at
variants which can accept column names as strings so that you don't have to repeat the functions
library(dplyr)
df %>%
group_by_at(ids) %>%
mutate_at(summing, sum) %>%
mutate_at(averaging, mean) %>%
mutate_at(wght_avergage, ~weighted.mean(., weight)) %>%
slice(1L) %>%
select(summing, averaging, wght_avergage)
# id1 id2 var_sum1 var_sum2 var_mean1 var_mean2 var_weighted_mean
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 a c -0.840 9.87 9.76 13.9 0.308
#2 a d 3.27 14.4 9.66 15.8 0.275
#3 b c -0.408 18.5 8.82 14.8 0.0450
#4 b d 1.29 4.85 10.3 15.4 -0.521
This gives same output as final_dplyr
.
final_dplyr
# id1 id2 var_sum1 var_sum2 var_mean1 var_mean2 var_weighted_mean
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 a c -0.840 9.87 9.76 13.9 0.308
#2 a d 3.27 14.4 9.66 15.8 0.275
#3 b c -0.408 18.5 8.82 14.8 0.0450
#4 b d 1.29 4.85 10.3 15.4 -0.521
We can also make use of map2
from purrr
to do this
library(dplyr)
library(purrr)
fns <- list(sum, mean, partial(weighted.mean, weight = weight))
map2(list(df[3:4], df[5:6], df[7:8]), fns,
~ bind_cols(.x, df %>%
select(id1, id2)) %>%
group_by(id1, id2) %>%
summarise_at(vars(-group_cols()), .y)) %>%
reduce(inner_join, by = c('id1', 'id2')) %>%
select(-weight)
# A tibble: 4 x 7
# Groups: id1 [2]
# id1 id2 var_sum1 var_sum2 var_mean1 var_mean2 var_weighted_mean
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 a c -0.840 9.87 9.76 13.9 0.308
#2 a d 3.27 14.4 9.66 15.8 0.511
#3 b c -0.408 18.5 8.82 14.8 0.390
#4 b d 1.29 4.85 10.3 15.4 -0.521
Or using Map
from base R
Reduce(function(...) merge(..., by = c('id1', 'id2')),
Map(function(fn, dat) aggregate(.~ id1 + id2,
cbind(dat, df[c('id1', 'id2')]), fn),
list(sum, mean, weighted.mean), list(df[3:4], df[5:6], df[7:8])))[-8]
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.