简体   繁体   中英

Different aggregation rules with data.table in r

I have a large data frame and I want to aggregate it according to two different ids. Different columns have different aggregation rules, and I want to write a compact code to do the aggregation (there are also many useless variables that I don't need in the final dataset). I did a toy example aggregating my data with dplyr::group_by:

n=10
df <- data.frame(id1 = sample(c("a","b"),n,T),id2 = sample(c("c","d"),n,T), # variables with IDs
                 var_sum1 = rnorm(n,0,1),var_sum2 = rnorm(n,5,1),           # variables to sum
                 var_mean1 = rnorm(n,10,1), var_mean2 = rnorm(n,15,1),      # variables to average
                 var_weighted_mean = rnorm(n,0,1),                          # vars to weight average
                 weight = sample(c(1,2),n,T),                               # weight
                 var_useless_1 = 1,var_useless_n = 1)                       # useless variables to throw away


final_dplyr <- df %>%
  group_by(id1, id2) %>%
  summarise(var_sum1 = sum(var_sum1),
            var_sum2 = sum(var_sum2),
            var_mean1 = mean(var_mean1),
            var_mean2 = mean(var_mean2),
            var_weighted_mean = weighted.mean(var_weighted_mean,weight))

Now, I want to define in a vector the variables that will follow each rule:

ids <- c("id1","id2")
summing = c("var_sum1","var_sum2")
averaging = c("var_mean1","var_mean2")
wght_avergage = c("var_weighted_mean")

Each of this vectors will contain the names of more or less 20 variables, so aggregating it "by hand" like I did with the dplyr toy-example would be kind of anoying.

Can I implement it with the data.table package? Other solutions are also welcome, but as I'm learning this package now, I would really appreciate a solution with data.table.

I thought about something like this (but as I am new to data.table, it might be completely wrong):

dt <- as.data.table(df)

# line not working
dt[ , .(summing, averaging, wght_average) := list(lapply(.SD[,.(summing)],sum),
                                               lapply(.SD[,.(averaging)],mean),
                                               lapply(.SD[,.(wght_average)],function(x)weighted.mean(x,weight))), 
    by = .(ids), 
    .SDcols = .(summing, averaging, wght_average)]

Thanks for any help!

You can use that general syntax, just a few changes (1) you're creating a new data frame (with columns whose length doesn't equal nrow(df) ), so you don't need the := and the part before it (2) you can use mget to get a list of columns to lapply over from a character vector (3) use c to concatenate lists together, rather than list which creates sublists.

ids <- c("id1","id2")
summing = c("var_sum1","var_sum2")
averaging = c("var_mean1","var_mean2")
wght_average = c("var_weighted_mean")


df[ ,  c(lapply(mget(summing), sum), 
         lapply(mget(averaging), mean), 
         lapply(mget(wght_average), weighted.mean, weight)), 
    by = c(ids)]

#    id1 id2   var_sum1  var_sum2 var_mean1 var_mean2 var_weighted_mean
# 1:   a   c -0.4091754 19.469144 10.181026  15.29206        0.06766247
# 2:   a   d -0.9797636  4.884255  8.856079  15.36002        1.43762082
# 3:   b   c -3.0569705 15.284160 10.021045  14.94577       -0.72186913
# 4:   b   d -0.4616429 10.076022  8.442672  15.09100        0.13813689

A possible tidyverse solution is to store the rules in a tibble

library(tidyverse)

ids = c("id1","id2")
do_over <- 
  list(
    summing = c("var_sum1","var_sum2"),
    averaging = c("var_mean1","var_mean2"),
    wght_average = c("var_weighted_mean"))
do_what <- 
  list(
    summing = sum,
    averaging = mean,
    wght_average = ~weighted.mean(., weight))

todo <- tibble(do_over, do_what)

todo
# # A tibble: 3 x 2
#   do_over      do_what     
#   <named list> <named list>
# 1 <chr [2]>    <fn>        
# 2 <chr [2]>    <fn>        
# 3 <chr [1]>    <formula>   

Then pmap over the tibble to get your output

pmap_dfc(todo, ~
           df %>% 
            group_by_at(ids) %>% 
            summarise_at(.x, .y))

# # A tibble: 3 x 11
# # Groups:   id1 [2]
#   id1   id2   var_sum1 var_sum2 id11  id21  var_mean1 var_mean2 id12  id22  var_weighted_mean
#   <fct> <fct>    <dbl>    <dbl> <fct> <fct>     <dbl>     <dbl> <fct> <fct>             <dbl>
# 1 a     c        0.152     4.90 a     c          9.04      15.1 a     c                 0.294
# 2 a     d        2.74     16.0  a     d         10.0       14.8 a     d                -0.486
# 3 b     c       -0.112    23.6  b     c         10.2       14.5 b     c                 0.421

In dplyr , you can use the _at variants which can accept column names as strings so that you don't have to repeat the functions

library(dplyr)

df %>%
  group_by_at(ids) %>%
  mutate_at(summing, sum) %>%
  mutate_at(averaging, mean) %>%
  mutate_at(wght_avergage, ~weighted.mean(., weight)) %>%
  slice(1L) %>%
  select(summing, averaging, wght_avergage)

#  id1   id2   var_sum1 var_sum2 var_mean1 var_mean2 var_weighted_mean
#  <fct> <fct>    <dbl>    <dbl>     <dbl>     <dbl>             <dbl>
#1 a     c       -0.840     9.87      9.76      13.9            0.308 
#2 a     d        3.27     14.4       9.66      15.8            0.275 
#3 b     c       -0.408    18.5       8.82      14.8            0.0450
#4 b     d        1.29      4.85     10.3       15.4           -0.521 

This gives same output as final_dplyr .

final_dplyr

#  id1   id2   var_sum1 var_sum2 var_mean1 var_mean2 var_weighted_mean
#  <fct> <fct>    <dbl>    <dbl>     <dbl>     <dbl>             <dbl>
#1 a     c       -0.840     9.87      9.76      13.9            0.308 
#2 a     d        3.27     14.4       9.66      15.8            0.275 
#3 b     c       -0.408    18.5       8.82      14.8            0.0450
#4 b     d        1.29      4.85     10.3       15.4           -0.521 

We can also make use of map2 from purrr to do this

library(dplyr)
library(purrr)
fns <- list(sum, mean, partial(weighted.mean, weight = weight))
map2(list(df[3:4], df[5:6], df[7:8]), fns,
   ~  bind_cols(.x, df %>% 
          select(id1, id2))  %>% 
         group_by(id1, id2) %>%
         summarise_at(vars(-group_cols()), .y)) %>% 
  reduce(inner_join, by = c('id1', 'id2')) %>%
  select(-weight)
# A tibble: 4 x 7
# Groups:   id1 [2]
#  id1   id2   var_sum1 var_sum2 var_mean1 var_mean2 var_weighted_mean
#  <fct> <fct>    <dbl>    <dbl>     <dbl>     <dbl>             <dbl>
#1 a     c       -0.840     9.87      9.76      13.9             0.308
#2 a     d        3.27     14.4       9.66      15.8             0.511
#3 b     c       -0.408    18.5       8.82      14.8             0.390
#4 b     d        1.29      4.85     10.3       15.4            -0.521

Or using Map from base R

Reduce(function(...) merge(..., by = c('id1', 'id2')), 
     Map(function(fn, dat)  aggregate(.~ id1 + id2, 
        cbind(dat, df[c('id1', 'id2')]), fn), 
      list(sum, mean, weighted.mean), list(df[3:4], df[5:6], df[7:8])))[-8]

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM