Efficient functions over specific data.frame columns in a list of data.frames

Question

I have a list of data.frame s. For example

set.seed(1)
my_list <- list()
ids = c("a","b","c","d","e")
for(i in 1:5){
  my_list[[i]] <- data.frame(id = ids, p = rnorm(length(ids)), m = rnorm(length(ids)), hp = runif(length(ids)), hm = runif(length(ids)), d = rnorm(length(ids)), a = rnorm(length(ids)))
}

What I want is to efficiently compute for each id (in the "id" column) the variance of the "p", "m", "d", and "a" columns over all data frames in the list. Ideally, this would return a data.frame like this (based on the values drawn above):

> result.df
  id     var_p     var_m      var_d     var_a
1  a 0.2371569 1.7810729 0.08264279 0.5074250
2  b 0.1091675 0.2107997 1.15051229 1.1578691
3  c 0.5385789 0.7650123 0.44215343 0.3137903
4  d 1.0174542 0.7818498 0.06414317 0.6079849
5  e 0.7343667 1.2870542 1.41615858 0.7362462

Answer 1

Using my_list

library(plyr)
df = do.call(rbind, my_list)
out = ddply(df, .(id), colwise(var, c('p','m','d','a')))

#> out
#  id         p         m          d         a
#1  a 0.2371569 1.7810729 0.08264279 0.5074250
#2  b 0.1091675 0.2107997 1.15051229 1.1578691
#3  c 0.5385789 0.7650123 0.44215343 0.3137903
#4  d 1.0174542 0.7818498 0.06414317 0.6079849
#5  e 0.7343667 1.2870542 1.41615858 0.7362462

Or base R alternative, using the combination of lapply and apply

df = do.call(rbind, my_list)
df1 = do.call(rbind, 
      lapply(split(df, df$id), 
      function(x) apply(subset(x, select = c(p,m,d,a)), 2, var)))

out = transform(df1, id = row.names(df1))

#> out
#          p         m          d         a id
#a 0.2371569 1.7810729 0.08264279 0.5074250  a
#b 0.1091675 0.2107997 1.15051229 1.1578691  b
#c 0.5385789 0.7650123 0.44215343 0.3137903  c
#d 1.0174542 0.7818498 0.06414317 0.6079849  d
#e 0.7343667 1.2870542 1.41615858 0.7362462  e

Or using doBy

library(doBy)
df = do.call(rbind, my_list)
out = summaryBy( p + m + d + a ~ id , data = df, keep.names=TRUE, FUN = var)

#> out
#  id         p         m          d         a
#1  a 0.2371569 1.7810729 0.08264279 0.5074250
#2  b 0.1091675 0.2107997 1.15051229 1.1578691
#3  c 0.5385789 0.7650123 0.44215343 0.3137903
#4  d 1.0174542 0.7818498 0.06414317 0.6079849
#5  e 0.7343667 1.2870542 1.41615858 0.7362462

Or using sqldf

library(sqldf)
df = do.call(rbind, my_list)
out = sqldf("select id, variance(p), variance(m), 
             variance(d), variance(a) from df group by id")

#> out
#  id variance(p) variance(m) variance(d) variance(a)
#1  a   0.2371569   1.7810729  0.08264279   0.5074250
#2  b   0.1091675   0.2107997  1.15051229   1.1578691
#3  c   0.5385789   0.7650123  0.44215343   0.3137903
#4  d   1.0174542   0.7818498  0.06414317   0.6079849
#5  e   0.7343667   1.2870542  1.41615858   0.7362462

Answer 2

Here is a base R approach

dat <- do.call(rbind,my_list)
aggregate( cbind(p,m,d,a) ~ id, var, data=dat)

which gives

  id         p         m          d         a
1  a 0.2371569 1.7810729 0.08264279 0.5074250
2  b 0.1091675 0.2107997 1.15051229 1.1578691
3  c 0.5385789 0.7650123 0.44215343 0.3137903
4  d 1.0174542 0.7818498 0.06414317 0.6079849
5  e 0.7343667 1.2870542 1.41615858 0.7362462

Answer 3

library(data.table)
rbindlist(my_list)[, lapply(.SD, var), by = id, .SDcols = c("p","m","d","a")]
#    id         p         m          d         a
# 1:  a 0.2371569 1.7810729 0.08264279 0.5074250
# 2:  b 0.1091675 0.2107997 1.15051229 1.1578691
# 3:  c 0.5385789 0.7650123 0.44215343 0.3137903
# 4:  d 1.0174542 0.7818498 0.06414317 0.6079849
# 5:  e 0.7343667 1.2870542 1.41615858 0.7362462

Answer 4

Updated to use bind_rows() (more efficient than do.call(rbind,...) per @hadley suggestion)

library(dplyr)
dat <- bind_rows(dat)[,c("id","p","m","d","a")]
dat %>% group_by(id) %>% summarise_each(funs(var))

#   id         p         m          d         a
# 1  a 0.2371569 1.7810729 0.08264279 0.5074250
# 2  b 0.1091675 0.2107997 1.15051229 1.1578691
# 3  c 0.5385789 0.7650123 0.44215343 0.3137903
# 4  d 1.0174542 0.7818498 0.06414317 0.6079849
# 5  e 0.7343667 1.2870542 1.41615858 0.7362462

Efficient functions over specific data.frame columns in a list of data.frames

Question

4 answers

solution1
3 ACCPTED 2015-06-26 17:21:32

solution2
3 2015-06-26 17:27:56

solution3
3 2015-06-26 17:34:20

solution4
2 2015-06-26 17:27:32

Efficient functions over specific data.frame columns in a list of data.frames

Question

4 answers

solution1 3 ACCPTED 2015-06-26 17:21:32

solution2 3 2015-06-26 17:27:56

solution3 3 2015-06-26 17:34:20

solution4 2 2015-06-26 17:27:32

solution1
3 ACCPTED 2015-06-26 17:21:32

solution2
3 2015-06-26 17:27:56

solution3
3 2015-06-26 17:34:20

solution4
2 2015-06-26 17:27:32