[英]Tidier reshape for list of data.frames (n x 2 data.frames) to single data.frame (n x 3 columns)
[英]Efficient functions over specific data.frame columns in a list of data.frames
我有一个data.frame
的列表。 例如
set.seed(1)
my_list <- list()
ids = c("a","b","c","d","e")
for(i in 1:5){
my_list[[i]] <- data.frame(id = ids, p = rnorm(length(ids)), m = rnorm(length(ids)), hp = runif(length(ids)), hm = runif(length(ids)), d = rnorm(length(ids)), a = rnorm(length(ids)))
}
我想要的是有效地计算每个id(在“id”列中)列表中所有数据帧上的“p”,“m”,“d”和“a”列的方差。 理想情况下,这将返回一个像这样的data.frame
(基于上面绘制的值):
> result.df
id var_p var_m var_d var_a
1 a 0.2371569 1.7810729 0.08264279 0.5074250
2 b 0.1091675 0.2107997 1.15051229 1.1578691
3 c 0.5385789 0.7650123 0.44215343 0.3137903
4 d 1.0174542 0.7818498 0.06414317 0.6079849
5 e 0.7343667 1.2870542 1.41615858 0.7362462
使用my_list
library(plyr)
df = do.call(rbind, my_list)
out = ddply(df, .(id), colwise(var, c('p','m','d','a')))
#> out
# id p m d a
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
或者基础R替代,使用lapply
和apply
的组合
df = do.call(rbind, my_list)
df1 = do.call(rbind,
lapply(split(df, df$id),
function(x) apply(subset(x, select = c(p,m,d,a)), 2, var)))
out = transform(df1, id = row.names(df1))
#> out
# p m d a id
#a 0.2371569 1.7810729 0.08264279 0.5074250 a
#b 0.1091675 0.2107997 1.15051229 1.1578691 b
#c 0.5385789 0.7650123 0.44215343 0.3137903 c
#d 1.0174542 0.7818498 0.06414317 0.6079849 d
#e 0.7343667 1.2870542 1.41615858 0.7362462 e
或者使用doBy
library(doBy)
df = do.call(rbind, my_list)
out = summaryBy( p + m + d + a ~ id , data = df, keep.names=TRUE, FUN = var)
#> out
# id p m d a
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
或者使用sqldf
library(sqldf)
df = do.call(rbind, my_list)
out = sqldf("select id, variance(p), variance(m),
variance(d), variance(a) from df group by id")
#> out
# id variance(p) variance(m) variance(d) variance(a)
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
这是一个基础R方法
dat <- do.call(rbind,my_list)
aggregate( cbind(p,m,d,a) ~ id, var, data=dat)
这使
id p m d a
1 a 0.2371569 1.7810729 0.08264279 0.5074250
2 b 0.1091675 0.2107997 1.15051229 1.1578691
3 c 0.5385789 0.7650123 0.44215343 0.3137903
4 d 1.0174542 0.7818498 0.06414317 0.6079849
5 e 0.7343667 1.2870542 1.41615858 0.7362462
library(data.table)
rbindlist(my_list)[, lapply(.SD, var), by = id, .SDcols = c("p","m","d","a")]
# id p m d a
# 1: a 0.2371569 1.7810729 0.08264279 0.5074250
# 2: b 0.1091675 0.2107997 1.15051229 1.1578691
# 3: c 0.5385789 0.7650123 0.44215343 0.3137903
# 4: d 1.0174542 0.7818498 0.06414317 0.6079849
# 5: e 0.7343667 1.2870542 1.41615858 0.7362462
更新为使用bind_rows()
(每个@hadley建议比do.call(rbind,...)
更有效)
library(dplyr)
dat <- bind_rows(dat)[,c("id","p","m","d","a")]
dat %>% group_by(id) %>% summarise_each(funs(var))
# id p m d a
# 1 a 0.2371569 1.7810729 0.08264279 0.5074250
# 2 b 0.1091675 0.2107997 1.15051229 1.1578691
# 3 c 0.5385789 0.7650123 0.44215343 0.3137903
# 4 d 1.0174542 0.7818498 0.06414317 0.6079849
# 5 e 0.7343667 1.2870542 1.41615858 0.7362462
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.