I have a data frame of multiple pairs of estimates and variances for several model parameters each within one of a number of sections. Here's a function that generates the illustrative sample:
samplerats <- function(){
set.seed(310366)
d = data.frame(section=c(rep("S1",10),rep("S2",10),rep("S3",5)))
nr = nrow(d)
for(i in 1:5){
d[[paste0("est_v",i)]] = rnorm(nr)
d[[paste0("var_v",i)]] = runif(nr)
}
d
}
and here's the start of what you get:
> d=samplerats()
> head(d)
section est_v1 var_v1 est_v2 var_v2 est_v3 var_v3
1 S1 0.3893008 0.1620882 -1.1915391 0.15439565 0.62022284 0.5487519
2 S1 0.8221099 0.3280630 0.7729817 0.14810283 -1.11337584 0.9947342
3 S1 0.8023230 0.1862810 -1.5285389 0.85648574 -1.74666907 0.4267944
4 S1 -0.2252865 0.5660111 -0.4348341 0.53013027 0.01823185 0.1379821
5 S1 -0.9475335 0.7904085 -1.0882961 0.40567780 1.69607397 0.3450983
6 S1 0.4415259 0.2969032 0.9200723 0.08754107 0.57010457 0.7579002
[with another two variables and 25 rows in total]
The task is to compute the ratio the variance of the estimates for each parameter with the mean of the variance for each parameter, grouped by section.
So for example, for variable v1, crudely just to get the numbers out:
> d %>% group_by(section) %>% summarise(var(est_v1)/mean(var_v1))
Source: local data frame [3 x 2]
section var(est_v1)/mean(var_v1)
1 S1 0.5874458
2 S2 2.4449153
3 S3 2.8621725
That gives us the answer for v1
, we just need to repeat for all the other variables. Note that the column names are est_
or var_
followed by a variable name which might be alpha
or g2
or some other alphanum.
Of course I have a horrendous solution:
ratit <- function(d){
isVAR <- function(s){stringr::str_sub(s,1,4)=="var_"}
spreads = reshape2::melt(d) %>% mutate(isVAR=isVAR(variable), Variable = str_replace(variable,"^.*_",""))
vout = spreads %>% group_by(Variable, section, isVAR) %>% summarise(Z=if(isVAR(variable[1])){mean(value)}else{var(value)})
ratios = vout %>% group_by(section, Variable) %>% summarise(Vratio = Z[1]/Z[2]) %>% dcast(section ~ Variable)
ratios
}
which gives:
> ratit(d)
Using section as id variables
Using Vratio as value column: use value.var to override.
section v1 v2 v3 v4 v5
1 S1 0.5874458 3.504169 3.676488 1.1716684 1.742021
2 S2 2.4449153 1.177326 1.106337 1.0700636 3.263149
3 S3 2.8621725 2.216099 3.846062 0.7777452 2.122726
where you can see the first column is the same as the v1
-only example earlier. But yuck.
If I can melt, cast, dplyr or otherwise tidyr it up into this format:
est var section variable
1 0.3893008 0.1620882 S1 v1
2 0.8221099 0.3280630 S1 v1
3 0.8023230 0.1862810 S1 v1
4 -0.2252865 0.5660111 S1 v1
5 -0.9475335 0.7904085 S1 v1
6 0.4415259 0.2969032 S1 v1
then its trivial - dd %>% group_by(section, variable) %>% summarise(rat=var(est)/mean(var)) %>% spread(variable, rat)
But that step eludes me...
Neat solutions welcome, using anything including base R, dplyr, tidyr, data.table etc.
This should do the trick:
dd <- reshape(d, varying = 2:11, direction = 'long', sep="_", timevar="variable")
head(dd)
# section variable est var id
# 1.v1 S1 v1 0.3893008 0.1620882 1
# 2.v1 S1 v1 0.8221099 0.3280630 2
# 3.v1 S1 v1 0.8023230 0.1862810 3
# 4.v1 S1 v1 -0.2252865 0.5660111 4
# 5.v1 S1 v1 -0.9475335 0.7904085 5
# 6.v1 S1 v1 0.4415259 0.2969032 6
Here is one solution with base R
using mapply
est <- d[grep('^est|section', colnames(d))]
var1 <- d[grep('^var|section', colnames(d))]
lstest <- split(est[-1], est$section)
lstvar <- split(var1[-1], var1$section)
res <- t(mapply(function(x,y) mapply(function(.x, .y)
var(.x)/mean(.y), x, y), lstest, lstvar))
Or using dplyr
est1 <- est %>%
group_by(section) %>%
summarise_each(funs(var)) %>%
data.frame()
var2 <- var1 %>%
group_by(section) %>%
summarise_each(funs(mean)) %>%
data.frame()
est1[-1]/var2[-1]
data
samplerats <- function(){
set.seed(310366)
d <- data.frame(section=sample(paste0("S", 1:20),
1e5, replace=TRUE))
nr <- nrow(d)
for(i in 1:20){
d[[paste0('est_v', i)]] <- rnorm(nr)
d[[paste0('var_v', i)]] <- runif(nr)
}
d
}
d <- samplerats()
Functions
akrun <- function(){
est1 <- d %>%
group_by(section) %>%
summarise_each(funs(var), starts_with('est'))
var1 <- d %>%
group_by(section) %>%
summarise_each(funs(mean), starts_with('var') )
cbind(unique(est1[1]), est1[-1]/var1[-1])
}
#Assuming that the `reshaped` dataset from @Josh O'Brien's method
#is further processed using `spread` from `tidyr`
josh <- function(){
dd <- reshape(d, varying = 2:ncol(d), direction = 'long',
sep="_", timevar="variable")
dd %>%
group_by(section, variable) %>%
summarise(rat=var(est)/mean(var)) %>%
spread(variable, rat)
}
#Using `data.table` for @Henriks' method as the output from
# `merged.stack is `data.table`
henrik <- function(){
dS <- merged.stack(data = getanID(d, "section"),
var.stubs = c("est", "var"), sep = "_")
dcast.data.table(dS[ , list(rat=var(est)/mean(var)),
.(section, .time_1)], section~.time_1, value.var='rat')
}
DMC <- function(){
d %>%
gather(key, value, -section) %>%
separate(key, into = c("type", "var")) %>%
group_by(section, var) %>%
summarise(result = var(value[type == "est"]) / mean(value[type == "var"]))%>%
spread(var, result)
}
benchmarks
library(microbenchmark)
microbenchmark(akrun(), josh(), henrik(), DMC(), unit='relative', times=20L)
#Unit: relative
# expr min lq mean median uq max neval
#akrun() 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 20
# josh() 323.57129 335.51929 315.05115 312.02953 293.18614 308.30833 20
#henrik() 30.02737 33.95731 32.15254 34.72281 29.55944 35.26825 20
#DMC() 204.66445 211.82019 207.47286 201.33015 207.10875 231.24048 20
# cld
# a
# d
# b
# c
@alexis_laz's solution came a bit late. Here is the system.time
system.time({cbind(levels(d$section),
aggregate(. ~ section, d[c(1, grep("^est_", names(d)))], var)[-1] /
aggregate(. ~ section, d[c(1, grep("^var_", names(d)))], mean)[-1])}
)
# user system elapsed
# 2.228 0.002 2.229
system.time(akrun())
# user system elapsed
# 0.034 0.000 0.035
An "etc" solution:
library(splitstackshape)
Reshape(data = d, id.vars = "section", var.stubs = c("est", "var"), sep = "_")
# section .id time est var
# 1: S1 1 1 0.3893008 0.16208821
# 2: S1 2 1 0.8221099 0.32806300
# 3: S1 3 1 0.8023230 0.18628100
# 4: S1 4 1 -0.2252865 0.56601106
# 5: S1 5 1 -0.9475335 0.79040846
# ---
# 121: S3 1 5 0.4823552 0.57649912
# 122: S3 2 5 0.6624314 0.27159239
# 123: S3 3 5 -0.7515308 0.09077159
# 124: S3 4 5 -0.4426505 0.81389532
# 125: S3 5 5 1.3881995 0.01433167
# or
merged.stack(data = getanID(d, "section"), var.stubs = c("est", "var"), sep = "_")
# section .id .time_1 est var
# 1: S1 1 v1 0.38930083 0.16208821
# 2: S1 1 v2 -1.19153913 0.15439565
# 3: S1 1 v3 0.62022284 0.54875189
# 4: S1 1 v4 0.07671314 0.71301067
# 5: S1 1 v5 0.53539985 0.86674969
# ---
# 121: S3 5 v1 0.87184287 0.63119596
# 122: S3 5 v2 1.26976583 0.50432276
# 123: S3 5 v3 0.02390527 0.55614582
# 124: S3 5 v4 0.15269326 0.93073954
# 125: S3 5 v5 1.38819949 0.01433167
This pipeline with dplyr
skips the intermediate table.
library(dplyr)
library(tidyr)
d %>%
gather(key, value, est_v1:var_v5) %>%
separate(key, into = c("type", "var")) %>%
group_by(section, var) %>%
summarise(
result = var(value[type == "est"]) / mean(value[type == "var"])
)
Yet another try:
cbind(levels(d$section),
aggregate(. ~ section, d[c(1, grep("^est_", names(d)))], var)[-1] /
aggregate(. ~ section, d[c(1, grep("^var_", names(d)))], mean)[-1])
# levels(d$section) est_v1 est_v2 est_v3 est_v4 est_v5
#1 S1 0.5874458 3.504169 3.676488 1.1716684 1.742021
#2 S2 2.4449153 1.177326 1.106337 1.0700636 3.263149
#3 S3 2.8621725 2.216099 3.846062 0.7777452 2.122726
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.