简体   繁体   中英

Working with pairs of related columns (dplyr, tidyr, data.table)

I have a data frame of multiple pairs of estimates and variances for several model parameters each within one of a number of sections. Here's a function that generates the illustrative sample:

samplerats <- function(){
    set.seed(310366)
    d = data.frame(section=c(rep("S1",10),rep("S2",10),rep("S3",5)))
    nr = nrow(d)
    for(i in 1:5){
        d[[paste0("est_v",i)]] = rnorm(nr)
        d[[paste0("var_v",i)]] = runif(nr)
    }
    d
}

and here's the start of what you get:

> d=samplerats()
> head(d)
  section     est_v1    var_v1     est_v2     var_v2      est_v3    var_v3
1      S1  0.3893008 0.1620882 -1.1915391 0.15439565  0.62022284 0.5487519
2      S1  0.8221099 0.3280630  0.7729817 0.14810283 -1.11337584 0.9947342
3      S1  0.8023230 0.1862810 -1.5285389 0.85648574 -1.74666907 0.4267944
4      S1 -0.2252865 0.5660111 -0.4348341 0.53013027  0.01823185 0.1379821
5      S1 -0.9475335 0.7904085 -1.0882961 0.40567780  1.69607397 0.3450983
6      S1  0.4415259 0.2969032  0.9200723 0.08754107  0.57010457 0.7579002
[with another two variables and 25 rows in total]

The task is to compute the ratio the variance of the estimates for each parameter with the mean of the variance for each parameter, grouped by section.

So for example, for variable v1, crudely just to get the numbers out:

> d %>% group_by(section) %>% summarise(var(est_v1)/mean(var_v1))
Source: local data frame [3 x 2]

  section var(est_v1)/mean(var_v1)
1      S1                0.5874458
2      S2                2.4449153
3      S3                2.8621725

That gives us the answer for v1 , we just need to repeat for all the other variables. Note that the column names are est_ or var_ followed by a variable name which might be alpha or g2 or some other alphanum.

Of course I have a horrendous solution:

ratit <- function(d){
    isVAR <- function(s){stringr::str_sub(s,1,4)=="var_"}

    spreads = reshape2::melt(d) %>% mutate(isVAR=isVAR(variable), Variable = str_replace(variable,"^.*_",""))
    vout = spreads %>% group_by(Variable, section, isVAR) %>% summarise(Z=if(isVAR(variable[1])){mean(value)}else{var(value)})
    ratios = vout %>% group_by(section, Variable) %>% summarise(Vratio = Z[1]/Z[2]) %>% dcast(section ~ Variable)
    ratios
}

which gives:

> ratit(d)
Using section as id variables
Using Vratio as value column: use value.var to override.
  section        v1       v2       v3        v4       v5
1      S1 0.5874458 3.504169 3.676488 1.1716684 1.742021
2      S2 2.4449153 1.177326 1.106337 1.0700636 3.263149
3      S3 2.8621725 2.216099 3.846062 0.7777452 2.122726

where you can see the first column is the same as the v1 -only example earlier. But yuck.

If I can melt, cast, dplyr or otherwise tidyr it up into this format:

         est       var section  variable
1  0.3893008 0.1620882      S1        v1
2  0.8221099 0.3280630      S1        v1
3  0.8023230 0.1862810      S1        v1
4 -0.2252865 0.5660111      S1        v1
5 -0.9475335 0.7904085      S1        v1
6  0.4415259 0.2969032      S1        v1

then its trivial - dd %>% group_by(section, variable) %>% summarise(rat=var(est)/mean(var)) %>% spread(variable, rat)

But that step eludes me...

Neat solutions welcome, using anything including base R, dplyr, tidyr, data.table etc.

This should do the trick:

dd <- reshape(d, varying = 2:11, direction = 'long', sep="_", timevar="variable")

head(dd)
#      section variable        est       var id
# 1.v1      S1       v1  0.3893008 0.1620882  1
# 2.v1      S1       v1  0.8221099 0.3280630  2
# 3.v1      S1       v1  0.8023230 0.1862810  3
# 4.v1      S1       v1 -0.2252865 0.5660111  4
# 5.v1      S1       v1 -0.9475335 0.7904085  5
# 6.v1      S1       v1  0.4415259 0.2969032  6

Here is one solution with base R using mapply

est <- d[grep('^est|section', colnames(d))]
var1 <- d[grep('^var|section', colnames(d))]
lstest <- split(est[-1], est$section)
lstvar <- split(var1[-1], var1$section)

res <- t(mapply(function(x,y) mapply(function(.x, .y) 
          var(.x)/mean(.y), x, y), lstest, lstvar))

Or using dplyr

 est1 <- est %>% 
          group_by(section) %>%
          summarise_each(funs(var)) %>% 
          data.frame()

 var2 <- var1 %>%
            group_by(section) %>% 
            summarise_each(funs(mean)) %>% 
             data.frame()

 est1[-1]/var2[-1]

Benchmarks

data

samplerats <- function(){
  set.seed(310366)
  d <- data.frame(section=sample(paste0("S", 1:20), 
                                    1e5, replace=TRUE))
  nr <- nrow(d)
  for(i in 1:20){
   d[[paste0('est_v', i)]] <- rnorm(nr)
   d[[paste0('var_v', i)]] <- runif(nr)
   }
  d
}
d <- samplerats()    

Functions

akrun <- function(){
     est1 <- d %>% 
               group_by(section) %>%
               summarise_each(funs(var), starts_with('est')) 
     var1 <- d %>%
               group_by(section) %>% 
               summarise_each(funs(mean), starts_with('var') )
      cbind(unique(est1[1]), est1[-1]/var1[-1])
   }

  #Assuming that the `reshaped` dataset from @Josh O'Brien's method 
  #is further processed using `spread` from `tidyr`

  josh <- function(){
     dd <- reshape(d, varying = 2:ncol(d), direction = 'long', 
             sep="_", timevar="variable")
     dd %>%
        group_by(section, variable) %>%
        summarise(rat=var(est)/mean(var)) %>%
        spread(variable, rat)
      }

   #Using `data.table` for @Henriks' method as the output from
   # `merged.stack is `data.table`

  henrik <- function(){
      dS <- merged.stack(data = getanID(d, "section"), 
         var.stubs = c("est", "var"), sep = "_")
       dcast.data.table(dS[ , list(rat=var(est)/mean(var)),
        .(section, .time_1)], section~.time_1, value.var='rat')
        }    

  DMC <- function(){       
           d %>%
           gather(key, value, -section) %>%
           separate(key, into = c("type", "var")) %>%
           group_by(section, var) %>%
           summarise(result = var(value[type == "est"]) / mean(value[type == "var"]))%>%
           spread(var, result) 
          }  

benchmarks

library(microbenchmark)
microbenchmark(akrun(), josh(), henrik(), DMC(), unit='relative', times=20L)
#Unit: relative
# expr       min        lq      mean    median        uq       max neval
#akrun()   1.00000   1.00000   1.00000   1.00000   1.00000   1.00000    20
# josh() 323.57129 335.51929 315.05115 312.02953 293.18614 308.30833    20
#henrik() 30.02737  33.95731  32.15254  34.72281  29.55944  35.26825    20
#DMC()   204.66445 211.82019 207.47286 201.33015 207.10875 231.24048    20
# cld
# a   
# d
# b  
# c 

@alexis_laz's solution came a bit late. Here is the system.time

 system.time({cbind(levels(d$section), 
    aggregate(. ~ section, d[c(1, grep("^est_", names(d)))], var)[-1] / 
   aggregate(. ~ section, d[c(1, grep("^var_", names(d)))], mean)[-1])}
 )
#  user  system elapsed 
# 2.228   0.002   2.229 
system.time(akrun())
#   user  system elapsed 
# 0.034   0.000   0.035 

An "etc" solution:

library(splitstackshape)
Reshape(data = d, id.vars = "section", var.stubs = c("est", "var"), sep = "_")
#      section .id time        est        var
#   1:      S1   1    1  0.3893008 0.16208821
#   2:      S1   2    1  0.8221099 0.32806300
#   3:      S1   3    1  0.8023230 0.18628100
#   4:      S1   4    1 -0.2252865 0.56601106
#   5:      S1   5    1 -0.9475335 0.79040846
# ---                                       
# 121:      S3   1    5  0.4823552 0.57649912
# 122:      S3   2    5  0.6624314 0.27159239
# 123:      S3   3    5 -0.7515308 0.09077159
# 124:      S3   4    5 -0.4426505 0.81389532
# 125:      S3   5    5  1.3881995 0.01433167

# or
merged.stack(data = getanID(d, "section"), var.stubs = c("est", "var"), sep = "_")
#      section .id .time_1         est        var
#   1:      S1   1      v1  0.38930083 0.16208821
#   2:      S1   1      v2 -1.19153913 0.15439565
#   3:      S1   1      v3  0.62022284 0.54875189
#   4:      S1   1      v4  0.07671314 0.71301067
#   5:      S1   1      v5  0.53539985 0.86674969
# ---                                           
# 121:      S3   5      v1  0.87184287 0.63119596
# 122:      S3   5      v2  1.26976583 0.50432276
# 123:      S3   5      v3  0.02390527 0.55614582
# 124:      S3   5      v4  0.15269326 0.93073954
# 125:      S3   5      v5  1.38819949 0.01433167

This pipeline with dplyr skips the intermediate table.

library(dplyr)
library(tidyr)

d %>%
  gather(key, value, est_v1:var_v5) %>%
  separate(key, into = c("type", "var")) %>%
  group_by(section, var) %>%
  summarise(
    result = var(value[type == "est"]) / mean(value[type == "var"])
  )

Yet another try:

cbind(levels(d$section), 
      aggregate(. ~ section, d[c(1, grep("^est_", names(d)))], var)[-1] / 
      aggregate(. ~ section, d[c(1, grep("^var_", names(d)))], mean)[-1])
#  levels(d$section)    est_v1   est_v2   est_v3    est_v4   est_v5
#1                S1 0.5874458 3.504169 3.676488 1.1716684 1.742021
#2                S2 2.4449153 1.177326 1.106337 1.0700636 3.263149
#3                S3 2.8621725 2.216099 3.846062 0.7777452 2.122726

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM