簡體   English   中英

從大型data.frame重新采樣

[英]Resampling from a large data.frame

我有這種結構的大data.frame

min.reps <- 1
max.reps <- 3
set.seed(1)
df <- do.call(rbind,lapply(1:100, function(i) {
  reps <- seq(1,as.integer(runif(1,min.reps, max.reps)), 1)
  vals <- runif(length(reps), 0, 100)
  return(data.frame(id=rep(i,length(reps)),rep=reps,val=vals,stringsAsFactors=F))
}))

head(df)

  id rep       val
1  1   1 37.212390
2  2   1 90.820779
3  2   2 20.168193
4  3   1 94.467527
5  3   2 66.079779
6  4   1  6.178627

每個df$id觀測值介於min.repsmax.reps之間( df$val )。 實際上,我有100萬個ID,而不是100個ID。

對於每一個df$id我想補充一個更大的價值,從正態分布抽樣meansdmedianmad在其現有值,分別。

這樣做很簡單:

add.reps <- 1
all.ids <- unique(df$id)

require(dplyr)
new.df <- do.call(rbind, lapply(all.ids, function(i) {
  id.df  <- dplyr::filter(df, id == i)
  add.df <- rbind(id.df, data.frame(id = rep(i,add.reps), rep = max(id.df$rep) + add.reps, val = rnorm(add.reps, median(id.df$val), mad(id.df$val)), stringsAsFactors = F))
}))

但是我想知道,考慮到我的實際data.frame的尺寸,是否有更快的方法來實現這一data.frame

這應該快得多:

add.reps <- 1
do.call(rbind, lapply(split(df, df$id), function(x) rbind(x, 
         data.frame(id = rep(unique(x$id), add.reps), rep = max(x$rep) + add.reps, 
                    val = rnorm(add.reps, median(x$val), mad(x$val)), stringsAsFactors = F))))

好的,到目前為止:

require(microbenchmark)
microbenchmark(
new.df <- do.call(rbind, lapply(all.ids, function(i) {
  id.df  <- dplyr::filter(df, id == i)
  add.df <- rbind(id.df, data.frame(id = rep(i,add.reps), rep = max(id.df$rep) + add.reps, val = rnorm(add.reps, median(id.df$val), mad(id.df$val)), stringsAsFactors = F))
}))
)

 new.df <- do.call(rbind, lapply(all.ids, function(i) {     id.df <- dplyr::filter(df, id == i)     add.df <- rbind(id.df, data.frame(id = rep(i, add.reps),          rep = max(id.df$rep) + add.reps, val = rnorm(add.reps,              median(id.df$val), mad(id.df$val)), stringsAsFactors = F)) }))
      min       lq     mean   median       uq      max neval
 212.9906 225.1345 371.9314 260.9686 332.5619 1621.586   100

microbenchmark(
new.df <- do.call(rbind, lapply(split(df, df$id), function(x) rbind(x,
                                                                    data.frame(id = rep(unique(x$id), add.reps), rep = max(x$rep) + add.reps,
                                                                               val = rnorm(add.reps, median(x$val), mad(x$val)), stringsAsFactors = F))))
)

 new.df <- do.call(rbind, lapply(split(df, df$id), function(x) rbind(x,      data.frame(id = rep(unique(x$id), add.reps), rep = max(x$rep) +          add.reps, val = rnorm(add.reps, median(x$val), mad(x$val)),          stringsAsFactors = F))))
      min       lq     mean   median       uq     max neval
 133.8357 135.1846 202.9654 137.2722 160.5121 1401.03   100

我想知道是否還能進一步改善

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM