在R中觀察多行的Boostrap

Question

我正在嘗試對數據進行引導，但是速度非常慢。 我在Windows筆記本電腦上使用R。 我有一個數據集，其中包含多行以指示觀察值，當我引導數據時，這些行必須保持在一起。 我有一個變量，它指示哪個行屬於哪個主題，稱為VacancyId。 我使用此方法的一種變體，因為我的輸出遵循泊松分布。 我的代碼中真正的瓶頸在於lapply函數：

bootSimFun.can <- function(preddata,opreddata,data) {
# sample by VacancyId because of dependencies
samp <- sample(unique(data$VacancyId), replace=TRUE)
# create bootstrapped data
bdata <- bind_rows(lapply(samp, function(x) data[data$VacancyId == x,]))
# remove NA
bdata <- na.omit(bdata)
# create x with the appropriate variables
x <- bdata[,c('VacancyBankId', 'VacancyFunctionId', 
'VacancyEducationLevelId', 'VacancyProvinceId')]
# make sure the variables are seen as categorical
x$FunctionId <- as.factor(x$VacancyFunctionId)
x$EducationLevel <- as.factor(x$VacancyEducationLevelId)
x$ProvinceId <- as.factor(x$VacancyProvinceId)
x$VacancyBankId <- as.factor(x$VacancyBankId)
# allocate outcome
y <- bdata$CandidatesPerWeek
# create dummy matrix
x.onehot <- model.matrix(~ . + 0, data = x)
# create parameters for the xgboost
xgb_params <- list("objective" = "count:poisson",
                 "eval_metric" = "rmse")
# train model
newmodel <- xgboost(data = x.onehot,
                  label = y,
                  nrounds = 10,
                  params = xgb_params)
# make predictions
bpred <- predict(newmodel,type="response",newdata=preddata)
# make predictions for 60 days
bpred <- bpred*(60/7)
# bind the predictions with the original data
bpredictions <- cbind(opreddata, bpred)
# we are interested in the predictions at campaign level, so we sum up by Function, Education, and Province
# which vacancybanks are used is specified in the make.predictions dataframe
aggregate <- aggregate(bpred ~ VacancyFunctionId + VacancyEducationLevelId + 
VacancyProvinceId, data=bpredictions, sum, na.rm=TRUE)
# make sure it is in the right order
aggregate <- aggregate[order(aggregate$VacancyFunctionId, 
aggregate$VacancyEducationLevelId, aggregate$VacancyProvinceId),]
# Generated random numbers based on Poisson distribution with the mean, e.g. 
lambda, equal to the predicted values from refitted models
rpois(length(aggregate$bpred), lambda=aggregate$bpred)
}

尤其是lapply函數確實很慢。 還有其他選擇嗎？

我數據的一小部分：

structure(list(VacancyId = structure(c(4L, 3L, 6L, 7L, 3L, 6L, 
6L, 7L, 3L, 4L, 7L, 4L, 4L, 2L, 2L, 2L, 2L, 1L, 5L, 5L), .Label = c("57772", 
"57775", "57818", "57820", "57821", "57822", "57871"), class = "factor"), 
VacancyBankId = structure(c(2L, 1L, 2L, 2L, 3L, 1L, 3L, 3L, 
2L, 4L, 6L, 1L, 3L, 7L, 3L, 5L, 1L, 4L, 1L, 2L), .Label = c("2", 
"17", "147", "257", "991", "1565", "1609"), class = "factor"), 
VacancyFunctionId = structure(c(2L, 1L, 1L, 2L, 1L, 1L, 1L, 
2L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 2L, 2L), .Label = c("3", 
"4", "5", "11"), class = "factor"), VacancyEducationLevelId = structure(c(2L, 
2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
3L, 2L, 2L, 2L), .Label = c("4", "6", "8"), class = "factor"), 
VacancyProvinceId = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L), .Label = c("19", 
"21"), class = "factor"), CandidatesPerWeek = c(0, 2, 0, 
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0)), 
.Names = c("VacancyId", "VacancyBankId", "VacancyFunctionId", "VacancyEducationLevelId", "VacancyProvinceId", "CandidatesPerWeek"), 
row.names = c(3L, 9L, 10L, 19L, 20L, 26L, 27L, 33L, 37L, 38L, 56L, 57L, 58L, 69L, 70L, 72L, 73L, 122L, 125L, 128L), class = "data.frame")

結果是CandidatesPerWeek。 VacancyBankId，VacancyFunctionId，VacancyEducationLevelId和VacancyProvinceId是輸入。

Answer 1

我將其轉換為data.table ，將其轉換為因子，對因子進行采樣並根據它們對表進行重新排序，我的采樣數據和由數據構成的100萬行表的運行速度要快5倍以上。

我使用以下代碼：

data2 <- copy(data)
data2 <- setDT(data2)
data2$VacancyId <- factor(data2$VacancyId,sample(levels(data2VacancyId),replace=TRUE))
setorder(data2,VacancyId)

這是基准

data2 <- copy(data)
data2 <- setDT(data2)
data2$VacancyId <- factor(data2$VacancyId)

f1 <- function(data){
  samp <- sample(unique(data$VacancyId), replace=TRUE)
  bdata <- bind_rows(lapply(samp, function(x) data[data$VacancyId == x,]))
  bdata
}

f2 <- function(data2){
  data2$VacancyId <- factor(data2$VacancyId,sample(levels(data2$VacancyId),replace=TRUE))
  setorder(data2,VacancyId)
  data2
}

library(microbenchmark)
microbenchmark(f1(data),f2(data2),times=1000)
# Unit: microseconds
#      expr      min        lq      mean    median       uq      max neval
#  f1(data) 2193.213 2406.9770 2616.2763 2492.2700 2591.189 21471.67  1000
# f2(data2)  308.261  372.3195  452.6593  409.4805  450.889 18877.83  1000

快大約5倍，讓我們檢查更大的數據（從您的示例中復制了100萬行）

set.seed(1)
big_data <- data[sample(1:nrow(data),1000000,replace=TRUE),]
big_data2 <- copy(big_data)
big_data2 <- setDT(big_data2)
big_data2$VacancyId <- factor(big_data2$VacancyId)
library(microbenchmark)
microbenchmark(f1(big_data),f2(big_data2),times=50)

# Unit: milliseconds
# expr       min       lq     mean   median       uq      max neval
# f1(big_data) 525.38332 556.3378 598.5809 570.0738 592.2092 899.6736    10
# f2(big_data2)  61.43292  66.3120 124.4283 107.0262 123.4962 374.3961    10

仍然快5倍

Answer 2

找到了！

 samp <- sample(unique(d.9weeks$VacancyId), replace = TRUE)
 datDT <- as.data.table(d.9weeks)
 setkey(datDT, "VacancyId")
 # create bootstrapped data
 bdata <- datDT[J(samp), allow.cartesian = TRUE]

在R中觀察多行的Boostrap

問題描述

2 個解決方案

解決方案1
0 2017-07-20 08:00:22

解決方案2
0 2017-07-20 12:39:15

在R中觀察多行的Boostrap

問題描述

2 個解決方案

解決方案1 0 2017-07-20 08:00:22

解決方案2 0 2017-07-20 12:39:15

解決方案1
0 2017-07-20 08:00:22

解決方案2
0 2017-07-20 12:39:15