I have data-set where the bootstraps were performed such that only values within the two main factors replicate/ level were replaced.
replicate level high.density low.density
1 low 14 36
1 low 54 31
1 mid 82 10
1 mid 24 NA
2 low 12 28
2 low 11 45
2 mid 12 17
2 mid NA 24
2 up 40 10
2 up NA 5
2 up 20 2
##DATA FRAME
df <- structure(list(replicate = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2), level = c("low", "low", "mid", "mid", "low", "low", "mid", "mid", "up", "up", "up"), high.density = c(14, 54, 82, 24, 12, 11, 12, NA, 40, NA, 20), low.density = c(36, 31, 10,
NA, 28, 45, 17, 24, 10, 5, 2)), class = c("spec_tbl_df","tbl_df","tbl", "data.frame"), row.names = c(NA, -11L), spec = structure(list(cols = list(replicate = structure(list(), class = c("collector_double", "collector")), level = structure(list(), class = c("collector_character","collector")), high.density = structure(list(), class = c("collector_double","collector")), low.density = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess", "collector")), skip = 1L), class = "col_spec"))
df$replicate <- as.factor(as.numeric(df$replicate))
df$level <- as.factor(as.character(df$level))
##Creating the data-set needed for boot. Only values with-in the unique replicate/ level were allowed to shuffle ( Credits: Dion )
df_shuffle <- function(DF) {
my_split <- split(DF, f = ~ DF$replicate + DF$level)
shuffle <- lapply(my_split, function(x) {
nrX <- nrow(x)
cbind(x[, c('replicate', 'level')],
high.density = x[sample(seq_len(nrX), replace = TRUE), 'high.density'],
low.density = x[sample(seq_len(nrX), replace = TRUE), 'low.density'])
})
DF_new <- do.call(rbind, shuffle)
rownames(DF_new) <- NULL
return(DF_new)
}
B <- 1000
df_list <- replicate(B, df_shuffle(df), simplify = FALSE)
df_list <- lapply(df_list, function(x) x[complete.cases(x), ]) #choose complete cases
Now I want to bootstrap over these observations to estimate the coefficient, p-value and confidence interval. I am trying to replicate the boot function like in this example and draw the correct confidence interval like in this example (I only need the overall bootstrapped line and confidence interval)
#A sample code for the plot
df_boot <- rbindlist(df_list, idcol = 'count')
ggplot(aes(x = low.density, y = high.density), data = df_boot) +
stat_smooth(aes(group = factor(count)), geom = "line", method = "lm", se = FALSE, color = "red", alpha=0.02) +
stat_smooth(geom = "line", method = "lm", se = FALSE, color = "black", linetype = "longdash") +
theme(panel.background = element_blank()) + theme(legend.position="none")
We may conduct a non-parametric or parametric bootstrap of the postulated linear model. The important difference between a non-parametric and a parametric bootstrap is that in the non-parametric case, we are going to repeatedly sample from our original dataframe df
, whereas in the parametric case, we simulate new data from our original model .
We postulate the following linear model:
high.density
i = b 0 + b 1 low.density
+ e i .
It is important to first listwise delete rows containing missing observations and then run the bootstrap procedure (even though you'd better multiply impute missing observations in independents...). In the post with the function df_shuffle()
to which you refer in your question was listwise deletion performed after resampling the data. Performing listwise deletion before the resampling ensures that each bootstrap sample has the same number of rows as df
. The is a prerequisite for the bootstrap to work and thus to be able to make valid inference based on the bootstrapping procedure.
The function boot_lm()
allows the user to either perform a non-parametric or parametric bootstrap. It consists of the following arguments:
original_model
: a character string specifying the postulated linear model. original_data
: a character string specifying the dataframe that was used to fit the postulated linear model. type
: a character string specifying whether a parametric ( param
) or non-parametric ( ordinary
) bootstrap should be performed. B
: the number of bootstrap samples to be taken. seed
: an integer to fix the random number generator. # listwise deletion
df <- df[complete.cases(df), ]
# linear model to be bootstrapped
fm0 <- lm(high.density ~ low.density, data = df)
boot_lm <- function(original_data, original_model,
type = c('ordinary', 'param'),
B = 1000L, seed = 1) {
set.seed(seed)
betas_original_model <- coef(original_model)
len_coef <- length(betas_original_model)
mat <- matrix(rep(0L, B * len_coef), ncol = len_coef)
if (type %in% 'ordinary') {
n_rows <- length(residuals(original_model))
for (i in seq_len(B)) {
boot_dat <- original_data[sample(seq_len(n_rows), replace = TRUE), ]
mat[i, ] <- coef(lm(terms(original_model), data = boot_dat))
}
}
if (type %in% 'param') {
X <- model.matrix(delete.response(terms(original_model)),
data = original_data)[, -1L]
for (i in seq_len(B)) {
mat[i, ] <- coef(lm(unlist(simulate(original_model)) ~ X,
data = original_data))
}
}
confints <- matrix(rep(0L, 2L * len_coef), ncol = 2L)
pvals <- numeric(len_coef)
for (i in seq_len(len_coef)) {
pvals[i] <- mean(abs(mat[, i] - mean(mat[, i])) > abs(betas_original_model[i]))
confints[i, ] <- quantile(mat[, i], c(.025, 0.975))
}
names(pvals) <- names(betas_original_model)
out <- data.frame(estimate = betas_original_model,
'lwr' = confints[, 1], 'upr' = confints[, 2],
p_value = pvals)
return(out)
}
Output: your data
# non-parametric bootstrap
ordinary <- boot_lm(original_data = df, original_model = fm0,
type = 'ordinary', B = 1e4)
> ordinary
estimate lwr upr p_value
(Intercept) 45.1522806 16.290080 88.6969733 0.0220
low.density -0.6492639 -2.055204 0.5368766 0.2792
# --------------------------------------------------------
# parametric bootstrap
param <- boot_lm(original_data = df, original_model = fm0,
type = 'param', B = 1e4)
> param
estimate lwr upr p_value
(Intercept) 45.1522806 10.472075 79.1197394 0.0103
low.density -0.6492639 -1.971258 0.6381189 0.3245
Output: mtcars
# linear model to be bootstrapped
fm1 <- lm(mpg ~ wt + cyl + qsec, data = mtcars)
ordinary <- boot_lm(original_data = mtcars, original_model = fm1,
type = 'ordinary', B = 1e4)
> ordinary
estimate lwr upr p_value
(Intercept) 29.4290521 13.8283579 41.2637258 0.0009
wt -3.8616401 -6.6867159 -2.0884969 0.0084
cyl -0.9277487 -1.9447741 0.4831599 0.1174
qsec 0.4944817 -0.1141793 1.3369213 0.1825
It sounds like one of the big asks is to be able to plot the bootstrapped results. Here is one option:
First, make the data
df <- structure(list(replicate = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2), level = c("low", "low", "mid", "mid", "low", "low", "mid", "mid", "up", "up", "up"), high.density = c(14, 54, 82, 24, 12, 11, 12, NA, 40, NA, 20), low.density = c(36, 31, 10,
NA, 28, 45, 17, 24, 10, 5, 2)), class = c("spec_tbl_df","tbl_df","tbl", "data.frame"), row.names = c(NA, -11L), spec = structure(list(cols = list(replicate = structure(list(), class = c("collector_double", "collector")), level = structure(list(), class = c("collector_character","collector")), high.density = structure(list(), class = c("collector_double","collector")), low.density = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess", "collector")), skip = 1L), class = "col_spec"))
df$replicate <- as.factor(as.numeric(df$replicate))
df$level <- as.factor(as.character(df$level))
Next, make some hypothetical data that moves the variable of interest low.density
across its range. Here we are doing it for all possible combinations of replicate
and level
, but you could also just pick one value for each.
hyp <- expand.grid(replicate = as.factor(c(1,2)),
level=factor(1:3, labels=c("low", "mid", "up")),
low.density = seq(min(df$low.density, na.rm=TRUE),
max(df$low.density, na.rm=TRUE),
length=25))
Then, we do the bootstrapping. In the function below, we draw the data, estimate the model and then generate predictions. If the model or predictions fail, then that particular model is thrown out and another is drawn until you get 1000 valid draws.
res <- NULL
i <- 1
while(i <= 1000){
tmp <- df_shuffle(df)
mod <- try(lm(high.density ~ low.density + replicate + level, data=tmp))
if(!inherits(mod, "try-error")){
pred <- try(predict(mod, newdata=hyp))
if(!inherits(pred, "try-error")){
res <- cbind(res, pred )
i <- i+1
}
}
}
Estimate the original model to get the predicted values
orig <- lm(high.density ~ low.density + level + replicate, data=df)
hyp$fit <- predict(orig, newdata=hyp)
Calculate the quantile confidence intervals for each bootstrapped prediction and add them to the dataset.
cis <- t(apply(res, 1, quantile, c(.025,.975)))
hyp$lwr <- cis[,1]
hyp$upr <- cis[,2]
Finally, make the plot.
ggplot(hyp, aes(x=low.density, y=fit,
ymin = lwr, ymax=upr)) +
geom_ribbon(colour="transparent", alpha=.25) +
geom_line() +
facet_grid(replicate ~ level) +
theme_bw() +
theme(panel.grid=element_blank())
To get smoother bounds, try more bootstrap replicates.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.