Earlier we looked at how we could apply bootstrapping to a linear model. We generated bootstrap standard errors for the OLS coefficient estimates. Here we will apply bootstrapping to the models to ChickWeight
data.
Let \(weight_{ijt}\) be the weight of chick \(i\) in Diet group \(j\) observed in time \(t\). We will apply bootstrapping to the following models;
Model 1: linear time trend \[weight_{ijt} = \alpha_{0} + \beta_{1j}\: time_t + \varepsilon_{ijt}\]
Model 2: linear time trend with Chick fixed effects \[ weight_{ijt} = \alpha_{0} + \beta_{1j}\: time_t + \alpha_i+ \varepsilon_{ijt}\]
Model 3: linear time trend with Chick random effects \[ weight_{ijt} = \alpha_{0} + \beta_{1j}\: time_t + v_{ijt}, \:\: v_{ijt} = \alpha_i+ \varepsilon_{ijt}\]
Load libraries and data.
library(dplyr)
library(ggplot2)
library(broom)
library(tidyr)
library(mosaic)
library(nlme)
library(lme4)
library(knitr)
load("movies2.RData")
getFormula <- function(ols, lmer=FALSE) gsub("()","", ifelse(!lmer, ols$call[2], ols@call[2]))
getDependentVar <- function(ols, lmer=FALSE) {
str <- getFormula(ols, lmer=lmer)
gsub(" ","", substr(str, 1, (regexpr("~",str)[1]-1))) 
}
run_ols_boot <- function(lm_rlt, num_do = 5000) {
# calculate the standard deviation of the residuals
N <- length(lm_rlt$residuals)
sd_res <- (sum(lm_rlt$residuals^2)/lm_rlt$df.residual) %>% sqrt()
dep_var <- getDependentVar(lm_rlt)
do(num_do) *
({
data_bt <- lm_rlt$model
# replace the dependent variable with its bootstrap counterpart
data_bt[[dep_var]] <- lm_rlt$fitted.values + # the predicted component
+ rnorm(N, mean = 0, sd = sd_res) # random draws from the error distribution
# run the OLS model with the same formula but with a new, bootstrap dataset
ols_bt <- lm(as.formula(getFormula(lm_rlt)), data = data_bt)
coef(ols_bt) # get coefficients
})
}
# function generating a matrix of ones
ones <- function(r,c) matrix(c(rep(1,r*c)),nrow=r,ncol=c)
# confidence interval by bootstrapping
# version 1
ci_ver1 <- function(est_bt, alpha = 0.05) {
# est_bt: bootstrap estimates with row = boot replications, col = coefficients
apply(est_bt, 2, function(x) quantile(x, c(alpha/2, 1 - alpha/2))) %>% t()
}
# version 2
ci_ver2 <- function(est_sample, bt_sd, alpha = 0.05) {
# est_semple: sample estimate vector
# bt_sd: bootstrap sd estimates of est_sample-vector
cbind('2.5%' = est_sample - 1.96 * bt_sd,
'97.5%' = est_sample + 1.96 * bt_sd)
}
# bootstrap p-value
bt_p_val <- function(est_sample, est_bt) {
# est_semple: sample estimate vector
# est_bt: bootstrap estimates with row = boot replications, col = coefficients
est_bt_long <- est_bt %>% data.frame() %>% gather()
est_bt_long <- est_bt_long %>%
mutate(
center_var = kronecker(ones(nrow(est_bt),1), matrix(est_sample, nrow=1)) %>% c(),
extremes = abs(value - center_var) >= abs(center_var)
)
p_val <- est_bt_long %>%
group_by(key) %>%
summarise(p_val = sum(extremes)/nrow(est_bt))
return(list(p_val=p_val, df_long=est_bt_long))
}
# histogram visualization
coeff_bt_histogram <- function(est_sample, est_bt, centering =FALSE) {
# est_semple: sample estimate vector
# est_bt: bootstrap estimates with row = boot replications, col = coefficients
est_bt_long <- bt_p_val(est_sample, est_bt)$df_long
if (centering) {
est_bt_long <- est_bt_long %>%
mutate(
key = paste(key, " - center"),
value = value - center_var,
fill_var = extremes
)
legend_lab <- "Extremes: | value - center | > | center |"
x_lab <- "value - center"
} else {
est_bt_long <- est_bt_long %>%
mutate(
sign = kronecker(ones(nrow(est_bt),1), matrix(ifelse(est_sample>0,1,-1), nrow=1)) %>% c(),
fill_var = value * sign <= 0
)
legend_lab <- "Crossing zero?"
x_lab <- "value"
}
est_bt_long %>%
ggplot(aes(x = value, fill = fill_var)) +
geom_histogram(color = "white", bins = 40) + theme(legend.position="bottom") +
facet_wrap(~key, scales = "free") + labs(fill = legend_lab, x = x_lab)
}
run_lmer_boot <- function(lmer_rlt, num_do = 5000) {
# Randome effects (RE) model bootstrapping (random intercepts, not random slopes)
rlt <- list()
rlt$model <- lmer_rlt@pp$X # model data for the part of fixed coefficients
N <- length(residuals(lmer_rlt))
rlt$fitted_no_RE <- rlt$model %*% matrix(fixef(lmer_rlt), ncol=1)
RE_vals <- lmer_rlt@flist[[1]] %>% unique() # RE variable values
N_RE <- length(RE_vals) # number of RE variable values
rlt$RE_idx <- rep(NA, N)
for (i in 1:N_RE) rlt$RE_idx[which(lmer_rlt@flist[[1]] == RE_vals[i])] <- i # index of RE variable values
sd_res <- sigma(lmer_rlt) # standard deviation of the residuals
sd_RE <- lmer_rlt@theta * sd_res # standard deviation of RE
dep_var <- getDependentVar(lmer_rlt, lmer=TRUE)
do(num_do) *
({
data_bt <- data.frame(lmer_rlt@frame)
# replace the dependent variable with its bootstrap counterpart
data_bt[[dep_var]] <- rlt$fitted_no_RE + # the predicted component by fixed coefficients
+ rnorm(N_RE, mean = 0, sd = sd_RE)[rlt$RE_idx] + # random draws of tge RE distribution
+ rnorm(N, mean = 0, sd = sd_res) # random draws of the residual distribution
# run the RE model with the same formula but with a new, bootstrap dataset
lmer_bt <- lmer(as.formula(getFormula(lmer_rlt, lmer=TRUE)), data = data_bt)
sd_res_bt <- sigma(lmer_bt)
sd_RE_bt <- lmer_bt@theta * sd_res_bt
c(fixef(lmer_bt), sigma_RE = sd_RE_bt, sigma_res = sd_res_bt) # get fixed coefficients
})
}
Inside do(num_do)({...})
, observe the additional random component for the random effects, which is a vector of random intercepts of length = number of random effect variable. We assign these random draws to individual units.
ChickWeight2 <- ChickWeight
# model 1
model_1 <- lm( weight ~ Diet*Time - Diet, data = ChickWeight2)
rlt_model_1_bt <- run_ols_boot(model_1, num_do = 2000)
obs_model_1 <- tidy(model_1) # summary of the original OLS estimates
bt_sd_1 <- apply(rlt_model_1_bt, 2, sd) # calculate bootstrap standard errors
bt_mode1_1 <- cbind(coeff = obs_model_1$estimate,
sd = bt_sd_1,
tstat = obs_model_1$estimate/bt_sd_1)
# OLS estimate with statistical inferences by statistic theory
obs_model_1
# OLS estimate with statistical inferences by bootstrapping
bt_mode1_1
coeff sd tstat
Intercept 27.858834 2.5860717 10.772645
Time 7.049161 0.2511614 28.066260
Diet2.Time 1.611209 0.3008103 5.356229
Diet3.Time 3.738317 0.3016122 12.394446
Diet4.Time 2.861438 0.3073260 9.310757
ci_ver1(rlt_model_1_bt)
2.5% 97.5%
Intercept 22.653963 32.893020
Time 6.548692 7.537667
Diet2.Time 1.054466 2.212098
Diet3.Time 3.145243 4.329096
Diet4.Time 2.262429 3.486386
ci_ver2(obs_model_1$estimate, bt_sd_1, model_1$df.residual)
2.5% 97.5%
Intercept 22.790133 32.927534
Time 6.556884 7.541437
Diet2.Time 1.021621 2.200797
Diet3.Time 3.147157 4.329477
Diet4.Time 2.259079 3.463796
bt_p_val(obs_model_1$estimate, rlt_model_1_bt)$p_val
# histogram
coeff_bt_histogram(obs_model_1$estimate, rlt_model_1_bt, centering=FALSE)
# model 2
model_2 <- lm( weight ~ Diet*Time - Diet + Chick, data = ChickWeight2)
rlt_model_2_bt <- run_ols_boot(model_2, num_do = 2000)
obs_model_2 <- tidy(model_2) # summary of the original OLS estimates
bt_sd_2 <- apply(rlt_model_2_bt, 2, sd) # calculate bootstrap standard errors
bt_mode1_2 <- cbind(coeff = obs_model_2$estimate,
sd = bt_sd_2,
tstat = obs_model_2$estimate/bt_sd_2)
# OLS estimate with statistical inferences by statistic theory
obs_model_2
# OLS estimate with statistical inferences by bootstrapping
bt_mode1_2
coeff sd tstat
Intercept 28.2445096 1.9855711 14.22487929
Time 6.6906239 0.2596489 25.76796272
Chick.L 25.6466519 14.0353961 1.82728380
Chick.Q -13.5654294 12.5576791 -1.08024973
Chick.C 51.9445570 11.7812219 4.40909758
Chick.4 9.2225392 11.4675797 0.80422718
Chick.5 -33.3384388 10.3455502 -3.22249065
Chick.6 41.0951586 10.9740495 3.74475789
Chick.7 -0.2107096 8.9305734 -0.02359418
Chick.8 5.8962394 10.2616228 0.57459132
Chick.9 13.4344671 8.9302403 1.50437912
Chick.10 -16.1181057 8.6544264 -1.86241178
Chick.11 -45.0381766 8.3007688 -5.42578378
Chick.12 9.6177311 8.3138454 1.15683305
Chick.13 55.0050219 8.0707850 6.81532492
Chick.14 8.1414846 7.7037995 1.05681419
Chick.15 -45.9058209 8.0669753 -5.69058652
Chick.16 -15.9530106 7.8268258 -2.03824782
Chick.17 28.2004666 7.6961323 3.66423880
Chick.18 17.2879476 7.7190033 2.23966062
Chick.19 -24.3940107 7.3623970 -3.31332454
Chick.20 13.0230252 7.5793338 1.71822822
Chick.21 12.9951812 7.5285354 1.72612340
Chick.22 -10.4667014 7.6737562 -1.36396063
Chick.23 3.4996009 7.4757583 0.46812654
Chick.24 3.9263398 7.4821592 0.52476026
Chick.25 -35.2223376 7.1613119 -4.91841969
Chick.26 20.8034554 7.4356204 2.79781032
Chick.27 41.5753691 7.2655607 5.72225200
Chick.28 8.9788801 7.3490760 1.22176993
Chick.29 -22.6530019 7.2127852 -3.14067329
Chick.30 -7.0641070 7.3393622 -0.96249603
Chick.31 11.6522480 7.3360895 1.58834593
Chick.32 16.4033102 7.1629405 2.29002463
Chick.33 -3.1073624 7.4438784 -0.41743863
Chick.34 -6.9049827 7.4434326 -0.92766106
Chick.35 0.5744950 7.0153017 0.08189171
Chick.36 -16.0760278 7.2991837 -2.20244189
Chick.37 -4.2041629 7.4068202 -0.56760699
Chick.38 -12.9040587 7.5312322 -1.71340603
Chick.39 -11.6779774 7.5096363 -1.55506565
Chick.40 42.2805547 7.4526749 5.67320528
Chick.41 -13.5916736 7.3790711 -1.84192202
Chick.42 -32.8738896 7.4252763 -4.42729513
Chick.43 -38.4670120 7.4298335 -5.17737203
Chick.44 12.3700410 7.5217965 1.64455939
Chick.45 -20.2123257 7.5361000 -2.68206707
Chick.46 23.0112367 7.3743422 3.12044600
Chick.47 -11.3521235 7.4495541 -1.52386617
Chick.48 -16.5301765 7.1550528 -2.31028017
Chick.49 -31.1804687 7.5594969 -4.12467508
Diet2.Time 1.9185124 0.4286596 4.47560806
Diet3.Time 4.7322471 0.4445013 10.64619507
Diet4.Time 2.9653114 0.4347580 6.82060215
ci_ver1(rlt_model_2_bt)
2.5% 97.5%
Intercept 24.313538 32.0576181
Time 6.205119 7.2186092
Chick.L -2.307375 52.8166013
Chick.Q -37.719358 10.6560247
Chick.C 28.783165 74.8672253
Chick.4 -12.685474 32.7306138
Chick.5 -54.191316 -13.4857366
Chick.6 19.629291 62.6320371
Chick.7 -17.226958 16.9220251
Chick.8 -13.780153 25.4488033
Chick.9 -3.856480 30.7518958
Chick.10 -33.818116 0.7094726
Chick.11 -61.100776 -27.9948297
Chick.12 -6.474448 25.9606673
Chick.13 38.885773 70.7144012
Chick.14 -6.800293 23.1543283
Chick.15 -61.961100 -29.7182984
Chick.16 -30.616152 -0.7889231
Chick.17 13.570366 42.9375852
Chick.18 1.860687 32.4160003
Chick.19 -38.516768 -10.6574920
Chick.20 -1.121593 27.7412117
Chick.21 -1.428137 28.0364710
Chick.22 -25.545140 4.8310177
Chick.23 -10.858153 18.6116745
Chick.24 -10.518633 18.2462698
Chick.25 -49.414351 -20.7086570
Chick.26 6.325055 35.1847648
Chick.27 27.220889 55.4067394
Chick.28 -6.109585 22.8199932
Chick.29 -36.090597 -8.4163810
Chick.30 -21.148078 7.7349804
Chick.31 -2.780116 26.4266853
Chick.32 2.584052 30.2253285
Chick.33 -17.725920 12.0296240
Chick.34 -21.276893 7.9541216
Chick.35 -13.247002 13.7500169
Chick.36 -31.356043 -1.9887921
Chick.37 -19.590968 9.3432318
Chick.38 -27.622970 1.8319825
Chick.39 -26.813294 2.8381737
Chick.40 27.842575 56.6360329
Chick.41 -28.122396 0.8054383
Chick.42 -48.042103 -18.7713323
Chick.43 -53.307274 -24.4147625
Chick.44 -1.664027 27.9581874
Chick.45 -35.705630 -5.6602085
Chick.46 9.300954 37.8833245
Chick.47 -26.694689 2.6615355
Chick.48 -30.656458 -2.8262318
Chick.49 -46.685957 -17.2721901
Diet2.Time 1.072546 2.7632157
Diet3.Time 3.878032 5.5918894
Diet4.Time 2.112013 3.7754640
ci_ver2(obs_model_2$estimate, bt_sd_2, model_2$df.residual)
2.5% 97.5%
Intercept 24.352790 32.1362290
Time 6.181712 7.1995358
Chick.L -1.862724 53.1560283
Chick.Q -38.178480 11.0476216
Chick.C 28.853362 75.0357518
Chick.4 -13.253917 31.6989954
Chick.5 -53.615717 -13.0611604
Chick.6 19.586021 62.6042957
Chick.7 -17.714633 17.2932142
Chick.8 -14.216541 26.0090201
Chick.9 -4.068804 30.9377381
Chick.10 -33.080782 0.8445701
Chick.11 -61.307683 -28.7686698
Chick.12 -6.677406 25.9128681
Chick.13 39.186283 70.8237604
Chick.14 -6.957962 23.2409316
Chick.15 -61.717092 -30.0945493
Chick.16 -31.293589 -0.6124320
Chick.17 13.116047 43.2848859
Chick.18 2.158701 32.4171940
Chick.19 -38.824309 -9.9637126
Chick.20 -1.832469 27.8785194
Chick.21 -1.760748 27.7511107
Chick.22 -25.507264 4.5738608
Chick.23 -11.152885 18.1520871
Chick.24 -10.738692 18.5913718
Chick.25 -49.258509 -21.1861662
Chick.26 6.229639 35.3772713
Chick.27 27.334870 55.8158680
Chick.28 -5.425309 23.3830690
Chick.29 -36.790061 -8.5159428
Chick.30 -21.449257 7.3210430
Chick.31 -2.726488 26.0309834
Chick.32 2.363947 30.4426735
Chick.33 -17.697364 11.4826392
Chick.34 -21.494111 7.6841453
Chick.35 -13.175496 14.3244863
Chick.36 -30.382428 -1.7696279
Chick.37 -18.721531 10.3132047
Chick.38 -27.665274 1.8571565
Chick.39 -26.396865 3.0409097
Chick.40 27.673312 56.8877975
Chick.41 -28.054653 0.8713058
Chick.42 -47.427431 -18.3203481
Chick.43 -53.029486 -23.9045384
Chick.44 -2.372680 27.1127621
Chick.45 -34.983082 -5.4415696
Chick.46 8.557526 37.4649475
Chick.47 -25.953250 3.2490025
Chick.48 -30.554080 -2.5062731
Chick.49 -45.997083 -16.3638547
Diet2.Time 1.078340 2.7586852
Diet3.Time 3.861025 5.6034695
Diet4.Time 2.113186 3.8174371
bt_p_val(obs_model_2$estimate, rlt_model_2_bt)$p_val
obs_model_2_time <- obs_model_2 %>% filter(grepl("Intercept", term) | grepl("Time", term))
obs_model_2_chick <- obs_model_2 %>% filter(grepl("Chick", term))
rlt_model_2_bt_time <- rlt_model_2_bt %>% dplyr::select(contains("Intercept"), contains("Time"))
rlt_model_2_bt_chick <- rlt_model_2_bt %>% dplyr::select(contains("Chick"))
coeff_bt_histogram(obs_model_2_time$estimate, rlt_model_2_bt_time, centering=FALSE)
coeff_bt_histogram(obs_model_2_chick$estimate[1:6], rlt_model_2_bt_chick[,1:6], centering=FALSE)
# model 3
model_3 <- lmer( weight ~ Diet*Time - Diet + (1 | Chick), data = ChickWeight2)
rlt_model_3_bt <- run_lmer_boot(model_3, num_do = 2000)
Warning in optwrap(optimizer, devfun, getStart(start, rho$lower, rho$pp), :
convergence code 3 from bobyqa: bobyqa -- a trust region step failed to
reduce q
obs_model_3 <- tidy(model_3) # summary of the original lmer estimates
bt_sd_3 <- apply(rlt_model_3_bt, 2, sd) # calculate bootstrap standard errors
bt_mode1_3 <- cbind(coeff = obs_model_3$estimate,
sd = bt_sd_3,
tstat = obs_model_3$estimate/bt_sd_3)
# random model estimate with statistical inferences by statistic theory (default)
obs_model_3
# random model estimate with statistical inferences by bootstrapping
bt_mode1_3
coeff sd tstat
Intercept 28.185242 3.7185615 7.579609
Time 6.772853 0.2433695 27.829510
Diet2.Time 1.844157 0.3960850 4.655962
Diet3.Time 4.475562 0.3862551 11.587062
Diet4.Time 2.941763 0.3883760 7.574523
sigma_RE 23.085561 2.6262252 8.790396
sigma_res 25.358103 0.7993851 31.722010
ci_ver1(rlt_model_3_bt)
2.5% 97.5%
Intercept 20.797343 35.443250
Time 6.292339 7.269661
Diet2.Time 1.054010 2.634484
Diet3.Time 3.720337 5.254353
Diet4.Time 2.175138 3.679915
sigma_RE 17.877542 28.100382
sigma_res 23.850278 26.982850
ci_ver2(obs_model_3$estimate, bt_sd_3, (nrow(ChickWeight2) - nrow(obs_model_3)))
2.5% 97.5%
Intercept 20.896862 35.473622
Time 6.295849 7.249857
Diet2.Time 1.067830 2.620483
Diet3.Time 3.718502 5.232622
Diet4.Time 2.180546 3.702980
sigma_RE 17.938159 28.232962
sigma_res 23.791308 26.924898
bt_p_val(obs_model_3$estimate, rlt_model_3_bt)$p_val
coeff_bt_histogram(obs_model_3$estimate, rlt_model_3_bt, centering=FALSE)