简体   繁体   中英

How to loop multiple exposures and outcomes as well as different models with glm in R?

The code below currently runs unadjusted glm for each exposure on each outcome (3 exposures per outcome) and exports the results into a list. For each exposure, I need 3 models: model 1 : unadjusted (which we currently have), model 2 : adjusted for cov1, model 3 : adjusted for cov1, cov2 and cov3

How would I implement the different models into this code?

amino_df <- data.frame(y = rbinom(100, 1, 0.5), y2 = rbinom(100, 1, 0.3), y3 = rbinom(100, 1, 0.2), y4 = rbinom(100, 1, 0.22),
                       exp1 = rnorm(100), exp2 = rnorm(100), exp3 = rnorm(100),
                       cov1 = rnorm(100), cov2 = rnorm(100), cov3 = rnorm(100))

exp <- c("exp1", "exp2", "exp3")
y <- c("y", "y2","y3","y4")
cov <- c("cov1", "cov2", "cov3")

obs_results <- replicate(length(y), data.frame())  

for(j in seq_along(y)){
  for (i in seq_along(exp)){
    mod <- as.formula(paste(y[j], "~", exp[i]))
    glmmodel <- glm(formula = mod, family = binomial, data = amino_df)
    
    obs_results[[j]][i,1] <- names(coef(glmmodel))[2]
    obs_results[[j]][i,2] <- exp(glmmodel$coefficients[2])
    obs_results[[j]][i,3] <- summary(glmmodel)$coefficients[2,2]
    obs_results[[j]][i,4] <- summary(glmmodel)$coefficients[2,4]
    obs_results[[j]][i,5] <- exp(confint.default(glmmodel)[2,1])
    obs_results[[j]][i,6] <- exp(confint.default(glmmodel)[2,2])
  }
  colnames(obs_results[[j]]) <- c("exposure","OR", "SE", "P_value", "95_CI_LOW","95_CI_HIGH")
}
names(obs_results) <- y

obs_df <- do.call("rbind", lapply(obs_results, as.data.frame)) 

EDIT - I now have a solution:

Further question, could this code below be adapted to include different models for the different exposures? So for exp1, adjust for all 3 cons: cov1, cov2, cov3, but for exp2 adjust for cov1, cov2 only? and exp3 cov2 and cov1 only?

amino_df <- data.frame(y = rbinom(100, 1, 0.5), y2 = rbinom(100, 1, 0.3), 
                       y3 = rbinom(100, 1, 0.2), y4 = rbinom(100, 1, 0.22),
                       exp1 = rnorm(100), exp2 = rnorm(100), exp3 = rnorm(100),
                       cov1 = rnorm(100), cov2 = rnorm(100), cov3 = rnorm(100))

exp <- c("exp1", "exp2", "exp3")
y <- c("y", "y2","y3","y4")
model <- c("", "+ cov1", "+ cov1 + cov2 + cov3")

obs_df <- lapply(y, function(j){
    lapply(exp, function(i){
        lapply(model, function(h){
            mod = as.formula(paste(j, "~", i, h))
            glmmodel = glm(formula = mod, family = binomial, data = amino_df)
            
            obs_results = data.frame(
                outcome = j,
                exposure = names(coef(glmmodel))[2], 
                covariate = h,
                OR = exp(glmmodel$coefficients[2]), 
                SE = summary(glmmodel)$coefficients[2,2], 
                P_value = summary(glmmodel)$coefficients[2,4], 
                `95_CI_LOW` = exp(confint.default(glmmodel)[2,1]), 
                `95_CI_HIGH` = exp(confint.default(glmmodel)[2,2])
            )
            return(obs_results)
        }) %>% bind_rows
    }) %>% bind_rows
}) %>% bind_rows %>% `colnames<-`(gsub("X95","95",colnames(.))) %>% `rownames<-`(NULL)

head(obs_df)

Just like how you specified exp and y at the beginning, you can specify the different model types.

Here is an approach using lapply() instead of for-loops:

amino_df <- data.frame(y = rbinom(100, 1, 0.5), y2 = rbinom(100, 1, 0.3), 
                       y3 = rbinom(100, 1, 0.2), y4 = rbinom(100, 1, 0.22),
                       exp1 = rnorm(100), exp2 = rnorm(100), exp3 = rnorm(100),
                       cov1 = rnorm(100), cov2 = rnorm(100), cov3 = rnorm(100))

exp <- c("exp1", "exp2", "exp3")
y <- c("y", "y2","y3","y4")
model <- c("", "+ cov1", "+ cov1 + cov2 + cov3")

obs_df <- lapply(y, function(j){
    lapply(exp, function(i){
        lapply(model, function(h){
            mod = as.formula(paste(j, "~", i, h))
            glmmodel = glm(formula = mod, family = binomial, data = amino_df)
            
            obs_results = data.frame(
                outcome = j,
                exposure = names(coef(glmmodel))[2], 
                covariate = h,
                OR = exp(glmmodel$coefficients[2]), 
                SE = summary(glmmodel)$coefficients[2,2], 
                P_value = summary(glmmodel)$coefficients[2,4], 
                `95_CI_LOW` = exp(confint.default(glmmodel)[2,1]), 
                `95_CI_HIGH` = exp(confint.default(glmmodel)[2,2])
            )
            return(obs_results)
        }) %>% bind_rows
    }) %>% bind_rows
}) %>% bind_rows %>% `colnames<-`(gsub("X95","95",colnames(.))) %>% `rownames<-`(NULL)

head(obs_df)
#  outcome exposure            covariate        OR        SE   P_value 95_CI_LOW 95_CI_HIGH
#1       y     exp1                      0.9425290 0.2125285 0.7806305 0.6214270   1.429550
#2       y     exp1               + cov1 0.9356460 0.2138513 0.7557639 0.6152917   1.422794
#3       y     exp1 + cov1 + cov2 + cov3 0.9638427 0.2174432 0.8655098 0.6293876   1.476027
#4       y     exp2                      1.3297429 0.1865916 0.1266809 0.9224452   1.916879
#5       y     exp2               + cov1 1.3300740 0.1866225 0.1264124 0.9226190   1.917473
#6       y     exp2 + cov1 + cov2 + cov3 1.3558196 0.1903111 0.1097054 0.9337031   1.968770

I included gsub("X95","95",colnames(.)) at the end because when creating new data frames, column names that begin with a number (ie, "95_CI_LOW", "95_CI_HIGH") get an "X" inserted at the beginning by default; this code removes it.

Supplement

If different exposures are uniquely adjusted with different covariates in your models, the following can be done instead. The easiest solution is to run all possible exposure+covariate combinations through the code above, and then filter obs_df (with filter() ) to select only the analyses that you want. However, it means it will take unnecessarily longer to run if you are working with large datasets.

A more direct approach is to enter specifically which exposure+covariate combinations to include in the model and remove the lapply(exp) function (and edit the core function accordingly):

model <- c("exp1 + cov1 + cov2 + cov3", "exp2 + cov1 + cov2", "exp3 + cov1")

obs_df <- lapply(y, function(j){
    lapply(model, function(h){
        mod = as.formula(paste(j, "~", h))
        glmmodel = glm(formula = mod, family = binomial, data = amino_df)
            
        obs_results = data.frame(
            outcome = j,
            exposure = names(coef(glmmodel))[2], 
            covariate = gsub(names(coef(glmmodel))[2],"",h), # gsub to remove exposure from covariate(s)
            OR = exp(glmmodel$coefficients[2]), 
            SE = summary(glmmodel)$coefficients[2,2], 
            P_value = summary(glmmodel)$coefficients[2,4], 
            `95_CI_LOW` = exp(confint.default(glmmodel)[2,1]), 
            `95_CI_HIGH` = exp(confint.default(glmmodel)[2,2])
        )
        return(obs_results)
    }) %>% bind_rows
}) %>% bind_rows %>% `colnames<-`(gsub("X95","95",colnames(.))) %>% `rownames<-`(NULL)

I would recommend collecting the different components that you wish to vary between models into a data frame, and building that accordingly:

library(tidyverse)

y <- c("y", "y2","y3","y4")
exp <- c("exp1", "exp2", "exp3")
cov <- list(character(), "cov1", c("cov1", "cov2", "cov3"))

# each covariate for each exposure
models1 <- crossing(outcome = y, exposure = exp, covariates = cov)
models1
#> # A tibble: 36 x 3
#>    outcome exposure covariates
#>    <chr>   <chr>    <list>    
#>  1 y       exp1     <chr [0]> 
#>  2 y       exp1     <chr [1]> 
#>  3 y       exp1     <chr [3]> 
#>  4 y       exp2     <chr [0]> 
#>  5 y       exp2     <chr [1]> 
#>  6 y       exp2     <chr [3]> 
#>  7 y       exp3     <chr [0]> 
#>  8 y       exp3     <chr [1]> 
#>  9 y       exp3     <chr [3]> 
#> 10 y2      exp1     <chr [0]> 
#> # ... with 26 more rows

# covariates specific per exposure
models2 <- crossing(outcome = y, nesting(exposure = exp, covariates = cov))
models2
#> # A tibble: 12 x 3
#>    outcome exposure covariates
#>    <chr>   <chr>    <list>    
#>  1 y       exp1     <chr [0]> 
#>  2 y       exp2     <chr [1]> 
#>  3 y       exp3     <chr [3]> 
#>  4 y2      exp1     <chr [0]> 
#>  5 y2      exp2     <chr [1]> 
#>  6 y2      exp3     <chr [3]> 
#>  7 y3      exp1     <chr [0]> 
#>  8 y3      exp2     <chr [1]> 
#>  9 y3      exp3     <chr [3]> 
#> 10 y4      exp1     <chr [0]> 
#> 11 y4      exp2     <chr [1]> 
#> 12 y4      exp3     <chr [3]>

Then put your model fitting and summary into a function that operates on those components:

fit_model <- function(outcome, exposure, covariates) {
  formula = reformulate(c(exposure, covariates), outcome)
  glmmodel = glm(formula = formula, family = binomial, data = amino_df)
  
  # using data.frame would not handle the covariate list column properly
  obs_results = tibble(
    outcome = outcome,
    exposure = names(coef(glmmodel))[2], 
    covariate = list(covariates),
    OR = exp(glmmodel$coefficients[2]), 
    SE = summary(glmmodel)$coefficients[2,2], 
    P_value = summary(glmmodel)$coefficients[2,4], 
    `95_CI_LOW` = exp(confint.default(glmmodel)[2,1]), 
    `95_CI_HIGH` = exp(confint.default(glmmodel)[2,2])
  )
  
  return(obs_results)
}

With those in place, you can use pmap() to fit the model for each row in your specification data frame:

amino_df <- data.frame(y = rbinom(100, 1, 0.5), y2 = rbinom(100, 1, 0.3), 
                       y3 = rbinom(100, 1, 0.2), y4 = rbinom(100, 1, 0.22),
                       exp1 = rnorm(100), exp2 = rnorm(100), exp3 = rnorm(100),
                       cov1 = rnorm(100), cov2 = rnorm(100), cov3 = rnorm(100))

# each covariate for each exposure
pmap_df(models1, fit_model)
#> # A tibble: 36 x 8
#>    outcome exposure covariate    OR    SE P_value `95_CI_LOW` `95_CI_HIGH`
#>    <chr>   <chr>    <list>    <dbl> <dbl>   <dbl>       <dbl>        <dbl>
#>  1 y       exp1     <chr [0]> 1.01  0.191  0.944        0.697         1.47
#>  2 y       exp1     <chr [1]> 1.01  0.191  0.947        0.697         1.47
#>  3 y       exp1     <chr [3]> 0.990 0.194  0.960        0.677         1.45
#>  4 y       exp2     <chr [0]> 1.26  0.215  0.281        0.827         1.92
#>  5 y       exp2     <chr [1]> 1.29  0.220  0.244        0.840         1.99
#>  6 y       exp2     <chr [3]> 1.31  0.222  0.229        0.845         2.02
#>  7 y       exp3     <chr [0]> 1.43  0.216  0.0969       0.937         2.19
#>  8 y       exp3     <chr [1]> 1.43  0.217  0.101        0.933         2.18
#>  9 y       exp3     <chr [3]> 1.36  0.221  0.166        0.881         2.09
#> 10 y2      exp1     <chr [0]> 1.55  0.230  0.0580       0.985         2.43
#> # ... with 26 more rows

# covariates specific per exposure
pmap_df(models2, fit_model)
#> # A tibble: 12 x 8
#>    outcome exposure covariate    OR    SE P_value `95_CI_LOW` `95_CI_HIGH`
#>    <chr>   <chr>    <list>    <dbl> <dbl>   <dbl>       <dbl>        <dbl>
#>  1 y       exp1     <chr [0]> 1.01  0.191  0.944        0.697         1.47
#>  2 y       exp2     <chr [1]> 1.29  0.220  0.244        0.840         1.99
#>  3 y       exp3     <chr [3]> 1.36  0.221  0.166        0.881         2.09
#>  4 y2      exp1     <chr [0]> 1.55  0.230  0.0580       0.985         2.43
#>  5 y2      exp2     <chr [1]> 0.717 0.249  0.182        0.441         1.17
#>  6 y2      exp3     <chr [3]> 0.999 0.241  0.996        0.622         1.60
#>  7 y3      exp1     <chr [0]> 1.21  0.243  0.442        0.749         1.94
#>  8 y3      exp2     <chr [1]> 0.822 0.267  0.463        0.487         1.39
#>  9 y3      exp3     <chr [3]> 1.56  0.269  0.0980       0.921         2.64
#> 10 y4      exp1     <chr [0]> 1.12  0.224  0.601        0.725         1.74
#> 11 y4      exp2     <chr [1]> 0.721 0.255  0.200        0.437         1.19
#> 12 y4      exp3     <chr [3]> 0.767 0.252  0.291        0.468         1.26

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM