简体   繁体   中英

boot function with purrr::map in R

I'm studying about bootstrap two sample t test with boot package. In gene expression matrix, I want to compare genes between conditions and my aim is to find expressed genes. I have a matrix 5*12(5 control, 7 treatment and 5 genes) and firstly I converted this data matrix to tibble format as two long vector in order to understand the tibble structure and make it easier for me.:

library(tidyverse)
#> Warning: package 'tidyr' was built under R version 4.0.4
#> Warning: package 'dplyr' was built under R version 4.0.4
library(magrittr)
library(boot)

##Gene Expression matrix
Exp.mat<- read.table( header = TRUE, text =  " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )

##Vectorized format from matrix
Vec_Ex.Mat <- as_tibble(t(Exp.mat))
Vec_Ex.Mat$Cond <- as.factor(c(rep("1", 5), rep("2", 7)))
Vec_Ex.Mat <- Vec_Ex.Mat %>% gather(Var, Val, -Cond)
Vec_Ex.Mat<- Vec_Ex.Mat[, c(2, 3, 1)]
colnames(Vec_Ex.Mat) <- c("Gene", "Exp", "Cond")
head(Vec_Ex.Mat)
#> # A tibble: 6 x 3
#>   Gene    Exp Cond 
#>   <chr> <dbl> <fct>
#> 1 Gene1 1.68  1    
#> 2 Gene1 0.322 1    
#> 3 Gene1 1.72  1    
#> 4 Gene1 1.91  1    
#> 5 Gene1 1.27  1    
#> 6 Gene1 1.68  2

##Created nested tibble
Nested_Ex.Mat <- Vec_Ex.Mat %>%
  dplyr::group_by(Gene) %>%
  tidyr::nest()

head(Nested_Ex.Mat)
#> # A tibble: 5 x 2
#> # Groups:   Gene [5]
#>   Gene  data                 
#>   <chr> <list>               
#> 1 Gene1 <tibble[,2] [12 x 2]>
#> 2 Gene2 <tibble[,2] [12 x 2]>
#> 3 Gene3 <tibble[,2] [12 x 2]>
#> 4 Gene4 <tibble[,2] [12 x 2]>
#> 5 Gene5 <tibble[,2] [12 x 2]>

## Function for bootstrap
bootFun <- function(df, f) {
  n <- nrow(df)
  idx <- which(df[, 2] == 2)
  idy <- which(df[, 2] == 1)
  nx <- length(idx)
  ny <- length(idy)
  new.df <- df
  new.df[idx, 1] <- df[idx, 1] - mean(df[idx, 1]) + mean(df[, 1])
  new.df[idy, 1] <- df[idy, 1] - mean(df[idy, 1]) + mean(df[, 1])
  df <- new.df
  MX <- sum(df[idx, 1] * f[idx])/sum(f[idx])
  SX <- sum(df[idx, 1]^2 * f[idx])/sum(f[idx]) - MX^2
  SX <- nx * SX/(nx - 1)
  MY <- sum(df[idy, 1] * f[idy])/sum(f[idy])
  SY <- sum(df[idy, 1]^2 * f[idy])/sum(f[idy]) - MY^2
  SY <- ny * SY/(ny - 1)
  SXY <- sqrt((SX/nx) + (SY/ny))
  (MX -MY)/SXY
}

##Bootstrap analysis with boot package using purrr::map
Nested_Ex.Mat %<>%
  dplyr::mutate(booted = purrr::map(.x=data, ~ boot::boot(data= .x, sim = "ordinary", statistic = bootFun,R = 5,stype = "f", strata=.x[, 2])))
#> Error: Problem with `mutate()` input `booted`.
#> x 'list' object cannot be coerced to type 'double'
#> i Input `booted` is `purrr::map(...)`.
#> i The error occurred in group 1: Gene = "Gene1".

Created on 2021-04-06 by the reprex package (v1.0.0)

What I don't understand is whether I use indexing correctly and I don't know how to introduce this data with boot package for tibble vectorized format. In example here , a single column is analyzed, it is working. But I want to use strata option by boot package for per gene with two columns in tibble data. Is it possible to get rid of this code load or can we make the function more efficient with shorter codes and correct indexing? Can you share your knowledge and suggestions? Thank you.

I'm not sure why you want to bootstrap t-tests. It seems easier to just run the t.test function. Here is my code for doing that:

Load packages

library(dplyr)
library(tidyr)
library(tibble)
library(stringr)
library(purrr)

Get data

Exp.mat<- read.table(header = TRUE, text =  " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )

Check data

head(Exp.mat)
#>             C1        C2         C3        C4        C5       T1        T2
#> Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389
#> Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023
#> Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405
#> Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747
#> Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895
#>              T3        T4        T5       T6        T7
#> Gene1 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
#> Gene2 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
#> Gene3 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
#> Gene4 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
#> Gene5 0.9986083 0.0000000 0.7091586 1.288114 2.0487426

Format data

Exp.mat.new <- Exp.mat %>% 
        # Convert row names to a column
        rownames_to_column('gene') %>% 
        # "Gather" the data using the pivot_longer function (preferred to the old "gather" function)
        pivot_longer(cols = -gene,
                     names_to = 'cond',
                     values_to = 'exp') %>% 
        # Fix condition
        mutate(cond = case_when(
                str_detect(cond, pattern = 'C') ~ 'C',
                TRUE ~ 'T')) %>% 
        # Nest 
        group_by(gene) %>% 
        nest()

Perform analysis

Exp.mat.new <- Exp.mat.new %>% 
        # Perform t-test
        mutate(t_test = map(.x = data,
                            ~ t.test(.x$exp ~ .x$cond))) %>% 
        # Extract vital statistics
        mutate(C_minus_T_95CI = map(.x = t_test, # 95% CI of the mean difference between groups C and T
                                    ~ paste0(round(.x$conf.int[[1]], 3), ' to ', 
                                             round(.x$conf.int[[2]], 3))),
               p_value = map(.x = t_test, # p-value
                             ~ round(.x$p.value, 5))) %>% 
        # Unnest the data
        unnest(cols = c(C_minus_T_95CI, p_value)) %>% 
        # Arrange by p-value
        arrange(p_value)

Print dataframe

Exp.mat.new
#> # A tibble: 5 x 5
#> # Groups:   gene [5]
#>   gene  data              t_test  C_minus_T_95CI   p_value
#>   <chr> <list>            <list>  <chr>              <dbl>
#> 1 Gene2 <tibble [12 × 2]> <htest> -1.028 to -0.071  0.0288
#> 2 Gene3 <tibble [12 × 2]> <htest> -1.237 to 0.475   0.323 
#> 3 Gene4 <tibble [12 × 2]> <htest> -0.482 to 1.251   0.345 
#> 4 Gene5 <tibble [12 × 2]> <htest> -0.95 to 1.958    0.423 
#> 5 Gene1 <tibble [12 × 2]> <htest> -0.914 to 0.66    0.712

If you want to, print full results of each individual t-test

walk(.x = Exp.mat.new$t_test, 
     ~ print(.x))
#> 
#>  Welch Two Sample t-test
#> 
#> data:  .x$exp by .x$cond
#> t = -2.6068, df = 8.8453, p-value = 0.02881
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -1.02805952 -0.07141179
#> sample estimates:
#> mean in group C mean in group T 
#>       0.3678434       0.9175791 
#> 
#> 
#>  Welch Two Sample t-test
#> 
#> data:  .x$exp by .x$cond
#> t = -1.0691, df = 6.4703, p-value = 0.3233
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -1.2367921  0.4754686
#> sample estimates:
#> mean in group C mean in group T 
#>        2.195439        2.576101 
#> 
#> 
#>  Welch Two Sample t-test
#> 
#> data:  .x$exp by .x$cond
#> t = 0.99278, df = 9.7754, p-value = 0.3448
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -0.4816528  1.2514667
#> sample estimates:
#> mean in group C mean in group T 
#>        1.838916        1.454009 
#> 
#> 
#>  Welch Two Sample t-test
#> 
#> data:  .x$exp by .x$cond
#> t = 0.86423, df = 5.5685, p-value = 0.4231
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -0.9503223  1.9584180
#> sample estimates:
#> mean in group C mean in group T 
#>        1.506200        1.002152 
#> 
#> 
#>  Welch Two Sample t-test
#> 
#> data:  .x$exp by .x$cond
#> t = -0.38483, df = 6.6963, p-value = 0.7123
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -0.9143974  0.6604509
#> sample estimates:
#> mean in group C mean in group T 
#>        1.381890        1.508863

Created on 2021-04-06 by the reprex package (v1.0.0)

First I was getting the above error. After that I changed strata in boot function. ( "strata =.x$cond" instead of strata=.x[, 2]). Then it was returning NA with mean function in the firs boot.fun function. I changed mean to colMeans in the boot.fun. It is working now. I will run it with matrxix, too for testing accuracy.

library(tidyverse)
library(dplyr)
library(tidyr)
library(tibble)
library(stringr)
library(purrr)
library(boot)

Exp.mat<- read.table(header = TRUE, text =  " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )

Exp.mat.new <- Exp.mat %>% 
  # Convert row names to a column
  rownames_to_column('gene') %>% 
  # "Gather" the data using the pivot_longer function (preferred to the old "gather" function)
  pivot_longer(cols = -gene,
               names_to = 'cond',
               values_to = 'exp') %>% 
  # Fix condition
  mutate(cond = case_when(
    str_detect(cond, pattern = 'C') ~ '1',
    TRUE ~ '2')) %>% 
  # Nest 
  group_by(gene) %>% 
  nest()

bootFun.tbl <- function(df,f) {
  n <- nrow(df)
  idx <- which(df[, 1] == 2) #select cond = 2 (treatment)
  idy <- which(df[, 1] == 1) #select cond = 1 (control)
  nx <- length(idx)
  ny <- length(idy)
  new.df <- df
  new.df[idx, 2] <- df[idx, 2] - colMeans(df[idx, 2]) + colMeans(df[, 2]) 
  new.df[idy, 2] <- df[idy, 2] - colMeans(df[idy, 2]) + colMeans(df[, 2])
  df <- new.df
  MX <- sum(df[idx, 2] * f[idx])/sum(f[idx])
  SX <- sum(df[idx, 2]^2 * f[idx])/sum(f[idx]) - MX^2
  SX <- nx * SX/(nx - 1)
  MY <- sum(df[idy, 2] * f[idy])/sum(f[idy])
  SY <- sum(df[idy, 2]^2 * f[idy])/sum(f[idy]) - MY^2
  SY <- ny * SY/(ny - 1)
  SXY <- sqrt((SX/nx) + (SY/ny))
  (MX -MY)/SXY
}

Exp.mat.new <- Exp.mat.new %>% 
  # Perform  bootstrap test
  mutate(boot.fun = map(.x = data,
                        ~boot::boot(data=.x, sim = "ordinary", 
                                    statistic = bootFun.tbl, R = 5,
                                    stype = "f", strata = .x$cond)))
head(Exp.mat.new)
#> # A tibble: 5 x 3
#> # Groups:   gene [5]
#>   gene  data                  boot.fun
#>   <chr> <list>                <list>  
#> 1 Gene1 <tibble[,2] [12 × 2]> <boot>  
#> 2 Gene2 <tibble[,2] [12 × 2]> <boot>  
#> 3 Gene3 <tibble[,2] [12 × 2]> <boot>  
#> 4 Gene4 <tibble[,2] [12 × 2]> <boot>  
#> 5 Gene5 <tibble[,2] [12 × 2]> <boot>
head(Exp.mat.new$boot.fun)
#> [[1]]
#> 
#> STRATIFIED BOOTSTRAP
#> 
#> 
#> Call:
#> boot::boot(data = .x, statistic = bootFun.tbl, R = 5, sim = "ordinary", 
#>     stype = "f", strata = .x$cond)
#> 
#> 
#> Bootstrap Statistics :
#>          original    bias    std. error
#> t1* -6.729777e-16 -1.237088    2.189771
#> 
#> [[2]]
#> 
#> STRATIFIED BOOTSTRAP
#> 
#> 
#> Call:
#> boot::boot(data = .x, statistic = bootFun.tbl, R = 5, sim = "ordinary", 
#>     stype = "f", strata = .x$cond)
#> 
#> 
#> Bootstrap Statistics :
#>         original    bias    std. error
#> t1* 5.264653e-16 0.4360011   0.4969442
#> 

Created on 2021-04-08 by the reprex package (v2.0.0)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM