I'm studying about bootstrap two sample t test with boot package. In gene expression matrix, I want to compare genes between conditions and my aim is to find expressed genes. I have a matrix 5*12(5 control, 7 treatment and 5 genes) and firstly I converted this data matrix to tibble format as two long vector in order to understand the tibble structure and make it easier for me.:
library(tidyverse)
#> Warning: package 'tidyr' was built under R version 4.0.4
#> Warning: package 'dplyr' was built under R version 4.0.4
library(magrittr)
library(boot)
##Gene Expression matrix
Exp.mat<- read.table( header = TRUE, text = " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )
##Vectorized format from matrix
Vec_Ex.Mat <- as_tibble(t(Exp.mat))
Vec_Ex.Mat$Cond <- as.factor(c(rep("1", 5), rep("2", 7)))
Vec_Ex.Mat <- Vec_Ex.Mat %>% gather(Var, Val, -Cond)
Vec_Ex.Mat<- Vec_Ex.Mat[, c(2, 3, 1)]
colnames(Vec_Ex.Mat) <- c("Gene", "Exp", "Cond")
head(Vec_Ex.Mat)
#> # A tibble: 6 x 3
#> Gene Exp Cond
#> <chr> <dbl> <fct>
#> 1 Gene1 1.68 1
#> 2 Gene1 0.322 1
#> 3 Gene1 1.72 1
#> 4 Gene1 1.91 1
#> 5 Gene1 1.27 1
#> 6 Gene1 1.68 2
##Created nested tibble
Nested_Ex.Mat <- Vec_Ex.Mat %>%
dplyr::group_by(Gene) %>%
tidyr::nest()
head(Nested_Ex.Mat)
#> # A tibble: 5 x 2
#> # Groups: Gene [5]
#> Gene data
#> <chr> <list>
#> 1 Gene1 <tibble[,2] [12 x 2]>
#> 2 Gene2 <tibble[,2] [12 x 2]>
#> 3 Gene3 <tibble[,2] [12 x 2]>
#> 4 Gene4 <tibble[,2] [12 x 2]>
#> 5 Gene5 <tibble[,2] [12 x 2]>
## Function for bootstrap
bootFun <- function(df, f) {
n <- nrow(df)
idx <- which(df[, 2] == 2)
idy <- which(df[, 2] == 1)
nx <- length(idx)
ny <- length(idy)
new.df <- df
new.df[idx, 1] <- df[idx, 1] - mean(df[idx, 1]) + mean(df[, 1])
new.df[idy, 1] <- df[idy, 1] - mean(df[idy, 1]) + mean(df[, 1])
df <- new.df
MX <- sum(df[idx, 1] * f[idx])/sum(f[idx])
SX <- sum(df[idx, 1]^2 * f[idx])/sum(f[idx]) - MX^2
SX <- nx * SX/(nx - 1)
MY <- sum(df[idy, 1] * f[idy])/sum(f[idy])
SY <- sum(df[idy, 1]^2 * f[idy])/sum(f[idy]) - MY^2
SY <- ny * SY/(ny - 1)
SXY <- sqrt((SX/nx) + (SY/ny))
(MX -MY)/SXY
}
##Bootstrap analysis with boot package using purrr::map
Nested_Ex.Mat %<>%
dplyr::mutate(booted = purrr::map(.x=data, ~ boot::boot(data= .x, sim = "ordinary", statistic = bootFun,R = 5,stype = "f", strata=.x[, 2])))
#> Error: Problem with `mutate()` input `booted`.
#> x 'list' object cannot be coerced to type 'double'
#> i Input `booted` is `purrr::map(...)`.
#> i The error occurred in group 1: Gene = "Gene1".
Created on 2021-04-06 by the reprex package (v1.0.0)
What I don't understand is whether I use indexing correctly and I don't know how to introduce this data with boot package for tibble vectorized format. In example here , a single column is analyzed, it is working. But I want to use strata option by boot package for per gene with two columns in tibble data. Is it possible to get rid of this code load or can we make the function more efficient with shorter codes and correct indexing? Can you share your knowledge and suggestions? Thank you.
I'm not sure why you want to bootstrap t-tests. It seems easier to just run the t.test
function. Here is my code for doing that:
Load packages
library(dplyr)
library(tidyr)
library(tibble)
library(stringr)
library(purrr)
Get data
Exp.mat<- read.table(header = TRUE, text = " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )
Check data
head(Exp.mat)
#> C1 C2 C3 C4 C5 T1 T2
#> Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389
#> Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023
#> Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405
#> Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747
#> Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895
#> T3 T4 T5 T6 T7
#> Gene1 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
#> Gene2 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
#> Gene3 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
#> Gene4 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
#> Gene5 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
Format data
Exp.mat.new <- Exp.mat %>%
# Convert row names to a column
rownames_to_column('gene') %>%
# "Gather" the data using the pivot_longer function (preferred to the old "gather" function)
pivot_longer(cols = -gene,
names_to = 'cond',
values_to = 'exp') %>%
# Fix condition
mutate(cond = case_when(
str_detect(cond, pattern = 'C') ~ 'C',
TRUE ~ 'T')) %>%
# Nest
group_by(gene) %>%
nest()
Perform analysis
Exp.mat.new <- Exp.mat.new %>%
# Perform t-test
mutate(t_test = map(.x = data,
~ t.test(.x$exp ~ .x$cond))) %>%
# Extract vital statistics
mutate(C_minus_T_95CI = map(.x = t_test, # 95% CI of the mean difference between groups C and T
~ paste0(round(.x$conf.int[[1]], 3), ' to ',
round(.x$conf.int[[2]], 3))),
p_value = map(.x = t_test, # p-value
~ round(.x$p.value, 5))) %>%
# Unnest the data
unnest(cols = c(C_minus_T_95CI, p_value)) %>%
# Arrange by p-value
arrange(p_value)
Print dataframe
Exp.mat.new
#> # A tibble: 5 x 5
#> # Groups: gene [5]
#> gene data t_test C_minus_T_95CI p_value
#> <chr> <list> <list> <chr> <dbl>
#> 1 Gene2 <tibble [12 × 2]> <htest> -1.028 to -0.071 0.0288
#> 2 Gene3 <tibble [12 × 2]> <htest> -1.237 to 0.475 0.323
#> 3 Gene4 <tibble [12 × 2]> <htest> -0.482 to 1.251 0.345
#> 4 Gene5 <tibble [12 × 2]> <htest> -0.95 to 1.958 0.423
#> 5 Gene1 <tibble [12 × 2]> <htest> -0.914 to 0.66 0.712
If you want to, print full results of each individual t-test
walk(.x = Exp.mat.new$t_test,
~ print(.x))
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = -2.6068, df = 8.8453, p-value = 0.02881
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -1.02805952 -0.07141179
#> sample estimates:
#> mean in group C mean in group T
#> 0.3678434 0.9175791
#>
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = -1.0691, df = 6.4703, p-value = 0.3233
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -1.2367921 0.4754686
#> sample estimates:
#> mean in group C mean in group T
#> 2.195439 2.576101
#>
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = 0.99278, df = 9.7754, p-value = 0.3448
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -0.4816528 1.2514667
#> sample estimates:
#> mean in group C mean in group T
#> 1.838916 1.454009
#>
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = 0.86423, df = 5.5685, p-value = 0.4231
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -0.9503223 1.9584180
#> sample estimates:
#> mean in group C mean in group T
#> 1.506200 1.002152
#>
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = -0.38483, df = 6.6963, p-value = 0.7123
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -0.9143974 0.6604509
#> sample estimates:
#> mean in group C mean in group T
#> 1.381890 1.508863
Created on 2021-04-06 by the reprex package (v1.0.0)
First I was getting the above error. After that I changed strata in boot function. ( "strata =.x$cond" instead of strata=.x[, 2]). Then it was returning NA with mean function in the firs boot.fun function. I changed mean to colMeans in the boot.fun. It is working now. I will run it with matrxix, too for testing accuracy.
library(tidyverse)
library(dplyr)
library(tidyr)
library(tibble)
library(stringr)
library(purrr)
library(boot)
Exp.mat<- read.table(header = TRUE, text = " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )
Exp.mat.new <- Exp.mat %>%
# Convert row names to a column
rownames_to_column('gene') %>%
# "Gather" the data using the pivot_longer function (preferred to the old "gather" function)
pivot_longer(cols = -gene,
names_to = 'cond',
values_to = 'exp') %>%
# Fix condition
mutate(cond = case_when(
str_detect(cond, pattern = 'C') ~ '1',
TRUE ~ '2')) %>%
# Nest
group_by(gene) %>%
nest()
bootFun.tbl <- function(df,f) {
n <- nrow(df)
idx <- which(df[, 1] == 2) #select cond = 2 (treatment)
idy <- which(df[, 1] == 1) #select cond = 1 (control)
nx <- length(idx)
ny <- length(idy)
new.df <- df
new.df[idx, 2] <- df[idx, 2] - colMeans(df[idx, 2]) + colMeans(df[, 2])
new.df[idy, 2] <- df[idy, 2] - colMeans(df[idy, 2]) + colMeans(df[, 2])
df <- new.df
MX <- sum(df[idx, 2] * f[idx])/sum(f[idx])
SX <- sum(df[idx, 2]^2 * f[idx])/sum(f[idx]) - MX^2
SX <- nx * SX/(nx - 1)
MY <- sum(df[idy, 2] * f[idy])/sum(f[idy])
SY <- sum(df[idy, 2]^2 * f[idy])/sum(f[idy]) - MY^2
SY <- ny * SY/(ny - 1)
SXY <- sqrt((SX/nx) + (SY/ny))
(MX -MY)/SXY
}
Exp.mat.new <- Exp.mat.new %>%
# Perform bootstrap test
mutate(boot.fun = map(.x = data,
~boot::boot(data=.x, sim = "ordinary",
statistic = bootFun.tbl, R = 5,
stype = "f", strata = .x$cond)))
head(Exp.mat.new)
#> # A tibble: 5 x 3
#> # Groups: gene [5]
#> gene data boot.fun
#> <chr> <list> <list>
#> 1 Gene1 <tibble[,2] [12 × 2]> <boot>
#> 2 Gene2 <tibble[,2] [12 × 2]> <boot>
#> 3 Gene3 <tibble[,2] [12 × 2]> <boot>
#> 4 Gene4 <tibble[,2] [12 × 2]> <boot>
#> 5 Gene5 <tibble[,2] [12 × 2]> <boot>
head(Exp.mat.new$boot.fun)
#> [[1]]
#>
#> STRATIFIED BOOTSTRAP
#>
#>
#> Call:
#> boot::boot(data = .x, statistic = bootFun.tbl, R = 5, sim = "ordinary",
#> stype = "f", strata = .x$cond)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* -6.729777e-16 -1.237088 2.189771
#>
#> [[2]]
#>
#> STRATIFIED BOOTSTRAP
#>
#>
#> Call:
#> boot::boot(data = .x, statistic = bootFun.tbl, R = 5, sim = "ordinary",
#> stype = "f", strata = .x$cond)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* 5.264653e-16 0.4360011 0.4969442
#>
Created on 2021-04-08 by the reprex package (v2.0.0)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.