I'm new to R and tidyverse and need to compute quantiles of data which is nested. For example, consider the following table:
> tbl=
subgroup boot
<chr> <list>
1 aaa <tibble [30 × 23]>
2 bbb <tibble [30 × 23]>
3 ccc <tibble [30 × 23]>
where boot
contains another tibble with 30 bootrapted replicates and 23 columns (various variables). For example:
> tbl$boot
[[1]]
# A tibble: 30 x 23
optimal_cutpoint AUC_b AUC_oob misclassification_c… misclassification_… acc_b acc_oob sensitivity_b sensitivity_oob specificity_b specificity_oob kappa_b
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 187 0.967 0.903 3 4 0.923 0.765 1 1 0.870 0.556 0.845
2 270 0.946 0.729 5 5 0.872 0.643 1 0.625 0.783 0.667 0.747
3 195 0.926 0.886 11 2 0.718 0.833 1 1 0.56 0.6 0.477
4 187 0.881 0.893 9 3 0.769 0.8 1 1 0.625 0.625 0.562
5 195 0.963 0.933 7 2 0.821 0.875 1 1 0.682 0.667 0.651
6 203 0.926 0.944 7 2 0.821 0.882 1 1 0.65 0.778 0.644
7 195 0.944 0.931 7 2 0.821 0.882 1 1 0.611 0.778 0.629
8 153 0.908 1 4 4 0.897 0.667 1 1 0.789 0.5 0.794
9 203 0.962 0.922 8 2 0.795 0.875 1 1 0.652 0.75 0.606
10 195 0.883 0.94 11 2 0.718 0.9 1 1 0.542 0.8 0.476
# ... with 20 more rows, and 11 more variables: kappa_oob <dbl>, TP_b <dbl>, FP_b <dbl>, TN_b <int>, FN_b <int>, TP_oob <dbl>, FP_oob <dbl>, TN_oob <int>,
# FN_oob <int>, roc_curve_b <list>, roc_curve_oob <list>
[[2]]
# A tibble: 30 x 23
optimal_cutpoint AUC_b AUC_oob misclassification_c… misclassification_… acc_b acc_oob sensitivity_b sensitivity_oob specificity_b specificity_oob kappa_b
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 72 0.842 0.81 11 6 0.788 0.7 1 1 0.577 0.4 0.577
2 72 0.735 0.95 10 5 0.808 0.75 1 1 0.545 0.5 0.581
3 80 0.787 0.907 11 5 0.788 0.667 1 0.833 0.522 0.556 0.549
4 72 0.856 0.833 9 6 0.827 0.647 1 1 0.64 0.333 0.649
5 72 0.88 0.778 11 5 0.788 0.706 1 1 0.593 0.375 0.583
6 72 0.666 0.959 16 4 0.692 0.818 1 1 0.304 0.636 0.328
7 43 0.708 0.941 19 7 0.635 0.731 1 1 0.24 0.462 0.247
8 68 0.866 0.85 12 6 0.769 0.7 1 1 0.5 0.4 0.519
9 80 0.801 0.872 16 5 0.692 0.773 1 0.923 0.407 0.556 0.398
10 80 0.877 0.809 8 8 0.846 0.619 1 0.909 0.652 0.3 0.677
# ... with 20 more rows, and 11 more variables: kappa_oob <dbl>, TP_b <dbl>, FP_b <dbl>, TN_b <int>, FN_b <int>, TP_oob <dbl>, FP_oob <dbl>, TN_oob <int>,
# FN_oob <int>, roc_curve_b <list>, roc_curve_oob <list>
[[3]]
# A tibble: 30 x 23
optimal_cutpoint AUC_b AUC_oob misclassification_c… misclassification_… acc_b acc_oob sensitivity_b sensitivity_oob specificity_b specificity_oob kappa_b
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 187 0.892 0.95 8 2 0.778 0.846 1 1 0.529 0.75 0.543
2 144. 0.928 0.929 5 2 0.861 0.818 1 1 0.688 0.5 0.710
3 142. 0.926 0.889 6 3 0.833 0.75 1 1 0.667 0.5 0.667
4 187 0.931 0.929 5 1 0.861 0.889 1 1 0.688 0.857 0.710
5 187 0.916 0.852 3 4 0.917 0.733 1 1 0.812 0.333 0.828
6 142. 0.937 0.875 5 3 0.861 0.786 1 1 0.667 0.625 0.7
7 187 0.963 0.857 6 3 0.833 0.75 1 1 0.667 0.571 0.667
8 142. 0.950 0.917 6 2 0.833 0.8 1 1 0.647 0.667 0.659
9 187 0.950 0.971 3 3 0.917 0.75 1 1 0.842 0.4 0.834
10 150. 0.938 0.952 5 3 0.861 0.769 1 1 0.688 0.5 0.710
# ... with 20 more rows, and 11 more variables: kappa_oob <dbl>, TP_b <dbl>, FP_b <dbl>, TN_b <int>, FN_b <int>, TP_oob <dbl>, FP_oob <dbl>, TN_oob <int>,
# FN_oob <int>, roc_curve_b <list>, roc_curve_oob <list>
So, from the boot
tibble I need to extract only optimal_cutpoint
column and compute quantiles (2.5% and 97.5%) for each of 'aaa', 'bbb', 'ccc':
> qnt.aaa <- quantile(tbl$boot[[1]]$optimal_cutpoint, c(0.025, 0.975))
> qnt.bbb <- quantile(tbl$boot[[2]]$optimal_cutpoint, c(0.025, 0.975))
> qnt.ccc <- quantile(tbl$boot[[3]]$optimal_cutpoint, c(0.025, 0.975))
So ideally I would like to have the following table:
> tbl.new=
subgroup ci.low ci.upp
<chr> <dbl> <dbl>
1 aaa qnt.aaa[1] qnt.aaa[2]
2 bbb qnt.bbb[1] qnt.bbb[2]
3 ccc qnt.ccc[1] qnt.ccc[2]
(of course the numerical values instead of qnt.
)
I guess I can implement this in a very awkward way, but I would like to learn how to use the tidyverse approach and make it neat.
You can use rowwise
to process each row individually, mutate
to add the new columns and finally ungroup
to combine the rows again for further calulations:
library(tidyverse)
tbl.new <-
tbl %>%
rowwise() %>%
mutate(ci.low = quantile(boot$optimal_cutpoint, 0.025),
ci.up = quantile(boot$optimal_cutpoint, 0.975)) %>%
ungroup()
Note that the code is untested, since your example data is not directly usable in R (Maybe try to dput
a small portion next time :))
We could use map
after by nest
ing the 'subgroup'
library(tidyverse)
tbl %>%
group_by(subgroup) %>%
nest %>%
mutate(cls = map(data, ~
quantile(.x$boot[[1]]$optimal_cutpoint, c(0.025, 0.975)) %>%
as.list %>%
as_tibble %>%
rename_all(~ c("ci.low", "ci.upp")))) %>%
select(-data) %>%
unnest
tbl <- structure(list(subgroup = c("aaa", "bbb", "ccc"), boot = list(
structure(list(optimal_cutpoint = c(187L, 270L, 195L, 187L,
195L, 203L, 195L, 153L, 203L, 195L), AUC_b = c(0.967, 0.946,
0.926, 0.881, 0.963, 0.926, 0.944, 0.908, 0.962, 0.883),
AUC_oob = c(0.903, 0.729, 0.886, 0.893, 0.933, 0.944,
0.931, 1, 0.922, 0.94), misclassification_c. = c(3L,
5L, 11L, 9L, 7L, 7L, 7L, 4L, 8L, 11L), misclassification_. = c(4L,
5L, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 2L), acc_b = c(0.923,
0.872, 0.718, 0.769, 0.821, 0.821, 0.821, 0.897, 0.795,
0.718), acc_oob = c(0.765, 0.643, 0.833, 0.8, 0.875,
0.882, 0.882, 0.667, 0.875, 0.9), sensitivity_b = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), sensitivity_oob = c(1,
0.625, 1, 1, 1, 1, 1, 1, 1, 1), specificity_b = c(0.87,
0.783, 0.56, 0.625, 0.682, 0.65, 0.611, 0.789, 0.652,
0.542), specificity_oob = c(0.556, 0.667, 0.6, 0.625,
0.667, 0.778, 0.778, 0.5, 0.75, 0.8), kappa_b = c(0.845,
0.747, 0.477, 0.562, 0.651, 0.644, 0.629, 0.794, 0.606,
0.476)), .Names = c("optimal_cutpoint", "AUC_b", "AUC_oob",
"misclassification_c.", "misclassification_.", "acc_b", "acc_oob",
"sensitivity_b", "sensitivity_oob", "specificity_b", "specificity_oob",
"kappa_b"), row.names = c("1", "2", "3", "4", "5", "6", "7",
"8", "9", "10"), class = c("tbl_df", "tbl", "data.frame")),
structure(list(optimal_cutpoint = c(187L, 270L, 195L, 187L,
195L, 203L, 195L, 153L, 203L, 195L), AUC_b = c(0.967, 0.946,
0.926, 0.881, 0.963, 0.926, 0.944, 0.908, 0.962, 0.883),
AUC_oob = c(0.903, 0.729, 0.886, 0.893, 0.933, 0.944,
0.931, 1, 0.922, 0.94), misclassification_c. = c(3L,
5L, 11L, 9L, 7L, 7L, 7L, 4L, 8L, 11L), misclassification_. = c(4L,
5L, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 2L), acc_b = c(0.923,
0.872, 0.718, 0.769, 0.821, 0.821, 0.821, 0.897, 0.795,
0.718), acc_oob = c(0.765, 0.643, 0.833, 0.8, 0.875,
0.882, 0.882, 0.667, 0.875, 0.9), sensitivity_b = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), sensitivity_oob = c(1,
0.625, 1, 1, 1, 1, 1, 1, 1, 1), specificity_b = c(0.87,
0.783, 0.56, 0.625, 0.682, 0.65, 0.611, 0.789, 0.652,
0.542), specificity_oob = c(0.556, 0.667, 0.6, 0.625,
0.667, 0.778, 0.778, 0.5, 0.75, 0.8), kappa_b = c(0.845,
0.747, 0.477, 0.562, 0.651, 0.644, 0.629, 0.794, 0.606,
0.476)), .Names = c("optimal_cutpoint", "AUC_b", "AUC_oob",
"misclassification_c.", "misclassification_.", "acc_b", "acc_oob",
"sensitivity_b", "sensitivity_oob", "specificity_b", "specificity_oob",
"kappa_b"), row.names = c("1", "2", "3", "4", "5", "6", "7",
"8", "9", "10"), class = c("tbl_df", "tbl", "data.frame")),
structure(list(optimal_cutpoint = c(187L, 270L, 195L, 187L,
195L, 203L, 195L, 153L, 203L, 195L), AUC_b = c(0.967, 0.946,
0.926, 0.881, 0.963, 0.926, 0.944, 0.908, 0.962, 0.883),
AUC_oob = c(0.903, 0.729, 0.886, 0.893, 0.933, 0.944,
0.931, 1, 0.922, 0.94), misclassification_c. = c(3L,
5L, 11L, 9L, 7L, 7L, 7L, 4L, 8L, 11L), misclassification_. = c(4L,
5L, 2L, 3L, 2L, 2L, 2L, 4L, 2L, 2L), acc_b = c(0.923,
0.872, 0.718, 0.769, 0.821, 0.821, 0.821, 0.897, 0.795,
0.718), acc_oob = c(0.765, 0.643, 0.833, 0.8, 0.875,
0.882, 0.882, 0.667, 0.875, 0.9), sensitivity_b = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), sensitivity_oob = c(1,
0.625, 1, 1, 1, 1, 1, 1, 1, 1), specificity_b = c(0.87,
0.783, 0.56, 0.625, 0.682, 0.65, 0.611, 0.789, 0.652,
0.542), specificity_oob = c(0.556, 0.667, 0.6, 0.625,
0.667, 0.778, 0.778, 0.5, 0.75, 0.8), kappa_b = c(0.845,
0.747, 0.477, 0.562, 0.651, 0.644, 0.629, 0.794, 0.606,
0.476)), .Names = c("optimal_cutpoint", "AUC_b", "AUC_oob",
"misclassification_c.", "misclassification_.", "acc_b", "acc_oob",
"sensitivity_b", "sensitivity_oob", "specificity_b", "specificity_oob",
"kappa_b"), row.names = c("1", "2", "3", "4", "5", "6", "7",
"8", "9", "10"), class = c("tbl_df", "tbl", "data.frame")))), .Names = c("subgroup",
"boot"), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.