简体   繁体   中英

Adding significance bars within and between groups in dodged ggplot2 boxplots

I have some data that I would like to 1) plot as grouped boxplots, and 2) add significance bars A) between boxplots within each group and B) between specific boxplots of different groups. My data looks something like this:

library("ggplot2")

df <- data.frame(enzyme = c(rep("A", 9), rep("B", 9), rep("C", 9)),
                 substrate = c(rep("1", 3), rep("2", 3), rep("3", 3),
                               rep("1", 3), rep("4", 3), rep("5", 3),
                               rep("1", 3), rep("4", 3), rep("5", 3)),
                 AUC = c(6.64, 6.56, 6.21, 5.96, 6.12, 6.24, 6.02, 6.32, 6.12,
                        0, 0, 0, 5.99, 6.26, 5.94, 0, 0, 0,
                        0, 0, 0, 5.99, 6.11, 6.13, 0, 0, 0))

q <- ggplot(df, aes(x = enzyme, y = AUC, color = substrate)) +
  geom_boxplot(show.legend = F,
               position = position_dodge2(width = 0.75, preserve = "single")) +       
  geom_point(show.legend = F, size = 2, position = position_dodge2(width = 0.75, preserve = "single"))

plot(q)

I know that I can add significance bars between groups with the following:

q + geom_signif(comparisons = list(c("A", "B"), c("A", "C"), c("B", "C")),
                test = "t.test", map_signif_level = T)

However, these comparisons are not meaningful for my data.

Instead, I would like to A) add significance bars between boxplots of the same group. I thought I could follow the suggestion of Simon, who suggested that I manually add bars by defining p-values, labels, and y coordinates for the bars ( How to add significance bar between subgroups of box plot ), though for my dataset this will be more difficult because I have three subgroups per group rather than two.

Ultimately, I would also like to B) add significance bars comparing two specific subgroups from different groups.

My question is, is there any easy way to do this using existing functions/packages? If I have to do this manually, can anyone suggest a good strategy? I would appreciate it!

I thought about this for a bit and figured out a lengthy solution. If anyone has a more succinct way of doing this, please let me know!

## significance bars within and between subgroups

# rearrange df, one unique sample per column, rows are replicates
df.split <- do.call(cbind, sapply(split(df, df$enzyme), function(x) {
  sapply(split(x, x$substrate), function(x) {x$AUC}) }) )
# keep track of sample names
sample.names <- do.call(c, lapply(split(df, df$enzyme), function(x) {
  unique(paste0(x$enzyme, ".", x$substrate)) }) )
colnames(df.split) <- sample.names
# perform statistical tests between every pairwise combination of 
# samples/columns in df.split
df.tests <- apply(combn(seq_along(sample.names), 2), 2, 
                       function(x) {
                         t.test(df.split[ ,x[1]], df.split[ ,x[2]])$p.value })
# keep track of sample pairs
sample.pairs <- apply(combn(seq_along(sample.names), 2), 2, 
                      function(x) {
                        paste0(colnames(df.split)[x[1]], "X", 
                               colnames(df.split)[x[2]]) })
names(df.tests) <- sample.pairs

# think about how the significance bars will be laid out: because there are
# three subgroups per enzyme, the bars for the three pairwise comparisons on
# the same plot would overlap. This needs to be done in layers

# select tests of interest for each layer
within.tests.1 <- c("A.1XA.2", "A.2XA.3", 
                    "B.1XB.4", "B.4XB.5", 
                    "C.1XC.4", "C.4XC.5")
within.tests.2 <- c("A.1XA.3", "B.1XB.5","C.1XC.5")
between.tests.1 <- c("A.1XB.4", "B.4XC.4")
between.tests.2 <- c("A.1XC.4")
  
p.values.1 <- df.tests[which(names(df.tests) %in% within.tests.1)]
p.values.2 <- df.tests[which(names(df.tests) %in% within.tests.2)]
p.values.3 <- df.tests[which(names(df.tests) %in% between.tests.1)]
p.values.4 <- df.tests[which(names(df.tests) %in% between.tests.2)]

# convert p-values into easily read labels, with NaN values omitted
p.values.1 <- replace(p.values.1, is.na(p.values.1), 1)
p.values.2 <- replace(p.values.2, is.na(p.values.2), 1)
p.values.3 <- replace(p.values.3, is.na(p.values.3), 1)
p.values.4 <- replace(p.values.4, is.na(p.values.4), 1)
labels.1 <- symnum(p.values.1, corr = FALSE, cutpoints = c(0,  .001,.01,.05, 1),
                   symbols = c("***","**","*",""))
labels.2 <- symnum(p.values.2, corr = FALSE, cutpoints = c(0,  .001,.01,.05, 1),
                   symbols = c("***","**","*",""))
labels.3 <- symnum(p.values.3, corr = FALSE, cutpoints = c(0,  .001,.01,.05, 1),
                   symbols = c("***","**","*",""))
labels.4 <- symnum(p.values.4, corr = FALSE, cutpoints = c(0,  .001,.01,.05, 1),
                   symbols = c("***","**","*",""))

# determine coordinates for significance bars 

# y values for layer 1 should all be just above the highest data point of all
# samples being compared
y.values.1 <- do.call(max, lapply(unlist(strsplit(names(labels.1), "X")), 
                                  function(x) {
                                    df.split[, which(colnames(df.split) %in% x)] }) ) + 0.3 %>% 
  rep(times = length(labels.1))
# y values for layer 2 should be higher than those of layer 1
y.values.2 <- y.values.1[c(1, 3, 5)] + 0.4
# y values for layer 3 should all be above the highest data point of all
# samples being compared, and higher than layer 2
y.values.3 <- do.call(max, lapply(unlist(strsplit(names(labels.3), "X")), 
                                  function(x) {
  df.split[, which(colnames(df.split) %in% x)] }) ) + 1.2 %>% 
  rep(times = length(labels.3))
# y values for layer 4 should be higher than those of layer 3
y.values.4 <- y.values.3[1] + 0.5

# for x values, first boxplot is always at x = 1
# since there are three groups per x = 1 and preserve = "single", the width of
# each subgroup boxplot is 0.25
x.min.values.1 <- c(0.75, 1, 1.75, 2, 2.75, 3)
x.max.values.1 <- x.min.values.1 + 0.25
x.min.values.2 <- c(0.75, 1.75, 2.75)
x.max.values.2 <- x.min.values.2 + 0.50
x.min.values.3 <- c(0.75, 2)
x.max.values.3 <- c(2, 3)
x.min.values.4 <- c(0.75)
x.max.values.4 <- c(3)


# finally, plot the significance bars for each layer, one on top of the other
q + geom_signif(y_position = y.values.1, 
                xmin = x.min.values.1, 
                xmax = x.max.values.1, 
                annotations = labels.1,
                tip_length = rep(0.02, length(labels.1)),
                vjust = 0.5 ) +
  geom_signif(y_position = y.values.2, 
              xmin = x.min.values.2, 
              xmax = x.max.values.2, 
              annotations = labels.2,
              tip_length = rep(0.04, length(labels.2)),
              vjust = 0.5 ) +
  geom_signif(y_position = y.values.3, 
              xmin = x.min.values.3, 
              xmax = x.max.values.3, 
              annotations = labels.3,
              tip_length = rep(0.04, length(labels.3)),
              vjust = 0.5 ) +
  geom_signif(y_position = y.values.4, 
              xmin = x.min.values.4, 
              xmax = x.max.values.4, 
              annotations = labels.4,
              tip_length = rep(0.06, length(labels.4)),
              vjust = 0.5 )

The output looks like this:

Barplot_with_significance_bars_within_and_between_groups

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM