简体   繁体   中英

Overlaying different vlines in R with ggplot facet_wrap

I am trying to produce a set of density plots showing the difference in expression level distributions for two sets of genes in four cell types. In addition to the density plots, I would like to have the median expression level for both groups overlaid onto each plot. Based on answers to a few similar questions, I've been able to get correct plots OR correct medians but not both at the same time. I'm out of ideas and hoping someone can set me right. Thanks!

Sample data is available here: https://github.com/adadiehl/sample_data/blob/master/sample.data

First Attempt. Produces correct plots, but same medians are plotted on all four:

dat = read.table("sample.data")

g = ggplot(dat[which(dat$FPKM > 0),], aes(x = FPKM))
g = g + geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2)
g = g + geom_vline(data=dat, aes(xintercept = median(dat$FPKM[ which(dat$FPKM > 0 & dat$class == "Other") ]) ), colour="turquoise3", linetype="longdash")
g = g + geom_vline(data=dat, aes(xintercept = median(dat$FPKM[ which(dat$FPKM > 0 & dat$class == "a_MCKG") ]) ), colour="tomato1", linetype="longdash")
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + ggtitle("Distribution of FPKM, MCKG vs. Other")
g = g + xlab("FPKM > 0")

Second Attempt: Correct plots but places all medians on all plots:

dat = read.table("sample.data")
vline.dat = data.frame(z=levels(dat$source), vl=tapply(dat$FPKM[which(dat$class != "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class != "a_MCKG" & dat$FPKM > 0)], median), vm=tapply(dat$FPKM[which(dat$class == "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class == "a_MCKG" & dat$FPKM > 0)], median))

g = ggplot(dat[which(dat$FPKM > 0),], aes(x = FPKM))
g = g + geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2)
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + geom_vline(data=vline.dat, aes(xintercept = vl), colour="turquoise3", linetype="longdash")
g = g + geom_vline(data=vline.dat, aes(xintercept = vm), colour="tomato1", linetype="longdash")
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + ggtitle("Distribution of FPKM, MCKG vs. Other")
g = g + xlab("FPKM > 0")

Third Attempt: Plots are all the same but have correct medians.

dat = read.table("sample.data")
vline.dat = data.frame(z=levels(dat$source), vl=tapply(dat$FPKM[which(dat$class != "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class != "a_MCKG" & dat$FPKM > 0)], median), vm=tapply(dat$FPKM[which(dat$class == "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class == "a_MCKG" & dat$FPKM > 0)], median))

g = ggplot(dat[which(dat$FPKM > 0),], aes(x = FPKM))
g = g + geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2)
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + geom_vline(data=vline.dat, aes(xintercept = vl), colour="turquoise3", linetype="longdash")
g = g + geom_vline(data=vline.dat, aes(xintercept = vm), colour="tomato1", linetype="longdash")
g = g + facet_wrap(~z, ncol=2, scales="free")
g = g + ggtitle("Distribution of FPKM, MCKG vs. Other")
g = g + xlab("FPKM > 0")

Passing pre-summarized data is the way to go:

library(plyr)

names(dat) <- c("FPKM", "class", "source")
dat2 <- subset(dat, FPKM > 0)

ggplot(dat2, aes(x = FPKM)) + 
  geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2) +
  geom_vline(data = ddply(dat2, .(source, class), summarize, mmed = median(FPKM)),
             aes(xintercept = mmed, color = class)) +
  facet_wrap(~ source, ncol = 2, scales = "free") +
  ggtitle("Distribution of FPKM, MCKG vs. Other") +
  xlab("FPKM > 0")

Alternatively, you can achieve the same with base R:

dat3 <- aggregate(FPKM ~ source + class, data = dat2, FUN = median)

ggplot(dat2, aes(x = FPKM)) + 
  geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2) +
  geom_vline(data = dat3,
             aes(xintercept = FPKM, color = class)) +
  facet_wrap(~ source, ncol = 2, scales = "free") +
  ggtitle("Distribution of FPKM, MCKG vs. Other") +
  xlab("FPKM > 0")

NB You may want to avoid column names such as source and class as these conflict with built-in functions.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM