簡體   English   中英

具有層次聚類的堆積條形圖(樹狀圖)

[英]Stacked bar plot with hierarchical clustering (dendrogram)

我試圖得到這樣的東西,但不幸的是,我找不到任何可以讓我用樹狀圖繪制堆積條形圖的包,如下所示:

樹狀圖

有誰知道怎么做?

第一次嘗試回答 - 但需要更多的工作才能讓它真正起作用。 具體來說,需要更仔細地考慮元素位置的對齊(以及它們的順序)。

# library
library(ggplot2)

# create a dataset
specie=c(rep("sorgho" , 3) , rep("poacee" , 3) , rep("banana" , 3) , rep("triticum" , 3) )
condition=rep(c("normal" , "stress" , "Nitrogen") , 4)
value=abs(rnorm(12 , 0 , 15))
data=data.frame(specie,condition,value)


dend <- as.dendrogram(hclust(dist(with(data, tapply(value, specie, mean)))))

data$specie <- factor(data$specie, levels = labels(dend))

# Stacked Percent
library(dendextend)
p1 <- ggplot(dend, horiz = T) 
p2 <- ggplot(data, aes(fill=condition, y=value, x=specie)) + 
    geom_bar( stat="identity", position="fill") + coord_flip()

library(cowplot)
plot_grid(p1, p2, align = "h")

在此處輸入圖片說明

差不多三年后,仍然沒有能夠將堆疊條形圖與 ggplot 中的層次聚類相結合的軟件包(至少我知道)。 這里我的解決方案基於加入樹狀圖和熱圖的帖子:

library(tidyverse)
library(phangorn)
library(vegan)
library(ggdendro)
library(dendextend)
library(ggsci)
library(cowplot)

## generate example data ####
set.seed(500)
combined_matrix <- data.frame(a=runif(14, 0, 33), b=runif(14, 0, 33), c=runif(14, 0, 33))
combined_matrix$d <- 100 - combined_matrix$a - combined_matrix$b - combined_matrix$c
row.names(combined_matrix) <- paste0("s", seq(1,14))

# vegan::vegdist() to calculate Bray-Curtis distance matrix
dm <- vegdist(combined_matrix, method = "bray")
# calculate UPGMA tree with phangorn::upgma() and convert to dendrogram
dendUPGMA <- as.dendrogram(upgma(dm))
plot_dendro_bars_v <- function(df, dend, taxonomy) {
  #convert dendrogram to segment data
  dend_data <- dendro_data(dend, type="rectangle")
  segment_data <- dend_data[["segments"]]
  #sample positions df
  sample_pos_table <- with(dend_data$labels, 
                           data.frame(x_center = x, sample = as.character(label), width = 0.9))
  #prepare input data
  ptdf <- rownames_to_column(df, var = "sample") %>%
    pivot_longer(-sample, names_to = taxonomy, values_to = "Frequency") %>%
    group_by(sample) %>%
    mutate(Frequency = Frequency/100,
           ymax = cumsum(Frequency/sum(Frequency)),
           ymin = ymax - Frequency/sum(Frequency),
           y_center = ymax-(Frequency/2)) %>%
    left_join(sample_pos_table) %>%
    mutate(xmin = x_center-width/2,
           xmax = x_center+width/2)
  #plot stacked bars
  axis_limits <- with(sample_pos_table, 
                      c(min(x_center - 0.5 * width), max(x_center + 0.5 * width))) + 
    0.1 * c(-1, 1) # extra spacing: 0.1
  plt_hbars <- ggplot(ptdf, 
                      aes_string(x = "x_center", y = "y_center", fill = taxonomy, xmin = "xmin", xmax = "xmax",
                                 height = "Frequency", width = "width")) + 
    geom_tile() +
    geom_rect(ymin = 0, ymax = 1, color = "black", fill = "transparent") +
    scale_fill_rickandmorty() +
    scale_y_continuous(expand = c(0, 0)) + 
    # For the y axis, alternatively set the labels as: gene_position_table$gene
    scale_x_continuous(breaks = sample_pos_table[, "x_center"], 
                       labels = sample_pos_table$sample,
                       limits = axis_limits, 
                       expand = c(0, 0)) + 
    labs(x = "", y = "Frequency") +
    theme_bw() +
    theme(# margin: top, right, bottom, and left
      plot.margin = unit(c(-0.9, 0.2, 1, 0.2), "cm"), 
      panel.grid.minor = element_blank())
  #plot dendrogram
  plt_dendr <- ggplot(segment_data) + 
    geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + 
    scale_y_continuous(expand = c(0, 0.05)) + 
    scale_x_continuous(breaks = sample_pos_table$x_center, 
                       labels = rep("", nrow(sample_pos_table)), 
                       limits = axis_limits, 
                       expand = c(0, 0)) + 
    labs(x = "", y = "Distance", colour = "", size = "") +
    theme_bw() + 
    theme(panel.grid.minor = element_blank(),
          panel.grid.major = element_blank())
  #combine plots
  comb <- plot_grid(plt_dendr, plt_hbars, align = 'v', ncol = 1, axis = "lr", rel_heights = c(0.3, 1))
  comb
}
plot_dendro_bars_v(df = combined_matrix, dend = dendUPGMA, taxonomy = "example")

垂直的

或水平

  plot_dendro_bars_h <- function(df, dend, taxonomy) {
  #convert dendrogram to segemnt data
  dend_data <- dendro_data(dend, type="rectangle")
  segment_data <- with(segment(dend_data), 
                       data.frame(x = y, y = x, xend = yend, yend = xend))
  #sample positions df
  sample_pos_table <- with(dend_data$labels, 
                           data.frame(y_center = x, sample = as.character(label), height = 0.9))
  #prepare input data
  ptdf <- rownames_to_column(df, var = "sample") %>%
    pivot_longer(-sample, names_to = taxonomy, values_to = "Frequency") %>%
    group_by(sample) %>%
    mutate(Frequency = Frequency/100,
           xmax = cumsum(Frequency/sum(Frequency)),
           xmin = xmax - Frequency/sum(Frequency),
           x_center = xmax-(Frequency/2)) %>%
    left_join(sample_pos_table) %>%
    mutate(ymin = y_center-height/2,
           ymax = y_center+height/2)
  #plot stacked bars
  axis_limits <- with(sample_pos_table, 
                      c(min(y_center - 0.5 * height), max(y_center + 0.5 * height))) + 
    0.1 * c(-1, 1) # extra spacing: 0.1
  plt_hbars <- ggplot(ptdf, 
                      aes_string(x = "x_center", y = "y_center", fill = taxonomy, ymin = "ymin", ymax = "ymax",
                                 height = "height", width = "Frequency")) + 
    geom_tile() +
    geom_rect(xmin = 0, xmax = 1, color = "black", fill = "transparent") +
    scale_fill_rickandmorty() +
    scale_x_continuous(expand = c(0, 0)) + 
    # For the y axis, alternatively set the labels as: gene_position_table$gene
    scale_y_continuous(breaks = sample_pos_table[, "y_center"], 
                       labels = rep("", nrow(sample_pos_table)),
                       limits = axis_limits, 
                       expand = c(0, 0)) + 
    labs(x = "Frequency", y = "") +
    theme_bw() +
    theme(# margin: top, right, bottom, and left
      plot.margin = unit(c(1, 0.2, 0.2, -0.9), "cm"), 
      panel.grid.minor = element_blank())
  #plot dendrogram
  plt_dendr <- ggplot(segment_data) + 
    geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + 
    scale_x_reverse(expand = c(0, 0.05)) + 
    scale_y_continuous(breaks = sample_pos_table$y_center, 
                       labels = sample_pos_table$sample, 
                       limits = axis_limits, 
                       expand = c(0, 0)) + 
    labs(x = "Distance", y = "", colour = "", size = "") +
    theme_bw() + 
    theme(panel.grid.minor = element_blank(),
          panel.grid.major = element_blank())
  #combine plots
  comb <- plot_grid(plt_dendr, plt_hbars, align = 'h', rel_widths = c(0.3, 1))
  return(comb)
}
plot_dendro_bars_h(df = combined_matrix, dend = dendUPGMA, taxonomy = "example")

水平的

數據可以與任何可以強制轉換為樹狀圖的樹(例如還有 UniFrac 樹)組合。 玩得開心,羅曼。

這是我的 Roman_G 腳本版本。 它在條形圖中顯示百分比,並使用vegan::reorder.hclust對樹狀圖的分​​支重新排序,以便第一列具有最高值的行傾向於放置在頂部,而具有最高值的行最后一列傾向於放在底部。 我還刪除了額外的邊距、刻度和軸。

library(tidyverse)
library(ggdendro)
library(vegan)
library(colorspace)
library(cowplot)

t=read.table(text="Spain_EN 0.028152 0.971828 0.000010 0.000010
Norway_Mesolithic 0.784705 0.083387 0.000010 0.131898
Russia_Sunghir4 0.000010 0.000010 0.999970 0.000010
Iran_Wezmeh_N 0.000010 0.492331 0.383227 0.124433
Russia_DevilsCave_N 0.000010 0.000010 0.000010 0.999970
Italy_North_Villabruna_HG 0.999970 0.000010 0.000010 0.000010
Russia_HG_Karelia 0.527887 0.133179 0.072342 0.266593
Russia_Yana_UP 0.000010 0.000014 0.999966 0.000010
Georgia_Kotias 0.000010 0.537322 0.381313 0.081355
China_SEastAsia_Island_EN 0.000010 0.000010 0.148652 0.851328
Turkey_N 0.000010 0.999970 0.000010 0.000010
USA_Ancient_Beringian 0.008591 0.000010 0.095008 0.896391
Russia_Sidelkino_HG 0.624076 0.045350 0.105615 0.224958
Russia_Kolyma_M 0.020197 0.000010 0.000010 0.979783
China_Tianyuan 0.000010 0.000010 0.423731 0.576249",row.names=1)

hc=hclust(dist(t),method="ward.D2")
hc=reorder(hc,wts=-as.matrix(t)%*%seq(ncol(t))^2) # vegan::reorder.hclust
tree=ggdendro::dendro_data(as.dendrogram(hc),type="rectangle")

p1=ggplot(ggdendro::segment(tree))+
geom_segment(aes(x=y,y=x,xend=yend,yend=xend),lineend="round",size=.4)+
scale_x_continuous(expand=expansion(add=c(0,.01)))+ # don't crop half of line between top-level nodes
scale_y_continuous(limits=.5+c(0,nrow(t)),expand=c(0,0))+
theme(
  axis.text=element_blank(),
  axis.ticks=element_blank(),
  axis.ticks.length=unit(0,"pt"), # remove extra space occupied by ticks
  axis.title=element_blank(),
  panel.background=element_rect(fill="white"),
  panel.grid=element_blank(),
  plot.margin=margin(5,5,5,0)
)

t=t[hc$labels[hc$order],]
t2=data.frame(V1=rownames(t)[row(t)],V2=colnames(t)[col(t)],V3=unname(do.call(c,t)))
lab=round(100*t2$V3)
lab[lab==0]=""

p2=ggplot(t2,aes(x=factor(V1,level=rownames(t)),y=V3,fill=V2))+
geom_bar(stat="identity",width=1,position=position_fill(reverse=T))+
geom_text(aes(label=lab),position=position_stack(vjust=.5,reverse=T),size=3.5)+
coord_flip()+
scale_x_discrete(expand=c(0,0))+
scale_y_discrete(expand=c(0,0))+
scale_fill_manual(values=colorspace::hex(HSV(head(seq(0,360,length.out=ncol(t)+1),-1),.5,1)))+
theme(
  axis.text=element_text(color="black",size=11),
  axis.text.x=element_blank(),
  axis.ticks=element_blank(),
  axis.title=element_blank(),
  legend.position="none",
  plot.margin=margin(5,0,5,5)
)

cowplot::plot_grid(p2,p1,rel_widths=c(1,.4))
ggsave("a.png",height=.25*nrow(t),width=7)

還有來自ggh4x scale_x_dendrogramscale_y_dendrogram ,它們使用ggdendro::dendro_datahttps://teunbrand.github.io/ggh4x/articles/PositionGuides.html#dendrograms 但是我無法讓它們使用coord_flip處理水平堆疊條。

library(ggh4x)

t=head(USArrests,20)
t2=data.frame(V1=rownames(t)[row(t)],V2=colnames(t)[col(t)],V3=unname(do.call(c,t)))
hc=hclust(dist(t))

ggplot(t2,aes(x=factor(V1,level=rownames(t)),y=V3,fill=V2))+
geom_bar(stat="identity",width=1,position=position_stack(reverse=F))+
geom_text(aes(label=round(V3)),position=position_stack(vjust=.5,reverse=F),size=3)+
scale_x_dendrogram(hclust=hc)+
scale_y_discrete(expand=c(0,0))+
# scale_fill_manual(values=colorspace::hex(HSV(head(seq(0,360,length.out=ncol(t)+1),-1),.5,1)))+
theme(
  axis.text=element_text(color="black",size=11),
  axis.text.x=element_text(angle=90,hjust=1,vjust=.5),
  axis.text.y=element_blank(),
  axis.ticks=element_blank(),
  axis.ticks.length=unit(14,"pt"), # height of dendrogram
  axis.title=element_blank(),
  legend.justification=c(0,1),
  legend.key=element_rect(fill=NA), # remove gray border around color squares
  legend.margin=margin(-6,0,0,0),
  legend.position=c(0,1),
  legend.title=element_blank(),
  panel.background=element_rect(fill="white"),
  plot.margin=margin(5,0,5,5)
)

ggsave("a.png",height=6,width=6)

編輯:第三個選項是使用circlizehttps : circlize

library(circlize)
library(vegan) # for reorder.hclust (may be masked by `seriation`)
library(dendextend) # for color_branches

t=read.table(text="Kalmyk 0.119357 0.725057 0.000010 0.037803 0.117774
Kyrgyz_China 0.039367 0.512079 0.230150 0.095038 0.123366
Altaian_Chelkan 0.034095 0.000010 0.919478 0.000010 0.046407
Azeri 0.051638 0.004671 0.010727 0.902646 0.030318
Uzbek 0.102725 0.273261 0.001854 0.452126 0.170033
Salar 0.000010 0.539636 0.460334 0.000010 0.000010
Tatar_Kazan 0.113456 0.057026 0.000010 0.099336 0.730171
Tatar_Siberian 0.251376 0.221389 0.000010 0.077505 0.449721
Finnish 0.007214 0.000010 0.000010 0.015174 0.977592
Yakut 0.505434 0.473202 0.000010 0.002914 0.018440
Mansi 0.572791 0.000010 0.000010 0.000010 0.427179
Altaian 0.222424 0.335614 0.358801 0.032694 0.050468
Shor_Mountain 0.233984 0.000010 0.724596 0.000010 0.041400
Chuvash 0.180171 0.011056 0.000010 0.006462 0.802301
Enets 0.920409 0.000010 0.000010 0.000010 0.079561
Yukagir_Tundra 0.710359 0.289611 0.000010 0.000011 0.000010
Kyrgyz_Tajikistan 0.104000 0.563708 0.000010 0.125799 0.206483
Khakass_Kachin 0.254253 0.416760 0.174200 0.005262 0.149525
Tuvinian 0.448940 0.448899 0.000010 0.031803 0.070348
Besermyan 0.209841 0.001487 0.000010 0.000460 0.788202
Nogai_Astrakhan 0.062497 0.463590 0.000010 0.183203 0.290701
Todzin 0.725173 0.257670 0.000010 0.005836 0.011312
Kazakh 0.067027 0.518213 0.087979 0.114550 0.212231
Tofalar 0.815599 0.110299 0.000010 0.009693 0.064398
Karakalpak 0.009983 0.316964 0.389103 0.158275 0.125676
Estonian 0.000010 0.000010 0.000010 0.004409 0.995561
Dolgan 0.694025 0.255361 0.000010 0.049624 0.000979
Tatar_Siberian_Zabolotniye 0.521637 0.020132 0.000010 0.000010 0.458212
Uyghur 0.043578 0.486742 0.000010 0.318983 0.150687
Udmurt 0.256391 0.000010 0.001010 0.000010 0.742579
Evenk_FarEast 0.241328 0.606202 0.000010 0.000010 0.152451
Selkup 0.804662 0.000010 0.000010 0.000010 0.195308
Kumyk 0.060751 0.000112 0.000010 0.823905 0.115222
Hungarian 0.000010 0.000010 0.000010 0.244311 0.755659
Tubalar 0.159517 0.009457 0.802778 0.000010 0.028238
Turkmen 0.123631 0.226543 0.000010 0.529793 0.120023
Karelian 0.012854 0.000010 0.000010 0.000010 0.987116
Kazakh_China 0.074285 0.573009 0.152931 0.069362 0.130412
Mongol 0.033174 0.847004 0.025135 0.005178 0.089509
Daur 0.000010 0.995215 0.000010 0.000010 0.004755
Evenk_Transbaikal 0.611414 0.388556 0.000010 0.000010 0.000010
Nogai_Karachay_Cherkessia 0.119988 0.120774 0.000010 0.617261 0.141967
Veps 0.026887 0.000010 0.000010 0.000010 0.973083
Even 0.441349 0.278457 0.000010 0.015239 0.264946
Nganasan 0.999960 0.000010 0.000010 0.000010 0.000010
Bashkir 0.114088 0.056493 0.251488 0.030390 0.547542
Xibo 0.000010 0.985541 0.000010 0.000362 0.014077
Khakass 0.202707 0.171413 0.530905 0.007675 0.087300
Shor_Khakassia 0.258218 0.000010 0.694889 0.000010 0.046873
Nanai 0.105903 0.894067 0.000010 0.000010 0.000010
Buryat 0.064420 0.848458 0.017066 0.001696 0.068360
Yukagir_Forest 0.379416 0.096266 0.000010 0.003580 0.520728
Karachai 0.067138 0.004534 0.000010 0.798982 0.129336
Mordovian 0.022303 0.001193 0.000010 0.025251 0.951243
Turkish_Balikesir 0.092314 0.038550 0.000010 0.804964 0.064163
Turkish 0.040918 0.012255 0.000010 0.873179 0.073639
Kyrgyz_Kyrgyzstan 0.090129 0.607265 0.000010 0.122885 0.179711
Balkar 0.075115 0.000010 0.000010 0.829730 0.095136
Gagauz 0.000010 0.027887 0.015891 0.601619 0.354593
Nogai_Stavropol 0.070584 0.403817 0.000010 0.244701 0.280888
Negidal 0.248518 0.751452 0.000010 0.000010 0.000010
Tatar_Mishar 0.066112 0.037441 0.010377 0.138008 0.748062",row.names=1)

hc=hclust(dist(t))
hc=reorder(hc,-(t[,1]+t[,2]-t[,4]-2*t[,5]))

labelcolor=hcl(c(260,90,120,60,0,210,180,310)+15,60,70)
barcolor=hcl(c(310,260,120,60,210)+15,60,70)

labels=hc$labels[hc$order]
cut=cutree(hc,8)
dend=color_branches(as.dendrogram(hc),k=length(unique(cut)),col=labelcolor[unique(cut[labels])])
t=t[hc$labels[hc$order],]

circos.clear()
png("a.png",w=2500,h=2500,res=300)
circos.par(cell.padding=c(0,0,0,0),gap.degree=5,points.overflow.warning=F)
circos.initialize("a",xlim=c(0,nrow(t)))

circos.track(ylim=c(0,1),bg.border=NA,track.height=.28,track.margin=c(.01,0),
  panel.fun=function(x,y)for(i in 1:nrow(t))circos.text(i-.5,0,labels[i],adj=c(0,.5),facing="clockwise",niceFacing=T,cex=.65,col=labelcolor[cut[labels[i]]]))

circos.track(ylim=c(0,1),track.margin=c(0,0),track.height=.35,bg.lty=0,panel.fun=function(x,y){
  mat=as.matrix(t)
  pos=1:nrow(mat)-.5
  barwidth=1
  for(i in 1:ncol(mat)){
    seq1=rowSums(mat[,seq(i-1),drop=F])
    seq2=rowSums(mat[,seq(i),drop=F])
    circos.rect(pos-barwidth/2,if(i==1){0}else{seq1},pos+barwidth/2,seq2,col=barcolor[i],border="gray20",lwd=.1)
  }
  for(i in 1:ncol(mat)){
    seq1=rowSums(mat[,seq(i-1),drop=F])
    seq2=rowSums(mat[,seq(i),drop=F])
    lab=round(100*t[,i])
    lab[lab<=1]=""
    circos.text(pos,if(i==1){seq1/2}else{seq1+(seq2-seq1)/2},labels=lab,col="gray10",cex=.4,facing="downward")
  }
})

circos.track(ylim=c(0,attr(dend,"height")),bg.border=NA,track.margin=c(0,.0015),track.height=.35,panel.fun=function(x,y)circos.dendrogram(dend))

circos.clear()
dev.off()

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM