簡體   English   中英

基於分組 dataframe 使用 ZE28396D3D40DZAF17 中的 dplyr 創建具有多個匯總列的 dataframe 的有效方法

[英]Efficient way to create a dataframe with multiple summary columns based on a grouped dataframe using dplyr in R

我有一個類似於這個假人的 dataframe:

dframe <- structure(list(id = c("294361-7349174-75411122", "294365-7645230-95464222", 
"291915-7345264-75464222", "291365-7345074-75164202", "594165-7345274-78444212", 
"234385-7335274-75464229", "734515-1345274-95464892", "201365-8345274-78464232", 
"294365-7315971-75464120", "591365-7345374-75464222", "394365-7345204-75411022", 
"494305-7345273-75464222", "291161-7345271-75461210", "294035-7345201-75464292", 
"298365-7345279-78864223", "294365-7345274-15964293", "294395-7345274-69464299", 
"899965-1345294-95464222", "194365-7145274-75464222", "194361-7349231-75464222", 
"294365-7345274-75464122", "191315-1345274-13464322", "794365-7349274-75464292", 
"214365-8318274-75464222", "394363-8341274-39494929"), gene = structure(c(3L, 
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ABC_1", "C_1", "XYZ_123"
), class = "factor"), group = structure(c(2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), .Label = c("KO", "WT"), class = "factor"), class_A = c(0, 
1, 0, 2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 2, 2, 1, 0, 0, 0, 0, 1, 1, 
1, 0, 3), class_B = c(0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1)), row.names = c(NA, -25L), class = "data.frame")

基於此,我想生成匯總表(用於按“組”和“基因”分組的數據)。 出於總結的目的,我將非零出現稱為“計數”,將非零值的總和稱為“命中”。 我覺得英語不是很流利,所以不用多說——下面是所需的 output:

desired_dframe <- structure(list(group = structure(c(1L, 1L, 2L, 2L, 2L), .Label = c("KO", 
"WT"), class = "factor"), gene = structure(c(1L, 3L, 1L, 2L, 
3L), .Label = c("ABC_1", "C_1", "XYZ_123"), class = "factor"), 
    counts_total = c(8L, 7L, 5L, 1L, 4L), counts_AB = c(6L, 3L, 
    3L, 1L, 3L), hits_AB = c(9, 6, 3, 1, 5), counts_A = c(4L, 
    3L, 2L, 1L, 2L), hits_A = c(6, 5, 2, 1, 3), counts_B = c(3L, 
    1L, 1L, 0L, 2L), hits_B = c(3, 1, 1, 0, 2)), row.names = c(NA, 
-5L), class = "data.frame")

我可以使用以下 function 來制作它:

summarize_mygene <- function(dframe){
  # count all ids per group and gene (counts)
  dframe1 <- dframe %>% 
    dplyr::group_by(group, gene) %>% 
    dplyr::summarise(counts_total = dplyr::n())

  # count all ocurrences of non-zero instances in both class_ columns (counts)
  dframe2 <- dframe %>% 
    rowwise() %>% 
    dplyr::mutate(counts_AB = sum(dplyr::c_across(class_A:class_B))) %>% 
    dplyr::group_by(group, gene) %>% 
    dplyr::summarise(dplyr::across(counts_AB, ~ sum(.x != 0))) 

  # sum up all non-zero instances in both class columns (hits)
  dframe3 <- dframe %>% 
    rowwise() %>% 
    dplyr::mutate(hits_AB = sum(dplyr::c_across(class_A:class_B))) %>% 
    dplyr::group_by(group, gene) %>% 
    dplyr::summarise(dplyr::across(hits_AB, ~ sum(.x)))

  # count non-zero ocurrences per group and gene in class_A column (counts)
  dframe4 <- dframe %>% 
    dplyr::group_by(group, gene) %>% 
    dplyr::summarise(dplyr::across(class_A, ~ sum(.x != 0))) %>% 
    dplyr::rename(counts_A = class_A)
  
  # sum up all non-zero instances in class_A column (hits)
  dframe5 <- dframe %>% 
    rowwise() %>% 
    dplyr::mutate(hits_A = class_A) %>% 
    dplyr::group_by(group, gene) %>% 
    dplyr::summarise(dplyr::across(hits_A, ~ sum(.x)))

  # count non-zero ocurrences per group and gene in class_A column (counts)
  dframe6 <- dframe %>% 
    dplyr::group_by(group, gene) %>% 
    dplyr::summarise(dplyr::across(class_B, ~ sum(.x != 0))) %>% 
    dplyr::rename(counts_B=class_B)

  # sum up all non-zero instances in class_B column (hits)
  dframe7 <- dframe %>% 
    rowwise() %>% 
    dplyr::mutate(hits_B = class_B) %>% 
    dplyr::group_by(group, gene) %>% 
    dplyr::summarise(dplyr::across(hits_B, ~ sum(.x)))

  # merge the outputs
  dframe_list <- list(dframe1, dframe2, dframe3, dframe4, dframe5, dframe6, dframe7)
  merged_dframe <- Reduce(function(x, y) merge(x, y, all=TRUE), dframe_list)
  
  
  return(merged_dframe)
}

但是,如您所見,這並不是使用 dplyr 的最優雅示例。 我以逐步的方式編寫了此 function,因為這是我可以避免錯誤的唯一方法。 最初,我嘗試在單個 dplyr::mutate() 調用或單個 dplyr::summarize() 調用中一次處理所有操作。 但是,它引發了多個錯誤或導致每行包含一個小標題的列。 經過幾次嘗試后,我放棄了編寫簡潔的代碼,並遇到了這個 summarise_mygene() 可憎的東西。 但是,我希望能幫助我改進這種噩夢燃料。

我們可以通過操作在單個組中執行此操作

library(dplyr)
library(stringr)
dframe %>% 
   mutate(Sum_AB = rowSums(across(starts_with('class_')))) %>%
   group_by(group, gene) %>% 
   summarise(counts_total = n(),            
     across(c(Sum_AB, class_A, class_B), 
       list(counts = ~ sum(.x != 0), hits = ~ sum(.x))), .groups= 'drop')%>% 
  rename_with(~str_replace(.x, '^\\w+_(\\w+)_(\\w+)', '\\2_\\1'), 3:last_col() )

-輸出

# A tibble: 5 × 9
  group gene    counts_total counts_AB hits_AB counts_A hits_A counts_B hits_B
  <fct> <fct>          <int>     <int>   <dbl>    <int>  <dbl>    <int>  <dbl>
1 KO    ABC_1              8         6       9        4      6        3      3
2 KO    XYZ_123            7         3       6        3      5        1      1
3 WT    ABC_1              5         3       3        2      2        1      1
4 WT    C_1                1         1       1        1      1        0      0
5 WT    XYZ_123            4         3       5        2      3        2      2

-OP 需要

> desired_dframe
  group    gene counts_total counts_AB hits_AB counts_A hits_A counts_B hits_B
1    KO   ABC_1            8         6       9        4      6        3      3
2    KO XYZ_123            7         3       6        3      5        1      1
3    WT   ABC_1            5         3       3        2      2        1      1
4    WT     C_1            1         1       1        1      1        0      0
5    WT XYZ_123            4         3       5        2      3        2      2

也許你正在尋找這個

desired_dframe <- dframe %>% 
  group_by(group, gene) %>%
  summarise(counts_total = n(),
            counts_AB = sum(ifelse((class_A == 0 & class_B == 0), 0, 1)),
            hits_AB = sum(class_A, class_B),
            counts_A = sum(ifelse(class_A == 0, 0, 1)),
            hits_A = sum(class_A),
            counts_B = sum(ifelse(class_B == 0, 0, 1)),
            hits_B = sum(class_B)) 

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM