从 dataframe 列值的平均值和标准偏差创建表 R

Question

我是 R 的新手，所以不知道代码。 我有两个数据框。 一个 dataframe 看起来像这样。

df

ID	疾病
GSM239170	控制
GSM239323	控制
GSM239324	控制
GSM239326	控制
GSM239328	反洗钱
GSM239329	反洗钱
GSM239331	反洗钱
GSM239332	控制
GSM239333	控制

而另一个 dataframe 看起来像这样：

df1

GSM239170	GSM239323	GSM239324	GSM239326	GSM239328	GSM239329	GSM239331	GSM239332	GSM239333
3.016704177	3.285669072	2.929482692	2.922820483	3.15950317	3.163327169	2.985901308	3.122708843	3.070948463
7.977735461	6.532514237	6.388007183	6.466679556	6.432795021	6.407321524	6.426470803	6.376394357	6.469070308
4.207280707	4.994965767	4.40159671	4.747114589	4.830045513	4.213762092	4.884418365	4.4318876	4.849665444
7.25609471	7.420807337	6.999340125	7.094488581	7.024332721	7.17928981	7.159898654	7.009977785	6.830979234
2.204955099	2.331625217	2.133305231	2.18332885	2.12778313	2.269697813	2.264705552	2.253940441	2.287924323
7.28437278	6.983593721	6.86337111	6.865970678	7.219840938	7.181113053	7.392230178	7.484052914	7.52498281
4.265792764	4.970684112	4.595545125	4.575545289	4.547957809	4.68215122	4.674495889	4.675841709	4.643311767
2.6943516	2.916324936	2.578130269	2.659717988	2.567436676	2.8095128	2.790110381	2.795882913	2.884588792
3.646303109	8.817891552	11.4248793	10.74738082	9.296043108	9.53150669	8.285160496	9.769919327	9.774610531
3.040292001	3.38486713	2.958851115	3.047880699	2.878562717	3.209319974	3.20260379	3.195993624	3.3004227
2.357625231	2.444753172	2.340767158	2.32143889	2.282608342	2.401218719	2.385568421	2.375334953	2.432634747
5.378494673	6.065038394	5.134842087	5.367342376	5.682051149	5.712072512	5.57179966	5.72082395	5.656674512
2.833814735	3.038434511	2.837711812	2.859800224	2.866040813	2.969167906	2.929449968	2.963530689	2.931065261
6.192932281	6.478439634	6.180169144	6.151689376	6.238949956	6.708196123	6.441437631	6.448280595	6.413562269
4.543042482	4.786227217	4.445131477	4.51471011	4.491645167	4.460114204	4.602482637	4.587221948	4.623125028
6.069437462	6.232738284	6.74644117	7.04995802	6.938928532	6.348253102	6.080950712	6.324619355	6.472893789

我想制作一个表格，包括每个基因的 mean_AML、sd_AML（标准差）、min_AML、max_AML、mean_Control、sd_Control、min_Control、max_Control 和 Fold_change（即 mean_AML – mean_Control）。 使用内置函数很好。

无法弄清楚我该怎么做。 请帮忙。

谢谢。

Answer 1

我们可以将pivot_longer与right_join结合起来，然后在组上使用summarise ：

library(dplyr)
library(tidyr)
df1 %>% 
  pivot_longer(
    everything(),
    names_to = "ID", 
    values_to = "value"
  ) %>% 
  right_join(df, by="ID") %>% 
  group_by(Disease) %>% 
  summarise(Min = min(value), Mean = mean(value), Max = max(value), Sd = sd(value)) %>%
  ungroup()

  Disease   Min  Mean   Max    Sd
  <chr>   <dbl> <dbl> <dbl> <dbl>
1 AML      2.13  4.91  9.53  2.04
2 Control  2.13  4.92 11.4   2.12

Answer 2

使用pivot_longer()将第二个 dataframe 放在长格式中会有所帮助：

library(tidyr)
newdf <- pivot_longer(df1, 
    cols = everything(),
    names_to = 'ID',
    values_to = 'value')

然后将两个数据框merge()为一个：

df.all <- merge(df, newdf, by = 'ID', all = T)

然后你可以group_by(gene)和summarise() ，添加你需要的任何值：

library(dplyr)
df.all %>% group_by(gene) %>% summarise(sd = sd(value), min = min(value), max = max(value), mean = mean(value))

Answer 3

旧的 function tidyr::gather的另一个选项有一个列 df：

library(tidyverse)

df2_spread <- df1 %>% 
  tidyr::gather(ID, val) %>% 
  left_join(df, by = 'ID')
  
df2_spread %>% 
  group_by(Disease, gene = ID) %>% 
  summarise(n = n(),
            mean = mean(val),
            sd = sd(val),
            min = min(val),
            max = max(val), .groups = "drop")

 A tibble: 9 × 7
  Disease gene          n  mean    sd   min   max
  <chr>   <chr>     <int> <dbl> <dbl> <dbl> <dbl>
1 AML     GSM239328    16  4.91  2.15  2.13  9.30
2 AML     GSM239329    16  4.95  2.13  2.27  9.53
3 AML     GSM239331    16  4.88  1.96  2.26  8.29
4 Control GSM239170    16  4.56  1.91  2.20  7.98
5 Control GSM239323    16  5.04  1.98  2.33  8.82
6 Control GSM239324    16  4.93  2.45  2.13 11.4 
7 Control GSM239326    16  4.97  2.34  2.18 10.7 
8 Control GSM239332    16  4.97  2.16  2.25  9.77
9 Control GSM239333    16  5.01  2.14  2.29  9.77

在任何情况下，我都无法找到一种方法来计算每个基因的 Fold_change，因为这里似乎只有一种基因疾病。

这是数据


df <- tibble::tribble(
          ~ID,  ~Disease,
  "GSM239170", "Control",
  "GSM239323", "Control",
  "GSM239324", "Control",
  "GSM239326", "Control",
  "GSM239328",     "AML",
  "GSM239329",     "AML",
  "GSM239331",     "AML",
  "GSM239332", "Control",
  "GSM239333", "Control"
  )



df1 <- tibble::tribble(
   ~GSM239170,  ~GSM239323,  ~GSM239324,  ~GSM239326,  ~GSM239328,  ~GSM239329,  ~GSM239331,  ~GSM239332,  ~GSM239333,
  3.016704177, 3.285669072, 2.929482692, 2.922820483,  3.15950317, 3.163327169, 2.985901308, 3.122708843, 3.070948463,
  7.977735461, 6.532514237, 6.388007183, 6.466679556, 6.432795021, 6.407321524, 6.426470803, 6.376394357, 6.469070308,
  4.207280707, 4.994965767,  4.40159671, 4.747114589, 4.830045513, 4.213762092, 4.884418365,   4.4318876, 4.849665444,
   7.25609471, 7.420807337, 6.999340125, 7.094488581, 7.024332721,  7.17928981, 7.159898654, 7.009977785, 6.830979234,
  2.204955099, 2.331625217, 2.133305231,  2.18332885,  2.12778313, 2.269697813, 2.264705552, 2.253940441, 2.287924323,
   7.28437278, 6.983593721,  6.86337111, 6.865970678, 7.219840938, 7.181113053, 7.392230178, 7.484052914,  7.52498281,
  4.265792764, 4.970684112, 4.595545125, 4.575545289, 4.547957809,  4.68215122, 4.674495889, 4.675841709, 4.643311767,
    2.6943516, 2.916324936, 2.578130269, 2.659717988, 2.567436676,   2.8095128, 2.790110381, 2.795882913, 2.884588792,
  3.646303109, 8.817891552,  11.4248793, 10.74738082, 9.296043108,  9.53150669, 8.285160496, 9.769919327, 9.774610531,
  3.040292001,  3.38486713, 2.958851115, 3.047880699, 2.878562717, 3.209319974,  3.20260379, 3.195993624,   3.3004227,
  2.357625231, 2.444753172, 2.340767158,  2.32143889, 2.282608342, 2.401218719, 2.385568421, 2.375334953, 2.432634747,
  5.378494673, 6.065038394, 5.134842087, 5.367342376, 5.682051149, 5.712072512,  5.57179966,  5.72082395, 5.656674512,
  2.833814735, 3.038434511, 2.837711812, 2.859800224, 2.866040813, 2.969167906, 2.929449968, 2.963530689, 2.931065261,
  6.192932281, 6.478439634, 6.180169144, 6.151689376, 6.238949956, 6.708196123, 6.441437631, 6.448280595, 6.413562269,
  4.543042482, 4.786227217, 4.445131477,  4.51471011, 4.491645167, 4.460114204, 4.602482637, 4.587221948, 4.623125028,
  6.069437462, 6.232738284,  6.74644117,  7.04995802, 6.938928532, 6.348253102, 6.080950712, 6.324619355, 6.472893789
  )

Answer 4

一个基本的 R方法。

df_new <- data.frame(t(df1), ID=colnames(df1))
df_new <- merge(df, df_new, by = 'ID')
out <- apply(df_new[, grep('^X', names(df_new))], 1, \(x) {
    data.frame(min=min(x), IQR_low=quantile(x, .25),
          mean=mean(x), median=median(x),IQR_high=quantile(x, .75),
          max=max(x), sd=sd(x))
})
out <- round(do.call(rbind, out), 2L)
rownames(out) <- df_new$ID

Output

> out
           min IQR_low mean median IQR_high   max   sd
GSM239170 2.20    2.97 4.56   4.24     6.10  7.98 1.91
GSM239323 2.33    3.22 5.04   4.98     6.49  8.82 1.98
GSM239324 2.13    2.91 4.93   4.52     6.48 11.42 2.45
GSM239326 2.18    2.91 4.97   4.66     6.57 10.75 2.34
GSM239328 2.13    2.88 4.91   4.69     6.56  9.30 2.15
GSM239329 2.27    3.11 4.95   4.57     6.48  9.53 2.13
GSM239331 2.26    2.97 4.88   4.78     6.43  8.29 1.96
GSM239332 2.25    3.08 4.97   4.63     6.39  9.77 2.16
GSM239333 2.29    3.04 5.01   4.75     6.47  9.77 2.14

数据

df <- structure(list(ID = c("GSM239170", "GSM239323", "GSM239324", 
"GSM239326", "GSM239328", "GSM239329", "GSM239331", "GSM239332", 
"GSM239333"), Disease = c("Control", "Control", "Control", "Control", 
"AML", "AML", "AML", "Control", "Control")), row.names = c(NA, 
-9L), class = "data.frame")

df1 <- structure(list(GSM239170 = c(3.016704177, 7.977735461, 4.207280707, 
7.25609471, 2.204955099, 7.28437278, 4.265792764, 2.6943516, 
3.646303109, 3.040292001, 2.357625231, 5.378494673, 2.833814735, 
6.192932281, 4.543042482, 6.069437462), GSM239323 = c(3.285669072, 
6.532514237, 4.994965767, 7.420807337, 2.331625217, 6.983593721, 
4.970684112, 2.916324936, 8.817891552, 3.38486713, 2.444753172, 
6.065038394, 3.038434511, 6.478439634, 4.786227217, 6.232738284
), GSM239324 = c(2.929482692, 6.388007183, 4.40159671, 6.999340125, 
2.133305231, 6.86337111, 4.595545125, 2.578130269, 11.4248793, 
2.958851115, 2.340767158, 5.134842087, 2.837711812, 6.180169144, 
4.445131477, 6.74644117), GSM239326 = c(2.922820483, 6.466679556, 
4.747114589, 7.094488581, 2.18332885, 6.865970678, 4.575545289, 
2.659717988, 10.74738082, 3.047880699, 2.32143889, 5.367342376, 
2.859800224, 6.151689376, 4.51471011, 7.04995802), GSM239328 = c(3.15950317, 
6.432795021, 4.830045513, 7.024332721, 2.12778313, 7.219840938, 
4.547957809, 2.567436676, 9.296043108, 2.878562717, 2.282608342, 
5.682051149, 2.866040813, 6.238949956, 4.491645167, 6.938928532
), GSM239329 = c(3.163327169, 6.407321524, 4.213762092, 7.17928981, 
2.269697813, 7.181113053, 4.68215122, 2.8095128, 9.53150669, 
3.209319974, 2.401218719, 5.712072512, 2.969167906, 6.708196123, 
4.460114204, 6.348253102), GSM239331 = c(2.985901308, 6.426470803, 
4.884418365, 7.159898654, 2.264705552, 7.392230178, 4.674495889, 
2.790110381, 8.285160496, 3.20260379, 2.385568421, 5.57179966, 
2.929449968, 6.441437631, 4.602482637, 6.080950712), GSM239332 = c(3.122708843, 
6.376394357, 4.4318876, 7.009977785, 2.253940441, 7.484052914, 
4.675841709, 2.795882913, 9.769919327, 3.195993624, 2.375334953, 
5.72082395, 2.963530689, 6.448280595, 4.587221948, 6.324619355
), GSM239333 = c(3.070948463, 6.469070308, 4.849665444, 6.830979234, 
2.287924323, 7.52498281, 4.643311767, 2.884588792, 9.774610531, 
3.3004227, 2.432634747, 5.656674512, 2.931065261, 6.413562269, 
4.623125028, 6.472893789)), row.names = c(NA, -16L), class = "data.frame")

从 dataframe 列值的平均值和标准偏差创建表 R

问题描述

df

df1

4 个解决方案

解决方案1
0 2022-01-20 20:24:50

解决方案2
0 2022-01-20 20:29:55

解决方案3
0 2022-01-20 20:38:21

解决方案4
0 2022-01-20 21:09:23

从 dataframe 列值的平均值和标准偏差创建表 R

问题描述

df

df1

4 个解决方案

解决方案1 0 2022-01-20 20:24:50

解决方案2 0 2022-01-20 20:29:55

解决方案3 0 2022-01-20 20:38:21

解决方案4 0 2022-01-20 21:09:23

解决方案1
0 2022-01-20 20:24:50

解决方案2
0 2022-01-20 20:29:55

解决方案3
0 2022-01-20 20:38:21

解决方案4
0 2022-01-20 21:09:23