[英]Save unique values of variable for each combination of two variables in a dataset
[英]Sum a variable in a grouped dataframe only once for each unique combination of two other variables with dplyr
我有一张长桌子,上面重复着area
和cluster
组合。
counts <- tibble::tribble(
~age, ~area, ~cluster, ~norm.to.area,
"gw_25", "cingulate", "cluster_1", 0.03,
"gw_20", "cingulate", "cluster_1", 0.03,
"gw_18", "hippocampus", "cluster_1", 0.02,
"gw_25", "insula", "cluster_1", 0.01,
"gw_20", "motor", "cluster_1", 0.01,
"gw_22", "motor", "cluster_1", 0.01,
"gw_25", "motor", "cluster_1", 0.01,
"gw_14", "motor", "cluster_1", 0.01,
"gw_18", "motor", "cluster_1", 0.01,
"gw_19", "motor", "cluster_1", 0.01,
"gw_17", "motor", "cluster_1", 0.01,
"gw_20", "occipital", "cluster_1", 0.01,
"gw_17", "occipital", "cluster_1", 0.01,
"gw_18", "occipital", "cluster_1", 0.01,
"gw_19", "occipital", "cluster_1", 0.01,
"gw_22", "occipital", "cluster_1", 0.01,
"gw_14", "occipital", "cluster_1", 0.01,
"gw_22", "parietal", "cluster_1", 0,
"gw_25", "parietal", "cluster_1", 0,
"gw_17", "parietal", "cluster_1", 0,
"gw_19", "parietal", "cluster_1", 0,
"gw_20", "parietal", "cluster_1", 0,
"gw_20", "PFC", "cluster_1", 0.01,
"gw_22", "PFC", "cluster_1", 0.01,
"gw_25", "PFC", "cluster_1", 0.01
)
我想创建一个新变量sum.norm.to.area
,它是每个cluster
的norm.to.area
的总和, norm.to.area
每个area / subcluster.merge
norm.to.area
组合使用norm.to.area
的值。
我尝试对group_by
cluster
,但这会根据给定组合的出现将这些值相加多次。
counts %>% group_by(cluster) %>% mutate(sum.norm.to.area = sum(norm.to.area)
谢谢你的建议。
更新1:
尝试使用汇总,如下所示,但是发生了相同的事情(当然,除了没有添加为新列):
> counts %>% group_by(subcluster.merge, area) %>% summarize(sum(norm.to.area))
tibble::tribble(
~cluster . , ~area, ~sum.norm.to.area.,
"cluster_1", "PFC", 0.06,
"cluster_1", "somatosensory", 0.05,
"cluster_1", "motor", 0.07,
"cluster_1", "parietal", 0,
"cluster_1", "temporal", 0.03,
"cluster_1", "occipital", 0.06,
"cluster_1", "hippocampus", 0.02,
"cluster_1", "insula", 0.01,
"cluster_1", "cingulate", 0.06,
"cluster_10-34", "PFC", 0.42,
"cluster_10-34", "somatosensory", 0.35,
"cluster_10-34", "motor", 0.48,
"cluster_10-34", "parietal", 0.36,
"cluster_10-34", "temporal", 0.28,
"cluster_10-34", "occipital", 0.4,
"cluster_10-34", "hippocampus", 0.12,
"cluster_10-34", "insula", 0,
"cluster_10-34", "cingulate", 0,
"cluster_11", "PFC", 0.18,
"cluster_11", "somatosensory", 0.15,
"cluster_11", "motor", 0.14,
"cluster_11", "parietal", 0.12,
"cluster_11", "temporal", 0.04,
"cluster_11", "occipital", 0.18,
"cluster_11", "hippocampus", 0.02
)
更新2
这是我想要的输出,但是我到达它的方式太复杂了。 我想找到一种使用mutate而不需要使用join
的简便方法。
> tmp <- counts %>% distinct(area, cluster, .keep_all = TRUE) %>%
add_count(cluster, wt = norm.to.area, name = "sum.norm.to.area")
counts %>% left_join(tmp, by = c("cluster", "area"))
所需的输出: sum.norm.to.area
是为area
和cluster
所有唯一组合添加norm.to.area
的结果(仅一次):
tibble::tribble(
~age, ~area, ~cluster, ~norm.to.area, ~sum.norm.to.area,
"gw_25", "cingulate", "cluster_1", 0.03, 0.11,
"gw_20", "cingulate", "cluster_1", 0.03, 0.11,
"gw_18", "hippocampus", "cluster_1", 0.02, 0.11,
"gw_25", "insula", "cluster_1", 0.01, 0.11,
"gw_20", "motor", "cluster_1", 0.01, 0.11,
"gw_22", "motor", "cluster_1", 0.01, 0.11,
"gw_25", "motor", "cluster_1", 0.01, 0.11,
"gw_14", "motor", "cluster_1", 0.01, 0.11,
"gw_18", "motor", "cluster_1", 0.01, 0.11,
"gw_19", "motor", "cluster_1", 0.01, 0.11,
"gw_17", "motor", "cluster_1", 0.01, 0.11,
"gw_20", "occipital", "cluster_1", 0.01, 0.11,
"gw_17", "occipital", "cluster_1", 0.01, 0.11,
"gw_18", "occipital", "cluster_1", 0.01, 0.11,
"gw_19", "occipital", "cluster_1", 0.01, 0.11,
"gw_22", "occipital", "cluster_1", 0.01, 0.11,
"gw_14", "occipital", "cluster_1", 0.01, 0.11,
"gw_22", "parietal", "cluster_1", 0, 0.11,
"gw_25", "parietal", "cluster_1", 0, 0.11,
"gw_17", "parietal", "cluster_1", 0, 0.11,
"gw_19", "parietal", "cluster_1", 0, 0.11,
"gw_20", "parietal", "cluster_1", 0, 0.11,
"gw_20", "PFC", "cluster_1", 0.01, 0.11,
"gw_22", "PFC", "cluster_1", 0.01, 0.11,
"gw_25", "PFC", "cluster_1", 0.01, 0.11,
"gw_18", "PFC", "cluster_1", 0.01, 0.11,
"gw_19", "PFC", "cluster_1", 0.01, 0.11,
"gw_17", "PFC", "cluster_1", 0.01, 0.11,
"gw_22", "somatosensory", "cluster_1", 0.01, 0.11,
"gw_20", "somatosensory", "cluster_1", 0.01, 0.11,
"gw_25", "somatosensory", "cluster_1", 0.01, 0.11,
"gw_18", "somatosensory", "cluster_1", 0.01, 0.11,
"gw_19", "somatosensory", "cluster_1", 0.01, 0.11,
"gw_25", "temporal", "cluster_1", 0.01, 0.11,
"gw_19", "temporal", "cluster_1", 0.01, 0.11,
"gw_20", "temporal", "cluster_1", 0.01, 0.11
)
使用dplyr
我们可以对每个area
的唯一值进行group_by
cluster
和sum
。
library(dplyr)
counts %>%
group_by(cluster) %>%
mutate(sum.norm = sum(norm.to.area[!duplicated(area)]))
# age area cluster norm.to.area sum.norm
# <chr> <chr> <chr> <dbl> <dbl>
# 1 gw_25 cingulate cluster_1 0.03 0.09
# 2 gw_20 cingulate cluster_1 0.03 0.09
# 3 gw_18 hippocampus cluster_1 0.02 0.09
# 4 gw_25 insula cluster_1 0.01 0.09
# 5 gw_20 motor cluster_1 0.01 0.09
# 6 gw_22 motor cluster_1 0.01 0.09
# 7 gw_25 motor cluster_1 0.01 0.09
# 8 gw_14 motor cluster_1 0.01 0.09
# 9 gw_18 motor cluster_1 0.01 0.09
#10 gw_19 motor cluster_1 0.01 0.09
# … with 15 more rows
我认为您不是在寻找mutate()
counts %>% group_by(cluster, area) %>% summarize(sum.norm.to.area = sum(norm.to.area))
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.