[英]Save unique values of variable for each combination of two variables in a dataset
[英]Sum a variable in a grouped dataframe only once for each unique combination of two other variables with dplyr
我有一張長桌子,上面重復着area
和cluster
組合。
counts <- tibble::tribble(
~age, ~area, ~cluster, ~norm.to.area,
"gw_25", "cingulate", "cluster_1", 0.03,
"gw_20", "cingulate", "cluster_1", 0.03,
"gw_18", "hippocampus", "cluster_1", 0.02,
"gw_25", "insula", "cluster_1", 0.01,
"gw_20", "motor", "cluster_1", 0.01,
"gw_22", "motor", "cluster_1", 0.01,
"gw_25", "motor", "cluster_1", 0.01,
"gw_14", "motor", "cluster_1", 0.01,
"gw_18", "motor", "cluster_1", 0.01,
"gw_19", "motor", "cluster_1", 0.01,
"gw_17", "motor", "cluster_1", 0.01,
"gw_20", "occipital", "cluster_1", 0.01,
"gw_17", "occipital", "cluster_1", 0.01,
"gw_18", "occipital", "cluster_1", 0.01,
"gw_19", "occipital", "cluster_1", 0.01,
"gw_22", "occipital", "cluster_1", 0.01,
"gw_14", "occipital", "cluster_1", 0.01,
"gw_22", "parietal", "cluster_1", 0,
"gw_25", "parietal", "cluster_1", 0,
"gw_17", "parietal", "cluster_1", 0,
"gw_19", "parietal", "cluster_1", 0,
"gw_20", "parietal", "cluster_1", 0,
"gw_20", "PFC", "cluster_1", 0.01,
"gw_22", "PFC", "cluster_1", 0.01,
"gw_25", "PFC", "cluster_1", 0.01
)
我想創建一個新變量sum.norm.to.area
,它是每個cluster
的norm.to.area
的總和, norm.to.area
每個area / subcluster.merge
norm.to.area
組合使用norm.to.area
的值。
我嘗試對group_by
cluster
,但這會根據給定組合的出現將這些值相加多次。
counts %>% group_by(cluster) %>% mutate(sum.norm.to.area = sum(norm.to.area)
謝謝你的建議。
更新1:
嘗試使用匯總,如下所示,但是發生了相同的事情(當然,除了沒有添加為新列):
> counts %>% group_by(subcluster.merge, area) %>% summarize(sum(norm.to.area))
tibble::tribble(
~cluster . , ~area, ~sum.norm.to.area.,
"cluster_1", "PFC", 0.06,
"cluster_1", "somatosensory", 0.05,
"cluster_1", "motor", 0.07,
"cluster_1", "parietal", 0,
"cluster_1", "temporal", 0.03,
"cluster_1", "occipital", 0.06,
"cluster_1", "hippocampus", 0.02,
"cluster_1", "insula", 0.01,
"cluster_1", "cingulate", 0.06,
"cluster_10-34", "PFC", 0.42,
"cluster_10-34", "somatosensory", 0.35,
"cluster_10-34", "motor", 0.48,
"cluster_10-34", "parietal", 0.36,
"cluster_10-34", "temporal", 0.28,
"cluster_10-34", "occipital", 0.4,
"cluster_10-34", "hippocampus", 0.12,
"cluster_10-34", "insula", 0,
"cluster_10-34", "cingulate", 0,
"cluster_11", "PFC", 0.18,
"cluster_11", "somatosensory", 0.15,
"cluster_11", "motor", 0.14,
"cluster_11", "parietal", 0.12,
"cluster_11", "temporal", 0.04,
"cluster_11", "occipital", 0.18,
"cluster_11", "hippocampus", 0.02
)
更新2
這是我想要的輸出,但是我到達它的方式太復雜了。 我想找到一種使用mutate而不需要使用join
的簡便方法。
> tmp <- counts %>% distinct(area, cluster, .keep_all = TRUE) %>%
add_count(cluster, wt = norm.to.area, name = "sum.norm.to.area")
counts %>% left_join(tmp, by = c("cluster", "area"))
所需的輸出: sum.norm.to.area
是為area
和cluster
所有唯一組合添加norm.to.area
的結果(僅一次):
tibble::tribble(
~age, ~area, ~cluster, ~norm.to.area, ~sum.norm.to.area,
"gw_25", "cingulate", "cluster_1", 0.03, 0.11,
"gw_20", "cingulate", "cluster_1", 0.03, 0.11,
"gw_18", "hippocampus", "cluster_1", 0.02, 0.11,
"gw_25", "insula", "cluster_1", 0.01, 0.11,
"gw_20", "motor", "cluster_1", 0.01, 0.11,
"gw_22", "motor", "cluster_1", 0.01, 0.11,
"gw_25", "motor", "cluster_1", 0.01, 0.11,
"gw_14", "motor", "cluster_1", 0.01, 0.11,
"gw_18", "motor", "cluster_1", 0.01, 0.11,
"gw_19", "motor", "cluster_1", 0.01, 0.11,
"gw_17", "motor", "cluster_1", 0.01, 0.11,
"gw_20", "occipital", "cluster_1", 0.01, 0.11,
"gw_17", "occipital", "cluster_1", 0.01, 0.11,
"gw_18", "occipital", "cluster_1", 0.01, 0.11,
"gw_19", "occipital", "cluster_1", 0.01, 0.11,
"gw_22", "occipital", "cluster_1", 0.01, 0.11,
"gw_14", "occipital", "cluster_1", 0.01, 0.11,
"gw_22", "parietal", "cluster_1", 0, 0.11,
"gw_25", "parietal", "cluster_1", 0, 0.11,
"gw_17", "parietal", "cluster_1", 0, 0.11,
"gw_19", "parietal", "cluster_1", 0, 0.11,
"gw_20", "parietal", "cluster_1", 0, 0.11,
"gw_20", "PFC", "cluster_1", 0.01, 0.11,
"gw_22", "PFC", "cluster_1", 0.01, 0.11,
"gw_25", "PFC", "cluster_1", 0.01, 0.11,
"gw_18", "PFC", "cluster_1", 0.01, 0.11,
"gw_19", "PFC", "cluster_1", 0.01, 0.11,
"gw_17", "PFC", "cluster_1", 0.01, 0.11,
"gw_22", "somatosensory", "cluster_1", 0.01, 0.11,
"gw_20", "somatosensory", "cluster_1", 0.01, 0.11,
"gw_25", "somatosensory", "cluster_1", 0.01, 0.11,
"gw_18", "somatosensory", "cluster_1", 0.01, 0.11,
"gw_19", "somatosensory", "cluster_1", 0.01, 0.11,
"gw_25", "temporal", "cluster_1", 0.01, 0.11,
"gw_19", "temporal", "cluster_1", 0.01, 0.11,
"gw_20", "temporal", "cluster_1", 0.01, 0.11
)
使用dplyr
我們可以對每個area
的唯一值進行group_by
cluster
和sum
。
library(dplyr)
counts %>%
group_by(cluster) %>%
mutate(sum.norm = sum(norm.to.area[!duplicated(area)]))
# age area cluster norm.to.area sum.norm
# <chr> <chr> <chr> <dbl> <dbl>
# 1 gw_25 cingulate cluster_1 0.03 0.09
# 2 gw_20 cingulate cluster_1 0.03 0.09
# 3 gw_18 hippocampus cluster_1 0.02 0.09
# 4 gw_25 insula cluster_1 0.01 0.09
# 5 gw_20 motor cluster_1 0.01 0.09
# 6 gw_22 motor cluster_1 0.01 0.09
# 7 gw_25 motor cluster_1 0.01 0.09
# 8 gw_14 motor cluster_1 0.01 0.09
# 9 gw_18 motor cluster_1 0.01 0.09
#10 gw_19 motor cluster_1 0.01 0.09
# … with 15 more rows
我認為您不是在尋找mutate()
counts %>% group_by(cluster, area) %>% summarize(sum.norm.to.area = sum(norm.to.area))
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.