[英]Summarize multiple columns with strings of values in a table
我有一個像這樣的 dataframe ,其中大多數列都包含值字符串; A_aoi
、 B_aoi
和C_aoi
列中的值表示注視方向( A
、 B
和C
到揚聲器, *
無處/其他地方); A_aoi_dur
、 B_aoi_dur
和C_aoi_dur
列中的值表示這些注視的持續時間:
df
# A tibble: 5 x 7
speaker A_aoi A_aoi_dur B_aoi B_aoi_dur C_aoi C_aoi_dur
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ID01.B B*B*B 494,251,416,217,35 A* 153,1260 A 1413
2 ID01.A *B*C*C 445,412,116,533,600,153 A 2259 A*A*A 379,123,1300,144,313
3 ID01.A B*B*B 1098,249,168,184,526 A*A 1090,313,822 A*A 817,626,782
4 ID01.C C*C*B* 1794,1561,158,208,125,63 C* 2735,1174 *A 152,3757
5 ID01.B B*B*C*C*B 1585,1068,249,51,998,352,1016,66,425 * 5810 *B*B* 835,173,3827,661,314
對於每個speaker
(可通過后綴A
、 B
和C
列中的speaker
識別),我想計算他們注視方向的總持續時間和比例。 我想獲得的表是這樣的:
預期結果:
A_aoi Total Prop B_aoi Total Prop C_aoi Total Prop
1 * 5431 34.77843 * 8557 54.79636 * 6021 38.556609
2 B 5533 35.43161 A 4324 27.68955 A 8761 56.102715
3 C 4652 29.78996 C 2735 17.51409 B 834 5.340676
我的感覺是,最好將 dataframe 轉換為長格式。 因此,使用separate_rows
的行並為每個speaker
的注視方向和注視持續時間構建許多中間數據幀,我最終得到了這個復雜的代碼——它完成了它應該做的事情。 但我很確定有一種更經濟、更優雅的方式!
那會是什么? 非常感謝您的幫助!
library(dplyr)
library(tidyr)
### A:
a_dur <- df %>%
separate_rows(A_aoi_dur, sep = ",") %>%
select(A_aoi_dur)
a_aoi <- df %>%
separate_rows(A_aoi, sep = "") %>%
select(A_aoi) %>%
filter(!A_aoi == "")
A <- cbind(a_dur, a_aoi)
# get grouped total durations and proportions:
A_stat <- A %>%
group_by(A_aoi) %>%
summarise(Total = sum(as.numeric(A_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
### B:
b_dur <- df %>%
separate_rows(B_aoi_dur, sep = ",") %>%
select(B_aoi_dur)
b_aoi <- df %>%
separate_rows(B_aoi, sep = "") %>%
select(B_aoi) %>%
filter(!B_aoi == "")
B <- cbind(b_dur, b_aoi)
# get grouped total durations and proportions:
B_stat <- B %>%
group_by(B_aoi) %>%
summarise(Total = sum(as.numeric(B_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
### C:
c_dur <- df %>%
separate_rows(C_aoi_dur, sep = ",") %>%
select(C_aoi_dur)
c_aoi <- df %>%
separate_rows(C_aoi, sep = "") %>%
select(C_aoi) %>%
filter(!C_aoi == "")
C <- cbind(c_dur, c_aoi)
# get grouped total durations and proportions:
C_stat <- C %>%
group_by(C_aoi) %>%
summarise(Total = sum(as.numeric(C_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
# get final table:
cbind(A_stat, B_stat, C_stat)
可重現的數據:
df <- structure(list(speaker = c("ID01.B", "ID01.A", "ID01.A", "ID01.C",
"ID01.B"), A_aoi = c("B*B*B", "*B*C*C", "B*B*B", "C*C*B*", "B*B*C*C*B"
), A_aoi_dur = c("494,251,416,217,35", "445,412,116,533,600,153",
"1098,249,168,184,526", "1794,1561,158,208,125,63", "1585,1068,249,51,998,352,1016,66,425"
), B_aoi = c("A*", "A", "A*A", "C*", "*"), B_aoi_dur = c("153,1260",
"2259", "1090,313,822", "2735,1174", "5810"), C_aoi = c("A",
"A*A*A", "A*A", "*A", "*B*B*"), C_aoi_dur = c("1413", "379,123,1300,144,313",
"817,626,782", "152,3757", "835,173,3827,661,314")), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
一種方法(雖然避免重復的列名):
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
map_columns <- function(aoi, dur){
tibble(
speaker = aoi,
duration = as.integer(dur)
)
}
df %>%
select(-1) %>% #This column seems irrelevant
mutate(
A_aoi = str_split(A_aoi, ''),
B_aoi = str_split(B_aoi, ''),
C_aoi = str_split(C_aoi, ''),
A_aoi_dur = str_split(A_aoi_dur, ','),
B_aoi_dur = str_split(B_aoi_dur, ','),
C_aoi_dur = str_split(C_aoi_dur, ','),
A_aoi = map2(A_aoi, A_aoi_dur, map_columns),
B_aoi = map2(B_aoi, B_aoi_dur, map_columns),
C_aoi = map2(C_aoi, C_aoi_dur, map_columns),
) %>%
select(1, 3, 5) %>%
gather() %>%
unnest(cols = value) %>%
group_by(key, speaker) %>%
summarise(
total = sum(duration)
) %>%
mutate(
prop = total/sum(total)*100
) %>%
ungroup() %>%
nest(data = -key) %>%
spread(key, data) %>%
unnest(cols = c(A_aoi, B_aoi, C_aoi), names_repair = ~paste0(., '_', rep(LETTERS[1:3], each = 3)))
Output:
# A tibble: 3 x 9
speaker_A total_A prop_A speaker_B total_B prop_B speaker_C total_C prop_C
<chr> <int> <dbl> <chr> <int> <dbl> <chr> <int> <dbl>
1 * 5431 34.8 * 8557 54.8 * 6021 38.6
2 B 5533 35.4 A 4324 27.7 A 8761 56.1
3 C 4652 29.8 C 2735 17.5 B 834 5.34
這是一個仍然需要在最后對列進行排序的鏡頭,但我認為它與您的代碼相比是一個整潔的版本,盡管 output 有點不同,因為它在一個列中包含所有aoi
而不是 3 列不同你的。
library(dplyr)
library(tidyr)
library(purrr)
# Using group_split to separate duration & attention group
split_df <- df %>%
pivot_longer(cols = contains("aoi"), names_to = "aoi",
values_to = "aoi_values") %>%
mutate(aoi_names = if_else(grepl("dur", aoi), "duration", "aoi")) %>%
group_split(aoi_names)
# For each group apply the same logics you do then combined them together
tidy_df <- bind_cols(split_df[[1]] %>%
separate_rows(aoi_values, sep = "") %>%
filter(aoi_values != "") %>%
select(speaker, aoi, aoi_values),
split_df[[2]] %>%
separate_rows(aoi_values, sep = ",") %>%
mutate(aoi = gsub("_dur", "", aoi)) %>%
select(duration = aoi_values))
# Finally calculate and pivot wider to have your desire output
tidy_df %>%
group_by(aoi, aoi_values) %>%
summarize(total_duration = sum(as.numeric(duration)),
.groups = "drop") %>%
group_by(aoi) %>%
mutate(prop = total_duration / sum(total_duration) * 100) %>%
pivot_wider(id_cols = aoi_values, names_from = aoi,
names_glue = "{aoi}_{.value}",
values_fill = 0,
values_from = c(total_duration, prop)) %>%
select(aoi_values, sort(names(.)))
Output
# A tibble: 4 x 7
aoi_values A_aoi_prop A_aoi_total_duration B_aoi_prop B_aoi_total_duration C_aoi_prop C_aoi_total_duration
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 * 34.8 5431 54.8 8557 38.6 6021
2 B 35.4 5533 0 0 5.34 834
3 C 29.8 4652 17.5 2735 0 0
4 A 0 0 27.7 4324 56.1 8761
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.