簡體   English   中英

用表中的值字符串匯總多個列

[英]Summarize multiple columns with strings of values in a table

我有一個像這樣的 dataframe ,其中大多數列都包含值字符串; A_aoiB_aoiC_aoi列中的值表示注視方向( ABC到揚聲器, *無處/其他地方); A_aoi_durB_aoi_durC_aoi_dur列中的值表示這些注視的持續時間:

df
# A tibble: 5 x 7
  speaker A_aoi     A_aoi_dur                            B_aoi B_aoi_dur    C_aoi C_aoi_dur           
  <chr>   <chr>     <chr>                                <chr> <chr>        <chr> <chr>               
1 ID01.B  B*B*B     494,251,416,217,35                   A*    153,1260     A     1413                
2 ID01.A  *B*C*C    445,412,116,533,600,153              A     2259         A*A*A 379,123,1300,144,313
3 ID01.A  B*B*B     1098,249,168,184,526                 A*A   1090,313,822 A*A   817,626,782         
4 ID01.C  C*C*B*    1794,1561,158,208,125,63             C*    2735,1174    *A    152,3757            
5 ID01.B  B*B*C*C*B 1585,1068,249,51,998,352,1016,66,425 *     5810         *B*B* 835,173,3827,661,314

對於每個speaker (可通過后綴ABC列中的speaker識別),我想計算他們注視方向的總持續時間和比例。 我想獲得的表是這樣的:

預期結果

  A_aoi Total     Prop B_aoi Total     Prop C_aoi Total      Prop
1     *  5431 34.77843     *  8557 54.79636     *  6021 38.556609
2     B  5533 35.43161     A  4324 27.68955     A  8761 56.102715
3     C  4652 29.78996     C  2735 17.51409     B   834  5.340676

我的感覺是,最好將 dataframe 轉換為長格式。 因此,使用separate_rows的行並為每個speaker的注視方向和注視持續時間構建許多中間數據幀,我最終得到了這個復雜的代碼——它完成了它應該做的事情。 我很確定有一種更經濟、更優雅的方式

那會是什么? 非常感謝您的幫助!

library(dplyr)
library(tidyr)

### A:
a_dur <- df %>% 
  separate_rows(A_aoi_dur, sep = ",") %>%
  select(A_aoi_dur)
a_aoi <- df %>% 
  separate_rows(A_aoi, sep = "") %>%
  select(A_aoi) %>%
  filter(!A_aoi == "")
A <- cbind(a_dur, a_aoi)

# get grouped total durations and proportions: 
A_stat <- A %>%
  group_by(A_aoi) %>%
  summarise(Total = sum(as.numeric(A_aoi_dur))) %>%
  mutate(Prop = Total/sum(Total)*100)

### B:
b_dur <- df %>% 
  separate_rows(B_aoi_dur, sep = ",") %>%
  select(B_aoi_dur)
b_aoi <- df %>% 
  separate_rows(B_aoi, sep = "") %>%
  select(B_aoi) %>%
  filter(!B_aoi == "")
B <- cbind(b_dur, b_aoi)

# get grouped total durations and proportions: 
B_stat <- B %>%
  group_by(B_aoi) %>%
  summarise(Total = sum(as.numeric(B_aoi_dur))) %>%
  mutate(Prop = Total/sum(Total)*100)


### C:
c_dur <- df %>% 
  separate_rows(C_aoi_dur, sep = ",") %>%
  select(C_aoi_dur)
c_aoi <- df %>% 
  separate_rows(C_aoi, sep = "") %>%
  select(C_aoi) %>%
  filter(!C_aoi == "")
C <- cbind(c_dur, c_aoi)

# get grouped total durations and proportions: 
C_stat <- C %>%
  group_by(C_aoi) %>%
  summarise(Total = sum(as.numeric(C_aoi_dur))) %>%
  mutate(Prop = Total/sum(Total)*100)

# get final table:
cbind(A_stat, B_stat, C_stat)

可重現的數據

df <- structure(list(speaker = c("ID01.B", "ID01.A", "ID01.A", "ID01.C", 
                                     "ID01.B"), A_aoi = c("B*B*B", "*B*C*C", "B*B*B", "C*C*B*", "B*B*C*C*B"
                                     ), A_aoi_dur = c("494,251,416,217,35", "445,412,116,533,600,153", 
                                                      "1098,249,168,184,526", "1794,1561,158,208,125,63", "1585,1068,249,51,998,352,1016,66,425"
                                     ), B_aoi = c("A*", "A", "A*A", "C*", "*"), B_aoi_dur = c("153,1260", 
                                                                                              "2259", "1090,313,822", "2735,1174", "5810"), C_aoi = c("A", 
                                                                                                                                                      "A*A*A", "A*A", "*A", "*B*B*"), C_aoi_dur = c("1413", "379,123,1300,144,313", 
                                                                                                                                                                                                    "817,626,782", "152,3757", "835,173,3827,661,314")), row.names = c(NA, 
                                                                                                                                                                                                                                                                       -5L), class = c("tbl_df", "tbl", "data.frame"))

一種方法(雖然避免重復的列名):

library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
map_columns <- function(aoi, dur){
  tibble(
    speaker = aoi,
    duration = as.integer(dur)
  )
}

df %>% 
select(-1) %>% #This column seems irrelevant
mutate(
    A_aoi = str_split(A_aoi, ''),
    B_aoi = str_split(B_aoi, ''),
    C_aoi = str_split(C_aoi, ''),
    A_aoi_dur = str_split(A_aoi_dur, ','),
    B_aoi_dur = str_split(B_aoi_dur, ','),
    C_aoi_dur = str_split(C_aoi_dur, ','),
    A_aoi = map2(A_aoi, A_aoi_dur, map_columns),
    B_aoi = map2(B_aoi, B_aoi_dur, map_columns),
    C_aoi = map2(C_aoi, C_aoi_dur, map_columns),
) %>% 
select(1, 3, 5) %>% 
gather() %>% 
unnest(cols = value) %>% 
group_by(key, speaker) %>% 
summarise(
    total = sum(duration)
) %>% 
mutate(
    prop = total/sum(total)*100
) %>% 
ungroup() %>% 
nest(data = -key) %>% 
spread(key, data) %>% 
unnest(cols = c(A_aoi, B_aoi, C_aoi), names_repair = ~paste0(., '_', rep(LETTERS[1:3], each = 3)))

Output:

# A tibble: 3 x 9
  speaker_A total_A prop_A speaker_B total_B prop_B speaker_C total_C prop_C
  <chr>       <int>  <dbl> <chr>       <int>  <dbl> <chr>       <int>  <dbl>
1 *            5431   34.8 *            8557   54.8 *            6021  38.6 
2 B            5533   35.4 A            4324   27.7 A            8761  56.1 
3 C            4652   29.8 C            2735   17.5 B             834   5.34

這是一個仍然需要在最后對列進行排序的鏡頭,但我認為它與您的代碼相比是一個整潔的版本,盡管 output 有點不同,因為它在一個列中包含所有aoi而不是 3 列不同你的。

library(dplyr)
library(tidyr)
library(purrr)
# Using group_split to separate duration & attention group
split_df <- df %>%
  pivot_longer(cols = contains("aoi"), names_to = "aoi",
    values_to = "aoi_values") %>%
  mutate(aoi_names = if_else(grepl("dur", aoi), "duration", "aoi")) %>%
  group_split(aoi_names)
# For each group apply the same logics you do then combined them together
tidy_df <- bind_cols(split_df[[1]] %>%
    separate_rows(aoi_values, sep = "") %>%
    filter(aoi_values != "") %>%
    select(speaker, aoi, aoi_values),
  split_df[[2]] %>%
    separate_rows(aoi_values, sep = ",") %>%
    mutate(aoi = gsub("_dur", "", aoi)) %>%
    select(duration = aoi_values))
# Finally calculate and pivot wider to have your desire output
tidy_df %>%
  group_by(aoi, aoi_values) %>%
  summarize(total_duration = sum(as.numeric(duration)),
    .groups = "drop") %>%
  group_by(aoi) %>%
  mutate(prop = total_duration / sum(total_duration) * 100) %>%
  pivot_wider(id_cols = aoi_values, names_from = aoi,
    names_glue = "{aoi}_{.value}",
    values_fill = 0,
    values_from = c(total_duration, prop)) %>%
  select(aoi_values, sort(names(.)))

Output

# A tibble: 4 x 7
  aoi_values A_aoi_prop A_aoi_total_duration B_aoi_prop B_aoi_total_duration C_aoi_prop C_aoi_total_duration
  <chr>           <dbl>                <dbl>      <dbl>                <dbl>      <dbl>                <dbl>
1 *                34.8                 5431       54.8                 8557      38.6                  6021
2 B                35.4                 5533        0                      0       5.34                  834
3 C                29.8                 4652       17.5                 2735       0                       0
4 A                 0                      0       27.7                 4324      56.1                  8761

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM