簡體   English   中英

在 R 中,在展平列表的嵌套列表時生成正確的列名

[英]In R, generate proper column names when flattening nested list of lists

在 R 中將列表的列表展平為數據幀時丟失重復的列名- 這是我們正在構建的去年的相關帖子。 我們正在調用 API 並接收以下原始數據,我們需要將其轉換為“干凈的” dataframe:

API調用返回的List列表

dput(raw_list)

list(list(id = 248013L, title = "Knockout Stage  - Final", start = "2020-10-31T10:25:00Z", 
    end = "2020-10-31T14:17:00Z", postponed_from = NULL, deleted_at = NULL, 
    lifecycle = "over", tier = 1L, best_of = 5L, chain = list(), 
    streamed = TRUE, bracket_position = list(part = "UB", col = 0L, 
        offset = 1L), participants = list(list(seed = 1L, score = 1L, 
        forfeit = FALSE, roster = list(id = 75421L), winner = FALSE, 
        stats = NULL), list(seed = 2L, score = 3L, forfeit = FALSE, 
        roster = list(id = 67585L), winner = TRUE, stats = NULL)), 
    tournament = list(id = 5088L), substage = list(id = 26285L), 
    game = list(id = 2L), matches = list(list(id = 443031L), 
        list(id = 443032L), list(id = 443033L), list(id = 443034L), 
        list(id = 443035L)), casters = list(list(primary = TRUE, 
        caster = list(id = 47L)), list(primary = FALSE, caster = list(
        id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
        list(primary = FALSE, caster = list(id = 449L)), list(
            primary = FALSE, caster = list(id = 524L)), list(
            primary = FALSE, caster = list(id = 1009L)), list(
            primary = FALSE, caster = list(id = 1567L)), list(
            primary = FALSE, caster = list(id = 1589L)), list(
            primary = FALSE, caster = list(id = 1591L)), list(
            primary = FALSE, caster = list(id = 3589L)), list(
            primary = FALSE, caster = list(id = 3635L)), list(
            primary = FALSE, caster = list(id = 4238L)), list(
            primary = FALSE, caster = list(id = 5163L)), list(
            primary = FALSE, caster = list(id = 5164L))), has_incident_report = FALSE), 
    list(id = 248014L, title = "Knockout Stage  - Semifinal", 
        start = "2020-10-25T10:07:00Z", end = "2020-10-25T13:57:16Z", 
        postponed_from = NULL, deleted_at = NULL, lifecycle = "over", 
        tier = 1L, best_of = 5L, chain = list(), streamed = TRUE, 
        bracket_position = list(part = "UB", col = 1L, offset = 2L), 
        participants = list(list(seed = 1L, score = 1L, forfeit = FALSE, 
            roster = list(id = 70687L), winner = FALSE, stats = NULL), 
            list(seed = 2L, score = 3L, forfeit = FALSE, roster = list(
                id = 75421L), winner = TRUE, stats = NULL)), 
        tournament = list(id = 5088L), substage = list(id = 26285L), 
        game = list(id = 2L), matches = list(list(id = 443021L), 
            list(id = 443022L), list(id = 443023L), list(id = 443024L), 
            list(id = 443025L)), casters = list(list(primary = TRUE, 
            caster = list(id = 47L)), list(primary = FALSE, caster = list(
            id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
            list(primary = FALSE, caster = list(id = 449L)), 
            list(primary = FALSE, caster = list(id = 524L)), 
            list(primary = FALSE, caster = list(id = 1009L)), 
            list(primary = FALSE, caster = list(id = 1567L)), 
            list(primary = FALSE, caster = list(id = 1589L)), 
            list(primary = FALSE, caster = list(id = 1591L)), 
            list(primary = FALSE, caster = list(id = 3589L)), 
            list(primary = FALSE, caster = list(id = 3635L)), 
            list(primary = FALSE, caster = list(id = 4238L)), 
            list(primary = FALSE, caster = list(id = 5163L)), 
            list(primary = FALSE, caster = list(id = 5164L))), 
        has_incident_report = FALSE), list(id = 251494L, title = "Group B", 
        start = "2020-10-05T11:04:00Z", end = "2020-10-05T11:44:27Z", 
        postponed_from = NULL, deleted_at = NULL, lifecycle = "over", 
        tier = 1L, best_of = 1L, chain = list(), streamed = TRUE, 
        bracket_position = NULL, participants = list(list(seed = 1L, 
            score = 1L, forfeit = FALSE, roster = list(id = 61033L), 
            winner = TRUE, stats = NULL), list(seed = 2L, score = 0L, 
            forfeit = FALSE, roster = list(id = 60051L), winner = FALSE, 
            stats = NULL)), tournament = list(id = 5088L), substage = list(
            id = 26282L), game = list(id = 2L), matches = list(
            list(id = 449511L)), casters = list(list(primary = TRUE, 
            caster = list(id = 47L)), list(primary = FALSE, caster = list(
            id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
            list(primary = FALSE, caster = list(id = 449L)), 
            list(primary = FALSE, caster = list(id = 524L)), 
            list(primary = FALSE, caster = list(id = 1009L)), 
            list(primary = FALSE, caster = list(id = 1567L)), 
            list(primary = FALSE, caster = list(id = 1589L)), 
            list(primary = FALSE, caster = list(id = 1591L)), 
            list(primary = FALSE, caster = list(id = 3589L)), 
            list(primary = FALSE, caster = list(id = 3635L)), 
            list(primary = FALSE, caster = list(id = 4238L)), 
            list(primary = FALSE, caster = list(id = 5163L)), 
            list(primary = FALSE, caster = list(id = 5164L))), 
        has_incident_report = FALSE))

這是我們目前擁有的。 我們這里的問題是從.name_repair = "unique"生成的名稱:

output_df <- zed %>%
      rrapply(., f = function(x) replace(x, is.null(x), NA)) %>%
      purrr::map(unlist) %>% 
      purrr::map(t) %>% 
      {suppressMessages(purrr::map(., as_tibble, .name_repair = "unique"))} %>%
      dplyr::bind_rows() %>%
      {suppressMessages(readr::type_convert(.))} %>%
      as.data.frame()

> colnames(output_df)
 [1] "id"                          "title"                       "start"                       "end"                         "postponed_from"             
 [6] "deleted_at"                  "lifecycle"                   "tier"                        "best_of"                     "streamed"                   
[11] "bracket_position.part"       "bracket_position.col"        "bracket_position.offset"     "participants.seed...14"      "participants.score...15"    
[16] "participants.forfeit...16"   "participants.roster.id...17" "participants.winner...18"    "participants.stats...19"     "participants.seed...20"     
[21] "participants.score...21"     "participants.forfeit...22"   "participants.roster.id...23" "participants.winner...24"    "participants.stats...25"    
[26] "tournament.id"               "substage.id"                 "game.id"                     "matches.id...29"             "matches.id...30"            
[31] "matches.id...31"             "matches.id...32"             "matches.id...33"             "casters.primary...34"        "casters.caster.id...35"     
[36] "casters.primary...36"        "casters.caster.id...37"      "casters.primary...38"        "casters.caster.id...39"      "casters.primary...40"       
[41] "casters.caster.id...41"      "casters.primary...42"        "casters.caster.id...43"      "casters.primary...44"        "casters.caster.id...45"     
[46] "casters.primary...46"        "casters.caster.id...47"      "casters.primary...48"        "casters.caster.id...49"      "casters.primary...50"       
[51] "casters.caster.id...51"      "casters.primary...52"        "casters.caster.id...53"      "casters.primary...54"        "casters.caster.id...55"     
[56] "casters.primary...56"        "casters.caster.id...57"      "casters.primary...58"        "casters.caster.id...59"      "casters.primary...60"       
[61] "casters.caster.id...61"      "has_incident_report"         "bracket_position"            "participants.seed...12"      "participants.score...13"    
[66] "participants.forfeit...14"   "participants.roster.id...15" "participants.winner...16"    "participants.stats...17"     "participants.seed...18"     
[71] "participants.score...19"     "participants.forfeit...20"   "participants.roster.id...21" "participants.winner...22"    "participants.stats...23"    
[76] "matches.id"                  "casters.primary...28"        "casters.caster.id...29"      "casters.primary...30"        "casters.caster.id...31"     
[81] "casters.primary...32"        "casters.caster.id...33"     

.name_repair = "unique"非常有用,因為如果沒有它,我們將有許多列名稱V20V21V30V32等。但是,自動生成的名稱仍然存在問題。

> colnames(output_df)[grepl('matches.id', colnames(output_df))]
[1] "matches.id...29" "matches.id...30" "matches.id...31" "matches.id...32" "matches.id...33" "matches.id" 

如果將這些列命名為matches.id, matches.id.1, matches.id.2, ...會更可取。 我們可以嘗試在事后更改列名,但是我們正在尋找一種解決方案,可以在我們當前的 rrapply() %>% map() %>% map() %>% map( ) %>% 綁定行()。 這可能嗎?

編輯:到目前為止,我們正在做這樣的事情:

# fix messy / dupe columns with weird ...23 suffixes
    match_cols <- colnames(output_df)[grepl('matches.id', colnames(output_df))]
    forfeit_cols <- colnames(output_df)[grepl('participants.forfeit', colnames(output_df))]
    score_cols <- colnames(output_df)[grepl('participants.score', colnames(output_df))]
    winner_cols <- colnames(output_df)[grepl('participants.winner', colnames(output_df))]
    seed_cols <- colnames(output_df)[grepl('participants.seed', colnames(output_df))]
    
    output_df <- output_df %>% 
      tidyr::unite('match_ids', all_of(match_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('is_forfeits', all_of(forfeit_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('scores', all_of(score_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('is_winners', all_of(winner_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('seeds', all_of(seed_cols), remove = TRUE, na.rm = TRUE)  

它將這些重復的列折疊在一起,所以現在我們只需要將它們分開即可。 不理想,但似乎這會奏效。

我仍然不確定您要做什么,但您可以稍微清理一下 pipe:

zed %>%
  rrapply::rrapply(f = function(x) replace(x, is.null(x), NA)) %>% 
  purrr::map_df(
    ~ unlist(.x) %>% 
      t() %>% 
      as_tibble(.name_repair = "unique")
  ) %>%
  readr::type_convert() %>% 
  tidyr::unite('match_ids', starts_with('matches.id'), na.rm = TRUE) %>%
  tidyr::unite('is_forfeits', starts_with('participants.forfeit'), na.rm = TRUE) %>%
  tidyr::unite('scores', starts_with('participants.score'), na.rm = TRUE) %>%
  tidyr::unite('is_winners', starts_with('participants.winner'), na.rm = TRUE) %>%
  tidyr::unite('seeds', starts_with('participants.seed'), na.rm = TRUE) %>% 
  suppressMessages()

減去我拒絕使用的as.data.frame()部分。 這應該創建相同的 output 而不離開 pipe 並且不需要像match_cols這樣的輔助向量。

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM