繁体   English   中英

在 R 中,在展平列表的嵌套列表时生成正确的列名

[英]In R, generate proper column names when flattening nested list of lists

在 R 中将列表的列表展平为数据帧时丢失重复的列名- 这是我们正在构建的去年的相关帖子。 我们正在调用 API 并接收以下原始数据,我们需要将其转换为“干净的” dataframe:

API调用返回的List列表

dput(raw_list)

list(list(id = 248013L, title = "Knockout Stage  - Final", start = "2020-10-31T10:25:00Z", 
    end = "2020-10-31T14:17:00Z", postponed_from = NULL, deleted_at = NULL, 
    lifecycle = "over", tier = 1L, best_of = 5L, chain = list(), 
    streamed = TRUE, bracket_position = list(part = "UB", col = 0L, 
        offset = 1L), participants = list(list(seed = 1L, score = 1L, 
        forfeit = FALSE, roster = list(id = 75421L), winner = FALSE, 
        stats = NULL), list(seed = 2L, score = 3L, forfeit = FALSE, 
        roster = list(id = 67585L), winner = TRUE, stats = NULL)), 
    tournament = list(id = 5088L), substage = list(id = 26285L), 
    game = list(id = 2L), matches = list(list(id = 443031L), 
        list(id = 443032L), list(id = 443033L), list(id = 443034L), 
        list(id = 443035L)), casters = list(list(primary = TRUE, 
        caster = list(id = 47L)), list(primary = FALSE, caster = list(
        id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
        list(primary = FALSE, caster = list(id = 449L)), list(
            primary = FALSE, caster = list(id = 524L)), list(
            primary = FALSE, caster = list(id = 1009L)), list(
            primary = FALSE, caster = list(id = 1567L)), list(
            primary = FALSE, caster = list(id = 1589L)), list(
            primary = FALSE, caster = list(id = 1591L)), list(
            primary = FALSE, caster = list(id = 3589L)), list(
            primary = FALSE, caster = list(id = 3635L)), list(
            primary = FALSE, caster = list(id = 4238L)), list(
            primary = FALSE, caster = list(id = 5163L)), list(
            primary = FALSE, caster = list(id = 5164L))), has_incident_report = FALSE), 
    list(id = 248014L, title = "Knockout Stage  - Semifinal", 
        start = "2020-10-25T10:07:00Z", end = "2020-10-25T13:57:16Z", 
        postponed_from = NULL, deleted_at = NULL, lifecycle = "over", 
        tier = 1L, best_of = 5L, chain = list(), streamed = TRUE, 
        bracket_position = list(part = "UB", col = 1L, offset = 2L), 
        participants = list(list(seed = 1L, score = 1L, forfeit = FALSE, 
            roster = list(id = 70687L), winner = FALSE, stats = NULL), 
            list(seed = 2L, score = 3L, forfeit = FALSE, roster = list(
                id = 75421L), winner = TRUE, stats = NULL)), 
        tournament = list(id = 5088L), substage = list(id = 26285L), 
        game = list(id = 2L), matches = list(list(id = 443021L), 
            list(id = 443022L), list(id = 443023L), list(id = 443024L), 
            list(id = 443025L)), casters = list(list(primary = TRUE, 
            caster = list(id = 47L)), list(primary = FALSE, caster = list(
            id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
            list(primary = FALSE, caster = list(id = 449L)), 
            list(primary = FALSE, caster = list(id = 524L)), 
            list(primary = FALSE, caster = list(id = 1009L)), 
            list(primary = FALSE, caster = list(id = 1567L)), 
            list(primary = FALSE, caster = list(id = 1589L)), 
            list(primary = FALSE, caster = list(id = 1591L)), 
            list(primary = FALSE, caster = list(id = 3589L)), 
            list(primary = FALSE, caster = list(id = 3635L)), 
            list(primary = FALSE, caster = list(id = 4238L)), 
            list(primary = FALSE, caster = list(id = 5163L)), 
            list(primary = FALSE, caster = list(id = 5164L))), 
        has_incident_report = FALSE), list(id = 251494L, title = "Group B", 
        start = "2020-10-05T11:04:00Z", end = "2020-10-05T11:44:27Z", 
        postponed_from = NULL, deleted_at = NULL, lifecycle = "over", 
        tier = 1L, best_of = 1L, chain = list(), streamed = TRUE, 
        bracket_position = NULL, participants = list(list(seed = 1L, 
            score = 1L, forfeit = FALSE, roster = list(id = 61033L), 
            winner = TRUE, stats = NULL), list(seed = 2L, score = 0L, 
            forfeit = FALSE, roster = list(id = 60051L), winner = FALSE, 
            stats = NULL)), tournament = list(id = 5088L), substage = list(
            id = 26282L), game = list(id = 2L), matches = list(
            list(id = 449511L)), casters = list(list(primary = TRUE, 
            caster = list(id = 47L)), list(primary = FALSE, caster = list(
            id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
            list(primary = FALSE, caster = list(id = 449L)), 
            list(primary = FALSE, caster = list(id = 524L)), 
            list(primary = FALSE, caster = list(id = 1009L)), 
            list(primary = FALSE, caster = list(id = 1567L)), 
            list(primary = FALSE, caster = list(id = 1589L)), 
            list(primary = FALSE, caster = list(id = 1591L)), 
            list(primary = FALSE, caster = list(id = 3589L)), 
            list(primary = FALSE, caster = list(id = 3635L)), 
            list(primary = FALSE, caster = list(id = 4238L)), 
            list(primary = FALSE, caster = list(id = 5163L)), 
            list(primary = FALSE, caster = list(id = 5164L))), 
        has_incident_report = FALSE))

这是我们目前拥有的。 我们这里的问题是从.name_repair = "unique"生成的名称:

output_df <- zed %>%
      rrapply(., f = function(x) replace(x, is.null(x), NA)) %>%
      purrr::map(unlist) %>% 
      purrr::map(t) %>% 
      {suppressMessages(purrr::map(., as_tibble, .name_repair = "unique"))} %>%
      dplyr::bind_rows() %>%
      {suppressMessages(readr::type_convert(.))} %>%
      as.data.frame()

> colnames(output_df)
 [1] "id"                          "title"                       "start"                       "end"                         "postponed_from"             
 [6] "deleted_at"                  "lifecycle"                   "tier"                        "best_of"                     "streamed"                   
[11] "bracket_position.part"       "bracket_position.col"        "bracket_position.offset"     "participants.seed...14"      "participants.score...15"    
[16] "participants.forfeit...16"   "participants.roster.id...17" "participants.winner...18"    "participants.stats...19"     "participants.seed...20"     
[21] "participants.score...21"     "participants.forfeit...22"   "participants.roster.id...23" "participants.winner...24"    "participants.stats...25"    
[26] "tournament.id"               "substage.id"                 "game.id"                     "matches.id...29"             "matches.id...30"            
[31] "matches.id...31"             "matches.id...32"             "matches.id...33"             "casters.primary...34"        "casters.caster.id...35"     
[36] "casters.primary...36"        "casters.caster.id...37"      "casters.primary...38"        "casters.caster.id...39"      "casters.primary...40"       
[41] "casters.caster.id...41"      "casters.primary...42"        "casters.caster.id...43"      "casters.primary...44"        "casters.caster.id...45"     
[46] "casters.primary...46"        "casters.caster.id...47"      "casters.primary...48"        "casters.caster.id...49"      "casters.primary...50"       
[51] "casters.caster.id...51"      "casters.primary...52"        "casters.caster.id...53"      "casters.primary...54"        "casters.caster.id...55"     
[56] "casters.primary...56"        "casters.caster.id...57"      "casters.primary...58"        "casters.caster.id...59"      "casters.primary...60"       
[61] "casters.caster.id...61"      "has_incident_report"         "bracket_position"            "participants.seed...12"      "participants.score...13"    
[66] "participants.forfeit...14"   "participants.roster.id...15" "participants.winner...16"    "participants.stats...17"     "participants.seed...18"     
[71] "participants.score...19"     "participants.forfeit...20"   "participants.roster.id...21" "participants.winner...22"    "participants.stats...23"    
[76] "matches.id"                  "casters.primary...28"        "casters.caster.id...29"      "casters.primary...30"        "casters.caster.id...31"     
[81] "casters.primary...32"        "casters.caster.id...33"     

.name_repair = "unique"非常有用,因为如果没有它,我们将有许多列名称V20V21V30V32等。但是,自动生成的名称仍然存在问题。

> colnames(output_df)[grepl('matches.id', colnames(output_df))]
[1] "matches.id...29" "matches.id...30" "matches.id...31" "matches.id...32" "matches.id...33" "matches.id" 

如果将这些列命名为matches.id, matches.id.1, matches.id.2, ...会更可取。 我们可以尝试在事后更改列名,但是我们正在寻找一种解决方案,可以在我们当前的 rrapply() %>% map() %>% map() %>% map( ) %>% 绑定行()。 这可能吗?

编辑:到目前为止,我们正在做这样的事情:

# fix messy / dupe columns with weird ...23 suffixes
    match_cols <- colnames(output_df)[grepl('matches.id', colnames(output_df))]
    forfeit_cols <- colnames(output_df)[grepl('participants.forfeit', colnames(output_df))]
    score_cols <- colnames(output_df)[grepl('participants.score', colnames(output_df))]
    winner_cols <- colnames(output_df)[grepl('participants.winner', colnames(output_df))]
    seed_cols <- colnames(output_df)[grepl('participants.seed', colnames(output_df))]
    
    output_df <- output_df %>% 
      tidyr::unite('match_ids', all_of(match_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('is_forfeits', all_of(forfeit_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('scores', all_of(score_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('is_winners', all_of(winner_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('seeds', all_of(seed_cols), remove = TRUE, na.rm = TRUE)  

它将这些重复的列折叠在一起,所以现在我们只需要将它们分开即可。 不理想,但似乎这会奏效。

我仍然不确定您要做什么,但您可以稍微清理一下 pipe:

zed %>%
  rrapply::rrapply(f = function(x) replace(x, is.null(x), NA)) %>% 
  purrr::map_df(
    ~ unlist(.x) %>% 
      t() %>% 
      as_tibble(.name_repair = "unique")
  ) %>%
  readr::type_convert() %>% 
  tidyr::unite('match_ids', starts_with('matches.id'), na.rm = TRUE) %>%
  tidyr::unite('is_forfeits', starts_with('participants.forfeit'), na.rm = TRUE) %>%
  tidyr::unite('scores', starts_with('participants.score'), na.rm = TRUE) %>%
  tidyr::unite('is_winners', starts_with('participants.winner'), na.rm = TRUE) %>%
  tidyr::unite('seeds', starts_with('participants.seed'), na.rm = TRUE) %>% 
  suppressMessages()

减去我拒绝使用的as.data.frame()部分。 这应该创建相同的 output 而不离开 pipe 并且不需要像match_cols这样的辅助向量。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM