简体   繁体   English

在 R 中,在展平列表的嵌套列表时生成正确的列名

[英]In R, generate proper column names when flattening nested list of lists

Losing duplicate column names when flattening list-of-lists into dataframes in R - here is a related post from last year that we are building from. 在 R 中将列表的列表展平为数据帧时丢失重复的列名- 这是我们正在构建的去年的相关帖子。 We are making an API call and receiving the following raw data, that we need to convert into a "clean" dataframe:我们正在调用 API 并接收以下原始数据,我们需要将其转换为“干净的” dataframe:

List of Lists returned by API Call API调用返回的List列表

dput(raw_list)

list(list(id = 248013L, title = "Knockout Stage  - Final", start = "2020-10-31T10:25:00Z", 
    end = "2020-10-31T14:17:00Z", postponed_from = NULL, deleted_at = NULL, 
    lifecycle = "over", tier = 1L, best_of = 5L, chain = list(), 
    streamed = TRUE, bracket_position = list(part = "UB", col = 0L, 
        offset = 1L), participants = list(list(seed = 1L, score = 1L, 
        forfeit = FALSE, roster = list(id = 75421L), winner = FALSE, 
        stats = NULL), list(seed = 2L, score = 3L, forfeit = FALSE, 
        roster = list(id = 67585L), winner = TRUE, stats = NULL)), 
    tournament = list(id = 5088L), substage = list(id = 26285L), 
    game = list(id = 2L), matches = list(list(id = 443031L), 
        list(id = 443032L), list(id = 443033L), list(id = 443034L), 
        list(id = 443035L)), casters = list(list(primary = TRUE, 
        caster = list(id = 47L)), list(primary = FALSE, caster = list(
        id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
        list(primary = FALSE, caster = list(id = 449L)), list(
            primary = FALSE, caster = list(id = 524L)), list(
            primary = FALSE, caster = list(id = 1009L)), list(
            primary = FALSE, caster = list(id = 1567L)), list(
            primary = FALSE, caster = list(id = 1589L)), list(
            primary = FALSE, caster = list(id = 1591L)), list(
            primary = FALSE, caster = list(id = 3589L)), list(
            primary = FALSE, caster = list(id = 3635L)), list(
            primary = FALSE, caster = list(id = 4238L)), list(
            primary = FALSE, caster = list(id = 5163L)), list(
            primary = FALSE, caster = list(id = 5164L))), has_incident_report = FALSE), 
    list(id = 248014L, title = "Knockout Stage  - Semifinal", 
        start = "2020-10-25T10:07:00Z", end = "2020-10-25T13:57:16Z", 
        postponed_from = NULL, deleted_at = NULL, lifecycle = "over", 
        tier = 1L, best_of = 5L, chain = list(), streamed = TRUE, 
        bracket_position = list(part = "UB", col = 1L, offset = 2L), 
        participants = list(list(seed = 1L, score = 1L, forfeit = FALSE, 
            roster = list(id = 70687L), winner = FALSE, stats = NULL), 
            list(seed = 2L, score = 3L, forfeit = FALSE, roster = list(
                id = 75421L), winner = TRUE, stats = NULL)), 
        tournament = list(id = 5088L), substage = list(id = 26285L), 
        game = list(id = 2L), matches = list(list(id = 443021L), 
            list(id = 443022L), list(id = 443023L), list(id = 443024L), 
            list(id = 443025L)), casters = list(list(primary = TRUE, 
            caster = list(id = 47L)), list(primary = FALSE, caster = list(
            id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
            list(primary = FALSE, caster = list(id = 449L)), 
            list(primary = FALSE, caster = list(id = 524L)), 
            list(primary = FALSE, caster = list(id = 1009L)), 
            list(primary = FALSE, caster = list(id = 1567L)), 
            list(primary = FALSE, caster = list(id = 1589L)), 
            list(primary = FALSE, caster = list(id = 1591L)), 
            list(primary = FALSE, caster = list(id = 3589L)), 
            list(primary = FALSE, caster = list(id = 3635L)), 
            list(primary = FALSE, caster = list(id = 4238L)), 
            list(primary = FALSE, caster = list(id = 5163L)), 
            list(primary = FALSE, caster = list(id = 5164L))), 
        has_incident_report = FALSE), list(id = 251494L, title = "Group B", 
        start = "2020-10-05T11:04:00Z", end = "2020-10-05T11:44:27Z", 
        postponed_from = NULL, deleted_at = NULL, lifecycle = "over", 
        tier = 1L, best_of = 1L, chain = list(), streamed = TRUE, 
        bracket_position = NULL, participants = list(list(seed = 1L, 
            score = 1L, forfeit = FALSE, roster = list(id = 61033L), 
            winner = TRUE, stats = NULL), list(seed = 2L, score = 0L, 
            forfeit = FALSE, roster = list(id = 60051L), winner = FALSE, 
            stats = NULL)), tournament = list(id = 5088L), substage = list(
            id = 26282L), game = list(id = 2L), matches = list(
            list(id = 449511L)), casters = list(list(primary = TRUE, 
            caster = list(id = 47L)), list(primary = FALSE, caster = list(
            id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
            list(primary = FALSE, caster = list(id = 449L)), 
            list(primary = FALSE, caster = list(id = 524L)), 
            list(primary = FALSE, caster = list(id = 1009L)), 
            list(primary = FALSE, caster = list(id = 1567L)), 
            list(primary = FALSE, caster = list(id = 1589L)), 
            list(primary = FALSE, caster = list(id = 1591L)), 
            list(primary = FALSE, caster = list(id = 3589L)), 
            list(primary = FALSE, caster = list(id = 3635L)), 
            list(primary = FALSE, caster = list(id = 4238L)), 
            list(primary = FALSE, caster = list(id = 5163L)), 
            list(primary = FALSE, caster = list(id = 5164L))), 
        has_incident_report = FALSE))

Here is what we have currently.这是我们目前拥有的。 Our issue here is with the names generated from .name_repair = "unique" :我们这里的问题是从.name_repair = "unique"生成的名称:

output_df <- zed %>%
      rrapply(., f = function(x) replace(x, is.null(x), NA)) %>%
      purrr::map(unlist) %>% 
      purrr::map(t) %>% 
      {suppressMessages(purrr::map(., as_tibble, .name_repair = "unique"))} %>%
      dplyr::bind_rows() %>%
      {suppressMessages(readr::type_convert(.))} %>%
      as.data.frame()

> colnames(output_df)
 [1] "id"                          "title"                       "start"                       "end"                         "postponed_from"             
 [6] "deleted_at"                  "lifecycle"                   "tier"                        "best_of"                     "streamed"                   
[11] "bracket_position.part"       "bracket_position.col"        "bracket_position.offset"     "participants.seed...14"      "participants.score...15"    
[16] "participants.forfeit...16"   "participants.roster.id...17" "participants.winner...18"    "participants.stats...19"     "participants.seed...20"     
[21] "participants.score...21"     "participants.forfeit...22"   "participants.roster.id...23" "participants.winner...24"    "participants.stats...25"    
[26] "tournament.id"               "substage.id"                 "game.id"                     "matches.id...29"             "matches.id...30"            
[31] "matches.id...31"             "matches.id...32"             "matches.id...33"             "casters.primary...34"        "casters.caster.id...35"     
[36] "casters.primary...36"        "casters.caster.id...37"      "casters.primary...38"        "casters.caster.id...39"      "casters.primary...40"       
[41] "casters.caster.id...41"      "casters.primary...42"        "casters.caster.id...43"      "casters.primary...44"        "casters.caster.id...45"     
[46] "casters.primary...46"        "casters.caster.id...47"      "casters.primary...48"        "casters.caster.id...49"      "casters.primary...50"       
[51] "casters.caster.id...51"      "casters.primary...52"        "casters.caster.id...53"      "casters.primary...54"        "casters.caster.id...55"     
[56] "casters.primary...56"        "casters.caster.id...57"      "casters.primary...58"        "casters.caster.id...59"      "casters.primary...60"       
[61] "casters.caster.id...61"      "has_incident_report"         "bracket_position"            "participants.seed...12"      "participants.score...13"    
[66] "participants.forfeit...14"   "participants.roster.id...15" "participants.winner...16"    "participants.stats...17"     "participants.seed...18"     
[71] "participants.score...19"     "participants.forfeit...20"   "participants.roster.id...21" "participants.winner...22"    "participants.stats...23"    
[76] "matches.id"                  "casters.primary...28"        "casters.caster.id...29"      "casters.primary...30"        "casters.caster.id...31"     
[81] "casters.primary...32"        "casters.caster.id...33"     

.name_repair = "unique" is very helpful because without it, we would have many column names V20 , V21 , V30 , V32 , etc. However, the auto-generated names are still problematic. .name_repair = "unique"非常有用,因为如果没有它,我们将有许多列名称V20V21V30V32等。但是,自动生成的名称仍然存在问题。

> colnames(output_df)[grepl('matches.id', colnames(output_df))]
[1] "matches.id...29" "matches.id...30" "matches.id...31" "matches.id...32" "matches.id...33" "matches.id" 

If would be preferable if these columns were named matches.id, matches.id.1, matches.id.2, ... .如果将这些列命名为matches.id, matches.id.1, matches.id.2, ...会更可取。 We can attempt to change the column names after the matter, however we are looking for a solution that creates the correct column names in our current chain of rrapply() %>% map() %>% map() %>% map() %>% bind_rows().我们可以尝试在事后更改列名,但是我们正在寻找一种解决方案,可以在我们当前的 rrapply() %>% map() %>% map() %>% map( ) %>% 绑定行()。 Is this possible?这可能吗?

EDIT: So far we're doing something like this:编辑:到目前为止,我们正在做这样的事情:

# fix messy / dupe columns with weird ...23 suffixes
    match_cols <- colnames(output_df)[grepl('matches.id', colnames(output_df))]
    forfeit_cols <- colnames(output_df)[grepl('participants.forfeit', colnames(output_df))]
    score_cols <- colnames(output_df)[grepl('participants.score', colnames(output_df))]
    winner_cols <- colnames(output_df)[grepl('participants.winner', colnames(output_df))]
    seed_cols <- colnames(output_df)[grepl('participants.seed', colnames(output_df))]
    
    output_df <- output_df %>% 
      tidyr::unite('match_ids', all_of(match_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('is_forfeits', all_of(forfeit_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('scores', all_of(score_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('is_winners', all_of(winner_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('seeds', all_of(seed_cols), remove = TRUE, na.rm = TRUE)  

which collapses these duplicate columns together, so now we simply need to split them back up.它将这些重复的列折叠在一起,所以现在我们只需要将它们分开即可。 Not ideal but seems like this will work.不理想,但似乎这会奏效。

I'm still unsure what you are trying to do, but you could clean up your pipe a little bit:我仍然不确定您要做什么,但您可以稍微清理一下 pipe:

zed %>%
  rrapply::rrapply(f = function(x) replace(x, is.null(x), NA)) %>% 
  purrr::map_df(
    ~ unlist(.x) %>% 
      t() %>% 
      as_tibble(.name_repair = "unique")
  ) %>%
  readr::type_convert() %>% 
  tidyr::unite('match_ids', starts_with('matches.id'), na.rm = TRUE) %>%
  tidyr::unite('is_forfeits', starts_with('participants.forfeit'), na.rm = TRUE) %>%
  tidyr::unite('scores', starts_with('participants.score'), na.rm = TRUE) %>%
  tidyr::unite('is_winners', starts_with('participants.winner'), na.rm = TRUE) %>%
  tidyr::unite('seeds', starts_with('participants.seed'), na.rm = TRUE) %>% 
  suppressMessages()

minus the as.data.frame() -part which I refuse to use.减去我拒绝使用的as.data.frame()部分。 This should create the same output without leaving the pipe and without needing auxiliary vectors like match_cols .这应该创建相同的 output 而不离开 pipe 并且不需要像match_cols这样的辅助向量。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM