简体   繁体   中英

In R, generate proper column names when flattening nested list of lists

Losing duplicate column names when flattening list-of-lists into dataframes in R - here is a related post from last year that we are building from. We are making an API call and receiving the following raw data, that we need to convert into a "clean" dataframe:

List of Lists returned by API Call

dput(raw_list)

list(list(id = 248013L, title = "Knockout Stage  - Final", start = "2020-10-31T10:25:00Z", 
    end = "2020-10-31T14:17:00Z", postponed_from = NULL, deleted_at = NULL, 
    lifecycle = "over", tier = 1L, best_of = 5L, chain = list(), 
    streamed = TRUE, bracket_position = list(part = "UB", col = 0L, 
        offset = 1L), participants = list(list(seed = 1L, score = 1L, 
        forfeit = FALSE, roster = list(id = 75421L), winner = FALSE, 
        stats = NULL), list(seed = 2L, score = 3L, forfeit = FALSE, 
        roster = list(id = 67585L), winner = TRUE, stats = NULL)), 
    tournament = list(id = 5088L), substage = list(id = 26285L), 
    game = list(id = 2L), matches = list(list(id = 443031L), 
        list(id = 443032L), list(id = 443033L), list(id = 443034L), 
        list(id = 443035L)), casters = list(list(primary = TRUE, 
        caster = list(id = 47L)), list(primary = FALSE, caster = list(
        id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
        list(primary = FALSE, caster = list(id = 449L)), list(
            primary = FALSE, caster = list(id = 524L)), list(
            primary = FALSE, caster = list(id = 1009L)), list(
            primary = FALSE, caster = list(id = 1567L)), list(
            primary = FALSE, caster = list(id = 1589L)), list(
            primary = FALSE, caster = list(id = 1591L)), list(
            primary = FALSE, caster = list(id = 3589L)), list(
            primary = FALSE, caster = list(id = 3635L)), list(
            primary = FALSE, caster = list(id = 4238L)), list(
            primary = FALSE, caster = list(id = 5163L)), list(
            primary = FALSE, caster = list(id = 5164L))), has_incident_report = FALSE), 
    list(id = 248014L, title = "Knockout Stage  - Semifinal", 
        start = "2020-10-25T10:07:00Z", end = "2020-10-25T13:57:16Z", 
        postponed_from = NULL, deleted_at = NULL, lifecycle = "over", 
        tier = 1L, best_of = 5L, chain = list(), streamed = TRUE, 
        bracket_position = list(part = "UB", col = 1L, offset = 2L), 
        participants = list(list(seed = 1L, score = 1L, forfeit = FALSE, 
            roster = list(id = 70687L), winner = FALSE, stats = NULL), 
            list(seed = 2L, score = 3L, forfeit = FALSE, roster = list(
                id = 75421L), winner = TRUE, stats = NULL)), 
        tournament = list(id = 5088L), substage = list(id = 26285L), 
        game = list(id = 2L), matches = list(list(id = 443021L), 
            list(id = 443022L), list(id = 443023L), list(id = 443024L), 
            list(id = 443025L)), casters = list(list(primary = TRUE, 
            caster = list(id = 47L)), list(primary = FALSE, caster = list(
            id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
            list(primary = FALSE, caster = list(id = 449L)), 
            list(primary = FALSE, caster = list(id = 524L)), 
            list(primary = FALSE, caster = list(id = 1009L)), 
            list(primary = FALSE, caster = list(id = 1567L)), 
            list(primary = FALSE, caster = list(id = 1589L)), 
            list(primary = FALSE, caster = list(id = 1591L)), 
            list(primary = FALSE, caster = list(id = 3589L)), 
            list(primary = FALSE, caster = list(id = 3635L)), 
            list(primary = FALSE, caster = list(id = 4238L)), 
            list(primary = FALSE, caster = list(id = 5163L)), 
            list(primary = FALSE, caster = list(id = 5164L))), 
        has_incident_report = FALSE), list(id = 251494L, title = "Group B", 
        start = "2020-10-05T11:04:00Z", end = "2020-10-05T11:44:27Z", 
        postponed_from = NULL, deleted_at = NULL, lifecycle = "over", 
        tier = 1L, best_of = 1L, chain = list(), streamed = TRUE, 
        bracket_position = NULL, participants = list(list(seed = 1L, 
            score = 1L, forfeit = FALSE, roster = list(id = 61033L), 
            winner = TRUE, stats = NULL), list(seed = 2L, score = 0L, 
            forfeit = FALSE, roster = list(id = 60051L), winner = FALSE, 
            stats = NULL)), tournament = list(id = 5088L), substage = list(
            id = 26282L), game = list(id = 2L), matches = list(
            list(id = 449511L)), casters = list(list(primary = TRUE, 
            caster = list(id = 47L)), list(primary = FALSE, caster = list(
            id = 48L)), list(primary = FALSE, caster = list(id = 425L)), 
            list(primary = FALSE, caster = list(id = 449L)), 
            list(primary = FALSE, caster = list(id = 524L)), 
            list(primary = FALSE, caster = list(id = 1009L)), 
            list(primary = FALSE, caster = list(id = 1567L)), 
            list(primary = FALSE, caster = list(id = 1589L)), 
            list(primary = FALSE, caster = list(id = 1591L)), 
            list(primary = FALSE, caster = list(id = 3589L)), 
            list(primary = FALSE, caster = list(id = 3635L)), 
            list(primary = FALSE, caster = list(id = 4238L)), 
            list(primary = FALSE, caster = list(id = 5163L)), 
            list(primary = FALSE, caster = list(id = 5164L))), 
        has_incident_report = FALSE))

Here is what we have currently. Our issue here is with the names generated from .name_repair = "unique" :

output_df <- zed %>%
      rrapply(., f = function(x) replace(x, is.null(x), NA)) %>%
      purrr::map(unlist) %>% 
      purrr::map(t) %>% 
      {suppressMessages(purrr::map(., as_tibble, .name_repair = "unique"))} %>%
      dplyr::bind_rows() %>%
      {suppressMessages(readr::type_convert(.))} %>%
      as.data.frame()

> colnames(output_df)
 [1] "id"                          "title"                       "start"                       "end"                         "postponed_from"             
 [6] "deleted_at"                  "lifecycle"                   "tier"                        "best_of"                     "streamed"                   
[11] "bracket_position.part"       "bracket_position.col"        "bracket_position.offset"     "participants.seed...14"      "participants.score...15"    
[16] "participants.forfeit...16"   "participants.roster.id...17" "participants.winner...18"    "participants.stats...19"     "participants.seed...20"     
[21] "participants.score...21"     "participants.forfeit...22"   "participants.roster.id...23" "participants.winner...24"    "participants.stats...25"    
[26] "tournament.id"               "substage.id"                 "game.id"                     "matches.id...29"             "matches.id...30"            
[31] "matches.id...31"             "matches.id...32"             "matches.id...33"             "casters.primary...34"        "casters.caster.id...35"     
[36] "casters.primary...36"        "casters.caster.id...37"      "casters.primary...38"        "casters.caster.id...39"      "casters.primary...40"       
[41] "casters.caster.id...41"      "casters.primary...42"        "casters.caster.id...43"      "casters.primary...44"        "casters.caster.id...45"     
[46] "casters.primary...46"        "casters.caster.id...47"      "casters.primary...48"        "casters.caster.id...49"      "casters.primary...50"       
[51] "casters.caster.id...51"      "casters.primary...52"        "casters.caster.id...53"      "casters.primary...54"        "casters.caster.id...55"     
[56] "casters.primary...56"        "casters.caster.id...57"      "casters.primary...58"        "casters.caster.id...59"      "casters.primary...60"       
[61] "casters.caster.id...61"      "has_incident_report"         "bracket_position"            "participants.seed...12"      "participants.score...13"    
[66] "participants.forfeit...14"   "participants.roster.id...15" "participants.winner...16"    "participants.stats...17"     "participants.seed...18"     
[71] "participants.score...19"     "participants.forfeit...20"   "participants.roster.id...21" "participants.winner...22"    "participants.stats...23"    
[76] "matches.id"                  "casters.primary...28"        "casters.caster.id...29"      "casters.primary...30"        "casters.caster.id...31"     
[81] "casters.primary...32"        "casters.caster.id...33"     

.name_repair = "unique" is very helpful because without it, we would have many column names V20 , V21 , V30 , V32 , etc. However, the auto-generated names are still problematic.

> colnames(output_df)[grepl('matches.id', colnames(output_df))]
[1] "matches.id...29" "matches.id...30" "matches.id...31" "matches.id...32" "matches.id...33" "matches.id" 

If would be preferable if these columns were named matches.id, matches.id.1, matches.id.2, ... . We can attempt to change the column names after the matter, however we are looking for a solution that creates the correct column names in our current chain of rrapply() %>% map() %>% map() %>% map() %>% bind_rows(). Is this possible?

EDIT: So far we're doing something like this:

# fix messy / dupe columns with weird ...23 suffixes
    match_cols <- colnames(output_df)[grepl('matches.id', colnames(output_df))]
    forfeit_cols <- colnames(output_df)[grepl('participants.forfeit', colnames(output_df))]
    score_cols <- colnames(output_df)[grepl('participants.score', colnames(output_df))]
    winner_cols <- colnames(output_df)[grepl('participants.winner', colnames(output_df))]
    seed_cols <- colnames(output_df)[grepl('participants.seed', colnames(output_df))]
    
    output_df <- output_df %>% 
      tidyr::unite('match_ids', all_of(match_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('is_forfeits', all_of(forfeit_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('scores', all_of(score_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('is_winners', all_of(winner_cols), remove = TRUE, na.rm = TRUE) %>%
      tidyr::unite('seeds', all_of(seed_cols), remove = TRUE, na.rm = TRUE)  

which collapses these duplicate columns together, so now we simply need to split them back up. Not ideal but seems like this will work.

I'm still unsure what you are trying to do, but you could clean up your pipe a little bit:

zed %>%
  rrapply::rrapply(f = function(x) replace(x, is.null(x), NA)) %>% 
  purrr::map_df(
    ~ unlist(.x) %>% 
      t() %>% 
      as_tibble(.name_repair = "unique")
  ) %>%
  readr::type_convert() %>% 
  tidyr::unite('match_ids', starts_with('matches.id'), na.rm = TRUE) %>%
  tidyr::unite('is_forfeits', starts_with('participants.forfeit'), na.rm = TRUE) %>%
  tidyr::unite('scores', starts_with('participants.score'), na.rm = TRUE) %>%
  tidyr::unite('is_winners', starts_with('participants.winner'), na.rm = TRUE) %>%
  tidyr::unite('seeds', starts_with('participants.seed'), na.rm = TRUE) %>% 
  suppressMessages()

minus the as.data.frame() -part which I refuse to use. This should create the same output without leaving the pipe and without needing auxiliary vectors like match_cols .

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM