[英]In R, generate proper column names when flattening nested list of lists
在 R 中将列表的列表展平为数据帧时丢失重复的列名- 这是我们正在构建的去年的相关帖子。 我们正在调用 API 并接收以下原始数据,我们需要将其转换为“干净的” dataframe:
API调用返回的List列表
dput(raw_list)
list(list(id = 248013L, title = "Knockout Stage - Final", start = "2020-10-31T10:25:00Z",
end = "2020-10-31T14:17:00Z", postponed_from = NULL, deleted_at = NULL,
lifecycle = "over", tier = 1L, best_of = 5L, chain = list(),
streamed = TRUE, bracket_position = list(part = "UB", col = 0L,
offset = 1L), participants = list(list(seed = 1L, score = 1L,
forfeit = FALSE, roster = list(id = 75421L), winner = FALSE,
stats = NULL), list(seed = 2L, score = 3L, forfeit = FALSE,
roster = list(id = 67585L), winner = TRUE, stats = NULL)),
tournament = list(id = 5088L), substage = list(id = 26285L),
game = list(id = 2L), matches = list(list(id = 443031L),
list(id = 443032L), list(id = 443033L), list(id = 443034L),
list(id = 443035L)), casters = list(list(primary = TRUE,
caster = list(id = 47L)), list(primary = FALSE, caster = list(
id = 48L)), list(primary = FALSE, caster = list(id = 425L)),
list(primary = FALSE, caster = list(id = 449L)), list(
primary = FALSE, caster = list(id = 524L)), list(
primary = FALSE, caster = list(id = 1009L)), list(
primary = FALSE, caster = list(id = 1567L)), list(
primary = FALSE, caster = list(id = 1589L)), list(
primary = FALSE, caster = list(id = 1591L)), list(
primary = FALSE, caster = list(id = 3589L)), list(
primary = FALSE, caster = list(id = 3635L)), list(
primary = FALSE, caster = list(id = 4238L)), list(
primary = FALSE, caster = list(id = 5163L)), list(
primary = FALSE, caster = list(id = 5164L))), has_incident_report = FALSE),
list(id = 248014L, title = "Knockout Stage - Semifinal",
start = "2020-10-25T10:07:00Z", end = "2020-10-25T13:57:16Z",
postponed_from = NULL, deleted_at = NULL, lifecycle = "over",
tier = 1L, best_of = 5L, chain = list(), streamed = TRUE,
bracket_position = list(part = "UB", col = 1L, offset = 2L),
participants = list(list(seed = 1L, score = 1L, forfeit = FALSE,
roster = list(id = 70687L), winner = FALSE, stats = NULL),
list(seed = 2L, score = 3L, forfeit = FALSE, roster = list(
id = 75421L), winner = TRUE, stats = NULL)),
tournament = list(id = 5088L), substage = list(id = 26285L),
game = list(id = 2L), matches = list(list(id = 443021L),
list(id = 443022L), list(id = 443023L), list(id = 443024L),
list(id = 443025L)), casters = list(list(primary = TRUE,
caster = list(id = 47L)), list(primary = FALSE, caster = list(
id = 48L)), list(primary = FALSE, caster = list(id = 425L)),
list(primary = FALSE, caster = list(id = 449L)),
list(primary = FALSE, caster = list(id = 524L)),
list(primary = FALSE, caster = list(id = 1009L)),
list(primary = FALSE, caster = list(id = 1567L)),
list(primary = FALSE, caster = list(id = 1589L)),
list(primary = FALSE, caster = list(id = 1591L)),
list(primary = FALSE, caster = list(id = 3589L)),
list(primary = FALSE, caster = list(id = 3635L)),
list(primary = FALSE, caster = list(id = 4238L)),
list(primary = FALSE, caster = list(id = 5163L)),
list(primary = FALSE, caster = list(id = 5164L))),
has_incident_report = FALSE), list(id = 251494L, title = "Group B",
start = "2020-10-05T11:04:00Z", end = "2020-10-05T11:44:27Z",
postponed_from = NULL, deleted_at = NULL, lifecycle = "over",
tier = 1L, best_of = 1L, chain = list(), streamed = TRUE,
bracket_position = NULL, participants = list(list(seed = 1L,
score = 1L, forfeit = FALSE, roster = list(id = 61033L),
winner = TRUE, stats = NULL), list(seed = 2L, score = 0L,
forfeit = FALSE, roster = list(id = 60051L), winner = FALSE,
stats = NULL)), tournament = list(id = 5088L), substage = list(
id = 26282L), game = list(id = 2L), matches = list(
list(id = 449511L)), casters = list(list(primary = TRUE,
caster = list(id = 47L)), list(primary = FALSE, caster = list(
id = 48L)), list(primary = FALSE, caster = list(id = 425L)),
list(primary = FALSE, caster = list(id = 449L)),
list(primary = FALSE, caster = list(id = 524L)),
list(primary = FALSE, caster = list(id = 1009L)),
list(primary = FALSE, caster = list(id = 1567L)),
list(primary = FALSE, caster = list(id = 1589L)),
list(primary = FALSE, caster = list(id = 1591L)),
list(primary = FALSE, caster = list(id = 3589L)),
list(primary = FALSE, caster = list(id = 3635L)),
list(primary = FALSE, caster = list(id = 4238L)),
list(primary = FALSE, caster = list(id = 5163L)),
list(primary = FALSE, caster = list(id = 5164L))),
has_incident_report = FALSE))
这是我们目前拥有的。 我们这里的问题是从.name_repair = "unique"
生成的名称:
output_df <- zed %>%
rrapply(., f = function(x) replace(x, is.null(x), NA)) %>%
purrr::map(unlist) %>%
purrr::map(t) %>%
{suppressMessages(purrr::map(., as_tibble, .name_repair = "unique"))} %>%
dplyr::bind_rows() %>%
{suppressMessages(readr::type_convert(.))} %>%
as.data.frame()
> colnames(output_df)
[1] "id" "title" "start" "end" "postponed_from"
[6] "deleted_at" "lifecycle" "tier" "best_of" "streamed"
[11] "bracket_position.part" "bracket_position.col" "bracket_position.offset" "participants.seed...14" "participants.score...15"
[16] "participants.forfeit...16" "participants.roster.id...17" "participants.winner...18" "participants.stats...19" "participants.seed...20"
[21] "participants.score...21" "participants.forfeit...22" "participants.roster.id...23" "participants.winner...24" "participants.stats...25"
[26] "tournament.id" "substage.id" "game.id" "matches.id...29" "matches.id...30"
[31] "matches.id...31" "matches.id...32" "matches.id...33" "casters.primary...34" "casters.caster.id...35"
[36] "casters.primary...36" "casters.caster.id...37" "casters.primary...38" "casters.caster.id...39" "casters.primary...40"
[41] "casters.caster.id...41" "casters.primary...42" "casters.caster.id...43" "casters.primary...44" "casters.caster.id...45"
[46] "casters.primary...46" "casters.caster.id...47" "casters.primary...48" "casters.caster.id...49" "casters.primary...50"
[51] "casters.caster.id...51" "casters.primary...52" "casters.caster.id...53" "casters.primary...54" "casters.caster.id...55"
[56] "casters.primary...56" "casters.caster.id...57" "casters.primary...58" "casters.caster.id...59" "casters.primary...60"
[61] "casters.caster.id...61" "has_incident_report" "bracket_position" "participants.seed...12" "participants.score...13"
[66] "participants.forfeit...14" "participants.roster.id...15" "participants.winner...16" "participants.stats...17" "participants.seed...18"
[71] "participants.score...19" "participants.forfeit...20" "participants.roster.id...21" "participants.winner...22" "participants.stats...23"
[76] "matches.id" "casters.primary...28" "casters.caster.id...29" "casters.primary...30" "casters.caster.id...31"
[81] "casters.primary...32" "casters.caster.id...33"
.name_repair = "unique"
非常有用,因为如果没有它,我们将有许多列名称V20
、 V21
、 V30
、 V32
等。但是,自动生成的名称仍然存在问题。
> colnames(output_df)[grepl('matches.id', colnames(output_df))]
[1] "matches.id...29" "matches.id...30" "matches.id...31" "matches.id...32" "matches.id...33" "matches.id"
如果将这些列命名为matches.id, matches.id.1, matches.id.2, ...
会更可取。 我们可以尝试在事后更改列名,但是我们正在寻找一种解决方案,可以在我们当前的 rrapply() %>% map() %>% map() %>% map( ) %>% 绑定行()。 这可能吗?
编辑:到目前为止,我们正在做这样的事情:
# fix messy / dupe columns with weird ...23 suffixes
match_cols <- colnames(output_df)[grepl('matches.id', colnames(output_df))]
forfeit_cols <- colnames(output_df)[grepl('participants.forfeit', colnames(output_df))]
score_cols <- colnames(output_df)[grepl('participants.score', colnames(output_df))]
winner_cols <- colnames(output_df)[grepl('participants.winner', colnames(output_df))]
seed_cols <- colnames(output_df)[grepl('participants.seed', colnames(output_df))]
output_df <- output_df %>%
tidyr::unite('match_ids', all_of(match_cols), remove = TRUE, na.rm = TRUE) %>%
tidyr::unite('is_forfeits', all_of(forfeit_cols), remove = TRUE, na.rm = TRUE) %>%
tidyr::unite('scores', all_of(score_cols), remove = TRUE, na.rm = TRUE) %>%
tidyr::unite('is_winners', all_of(winner_cols), remove = TRUE, na.rm = TRUE) %>%
tidyr::unite('seeds', all_of(seed_cols), remove = TRUE, na.rm = TRUE)
它将这些重复的列折叠在一起,所以现在我们只需要将它们分开即可。 不理想,但似乎这会奏效。
我仍然不确定您要做什么,但您可以稍微清理一下 pipe:
zed %>%
rrapply::rrapply(f = function(x) replace(x, is.null(x), NA)) %>%
purrr::map_df(
~ unlist(.x) %>%
t() %>%
as_tibble(.name_repair = "unique")
) %>%
readr::type_convert() %>%
tidyr::unite('match_ids', starts_with('matches.id'), na.rm = TRUE) %>%
tidyr::unite('is_forfeits', starts_with('participants.forfeit'), na.rm = TRUE) %>%
tidyr::unite('scores', starts_with('participants.score'), na.rm = TRUE) %>%
tidyr::unite('is_winners', starts_with('participants.winner'), na.rm = TRUE) %>%
tidyr::unite('seeds', starts_with('participants.seed'), na.rm = TRUE) %>%
suppressMessages()
减去我拒绝使用的as.data.frame()
部分。 这应该创建相同的 output 而不离开 pipe 并且不需要像match_cols
这样的辅助向量。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.