Converting this
g1 g2 desc val
A a 1 v1
A a 2 v2
A b 3 v3
To:
desc val
A
a
1 v1
2 v2
b
3 v3
I've converted a hierarchical data frame with two grouping levels into a structured list using a for loop. This displayed descriptions with an associated variable in a list interspersed with the group levels in order.
The purpose is to present the hierarchical data as a list so that it can be printed with formatting to distinguish the different grouping levels, using openxlsx.
Is there a more efficient base R, tidyverse or other approach to achieve this?
For loop code
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12))
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = 2)) %>%
rename(desc = V1, val = V2) %>%
mutate(desc = NA_character_,
val = NA_real_)
# loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data
output$desc[[output_row]] <- tib$desc[[i]]
output$val[[output_row]] <- tib$val[[i]]
output_row <- output_row + 1
}
I believe you can simplify and slightly optimize your code like this :
library(dplyr)
library(tidyr)
library(microbenchmark)
microbenchmark(
old = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12))
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = 2)) %>%
rename(desc = V1, val = V2) %>%
mutate(desc = NA_character_,
val = NA_real_)
# loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data
output$desc[[output_row]] <- tib$desc[[i]]
output$val[[output_row]] <- tib$val[[i]]
output_row <- output_row + 1
}
}
,
new_simple = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12)) %>%
unite('g1g2', g1, g2, remove = F)
tib_list <- split(tib, tib$g1g2)
convert_group <- function(sub_df){
tibble(
desc = c(sub_df$g1[1], sub_df$g2[2], sub_df$desc)
, val = c(NA, NA, sub_df$val)
)
}
res_df <- bind_rows(lapply(tib_list, convert_group))
}
,
new_fast = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12)) %>%
unite('g1g2', g1, g2, remove = F)
tib_list <- split(tib, tib$g1g2)
convert_desc <- function(sub_df){
c(sub_df$g1[1], sub_df$g2[2], sub_df$desc)
}
convert_val <- function(sub_df){ c(NA, NA, sub_df$val) }
res_df <- tibble(
desc = sapply(tib_list, convert_desc)
, val = sapply(tib_list, convert_val)
)
}
)
This gives me the following output:
Unit: milliseconds
expr min lq mean median uq max neval
old 41.06535 43.52606 49.42744 47.29305 52.74399 76.98021 100
new_simple 57.08038 60.65657 68.11021 63.38157 71.62398 112.24893 100
new_fast 24.16624 26.30785 31.07178 28.38764 31.91647 148.06442 100
Using a few packages from the tidyverse
, we could do:
library(tidyverse)
# or explicitly load what you need
library(purrr)
library(dplyr)
library(tidyr)
library(stringr)
transpose(df) %>%
unlist() %>%
stack() %>%
distinct(values, ind) %>%
mutate(detect_var = str_detect(values, "^v"),
ind = lead(case_when(detect_var == TRUE ~ values)),
values = case_when(detect_var == TRUE ~ NA_character_,
TRUE ~ values)) %>%
drop_na(values) %>%
select(values, ind) %>%
replace_na(list(ind = ""))
Returns:
values ind 1 A 2 a 3 1 v1 5 2 v2 7 b 8 3 v3
Using tib
data set, my solution seems to be a little slower than Plamen's:
Unit: milliseconds expr min lq mean median uq max neval old 17.658398 18.492957 21.292965 19.396304 21.770249 133.215223 100 new_simple 6.742158 7.013732 7.638155 7.190095 7.759104 12.640237 100 new_fast 4.064907 4.266243 4.837131 4.507865 4.871533 9.442904 100 tidyverse 4.980664 5.326694 6.004602 5.552611 6.215129 9.923524 100
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.