简体   繁体   中英

dplyr::across performance and dplyr::summarise to data.table efficiency

dplyr is not liking my large dataset so I'm try to convert the following simple code to the most efficient data.table equivalent:

library(tidyverse)
data(iris)
iris$year <- rep(c(2000, 3000), each = 25) 
iris$color <- rep(c("red", "green","blue"), each = 50) 
iris$letter <- as.factor(rep(c("A", "B", "C"), each = 50)) 
head(iris, 3)

iris %>% 
  group_by(Species, year) %>% 
  summarise(across(c(-Sepal.Length, -Sepal.Width), dplyr::first), 
            across(c(Sepal.Length, Sepal.Width), dplyr::last)) %>% 
  ungroup

However my effort is giving me the wrong solution and is also not naming columns:

library(data.table)
final <- setDT(iris)[, c(
  lapply(setdiff(names(iris), c("Sepal.Length", "Sepal.Width")), head, 1), 
  lapply(c("Sepal.Length", "Sepal.Width"), tail, 1)
), by = c("Species", "year")]
final

Maybe there is a quicker/better data.table approach?

thanks

EDIT

When I let the above dplyr code run on my real data (~3million rows, 80 columns), I ran into memory problems. It ran in rstudio for about 15hours before aborting. summarise with across has been known to be slower than summarise_at (see here ) but I thought they should be the same now . Using the solutions below and taking the first 20000 rows of my real dataset, I microbenchmark times = 10 and got:

#NOTE this is on my real dataset so not reproducible 
microbenchmark(datatable <- as.data.table(real_data)[, c(lapply(.SD[, nm1, with = FALSE], first),
                                                         lapply(.SD[, nm2, with = FALSE], last)), .( Species, year)],
               
               collapse_package <- collap(real_data, ~  Species + year, custom = list(ffirst = nm1, flast = nm2)),
               
               sqldf_df <- fn$sqldf("
 with first_data as (select min(rowid), $byVar, $firstVar from real_data group by $byVar),
      last_data as (select max(rowid), $byVar, $lastVar from real_data group by $byVar)
 select $byVar, $firstVar, $lastVar from first_data left join last_data using($byVar)
", dbname = tempfile()),
               sqldf_df_no_dbname <- fn$sqldf("
 with first_data as (select min(rowid), $byVar, $firstVar from real_data group by $byVar),
      last_data as (select max(rowid), $byVar, $lastVar from real_data group by $byVar)
 select $byVar, $firstVar, $lastVar from first_data left join last_data using($byVar)
"),
               dplyr_sum_across <- real_data %>% 
                 group_by(Species, year) %>%  
                 summarise(
                   across(c(-Sepal.Length, -Sepal.Width), dplyr::first), 
                   across(c(Sepal.Length, Sepal.Width), dplyr::last)), times = 10)

#                         min         lq        mean     median         uq        max neval cld
# datatable         9664.3822  9974.6145 10211.00909 10130.2571 10438.7439 10872.2079    10  b 
# collapse_package     4.9311     5.0039     5.10331     5.0677     5.1597     5.5432    10  a  
# sqldf_df           394.3706   395.7660   403.82425   399.2484   401.9162   450.3884    10  a  
# sqldf_df_no_dbname 374.9822   380.2022   385.52904   382.6653   387.7198   402.9556    10  a  
# dplyr_sum_across 23969.3657 25055.5517 25800.82757 25653.1470 26262.3583 27616.5212    10  c

library(collapse) run times are very impressive! Nice overview here .

With data.table , we can use

nm1 <- c("Petal.Length", "Petal.Width", "color", "letter")
nm2 <- c("Sepal.Length", "Sepal.Width")
as.data.table(iris)[, c(lapply(.SD[, nm1, with = FALSE], first),
     lapply(.SD[, nm2, with = FALSE], last)), .(Species, year)]

-output

#      Species year Petal.Length Petal.Width color letter Sepal.Length Sepal.Width
#1:     setosa 2000          1.4         0.2   red      A          4.8         3.4
#2:     setosa 3000          1.6         0.2   red      A          5.0         3.3
#3: versicolor 2000          4.7         1.4 green      B          6.4         2.9
#4: versicolor 3000          4.4         1.4 green      B          5.7         2.8
#5:  virginica 2000          6.0         2.5  blue      C          6.7         3.3
#6:  virginica 3000          6.0         1.8  blue      C          5.9         3.0

Or another option is collapse

library(collapse)
collap(iris, ~ Species + year, custom = list(ffirst = nm1, flast = nm2))
#   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species year color letter
#1          4.8         3.4          1.4         0.2     setosa 2000   red      A
#2          5.0         3.3          1.6         0.2     setosa 3000   red      A
#3          6.4         2.9          4.7         1.4 versicolor 2000 green      B
#4          5.7         2.8          4.4         1.4 versicolor 3000 green      B
#5          6.7         3.3          6.0         2.5  virginica 2000  blue      C
#6          5.9         3.0          6.0         1.8  virginica 3000  blue      C
 

1) sqldf sqldf can perform the calculation outside of R by specifying an external database name (see dbname= argument used below) so that R memory limitations for intermediate calculations do not affect it. You can also try it without the dbname= argument just in case you actually do have enough memory.

library(sqldf)

# enclose each argument in [...] and then create comma separated string
varString <- function(...) toString(sprintf("[%s]", c(...)))

firstVar <- varString("Petal.Length", "Petal.Width", "color", "letter")
lastVar <- varString("Sepal.Length", "Sepal.Width")
byVar <- varString("Species", "year")

fn$sqldf("
 with first_data as (select min(rowid), $byVar, $firstVar from iris group by $byVar),
      last_data as (select max(rowid), $byVar, $lastVar from iris group by $byVar)
 select $byVar, $firstVar, $lastVar from first_data left join last_data using($byVar)
", dbname = tempfile())

giving:

     Species year Petal.Length Petal.Width color letter Sepal.Length Sepal.Width
1     setosa 2000          1.4         0.2   red      A          4.8         3.4
2     setosa 3000          1.6         0.2   red      A          5.0         3.3
3 versicolor 2000          4.7         1.4 green      B          6.4         2.9
4 versicolor 3000          4.4         1.4 green      B          5.7         2.8
5  virginica 2000          6.0         2.5  blue      C          6.7         3.3
6  virginica 3000          6.0         1.8  blue      C          5.9         3.0

2) Base R This solution only uses base R. The line computing a unique key for each combination of Species and year might need to be modified for other data.

key <- as.integer(iris$Species) + as.integer(iris$year)
i <- !duplicated(key)
j <- !duplicated(key, fromLast = TRUE)
data.frame(
  iris[i, c("Species", "year", "Petal.Length", "Petal.Width")], 
  iris[j, c("Sepal.Length", "Sepal.Width")]
)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM