简体   繁体   中英

R: Summarize groups of rows in one data.frame with groups based on whether a date column's value falls in a time range in another data.frame

I am trying to find the rows sums for each column in data frame df_count (cars, buses,trucks) between the time frames given in each row in the data frame start_end

So for example, row 1 of start_end ranges from 2021-06-12 00:15:00 to 2021-06-12 00:55:00.

I want to find the row sum of cars (for example) between these timestamps in column 1 of df_count (rows 5 to 12)

 df_count <- structure(list(date = structure(c(1623456000, 1623456300, 1623456600, 
    1623456900, 1623457200, 1623457500, 1623457800, 1623458100, 1623458400, 
    1623458700, 1623459000, 1623459300, 1623459600, 1623459900, 1623460200, 
    1623460500, 1623460800, 1623461100, 1623461400, 1623461700, 1623462000, 
    1623462300, 1623462600, 1623462900, 1623463200, 1623463500, 1623463800, 
    1623464100, 1623464400, 1623464700), tzone = "UTC", class = c("POSIXct", 
    "POSIXt")), cars = c(45, 45, 45, 52, 52, 52, 46, 46, 46, 34, 
    34, 34, 29, 29, 29, 36, 36, 36, 17, 17, 17, 18, 18, 18, 14, 14, 
    14, 3, 3, 3), buses = c(4, 4, 4, 7, 7, 7, 5, 5, 5, 4, 4, 4, 5, 
    5, 5, 4, 4, 4, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1), trucks = c(3, 
    3, 3, 2, 2, 2, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 
    2, 2, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -30L), class = c("tbl_df", 
    "tbl", "data.frame"))
    
    start_end <- structure(list(start_co2_plume = c("2021-06-12 00:15:00", "2021-06-12 00:55:00", 
    "2021-06-12 01:15:00", "2021-06-12 01:30:00", "2021-06-12 02:00:00", 
    "2021-06-12 02:25:00", "2021-06-12 03:00:00", "2021-06-12 03:20:00", 
    "2021-06-12 03:45:00", "2021-06-12 03:55:00", "2021-06-12 04:20:00", 
    "2021-06-12 04:35:00", "2021-06-12 04:50:00", "2021-06-12 05:10:00", 
    "2021-06-12 05:40:00", "2021-06-12 05:50:00", "2021-06-12 06:00:00", 
    "2021-06-12 06:10:00", "2021-06-12 06:25:00", "2021-06-12 06:35:00", 
    "2021-06-12 06:45:00", "2021-06-12 06:55:00", "2021-06-12 08:10:00", 
    "2021-06-12 08:30:00", "2021-06-12 08:55:00", "2021-06-12 09:45:00", 
    "2021-06-12 10:05:00", "2021-06-12 10:35:00", "2021-06-12 11:05:00", 
    "2021-06-12 11:25:00"), end_co2_plume = c("2021-06-12 00:55:00", 
    "2021-06-12 01:15:00", "2021-06-12 01:30:00", "2021-06-12 02:00:00", 
    "2021-06-12 02:25:00", "2021-06-12 03:00:00", "2021-06-12 03:20:00", 
    "2021-06-12 03:35:00", "2021-06-12 03:55:00", "2021-06-12 04:10:00", 
    "2021-06-12 04:35:00", "2021-06-12 04:50:00", "2021-06-12 05:10:00", 
    "2021-06-12 05:30:00", "2021-06-12 05:50:00", "2021-06-12 06:00:00", 
    "2021-06-12 06:10:00", "2021-06-12 06:25:00", "2021-06-12 06:35:00", 
    "2021-06-12 06:45:00", "2021-06-12 06:55:00", "2021-06-12 07:10:00", 
    "2021-06-12 08:30:00", "2021-06-12 08:55:00", "2021-06-12 09:10:00", 
    "2021-06-12 10:05:00", "2021-06-12 10:25:00", "2021-06-12 10:50:00", 
    "2021-06-12 11:25:00", "2021-06-12 11:45:00")), row.names = c(NA, 
    30L), class = "data.frame")

The below produces the desired output. It is necessary to assume the time zones for the dates, so I assumed they came from the same time zone.

library(purrr)
library(dplyr)

# Convert dates in start_end from character vectors to date classes
# Assumes the times are in the same time zone
start_end <- start_end %>% mutate(start_date = as.POSIXct(start_co2_plume, tz = "UTC"),
                                  end_date = as.POSIXct(end_co2_plume, tz = "UTC"))

# For each row in start_end, subset df_count to the rows whose dates fall in
# the the interval defined by the start_date and end_date values for that row.
# For each automobile column, sum the values and add an index to tell us which
# interval it came from. 
results <-
  pmap(list(start_end$start_date, start_end$end_date, 1:nrow(start_end)),
       function(start, end, ind) {
         df_count %>%
           filter((date >= start) & (date < end)) %>%
           select(-date) %>%
           summarise(across(everything(), sum)) %>%
           mutate(interval_id = ind,
                  start = start,
                  end = end)
       })

# Combine into a single data.frame
results %>% bind_rows()
#> # A tibble: 30 x 6
#>     cars buses trucks interval_id start               end                
#>    <dbl> <dbl>  <dbl>       <int> <dttm>              <dttm>             
#>  1   362    44     26           1 2021-06-12 00:15:00 2021-06-12 00:55:00
#>  2   121    19     13           2 2021-06-12 00:55:00 2021-06-12 01:15:00
#>  3   108    12      9           3 2021-06-12 01:15:00 2021-06-12 01:30:00
#>  4   105    12     15           4 2021-06-12 01:30:00 2021-06-12 02:00:00
#>  5    48     5      5           5 2021-06-12 02:00:00 2021-06-12 02:25:00
#>  6     3     1      1           6 2021-06-12 02:25:00 2021-06-12 03:00:00
#>  7     0     0      0           7 2021-06-12 03:00:00 2021-06-12 03:20:00
#>  8     0     0      0           8 2021-06-12 03:20:00 2021-06-12 03:35:00
#>  9     0     0      0           9 2021-06-12 03:45:00 2021-06-12 03:55:00
#> 10     0     0      0          10 2021-06-12 03:55:00 2021-06-12 04:10:00
#> # ... with 20 more rows

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM