简体   繁体   中英

R dplyr: dealing with NA values and empty/missing rows when summarizing data by group

Here is my data set:

library(dplyr)
library(lubridate)

d <- data.frame( individual=c(rep("A",1),rep("B",2),rep("C",5)),
                 date = as_date(c(rep("2020-02-01",1), rep("2020-03-02",2), rep("2020-04-01",5))),
                 test = c(NA, 0,1, 1,1,0,0,0),
                 date_test = c(NA,
                               as_date("2020-03-04"), as_date("2020-04-01"),
                               seq(as_date("2020-01-01"),as_date("2020-06-01"),length.out = 5)),
                 x= c(1.1, rep(2.1,2),rep(4,5)))
d <- d %>% as_tibble %>% mutate(date_test=as_date(as.numeric(date_test))) %>% group_by(individual) %>% mutate(before= date_test<date) %>%  ungroup()


# A tibble: 8 x 6
  individual date        test date_test      x before
  <fct>      <date>     <dbl> <date>     <dbl> <lgl> 
1 A          2020-02-01    NA NA           1.1 NA    
2 B          2020-03-02     0 2020-03-04   2.1 FALSE 
3 B          2020-03-02     1 2020-04-01   2.1 FALSE 
4 C          2020-04-01     1 2020-01-01   4   TRUE  
5 C          2020-04-01     1 2020-02-08   4   TRUE  
6 C          2020-04-01     0 2020-03-17   4   TRUE  
7 C          2020-04-01     0 2020-04-24   4   FALSE 
8 C          2020-04-01     0 2020-06-01   4   FALSE 

What I would like is to create a new variable (that I call "any_test") that says whether there was any test==1 when grouped by two variables: "individuals" and "before". I thus want to reduce the dataset to have 1 row by "individual" AND "before" level. In addition, when the variable any_test=1, I would like to keep the last date where test=1. Finally I want to keep the "x" variable.

To be clearer, here is the desired result that I want to obtain using dpylr R:

  individual before test_any  date_test   x
1          A  FALSE        0       <NA> 1.1
2          A   TRUE        0       <NA> 1.1
3          B  FALSE        1 2020-04-01 2.2
4          B   TRUE        0       <NA> 2.2
5          C  FALSE        0       <NA> 4.0
6          C   TRUE        1 2020-02-08 4.0

If I understood you correctly this should work. I first create a sceleton with all the combinations that should exist (before true/false for all cases, etc.). Then I reduce the dataframe to the values you are interested in (max test-date if there is any) and then will merge this to the dataframe.

library(dplyr)
library(lubridate)
library(tidyverse)

d <- data.frame( individual=c(rep("A",1),rep("B",2),rep("C",5)),
                 date = as_date(c(rep("2020-02-01",1), rep("2020-03-02",2), rep("2020-04-01",5))),
                 test = c(NA, 0,1, 1,1,0,0,0),
                 date_test = c(NA,
                               as_date("2020-03-04"), as_date("2020-04-01"),
                               seq(as_date("2020-01-01"),as_date("2020-06-01"),length.out = 5)),
                 x= c(1.1, rep(2.1,2),rep(4,5)))
d <- d %>% as_tibble %>% mutate(date_test=as_date(as.numeric(date_test))) %>% group_by(individual) %>% mutate(before= date_test<date) %>%  ungroup()

sceleton <- distinct(d, individual, before, x) %>%
  complete(before, individual) %>%
  group_by(individual) %>%
  fill(x, .direction = "downup") %>%
  filter(!is.na(before))

only_tests <- d %>%
  filter(test==1) %>%
  rename(test_any = test) %>%
  select(individual, before, date_test,test_any) %>%
  group_by(individual, before) %>%
  slice_max(.,order_by = date_test,n = 1)

full <- sceleton %>%
  left_join(.,only_tests, by = c('before','individual')) %>%
  replace_na(., list(test_any = 0))

Here is another dplyr approach turning individual and before to factors and then using group_by(.drop = FALSE) . I assume that x is constant per individual so we can use mean(x) when summarising by individual .

library(dplyr)
library(lubridate)

d %>% 
  mutate(across(c(individual,before), as.factor)) %>% 
  group_by(individual, before, .drop = FALSE) %>% 
  summarise(test_any = as.numeric(any(test == 1)),
            date_test = if_else(test_any == 1,
                               max(date_test[cur_data()$test == 1], na.rm = TRUE),
                               NA_real_),
            x = mean(x)) %>% 
  group_by(individual, .drop = FALSE) %>%
  mutate(x = mean(x, na.rm = TRUE)) %>% 
  ungroup %>% 
  filter(!is.na(before))

#> `summarise()` regrouping output by 'individual' (override with `.groups` argument)
#> # A tibble: 6 x 5
#>   individual before test_any date_test      x
#>   <fct>      <fct>     <dbl> <date>     <dbl>
#> 1 A          FALSE         0 NA           1.1
#> 2 A          TRUE          0 NA           1.1
#> 3 B          FALSE         1 2020-04-01   2.1
#> 4 B          TRUE          0 NA           2.1
#> 5 C          FALSE         0 NA           4  
#> 6 C          TRUE          1 2020-02-08   4

Created on 2021-04-12 by the reprex package (v0.3.0)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM