简体   繁体   中英

Rolling windows and selfjoin using data.table

I have a timeseries dataset that requires a self-join to check max number of "days past due" (DPD), 12 months from the date of observation for each customer.

Sample data shown below

  > data
   Cust_ID   Obs_Date DPD
1        1 2014-01-31  37
2        1 2014-02-28  79
3        1 2014-03-31 101
4        1 2014-04-30  90
5        1 2014-05-31  50
6        1 2014-06-30  80
7        1 2014-07-31  37
8        1 2014-08-31  15
9        1 2014-09-30  84
10       1 2014-10-31  45
11       1 2014-11-30  78
12       1 2014-12-31  73
13       1 2015-01-31  82
14       1 2015-02-28  83
15       1 2015-03-31  78
16       1 2015-04-30  88
17       1 2015-05-31  77
18       1 2015-06-30 101
19       1 2014-01-31  40
20       1 2014-02-28  82
21       1 2014-03-31 110
22       1 2014-04-30  90
23       1 2014-05-31  50
24       1 2014-06-30  80
25       1 2014-07-31  37
26       1 2014-08-31  15
27       1 2014-09-30  84
28       1 2014-10-31  45
29       1 2014-11-30  78
30       1 2014-12-31  73
31       1 2015-01-31  82
32       1 2015-02-28  83
33       1 2015-03-31  78
34       1 2015-04-30  78
35       1 2015-05-31  78
36       1 2015-06-30  78
37       2 2014-01-31  74
38       2 2014-02-28  74
    data<-data.frame(stringsAsFactors=FALSE,
     Cust_ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
                 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
                 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
                 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
                 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
                 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
                 3L, 3L, 3L, 3L, 3L),

    Obs_Date = c("31-Jan-14", "28-Feb-14", "31-Mar-14", "30-Apr-14",
                 "31-May-14", "30-Jun-14", "31-Jul-14", "31-Aug-14",
                 "30-Sep-14", "31-Oct-14", "30-Nov-14", "31-Dec-14", "31-Jan-15",
                 "28-Feb-15", "31-Mar-15", "30-Apr-15", "31-May-15", "30-Jun-15",
                 "31-Jan-14", "28-Feb-14", "31-Mar-14", "30-Apr-14", "31-May-14",
                 "30-Jun-14", "31-Jul-14", "31-Aug-14", "30-Sep-14", "31-Oct-14",
                 "30-Nov-14", "31-Dec-14", "31-Jan-15", "28-Feb-15", "31-Mar-15",
                 "30-Apr-15", "31-May-15", "30-Jun-15", "31-Jan-14", "28-Feb-14",
                 "31-Mar-14", "30-Apr-14", "31-May-14", "30-Jun-14", "31-Jul-14",
                 "31-Aug-14", "30-Sep-14", "31-Oct-14", "30-Nov-14", "31-Dec-14",
                 "31-Jan-15", "28-Feb-15", "31-Mar-15", "30-Apr-15", "31-May-15",
                 "30-Jun-15", "31-Jan-14", "28-Feb-14", "31-Mar-14", "30-Apr-14",
                 "31-May-14", "30-Jun-14", "31-Jul-14", "31-Aug-14", "30-Sep-14",
                 "31-Oct-14", "30-Nov-14", "31-Dec-14", "31-Jan-15",
                 "28-Feb-15", "31-Mar-15", "30-Apr-15", "31-May-15", "30-Jun-15",
                 "31-Jan-14", "28-Feb-14", "31-Mar-14", "30-Apr-14", "31-May-14",
                 "30-Jun-14", "31-Jul-14", "31-Aug-14", "30-Sep-14", "31-Oct-14",
                 "30-Nov-14", "31-Dec-14", "31-Jan-15", "28-Feb-15", "31-Mar-15",
                 "30-Apr-15", "31-May-15", "30-Jun-15", "31-Jan-14", "28-Feb-14",
                 "31-Mar-14", "30-Apr-14", "31-May-14", "30-Jun-14", "31-Jul-14",
                 "31-Aug-14", "30-Sep-14"),
     DPD = c(37L, 79L, 101L, 90L, 50L, 80L, 37L, 15L, 84L, 45L, 78L, 73L,
                 82L, 83L, 78L, 88L, 77L, 101L, 40L, 82L, 110L, 90L, 50L, 80L,
                 37L, 15L, 84L, 45L, 78L, 73L, 82L, 83L, 78L, 78L, 78L, 78L, 74L,
                 74L, 70L, 96L, 73L, 88L, 86L, 94L, 90L, 99L, 71L, 100L, 93L,
                 94L, 90L, 85L, 96L, 85L, 77L, 97L, 100L, 80L, 94L, 72L, 92L, 98L,
                 97L, 78L, 79L, 74L, 83L, 94L, 78L, 99L, 81L, 83L, 68L, 65L, 67L,
                 38L, 111L, 66L, 62L, 74L, 93L, 48L, 66L, 64L, 102L, 86L, 48L,
                 54L, 53L, 61L, 114L, 33L, 93L, 53L, 75L, 59L, 120L, 78L, 31L)
)
data<-data %>% mutate(Obs_Date = dmy(Obs_Date))

Dplyr solution to the problem is shown below

max_dpd_data<-data %>%
  left_join(data,data,by="Cust_ID") %>% 
  filter(Obs_Date.y > Obs_Date.x & Obs_Date.y<=(Obs_Date.x %m+% months(12))) %>%
  group_by(Cust_ID, Obs_Date.x) %>% summarise(Max_DPD_12_M = max(DPD.y)) %>% 
  rename(Obs_Date='Obs_Date.x')

but the data set I am using has 15Million rows, and I am wondering if there is a more efficient data.table solution to the problem. Desired solution is shown below

max_dpd_data
# A tibble: 51 x 3
# Groups:   Cust_ID [3]
   Cust_ID Obs_Date   Max_DPD_12_M
     <int> <date>            <dbl>
 1       1 2014-01-31          110
 2       1 2014-02-28          110
 3       1 2014-03-31           90
 4       1 2014-04-30           88
 5       1 2014-05-31           88
 6       1 2014-06-30          101
 7       1 2014-07-31          101
 8       1 2014-08-31          101
 9       1 2014-09-30          101
10       1 2014-10-31          101
# ... with 41 more rows

Here's a data.table approach which will add a new column with the max_dpd :

library(data.table)
setDT(data)
data[, max_date := Obs_Date %m+% months(12)]
data[data, on = .(Cust_ID, Obs_Date >= Obs_Date , Obs_Date <= max_date), 
     max_dpd := max(DPD), by = .EACHI]

head(data)
#   Cust_ID   Obs_Date DPD   max_date max_dpd
#1:       1 2014-01-31  37 2015-01-31     110
#2:       1 2014-02-28  79 2015-02-28     110
#3:       1 2014-03-31 101 2015-03-31     110
#4:       1 2014-04-30  90 2015-04-30      90
#5:       1 2014-05-31  50 2015-05-31      88
#6:       1 2014-06-30  80 2015-06-30     101

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM