簡體   English   中英

根據具有范圍的另外兩列擴展兩列中的日期

[英]expand dates in two columns based on another two columns with ranges

我有下面 Rcode 中給出的表 1。 此表給出了日期的開始和結束范圍。

我有另一個表,表 2,它給出了日期范圍的外端,應該包含表 1 的開始和結束范圍。

我的決賽桌應該看起來像下面的 Rcode 中給出的那樣。

最終表應具有與表 1 和表 2 中的確切范圍,應進行調整,以便結束日期與后續行和前面的行連續。 換句話說,決賽桌應該有非重疊的間隔。 我一直未能成功解決的復雜日期問題。 希望我已經解釋清楚了。

以下代碼將給出表 1 和表 2。

table1 <- read.table(text="
id  start         end      var1
A   03/15/1992  03/20/1992  1
A   03/24/1992  03/26/1992  2
A   03/28/1992  03/31/1992  5
B   06/06/1994  06/06/1994  1
", header=T, stringsAsFactors=F)

start   <- as.Date(start)
end     <- as.Date(end)
table1      <- data.frame(id,start, end, var1) 
setDT(table1)


table2 <- read.table(text="
id  t1            t2       var2
A   01/01/1992  03/16/1992  3
A   03/17/1992  03/19/1992  4
A   03/20/1992  05/25/1992  6
B   06/06/1994  06/06/1994  8
", header=T, stringsAsFactors=F)

t1   <- as.Date(t1)
t2     <- as.Date(t2)
table2     <- data.frame(id,t1, t2, var2)
setDT(table2)



finaltable <- read.table(text="
id  t1             t2     var1  var2
A   01/01/1992  03/14/1992      3
A   03/15/1992  03/20/1992  1   
A   03/21/1992  03/23/1992      6
A   03/24/1992  03/26/1992  2   
A   03/27/1992  03/27/1992      6
A   03/28/1992  03/31/1992  5   
A   04/01/1992  05/25/1992      6
B   06/06/1994  06/06/1994  1   8
  ", header=T, stringsAsFactors=F)

這是一個使用data.table的選項:

#get first and last rows by id for each table
d1 <- table1[, .SD[c(1L, .N)], id][, ri := rowid(id)][]
d2 <- table2[, .SD[c(1L, .N)], id][, ri := rowid(id)][]

#create the earliest and latest intervals to row bind to original table1
morerows <- d1[d2, on=.(id, ri)][, 
    .(id, start=fifelse(ri==1L, t1, end+1L), end=fifelse(ri==1L, start-1L, t2))][
        start<=end]
DT1 <- rbindlist(list(table1, morerows), use.names=TRUE, fill=TRUE)
setkey(DT1, id, start, end)

#add in missing intervals
ans <- rbindlist(list(DT1, DT1[, .(start=end[-.N]+1L, end=start[-1L]-1L), id]), 
    use.names=TRUE, fill=TRUE)[start<=end]
setkey(ans, id, start, end)

ans[is.na(var1), var2 := table2[.SD, on=.(id, t1<=start, t2>=start), var2]]
ans[is.na(var2), var2 := table2[.SD, on=.(id, t1=start, t2=start), var2]]

output:

   id      start        end var1 var2
1:  A 1992-01-01 1992-03-14   NA    3
2:  A 1992-03-15 1992-03-20    1   NA
3:  A 1992-03-21 1992-03-23   NA    6
4:  A 1992-03-24 1992-03-26    2   NA
5:  A 1992-03-27 1992-03-27   NA    6
6:  A 1992-03-28 1992-03-31    5   NA
7:  A 1992-04-01 1992-05-25   NA    6
8:  B 1994-06-06 1994-06-06    1    8

數據:

library(data.table)
table1 <- fread("id  start         end      var1
A   03/15/1992  03/20/1992  1
A   03/24/1992  03/26/1992  2
A   03/28/1992  03/31/1992  5
B   06/06/1994  06/06/1994  1")
cols <- c("start", "end")
table1[, (cols) := lapply(.SD, as.Date, format="%m/%d/%Y"), .SDcols=cols]

table2 <- fread("id  t1            t2       var2
A   01/01/1992  03/16/1992  3
A   03/17/1992  03/19/1992  4
A   03/20/1992  05/25/1992  6
B   06/06/1994  06/06/1994  8")
cols <- c("t1", "t2")
table2[, (cols) := lapply(.SD, as.Date, format="%m/%d/%Y"), .SDcols=cols]

finaltable <- fread("id  t1             t2     var1  var2
A   01/01/1992  03/14/1992  NA    3
A   03/15/1992  03/20/1992  1   NA
A   03/21/1992  03/23/1992  NA    6
A   03/24/1992  03/26/1992  2   NA
A   03/27/1992  03/27/1992  NA    6
A   03/28/1992  03/31/1992  5   NA
A   04/01/1992  05/25/1992  NA    6
B   06/06/1994  06/06/1994  1   8")
cols <- c("t1", "t2")
finaltable[, (cols) := lapply(.SD, as.Date, format="%m/%d/%Y"), .SDcols=cols]

這是一個可能的解決方案,使用包dplyranytime

library(dplyr)

table1 <- read.table(text="
id  start         end      var1
A   03/15/1992  03/20/1992  1
A   03/24/1992  03/26/1992  2
A   03/28/1992  03/31/1992  5
B   06/06/1994  06/06/1994  1
", header=T, stringsAsFactors=F) %>%
  data.frame()

library(anytime)

table1$t1 <- anydate(table1$start)
table1$t2 <- anydate(table1$end)

table2 <- read.table(text="
id  t1            t2       var2
A   01/01/1992  03/16/1992  3
A   03/17/1992  03/19/1992  4
A   03/20/1992  05/25/1992  6
B   06/06/1994  06/06/1994  8
", header=T, stringsAsFactors=F) %>%
  data.frame()

table2$t1 <- anydate(table2$t1)
table2$t2 <- anydate(table2$t2)

finaltable <- merge(table1, table2, by = c("id", "t1", "t2"), all = T) %>%
  select(-c(start, end))
finaltable
#>   id         t1         t2 var1 var2
#> 1  A 1992-01-01 1992-03-16   NA    3
#> 2  A 1992-03-15 1992-03-20    1   NA
#> 3  A 1992-03-17 1992-03-19   NA    4
#> 4  A 1992-03-20 1992-05-25   NA    6
#> 5  A 1992-03-24 1992-03-26    2   NA
#> 6  A 1992-03-28 1992-03-31    5   NA
#> 7  B 1994-06-06 1994-06-06    1    8

reprex package (v0.3.0) 創建於 2020-07-28

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM