简体   繁体   中英

R Identifying pairs in irregular dataset

I need to identify pairs of arrivals and departures. The movement variable identifies ARR and DEP but sometimes among ARR - DEP pairs there are other rows that mess things up. I would like to keep ARR - DEP pairs and discard the rest.

Data looks mostly like this:

  id    time            movement    origin  dest
   1    10/06/2011 15:54    ARR        15    15
   1    10/06/2011 16:14    DEP        15    29
   2    10/06/2011 17:59    ARR        73    73
   2    10/06/2011 18:10    DEP        73    75
   2    10/06/2011 21:10    ARR        75    75
   2    10/06/2011 21:20    DEP        75    73

If you load the data below, you will see "misbehaved" cases at id Id5: Standalone movement without pair to match. Id 6: Extra DEP record (that I would discard) and Id 8: DEP first instead of ARR .

I have tried the following:

dfru$test <- FALSE
dfru$test[which(dfru$movement == "ARR")] <- TRUE

dfru$test[which(dfru$test[-1] =="TRUE")] <- 1 #Doesn't work (ie assigns TRUE or 1 to id 5 and not to the last record of id 4.

If I change the last line for dfru$test[which(dfru$test[-1] =="TRUE" & dfru$movement == "DEP")] <- 1 it doesn't work either to match trips.

Any ideas? Commands/packages that I could use?

Data:

dfru <- structure(list(time = structure(c(7L, 16L, 8L, 11L, 18L, 20L, 
10L, 12L, 3L, 6L, 15L, 19L, 9L, 4L, 5L, 14L, 1L, 2L, 13L, 17L
), .Label = c("10/06/2011 09:08", "10/06/2011 10:54", "10/06/2011 11:38", 
"10/06/2011 12:41", "10/06/2011 12:54", "10/06/2011 14:26", "10/06/2011 14:33", 
"10/06/2011 14:59", "10/06/2011 17:12", "10/06/2011 17:14", "10/06/2011 17:23", 
"10/06/2011 18:56", "10/06/2011 19:03", "10/06/2011 19:04", "10/06/2011 19:16", 
"10/06/2011 19:24", "10/06/2011 20:12", "10/06/2011 21:10", "10/06/2011 22:28", 
"10/06/2011 23:40"), class = "factor"), movement = structure(c(1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 3L, 1L, 2L, 2L, 1L, 
2L, 2L, 3L), .Label = c("ARR", "DEP", "ITZ"), class = "factor"), 
    origin = c(15L, 15L, 73L, 73L, 75L, 75L, 17L, 17L, 49L, 49L, 
    15L, 15L, 32L, 10L, 10L, 17L, 76L, 76L, 76L, 76L), dest = c(15L, 
    29L, 73L, 75L, 75L, 73L, 17L, 48L, 49L, 15L, 15L, 49L, 32L, 
    10L, 17L, 10L, 76L, 65L, 76L, 65L), id = c(1L, 1L, 2L, 2L, 
    2L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 6L, 6L, 6L, 7L, 7L, 8L, 
    8L)), .Names = c("time", "movement", "origin", "dest", "id"
), row.names = c(NA, -20L), class = c("data.table", "data.frame"
))

This produces the desired result in your question and may be a bit simpler.

library(data.table)
codes <- c(ARR=1,DEP=-1,ITZ=0)
dfru[,keep:=ifelse(abs(c(2,diff(codes[movement])))==2,TRUE,FALSE),by=id]
dfru[!(movement %in% c("ARR","DEP")),keep:=FALSE]
# result <- dfru[(keep)]  # remove rows flagged for deletion...
dfru
#                 time movement origin dest id  keep
#  1: 10/06/2011 14:33      ARR     15   15  1  TRUE
#  2: 10/06/2011 19:24      DEP     15   29  1  TRUE
#  3: 10/06/2011 14:59      ARR     73   73  2  TRUE
#  4: 10/06/2011 17:23      DEP     73   75  2  TRUE
#  5: 10/06/2011 21:10      ARR     75   75  2  TRUE
#  6: 10/06/2011 23:40      DEP     75   73  2  TRUE
#  7: 10/06/2011 17:14      ARR     17   17  3  TRUE
#  8: 10/06/2011 18:56      DEP     17   48  3  TRUE
#  9: 10/06/2011 11:38      ARR     49   49  4  TRUE
# 10: 10/06/2011 14:26      DEP     49   15  4  TRUE
# 11: 10/06/2011 19:16      ARR     15   15  4  TRUE
# 12: 10/06/2011 22:28      DEP     15   49  4  TRUE
# 13: 10/06/2011 17:12      ITZ     32   32  5 FALSE
# 14: 10/06/2011 12:41      ARR     10   10  6  TRUE
# 15: 10/06/2011 12:54      DEP     10   17  6  TRUE
# 16: 10/06/2011 19:04      DEP     17   10  6 FALSE
# 17: 10/06/2011 09:08      ARR     76   76  7  TRUE
# 18: 10/06/2011 10:54      DEP     76   65  7  TRUE
# 19: 10/06/2011 19:03      DEP     76   76  8  TRUE
# 20: 10/06/2011 20:12      ITZ     76   65  8 FALSE

This approach uses diff(...) on coded movement (ARR=1, DEP=-1, ITZ=0) to produce a vector that will be either 2 or -2 if ARR is followed by DEP. If there is an extra DEP that element will be 0 and should be flagged for deletion. Then we flag any element that is not ARR or DEP for deletion. Then, optionally, we delete the flagged rows.

Your question is still a bit vague as there are many possibilities, in principle, not accounted for. For instance, what should be done if an id sequence starts with DEP? What if there is a ARR only (no DEPs)?

Here I define some helper functions to get the job done

gapply<-function(x, y, f) unsplit(lapply(split(x, y), f), y)
markpair<-function(x) {
    arr <- cumsum(x$movement=="ARR")
    dep <- gapply(x$movement, arr, function(x) x=="DEP" & cumsum(x=="DEP")==1)
    dep <- dep $ (arr<0)
    hasdep <- gapply(dep, arr, any) 
    arr <- x$movement=="ARR" & hasdep
    cbind(x, keep = dep | arr)
}
gapply(dfru, dfru$id, markpair)

which returns

               time movement origin dest id  keep
1  10/06/2011 14:33      ARR     15   15  1  TRUE
2  10/06/2011 19:24      DEP     15   29  1  TRUE
3  10/06/2011 14:59      ARR     73   73  2  TRUE
4  10/06/2011 17:23      DEP     73   75  2  TRUE
5  10/06/2011 21:10      ARR     75   75  2  TRUE
6  10/06/2011 23:40      DEP     75   73  2  TRUE
7  10/06/2011 17:14      ARR     17   17  3  TRUE
8  10/06/2011 18:56      DEP     17   48  3  TRUE
9  10/06/2011 11:38      ARR     49   49  4  TRUE
10 10/06/2011 14:26      DEP     49   15  4  TRUE
11 10/06/2011 19:16      ARR     15   15  4  TRUE
12 10/06/2011 22:28      DEP     15   49  4  TRUE
13 10/06/2011 17:12      ITZ     32   32  5 FALSE
14 10/06/2011 12:41      ARR     10   10  6  TRUE
15 10/06/2011 12:54      DEP     10   17  6  TRUE
16 10/06/2011 19:04      DEP     17   10  6 FALSE
17 10/06/2011 09:08      ARR     76   76  7  TRUE
18 10/06/2011 10:54      DEP     76   65  7  TRUE
19 10/06/2011 19:03      DEP     76   76  8 FALSE
20 10/06/2011 20:12      ITZ     76   65  8 FALSE

which seems to mark the bad rows based on your description

Here's an attempt at the answer. If it doesn't answer the question, hopefully it provides an approach to get to the answer you are looking for. The algorithm is to 1) split dfru by id.
2) For each id, 2a) determine the arr and dep rows.
2b) Find the match between arr$dest and dep$origin. 2c) Return a list containing the matching dep and arr

forEachID<- function(id) {
  # print(id)  
  id_arr <- which(id$movement=='ARR')
  id_dep <- which(id$movement=='DEP')
  arr_dest <- id[id_arr,'dest']
  dep_origin <- id[id_dep,'origin']
  # print(arr_dest)
  # print(dep_origin)
  m<-match(arr_dest, dep_origin)
  # print(m)
  tMatch<-NULL
  if (length(m)>0) {
    arr <- id[id_arr[m],]
    dep <- id[id_dep[m],]
    tMatch<-list(arr=arr, dep=dep)
  }
}
paths <- by(dfru,dfru$id,forEachID)
print(paths)

Based on the comment, here's an updated answer to match based on time sequence

matchByDestOrigin <- function(id,id_arr,id_dep) {
  m<-match(arr_dest, dep_origin)
  tMatch<-NULL  
  if (length(m)>0) {
    arr <- id[id_arr[m],]
    dep <- id[id_dep[m],]
    tMatch<-list(arr=arr, dep=dep)
  }
}
matchByDestOrigin <- function(id,id_arr,id_dep) {
  tarr <- id[id_arr,]
  tarr <- tarr[order(tarr$time),]
  tdep <- id[id_dep,]
  tdep <- tdep[order(tdep$time),]
  nrows <- min(nrow(tarr),nrow(tdep))
  tMatch <- NULL
  if (nrows>0) {
    arr <- tarr[nrows,]
    dep <- tdep[nrows,]
    tMatch<-list(arr=arr, dep=dep)
  }
}


forEachIDMatchSequence<- function(id) {
  # print(id)  
  id_arr <- which(id$movement=='ARR')
  id_dep <- which(id$movement=='DEP')
  return(matchByDestOrigin(id,id_arr,id_dep))
}
forEachIDMatchDestOrigin<- function(id) {
  # print(id)  
  id_arr <- which(id$movement=='ARR')
  id_dep <- which(id$movement=='DEP')
  return(matchByTimeSequence(id,id_arr,id_dep))
}
destOriginPaths <- by(dfru,dfru$id,forEachIDMatchDestOrigin)
print(destOriginPaths)
seqPaths <- by(dfru,dfru$id,forEachIDMatchSequence)
print(seqPaths)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM