简体   繁体   中英

How do I select values from a lookup table based on their time stamp?

I have a data frame called dayfile containing the following variables:

Period   SubCode            EchoTime  PosX  PosY  PosZ
1   8753      13 2016-06-19 00:06:27 16.81 39.21 12.66
2   8753      13 2016-06-19 00:06:35 16.67 39.08 12.66
3   8753      13 2016-06-19 00:06:36 16.33 39.60 13.03
4   8753      13 2016-06-19 00:06:45 17.14 38.14 12.23
5   8753      13 2016-06-19 00:06:53 16.95 38.21 12.38
6   8753      13 2016-06-19 00:06:53 17.44 37.67 11.95

And I have another data frame called probe.DOT1 that looks like this:

  DO.time.1m           DO.1m Temp.1m
1 2016-06-18 10:24:50  7.69   18.04
2 2016-06-18 11:24:50  7.54   19.12
3 2016-06-18 11:54:50  7.57   18.98
4 2016-06-18 12:24:50  9.51   19.88
5 2016-06-18 12:54:50  9.30   11.62
6 2016-06-18 13:24:50  8.81   11.54

I want to create a new column in dayfile called O2 from probe.DOT1$DO.1m where dayfile$EchoTime is within the time range between two consecutive values of probe.DOT1$Do.time.1m , ie it should look something like this:

Period   SubCode            EchoTime  PosX  PosY  PosZ  O2
1   8753      13 2016-06-19 00:06:27 16.81 39.21 12.66  7.54
2   8753      13 2016-06-19 00:06:35 16.67 39.08 12.66  7.54
3   8753      13 2016-06-19 00:06:36 16.33 39.60 13.03  7.59
4   8753      13 2016-06-19 00:06:45 17.14 38.14 12.23  7.59
5   8753      13 2016-06-19 00:06:53 16.95 38.21 12.38  7.59
6   8753      13 2016-06-19 00:06:53 17.44 37.67 11.95  7.59

I have tried this:

dayfile$O2 <- probe.DOT1[dayfile$EchoTime < probe.DOT1$DO.time.1m &
dayfile$EchoTime > diff(probe.DOT1$DO.time.1m, lag = 1) , 'DO.1m']

But it gives me errors. Does anyone have any suggestions?

Thanks.

EDIT

Here is some dput so you can recreate small portions of my files:

dayfile :

structure(list(Period = c(7017, 7017, 7017, 7017, 7017, 7017, 
7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 
7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 
7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 7017, 
7017, 7017), SubCode = c(20, 20, 20, 20, 20, 20, 20, 20, 20, 
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
), EchoTime = structure(c(1466249003, 1466249010, 1466249017, 
1466249025, 1466249032, 1466249039, 1466249046, 1466249053, 1466249060, 
1466249067, 1466249074, 1466249081, 1466249088, 1466249095, 1466249102, 
1466249109, 1466249116, 1466249123, 1466249130, 1466249137, 1466249144, 
1466249151, 1466249158, 1466249165, 1466249172, 1466249179, 1466249186, 
1466249193, 1466249200, 1466249207, 1466249214, 1466249221, 1466249228, 
1466249235, 1466249242, 1466249249, 1466249256, 1466249263, 1466249270, 
1466249277, 1466249284), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    PosX = c(14.69, 14.69, 14.69, 16.31, 14.94, 14.94, 14.85, 
    14.73, 16.64, 16.65, 16.7, 16.36, 16.38, 16.37, 16.41, 16.39, 
    16.12, 15.8, 16.33, 16.17, 16.32, 15.61, 15.5, 15.82, 16.68, 
    16.68, 16.68, 16.61, 16.68, 16.68, 11.39, 11.39, 16.68, 16.6, 
    16.6, 15.08, 15.08, 14.9, 14.81, 15.08, 15.15), PosY = c(36.98, 
    36.98, 36.98, 37.68, 36.9, 36.9, 36.96, 37.09, 37.19, 37.19, 
    37.01, 37.49, 37.47, 37.47, 37.45, 37.46, 37.75, 38.08, 38.06, 
    38.49, 38.06, 38.54, 38.39, 38.06, 37.16, 37.16, 37.16, 37.36, 
    37.16, 37.16, 40.09, 40.09, 37.16, 37.37, 37.37, 36.79, 36.79, 
    36.98, 36.94, 36.79, 36.73), PosZ = c(14.68, 14.68, 14.68, 
    15.67, 15.03, 15.03, 14.66, 14.88, 15.12, 15.22, 14.84, 15.28, 
    15.58, 15.48, 15.88, 15.68, 16.05, 15.91, 15.37, 15.64, 15.27, 
    16.1, 16.3, 16.33, 15.61, 15.61, 15.61, 15.8, 15.61, 15.61, 
    18.06, 18.06, 15.61, 15.7, 15.7, 15.13, 15.13, 15.3, 15.38, 
    15.13, 15.19)), .Names = c("Period", "SubCode", "EchoTime", 
"PosX", "PosY", "PosZ"), row.names = c(213387L, 213389L, 213391L, 
213393L, 213395L, 213397L, 213399L, 213401L, 213403L, 213405L, 
213407L, 213409L, 213411L, 213413L, 213415L, 213417L, 213419L, 
213421L, 213423L, 213425L, 213427L, 213429L, 213431L, 213433L, 
213435L, 213437L, 213439L, 213441L, 213443L, 213445L, 213447L, 
213449L, 213450L, 213452L, 213454L, 213456L, 213458L, 213460L, 
213462L, 213464L, 213466L), class = "data.frame")

probe.DOT1 :

structure(list(DO.time.1m = structure(c(1466245490, 1466249090, 
1466250890, 1466252690, 1466254490, 1466256290, 1466258090, 1466259890, 
1466261690, 1466263490, 1466265290, 1466267090, 1466268890, 1466270690, 
1466272490, 1466274290, 1466276090, 1466277890, 1466279690, 1466281490
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), DO.1m = c(7.69, 
7.54, 7.57, 9.51, 9.3, 8.81, 8.71, 8.66, 8.19, 8.52, 8.79, 9.01, 
9.19, 9.39, 9.79, 9.82, 9.48, 9.5, 9.29, 9.38), Temp.1m = c(18.04, 
19.12, 18.98, 19.88, 11.62, 11.54, 12.24, 12.22, 12.32, 12.2, 
12.18, 12.16, 12.14, 11.8, 11.62, 11.68, 11.52, 11.32, 11.34, 
11.14)), .Names = c("DO.time.1m", "DO.1m", "Temp.1m"), row.names = c(NA, 
20L), class = "data.frame")

I would use a double loop for this

dayfile$O2 <- NA
for (i in 1:nrow(dayfile)) {
  for (j in 2:nrow(probe.DOT1)) {
    # between previous do time and before current do time
    if (dayfile$EchoTime[i] > probe.DOT1$DO.time.1m[j - 1] & dayfile$EchoTime[i] < probe.DOT1$DO.time.1m[j]) {
      dayfile$O2[i] <- probe.DOT1$DO.1m[j]
    }
  }
}

Ah man... fuzzyjoin almost can solve this (which would be infinitely "eleganter"), but o well.

Here's my pipeline to do this w/out loops... but the dput you gave us is a bit crappy in that there was only 1 value properly between ranges... but let's see how it does on your real data.

## First create a df that contains all the timestamp info:
all <- data.frame(ts = unique(unlist(c(dayfile$EchoTime,probe.DOT1$DO.time.1m))))

## Now join both tables to this master record, fill in the blanks in 
## Period,SubCode,PosX,Y,Z, remove Temp.1m, remove useless rows, rename O2
library(dplyr)
library(tidyr)
all %>%
  left_join(dayfile,   by=c("ts"="EchoTime")) %>%
  left_join(probe.DOT1,by=c("ts" = "DO.time.1m")) %>%
  arrange(ts) %>%
  fill(Period,SubCode,PosX,PosY,PosZ) %>%
  select(-Temp.1m) %>%
  filter(!is.na(DO.1m)) %>%
  rename(O2=DO.1m)

# ts Period SubCode  PosX  PosY  PosZ   O2
# 1  2016-06-18 10:24:50     NA      NA    NA    NA    NA 7.69
# 2  2016-06-18 11:24:50   7017      20 16.38 37.47 15.58 7.54
# 3  2016-06-18 11:54:50   7017      20 15.15 36.73 15.19 7.57
# 4  2016-06-18 12:24:50   7017      20 15.15 36.73 15.19 9.51
# 5  2016-06-18 12:54:50   7017      20 15.15 36.73 15.19 9.30
# 6  2016-06-18 13:24:50   7017      20 15.15 36.73 15.19 8.81
# 7  2016-06-18 13:54:50   7017      20 15.15 36.73 15.19 8.71
# 8  2016-06-18 14:24:50   7017      20 15.15 36.73 15.19 8.66
# 9  2016-06-18 14:54:50   7017      20 15.15 36.73 15.19 8.19
# 10 2016-06-18 15:24:50   7017      20 15.15 36.73 15.19 8.52
# 11 2016-06-18 15:54:50   7017      20 15.15 36.73 15.19 8.79
# 12 2016-06-18 16:24:50   7017      20 15.15 36.73 15.19 9.01
# 13 2016-06-18 16:54:50   7017      20 15.15 36.73 15.19 9.19
# 14 2016-06-18 17:24:50   7017      20 15.15 36.73 15.19 9.39
# 15 2016-06-18 17:54:50   7017      20 15.15 36.73 15.19 9.79
# 16 2016-06-18 18:24:50   7017      20 15.15 36.73 15.19 9.82
# 17 2016-06-18 18:54:50   7017      20 15.15 36.73 15.19 9.48
# 18 2016-06-18 19:24:50   7017      20 15.15 36.73 15.19 9.50
# 19 2016-06-18 19:54:50   7017      20 15.15 36.73 15.19 9.29
# 20 2016-06-18 20:24:50   7017      20 15.15 36.73 15.19 9.38

EDIT:

To see the whole dataset with all the data together, use:

all %>%
  left_join(dayfile,   by=c("ts"="EchoTime")) %>%
  left_join(probe.DOT1,by=c("ts" = "DO.time.1m")) 

In R, you can (and should) run every row one at a time...just to see what each row does. Do this by selecting "blocks of text"... from all to whatever. See how running what I have above shows you all the steps so far? You can do that with every other line in the pipeline too... so just select the block everything including the fill, then run:

all %>%
  left_join(dayfile,   by=c("ts"="EchoTime")) %>%
  left_join(probe.DOT1,by=c("ts" = "DO.time.1m")) %>%
  arrange(ts) %>%
  fill(Period,SubCode,PosX,PosY,PosZ)

Here is the solution to my problem based on the code from @Amit

all <- data.frame(ts = unique(unlist(c(dayfile$EchoTime, probe.DOT1$DO.time.1m))))

library(dplyr)
library(tidyr)
all <- all %>%
  left_join(dayfile,   by=c("ts"="EchoTime")) %>%
  left_join(probe.DOT1,by=c("ts" = "DO.time.1m")) %>%
  arrange(ts) %>%
  fill(DO.1m) %>%
  select(-Temp.1m) %>%
  filter(!is.na(PosX)) %>%
  rename(O2=DO.1m) %>%
  arrange(Period, ts) 

dayfile$O2 <- all$O2

I changed @Amit's code slightly by filling Do.1m instead of the dayfile columns and then filtering out NAs from PosX instead of DO.1m . It works a treat!

Here is the output:

       Period SubCode            EchoTime  PosX  PosY  PosZ   O2
213387   7017      20 2016-06-18 11:23:23 14.69 36.98 14.68 7.69
213389   7017      20 2016-06-18 11:23:30 14.69 36.98 14.68 7.69
213391   7017      20 2016-06-18 11:23:37 14.69 36.98 14.68 7.69
213393   7017      20 2016-06-18 11:23:45 16.31 37.68 15.67 7.69
213395   7017      20 2016-06-18 11:23:52 14.94 36.90 15.03 7.69
213397   7017      20 2016-06-18 11:23:59 14.94 36.90 15.03 7.69
213399   7017      20 2016-06-18 11:24:06 14.85 36.96 14.66 7.69
213401   7017      20 2016-06-18 11:24:13 14.73 37.09 14.88 7.69
213403   7017      20 2016-06-18 11:24:20 16.64 37.19 15.12 7.69
213405   7017      20 2016-06-18 11:24:27 16.65 37.19 15.22 7.69
213407   7017      20 2016-06-18 11:24:34 16.70 37.01 14.84 7.69
213409   7017      20 2016-06-18 11:24:41 16.36 37.49 15.28 7.69
213411   7017      20 2016-06-18 11:24:48 16.38 37.47 15.58 7.69
213413   7017      20 2016-06-18 11:24:55 16.37 37.47 15.48 7.54
213415   7017      20 2016-06-18 11:25:02 16.41 37.45 15.88 7.54
213417   7017      20 2016-06-18 11:25:09 16.39 37.46 15.68 7.54
213419   7017      20 2016-06-18 11:25:16 16.12 37.75 16.05 7.54
213421   7017      20 2016-06-18 11:25:23 15.80 38.08 15.91 7.54
213423   7017      20 2016-06-18 11:25:30 16.33 38.06 15.37 7.54
213425   7017      20 2016-06-18 11:25:37 16.17 38.49 15.64 7.54

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM