簡體   English   中英

按R中指定的時間長度擴展時間序列的最快方法

[英]Fastest way to expand a time series by specified time lengths in R

我有兩種活動記錄器的數據。 第一個記錄器記錄記錄器處於潮濕或干燥狀態的秒數 (請參見act1 )。 第二個記錄器每3秒采樣一次濕/干,並每10分鍾記錄一次濕采樣的總數。 給定3秒的采樣間隔,每10分鍾結束時記錄的值的范圍從零(始終干燥)到200(始終潮濕),請參見act2

我想重塑和重采樣來自第一個記錄器的數據,以使用盡可能高效的方法復制第二個記錄器的格式。

我在此處提供的示例使用數據的子樣本(6行),但是我的實際數據集包含一年多的觀察值(40,000多行),目前在3天后仍在運行。

act1 <- structure(list(
Valid = c("ok", "ok", "ok", "ok", "ok", "ok"),
Date = structure(c(1425579093, 1425579171, 1425579177, 1425579216, 1425579225, 1425579240),
class = c("POSIXct", "POSIXt"), tzone = ""),
Activity = c(78L, 6L, 39L, 9L, 15L, 9L),
Wet = c("wet", "dry", "wet", "dry", "wet", "dry")),
row.names = c("2", "3", "4", "5", "6", "7"),
class = "data.frame")

act2 <- structure(list(
Valid = c("ok", "ok", "ok", "ok", "ok", "ok"),
Date = structure(c(1425579093, 1425579171, 1425579177, 1425579216, 1425579225, 1425579240),
class = c("POSIXct", "POSIXt"), tzone = ""),
Activity = c(78L, 6L, 39L, 9L, 15L, 9L),
Wet = c("wet", "dry", "wet", "dry", "wet", "dry")), row.names = c("2", "3", "4", "5", "6", "7"),
class = "data.frame")

使用lapply,我已根據“ 活動”列中指定的間隔在act1數據框中擴展了“ 日期”列(POSIXct格式),並在“ 濕”列中保留了對相應狀態的引用。

act1  <-  lapply(1:nrow(act1),  function(x){
  data.frame(
    Valid = rep(act1[x, 1], act1[x, 3]), 
    Date = strptime(act1[x, 2], format = "%Y-%m-%d%H:%M:%S")+(seq_len(act1[x, 3])-1), 
    Activity = rep(1, act1[x, 3]), 
    Wet = rep(act1[x, 4], act1[x, 3])
  )})
act1 <- as.data.frame(do.call(rbind, act1))

然后,我已使用dplyrlubridate將每個觀察結果分組為3個第二個容器 ,並確定每個容器中的最后一個觀察是否潮濕。 我將其余的濕觀測值分組為10分鍾的箱,並總結了多少個樣本濕。

library(dplyr)
library(lubridate)

act1 <- act1 %>%
  mutate(interval = floor_date(Date, unit="minutes") + seconds(floor(second(Date)/3)*3)) %>% 
  group_by(interval) %>%
  summarise(Valid = "ok",
            Wet = Wet[which(Date==max(Date))]=="wet") %>%
  mutate(int10 = floor_date(interval, unit="hour") +
           minutes(floor(minute(interval)/10)*10) +
           (min(interval) - min(floor_date(interval, unit="hour") + minutes(floor(minute(interval)/10)*10)))) %>% 
  group_by(int10) %>%
  summarise(Valid = "ok",
            Activity = sum(Wet)) %>%
  rename(Date = int10) %>%
  select(Valid,Date,Activity)

我在此處提供的示例使用原始數據集的一個子集(6行),但是我的實際數據集包含超過一年的觀察值(40,000+行),目前經過3天仍在運行!

Vectorization, repcutseq應該在此任務的工具框中。

您可以縮短涉及lapply第一條主要聲明-您只是在嘗試重復行。 例如, act1[c(1,1), ]act1返回act1的第一行。 在循環中,您訪問act1[x, 3] 4次。 下面的這一行將復制我們所需的行數:

act1a <- act1_copy[rep(seq_len(nrow(act1_copy)), act1_copy[['Activity']]), ]

> nrow(act1_copy)
[1] 6
> seq_len(nrow(act1_copy))
[1] 1 2 3 4 5 6
> act1_copy[['Activity']]
[1] 78  6 39  9 15  9
> rep(seq_len(nrow(act1_copy)), act1_copy[['Activity']])
  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [60] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[119] 3 3 3 3 3 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6

# or if you're into external packages, this is a lot nicer looking:
tidyr::uncount(act1_copy, weights = Activity)

下一步是更正秒數,並將“ Activity重做為1

act1a[['Date']] <- act1a[['Date']] + sequence(act1_copy[['Activity']]) - 1
act1a[['Activity']] <- 1L

現在,要使舊的日志記錄數據與更新的日志記錄數據相匹配(即,每3秒記錄一次)的最后一步是對3秒進行分組。 重要的是要注意,每3秒一次等效於每3行一次。 因此,根據對act$Date完成的信心程度,我們可以執行以下兩種方法之一:

act1b <- act1a[!duplicated(cut(act1a$Date, '3 sec', labels = F)), ]

# or if you're sure there is one reading per second, you can just do once every three rows 
act1b <- act1a[seq(from = 1, to = nrow(act1a), by = 3), ]

# what cut() looks like for reference
cut(act1a$Date, '3 sec', labels = F)
  [1]  1  1  1  2  2  2  3  3  3  4  4  4  5  5  5  6  6  6  7  7  7  8  8  8  9  9  9 10 10 10 11 11 11 12 12 12 13 13 13
 [40] 14 14 14 15 15 15 16 16 16 17 17 17 18 18 18 19 19 19 20 20 20 21 21 21 22 22 22 23 23 23 24 24 24 25 25 25 26 26 26
 [79] 27 27 27 28 28 28 29 29 29 30 30 30 31 31 31 32 32 32 33 33 33 34 34 34 35 35 35 36 36 36 37 37 37 38 38 38 39 39 39
[118] 40 40 40 41 41 41 42 42 42 43 43 43 44 44 44 45 45 45 46 46 46 47 47 47 48 48 48 49 49 49 50 50 50 51 51 51 52 52 52

#or with labels:
cut(act1a$Date, '3 sec')
  [1] 2015-03-05 13:11:33 2015-03-05 13:11:33 2015-03-05 13:11:33 2015-03-05 13:11:36 2015-03-05 13:11:36
  [6] 2015-03-05 13:11:36 2015-03-05 13:11:39 2015-03-05 13:11:39 2015-03-05 13:11:39 2015-03-05 13:11:42
 [11] 2015-03-05 13:11:42 2015-03-05 13:11:42 2015-03-05 13:11:45 2015-03-05 13:11:45 2015-03-05 13:11:45
 [16] 2015-03-05 13:11:48 2015-03-05 13:11:48 2015-03-05 13:11:48 2015-03-05 13:11:51 2015-03-05 13:11:51
 [21] 2015-03-05 13:11:51 2015-03-05 13:11:54 2015-03-05 13:11:54 2015-03-05 13:11:54 2015-03-05 13:11:57 
# truncated for brevity.

最后一步是匯總數據。 就像最后一步一樣,我們可以使用cut()對使用時間進行分組,也可以再次使用rep(seq())使分組更快一些。

aggregate(act1b$Wet, list(Date = cut(act1b$Date, '10 min')), FUN = function(x) sum(x == 'wet'))
#or if you know there is one reading per second,
aggregate(act1b$Wet,
          list(Date = rep(act1b$Date[seq(from = 1, to = nrow(act1b), by = 10 * 60 / 3)]
                          , each = 10 * 60 / 3
                          , length.out = nrow(act1b)))
               , FUN = function(x) sum(x == 'wet'))

放在一起,您將獲得:

act1a <- act1_copy[rep(seq_len(nrow(act1_copy)), act1_copy[['Activity']]), ]
act1a[['Date']] <- act1a[['Date']] + sequence(act1_copy[['Activity']]) - 1
act1a[['Activity']] <- 1L

act1b <- act1a[seq(from = 1, to = nrow(act1a), by = 3), ]

aggregate(act1b$Wet,
          list(Date = rep(act1b$Date[seq(from = 1, to = nrow(act1b), by = 10 * 60 / 3)]
                          , each = 10 * 60 / 3
                          , length.out = nrow(act1b)))
               , FUN = function(x) sum(x == 'wet'))

library(tidyr)
library(dplyr)
  tidyr::uncount(act1_copy, weights = Activity)%>%
    mutate(Activity = 1L
           , Date = Date + sequence(act1_copy[['Activity']]) - 1)%>%
    slice(seq(from = 1, to = nrow(.), by = 3))%>%
    group_by(Date =rep(Date[seq(from = 1, to = nrow(.), by = 10 * 60 / 3)]
                           , each = 10 * 60 / 3
                           , length.out = nrow(.)))%>%
    summarize(Wet = sum(Wet == 'wet'))

# A tibble: 1 x 2
  Date                  Wet
  <dttm>              <int>
1 2015-03-05 13:11:33    44

有一些性能和代碼-請注意data.table方式,我在10分鍾的總結中遇到了問題,因此這並不是完全正確的:

Unit: milliseconds
         expr       min        lq      mean    median        uq       max neval
    cole_base  2.044400  2.174451  2.328061  2.253251  2.340801  6.424400   100
   cole_dplyr  3.152901  3.359501  3.502880  3.428101  3.515302  8.248401   100
      cole_dt  3.308601  3.541151  3.884475  3.698201  3.796652 13.155701   100
 original_all 32.626601 33.061152 34.531462 33.392151 34.237601 50.499501   100
library(microbenchmark)
library(data.table)
library(dplyr)
library(tidyr)
library(lubridate)

act1_copy <- structure(list(
  Valid = c("ok", "ok", "ok", "ok", "ok", "ok"),
  Date = structure(c(1425579093, 1425579171, 1425579177, 1425579216, 1425579225, 1425579240),
                   class = c("POSIXct", "POSIXt"), tzone = ""),
  Activity = c(78L, 6L, 39L, 9L, 15L, 9L),
  Wet = c("wet", "dry", "wet", "dry", "wet", "dry")),
  row.names = c("2", "3", "4", "5", "6", "7"),
  class = "data.frame")

dt <- as.data.table(act1_copy)
microbenchmark( cole_base = {
  act1a <- act1_copy[rep(seq_len(nrow(act1_copy)), act1_copy[['Activity']]), ]
  act1a[['Date']] <- act1a[['Date']] + sequence(act1_copy[['Activity']]) - 1
  # if you know there is definitly one reading per second
  # act1a[['Date']] <- act1a[['Date']] + seq_len(nrow(act1a)) - 1
  act1a[['Activity']] <- 1L

  # act1b <- act1a[!duplicated(cut(act1a$Date, '3 sec', labels = F)), ]
  # or if you're sure there is one reading per second, you can just do once every three rows 
  act1b <- act1a[seq(from = 1, to = nrow(act1a), by = 3), ]

  # aggregate(act1b$Wet, list(Date = cut(act1b$Date, '10 min')), FUN = function(x) sum(x == 'wet'))
  #or if you know there is one reading per second,
  aggregate(act1b$Wet,
            list(Date = rep(act1b$Date[seq(from = 1, to = nrow(act1b), by = 10 * 60 / 3)]
                            , each = 10 * 60 / 3
                            , length.out = nrow(act1b)))
                 , FUN = function(x) sum(x == 'wet'))

}
, cole_dplyr = {
  tidyr::uncount(act1_copy, weights = Activity)%>%
    mutate(Activity = 1L
           , Date = Date + sequence(act1_copy[['Activity']]) - 1)%>%
    # filter(!duplicated(cut(Date, '3 sec', labels = F)))%>%
    slice(seq(from = 1, to = nrow(.), by = 3))%>%
    # group_by(Date = cut(Date, '10 min'))%>%
    group_by(Date =rep(Date[seq(from = 1, to = nrow(.), by = 10 * 60 / 3)]
                           , each = 10 * 60 / 3
                           , length.out = nrow(.)))%>%
    summarize(Wet = sum(Wet == 'wet'))

}
, cole_dt = {
  copy(dt)[rep(seq_len(.N), Activity)
           , .(Date = Date + sequence(act1_copy[['Activity']]) - 1
               ,Wet, Valid, Activity = 1L) 
           ][seq(from = 1, to = .N, by = 3)
             , .(Wet = sum(Wet == 'wet'))
             , by = cut(Date, '10 min')]
}
, original_all = {
  act1  <-  lapply(1:nrow(act1_copy),  function(x){
    data.frame(
      Valid = rep(act1_copy[x, 1], act1_copy[x, 3]),
      Date = strptime(act1_copy[x, 2], format = "%Y-%m-%d%H:%M:%S")+(seq_len(act1_copy[x, 3])-1),
      Activity = rep(1, act1_copy[x, 3]),
      Wet = rep(act1_copy[x, 4], act1_copy[x, 3])
    )})
  act1 <- as.data.frame(do.call(rbind, act1))

  act1 <- act1 %>%
    mutate(interval = floor_date(Date, unit="minutes") + seconds(floor(second(Date)/3)*3)) %>%
    group_by(interval) %>%
    summarise(Valid = "ok",
              Wet = Wet[which(Date==max(Date))]=="wet") %>%
    mutate(int10 = floor_date(interval, unit="hour") +
             minutes(floor(minute(interval)/10)*10) +
             (min(interval) - min(floor_date(interval, unit="hour") + minutes(floor(minute(interval)/10)*10)))) %>%
    group_by(int10) %>%
    summarise(Valid = "ok",
              Activity = sum(Wet)) %>%
    rename(Date = int10) %>%
    select(Valid,Date,Activity)
}
)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM