[英]Create a startime and endtime column from one timestamp column in R
我正在尝试确定事件(集群,1 或 2)何时开始和停止,因此我可以使用 timeDiff 并将 latlon 数据与这些停止/开始日期的天气数据相关联。 现在排除位置数据,我的数据看起来像这样(8000 多行,时间戳间隔 15 分钟,27 个唯一 ID,簇在 ID 内变化 50 多次):
dput(head(mydata,80))structure(list(id = c(43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L, 43486L,
43486L, 43486L, 43486L), cluster = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1",
"2"), class = "factor"), idswitch = c("43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0",
"43486 2 0", "43486 2 0", "43486 2 0", "43486 2 0", "43486 1 1",
"43486 2 2", "43486 2 2", "43486 2 2", "43486 2 2", "43486 2 2",
"43486 2 2", "43486 2 2", "43486 2 2", "43486 2 2", "43486 2 2",
"43486 2 2", "43486 2 2", "43486 2 2"), t_ = structure(c(1661663122,
1661664035, 1661664948, 1661665900, 1661666812, 1661667724, 1661668637,
1661669550, 1661670463, 1661671376, 1661672289, 1661673202, 1661674114,
1661675026, 1661675939, 1661676852, 1661677764, 1661678676, 1661679588,
1661680501, 1661681414, 1661682326, 1661683239, 1661684151, 1661685063,
1661685975, 1661686888, 1661687800, 1661688712, 1661689624, 1661690536,
1661691449, 1661692362, 1661693275, 1661694187, 1661695099, 1661696011,
1661696923, 1661697835, 1661698747, 1661699659, 1661700571, 1661701483,
1661702976, 1661703888, 1661704800, 1661705714, 1661706626, 1661707541,
1661708454, 1661709366, 1661710279, 1661711191, 1661712104, 1661713017,
1661713930, 1661714842, 1661715755, 1661716669, 1661717583, 1661718496,
1661719410, 1661720324, 1661721236, 1661722149, 1661723061, 1661723973,
1661724885, 1661725798, 1661726713, 1661727629, 1661728542, 1661729454,
1661730370, 1661731283, 1661732196, 1661733109, 1661734022, 1661734934,
1661735848), tzone = "", class = c("POSIXct", "POSIXt")), mode.switches = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)), row.names = c(NA,
-80L), class = c("tbl_df", "tbl", "data.frame"))
这是一个(稍微修改过的)块来说明我的问题:
id cluster idswitch t_ mode.switches
<int> <fct> <chr> <dttm> <dbl>
1 43486 2 43486 2 0 2022-08-28 00:05:22 0
2 43486 2 43486 2 0 2022-08-28 00:20:35 0
3 43486 2 43486 2 0 2022-08-28 00:35:48 0
4 43486 2 43486 2 0 2022-08-28 00:51:40 0
5 43486 2 43486 2 0 2022-08-28 01:06:52 0
6 43486 1 43486 2 0 2022-08-28 01:22:04 0
我想要一个如下所示的 output:
idcluster StartTime EndTime
\<chr\> \<dttm\> \<dttm\>
1 43486 2 2022-08-28 00:05:22 2022-08-28 01:22:04
2 43486 1 2022-08-28 01:22:04 2022-08-28 01:37:17
3 43486 2 2022-08-28 01:37:17 2022-08-28 02:07:43
4 43486 1 2022-08-28 02:07:43 2022-08-31 19:29:55
事件 2 在事件 1 开始的同时结束。
我使用 paste 将 id 和 cluster 归为一列,并创建了一个新的 col,mode.switches,它总结了每个 id 事件变化的次数,认为这可能有帮助(剧透,事实并非如此)。 我的一些尝试如下:
mydata <- mydata %>%
group_by(id) %>%
mutate(mode.switches = cumsum(c(0,as.numeric(diff(cluster))!=0))) %>%
ungroup()
#First try---------
test <- all %>%
group_by(idswitch) %>%
summarise(
first = min(t_),
last = max(t_)
)
#This is not what I want. An event stops and starts at the same time, and start/end times are wrong. Events need to start when the event before them stops.
idswitch first last
<chr> <dttm> <dttm>
1 43486 1 1 2022-08-28 16:59:33 2022-08-28 16:59:33
2 43486 1 11 2022-08-30 14:12:25 2022-08-30 20:47:56
3 43486 1 13 2022-08-30 21:18:20 2022-08-30 23:04:47
4 43486 1 15 2022-08-31 14:40:47 2022-08-31 19:29:55
5 43486 1 17 2022-08-31 20:00:22 2022-08-31 21:46:53
#lead/lag - it still respects idcluster so won't use t_ +1 or -1 of another idcluster, but I need the times related to id and cluster....---------
test2 <- mydata %>%
group_by(idcluster) %>%
mutate(start=t_, stop=lead(t_, n=1))
sum(is.na(test$stop)) #606 NAs :(
#same problems------------------
test7<- all %>%
filter(cluster == "1" | lag(cluster) == "1") %>%
group_by(mode.switches, id, cluster, gr = cumsum(lag(cluster != "1", default = TRUE))) %>%
summarise(StartTime = first(t_),
EndTime = last(t_)) %>%
mutate(EndTime = replace(EndTime, StartTime == EndTime, NA)) %>%
dplyr::select(-gr)
sum(is.na(test7$EndTime)) #378 NAs, lame.
存在很多类似的问题,但似乎没有一个能解决我的问题: Identify start and stop sequences and length of run in R Create start and end time column from a single datetime column in R R Start Column & End Column Time Log Obtaining the start and单列数据的结束日期如何使用lubridate从R的大型数据集中的单列中提取开始和结束日期? R 尝试从单列创建开始和停止时间根据 R 中的多个条件创建开始和结束时间列(dplyr、lubridate)
感谢您添加数据。
您可以先group_by
id
。 然后, filter
并保留给定id
的第一行,或者cluster
发生变化的地方。
然后,您可以添加一个end
列,它是时间的后续行lead
值。
library(dplyr)
df %>%
group_by(id) %>%
filter(row_number() == 1 | cluster != lag(cluster)) %>%
mutate(end = lead(t_))
Output
id cluster idswitch t_ mode.switches end
<int> <fct> <chr> <dttm> <dbl> <dttm>
1 43486 2 43486 2 0 2022-08-28 01:05:22 0 2022-08-28 17:59:33
2 43486 1 43486 1 1 2022-08-28 17:59:33 1 2022-08-28 18:14:45
3 43486 2 43486 2 2 2022-08-28 18:14:45 2 NA
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.