[英]R merging rows in a dataframe
這是一個大數據框的head
head(Hdata_soil)
X_id timestamp address rssi batt_v soil_temp_1 soil_temp_2 soil_temp_3 soil_moisture_1
1 565846060dd8e408e3817c58 2015-11-27 12:01:10 A8 -65 NA NA NA NA NA
2 565846070dd8e408e3817c59 2015-11-27 12:01:11 A8 NA NA 9.73 -273.15 14.63 647
3 565846cf0dd8e408e3817caf 2015-11-27 12:04:31 A7 -64 NA NA NA NA NA
4 565846cf0dd8e408e3817cb0 2015-11-27 12:04:31 A7 NA NA 8.56 9.46 9.64 660
5 565847650dd8e408e3817cf5 2015-11-27 12:07:01 A8 -64 NA NA NA NA NA
6 565847660dd8e408e3817cf6 2015-11-27 12:07:02 A8 NA NA 9.82 -273.15 14.29 643
可以從Dropbox訪問完整數據集
如您所見,每個address
有2個連續觀察值, timestamps
大約相隔1秒。 變量在這兩個觀察之間分開。 我怎樣才能將它們合並到一行,保留第一個timestamp
?
確保只有來自同一address
連續2次觀察才會發生這種情況。
如果有人能指出我正在使用的包/功能方向,我真的很感激。
查看以下代碼,以滿足您的需求。 首先,將timestamps列轉換為類'POSIXlt'的對象,該對象允許確定單個觀察之間的時間差。 然后我使用foreach
並行循環遍歷所有行,並跳過在前一次迭代期間已經合並到另一行的所有記錄(保存在向量'used'中)。 which
與difftime
結合允許識別連續觀察(例如,在距當前處理的觀察5s內)。 最后(並且僅當候選記錄中存在當前觀察的'地址'時),合並線,用連續觀察的值替換當前處理的線中的缺失值。
## load 'foreach' package
library(foreach)
## import and reformat data
Hdata_soil <- read.csv("Hdata_soil.csv", header = TRUE,
stringsAsFactors = FALSE)
## reformat timestamps
timestamps <- strptime(Hdata_soil$timestamp, format = "%Y-%m-%d %H:%M:%S")
## vector with information about merged lines
used <- integer()
dat_out <- foreach(i = 1:length(timestamps), .combine = "rbind") %do% {
## skip current iteration if line has already been merged into another line
if (i %in% used)
return(NULL)
## identify consecutive observation (<5s)
x <- timestamps[i]
y <- timestamps[(i+1):length(timestamps)]
# (subset same or consecutive days to reduce
# computation time of 'difftime')
id_day <- which(as.Date(y) == as.Date(x) |
as.Date(y) == (as.Date(x) + 1))
y <- y[id_day]
# (subset records within 5s from current observation)
id_sec <- which(difftime(y, x, units = "secs") < 5)
id <- id_day[id_sec]
## if consecutive observation(s) exist(s) and include address of
## current observation, perform merge
if (length(id) > 0 &
any(Hdata_soil[i+id, "address"] == Hdata_soil[i, "address"])) {
for (j in 1:length(id)) {
Hdata_soil_x <- data.frame(Hdata_soil[i, ])
Hdata_soil_y <- data.frame(Hdata_soil[i+id[j], ])
# overwrite all missing values in current line with values
# from consecutive line
Hdata_soil_x[which(is.na(Hdata_soil_x) & !is.na(Hdata_soil_y))] <-
Hdata_soil_y[which(is.na(Hdata_soil_x) & !is.na(Hdata_soil_y))]
# update information about merged lines
used <- c(used, i, i+id[j])
}
# return merged line
return(Hdata_soil_x)
## else return current line as is
} else {
used <- c(used, i)
return(data.frame(Hdata_soil[i, ]))
}
}
但是,代碼需要很長時間才能執行,這似乎與difftime
有關。
> user system elapsed
> 2209.504 99.389 2311.996
我認為以下內容應該有效。 制作唯一地址標簽的向量。 然后,對於每個地址標簽,提取相關行並使用各種函數來選擇所需的行(例如,最小時間戳,非NA的rssi值等)。 使用rbind.data.frame
從最后的列表重建。
unad <- unique(Hdata_soil$address)
lst <- lapply(unad, function(ad){
recs <- Hdata_soil[Hdata_soil$address == ad,]
X_id <- recs$X_id[1]
ts <- min(recs$timestamp)
rssi <- recs$rssi[!is.na(recs$rssi)]
if(length(rssi) == 0L) rssi <- NA else if(length(rssi) >= 2L) rssi <- mean(rssi) # or something - ensure end up with length 1
## remaining observations like rssi
## ...
return(data.frame(X_id = X_id, timestamp = ts, address = ad, rssi = rssi, ...))
})
result <- do.call(rbind.data.frame, lst)
適應評論:
mtx <- matrix(1:nrow(Hdata_soil), nrow(Hdata_soil), 2)
col.names(mtx) <- c("startR", "endR")
# identifies consecutive duplicate addresses and groups together into subsets
for(r in 1:(nrow(mtx) - 1)){
with(Hdata_soil, if(identical(address[r], address[r + 1])){
mtx[r, 2] <- mtx[r, 2] + 1
mtx[r + 1,] <- c(NA, NA)
})
}
#remove nas - essentially noting that duplicate addresses have been grouped
mtx <- mtx[!is.na(mtx[, 1]),]
lst <- lapply(1:nrow(mtx), function(r){
datsubset <- Hdata_soil[mtx[r, "startR"]:mtx[r, "endR"],, drop = FALSE]
# aggregate the subset of rows into one row as you please
})
result <- do.call(rbind.data.frame, lst)
請注意,如果有任何地址連續出現三次,則需要進行一些調整。
首先,我認為您的數據需要一個額外的id列,因為每個行對的address
不是唯一的,也不考慮您想要如何對它們進行分組。 為簡單起見,我將在此定義id
列:
df$id <- as.character(c(1,1,2,2,3,3))
然后我們可以做以下事情
# Replace NA's by 0
df[is.na(df)] <- 0
# Extract numeric columns
tokeep <- which(sapply(df,is.numeric))
# Sum numeric columns per id
setDT(df)[,lapply(.SD,sum),by=id,.SDcols = tokeep]
產量:
id rssi soil_temp_1 soil_temp_2 soil_temp_3 soil_moisture_1
1: 1 -65 9.73 -273.15 14.63 647
2: 2 -64 8.56 9.46 9.64 660
3: 3 -64 9.82 -273.15 14.29 643
因此,您可以將其與原始df的其余非數字列合並,選擇要在X_id
和timestamp
刪除的唯一值。
如果您確信將“時間戳”舍入到最近的分鍾將提供與“地址”相關的唯一標識符,則使用dplyr的此解決方案可能有效:
library(readr) # Required only for recreating your data frame
library(dplyr)
Hdata_soil <- readr::read_csv("X_id,timestamp,address,rssi,batt_v,soil_temp_1,soil_temp_2,soil_temp_3,soil_moisture_1
565846060dd8e408e3817c58,27/11/2015 12:01:10,A8,-65,NA,NA,NA,NA,NA
565846070dd8e408e3817c59,27/11/2015 12:01:11,A8,NA,NA,9.73,-273.15,14.63,647
565846cf0dd8e408e3817caf,27/11/2015 12:04:31,A7,-64,NA,NA,NA,NA,NA
565846cf0dd8e408e3817cb0,27/11/2015 12:04:31,A7,NA,NA,8.56,9.46,9.64,660
565847650dd8e408e3817cf5,27/11/2015 12:07:01,A8,-64,NA,NA,NA,NA,NA
565847660dd8e408e3817cf6,27/11/2015 12:07:02,A8,NA,NA,9.82,-273.15,14.29,643")
# Dplyr chain to create new vars, group then summarise
Hdata_soil <- dplyr::mutate(
Hdata_soil,
# Convert timestamp to POSIXct
timestamp = as.POSIXct(strptime(timestamp, format = "%d/%m/%Y %H:%M:%S"))
# Round to nearest minute
, timestamp_round = as.POSIXct(round(timestamp, units = "mins"))
) %>%
# Group by nearest minute timestamps and address
dplyr::group_by(timestamp_round, address) %>%
# Take minimum non-NA value
dplyr::summarise_each(
funs(min(., na.rm = TRUE))
)
產量:
> # Print
> Hdata_soil
Source: local data frame [3 x 10]
Groups: timestamp_round [?]
timestamp_round address X_id timestamp rssi batt_v soil_temp_1 soil_temp_2 soil_temp_3 soil_moisture_1
(time) (chr) (chr) (time) (int) (lgl) (dbl) (dbl) (dbl) (int)
1 2015-11-27 12:01:00 A8 565846060dd8e408e3817c58 2015-11-27 12:01:10 -65 NA 9.73 -273.15 14.63 647
2 2015-11-27 12:05:00 A7 565846cf0dd8e408e3817caf 2015-11-27 12:04:31 -64 NA 8.56 9.46 9.64 660
3 2015-11-27 12:07:00 A8 565847650dd8e408e3817cf5 2015-11-27 12:07:01 -64 NA 9.82 -273.15 14.29 643
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.