[英]Imputing Missing Values Using values from a different Row in the same Column in R
可能有更好的解决方案,但我使用data.table编写了此解决方案:
library(data.table)
#Create Data Table, You can read.csv or read.xlsx etc
raw <- data.table(Event = paste0("e", 1:10),
TransactionID = c("t1",NA,NA,"t4",NA,"t5","t6",NA,NA,"t8"),
UserId = c(rep("kenn1",4), rep("kenn2",6)),
EventTime = as.POSIXct(
c("2017-05-20 9:00", "2017-05-20 9:30", "2017-05-20 9:45", "2017-05-20 9:50", "2017-05-20 10:01",
"2017-05-20 10:02", "2017-05-20 10:03","2017-05-20 10:04","2017-05-20 10:05","2017-05-20 10:06")
, format="%Y-%m-%d %H:%M")
)
transactionTimes <- raw[!is.na(TransactionID), .(TransactionID, EventTime)]
raw[, Above := na.locf(TransactionID, na.rm = F), UserId]
raw[, Below := na.locf(TransactionID, na.rm = F, fromLast = T), UserId]
raw <- merge(raw, transactionTimes[, .(Above = TransactionID, AboveTime = EventTime)], by="Above", all.x = T)
raw <- merge(raw, transactionTimes[, .(Below = TransactionID, BelowTime = EventTime)], by="Below", all.x = T)
raw[, AboveDiff := EventTime - AboveTime]
raw[, BelowDiff := BelowTime - EventTime]
raw[is.na(TransactionID) & is.na(AboveDiff), TransactionID := Below]
raw[is.na(TransactionID) & is.na(BelowDiff), TransactionID := Above]
raw[is.na(TransactionID), TransactionID := ifelse(AboveDiff <= BelowDiff, Above, Below)]
raw <- raw[, .(Event, TransactionID, UserId, EventTime)]
rm(transactionTimes)
使用data.table
另一种解决方案。
library(data.table)
#Create Data Table, You can read.csv or read.xlsx etc
raw <- data.table(Event = paste0("e", 1:10),
TransactionID = c("t1",NA,NA,"t4",NA,"t5","t6",NA,NA,"t8"),
UserId = c(rep("kenn1",4), rep("kenn2",6)),
EventTime = as.POSIXct(
c("2017-05-20 9:00", "2017-05-20 9:30", "2017-05-20 9:45", "2017-05-20 9:50", "2017-05-20 10:01",
"2017-05-20 10:02", "2017-05-20 10:03","2017-05-20 10:04","2017-05-20 10:05","2017-05-20 10:06")
, format="%Y-%m-%d %H:%M")
)
#subset a rows without duplicates
raw_notNA <- raw[!is.na(TransactionID)]
# merge the subset data with original (this will duplicate rows of originals with candiate rows)
merged <- merge(raw, raw_notNA, all.x = T, by = "UserId", allow.cartesian=TRUE)
# calcuate time difference between original and candiate rows
merged[, DiffTime := abs(EventTime.x - EventTime.y)]
# create new Transaction IDs from the closest event
merged[, NewTransactionID := TransactionID.y[DiffTime == min(DiffTime)], by = Event.x]
# remove the duplicaetd rows, and delete unnecesary columns
output <- merged[, .SD[1], by = Event.x][, list(Event.x, NewTransactionID, UserId, EventTime.x)]
names(output) <- names(raw)
print(output)
受此问题答案的启发(您的问题不是重复的,只是相似的)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.