简体   繁体   中英

R merging rows in a dataframe

Here is the head of a large data frame

head(Hdata_soil)
                      X_id           timestamp address rssi batt_v soil_temp_1 soil_temp_2 soil_temp_3 soil_moisture_1
1 565846060dd8e408e3817c58 2015-11-27 12:01:10      A8  -65     NA          NA          NA          NA              NA
2 565846070dd8e408e3817c59 2015-11-27 12:01:11      A8   NA     NA        9.73     -273.15       14.63             647
3 565846cf0dd8e408e3817caf 2015-11-27 12:04:31      A7  -64     NA          NA          NA          NA              NA
4 565846cf0dd8e408e3817cb0 2015-11-27 12:04:31      A7   NA     NA        8.56        9.46        9.64             660
5 565847650dd8e408e3817cf5 2015-11-27 12:07:01      A8  -64     NA          NA          NA          NA              NA
6 565847660dd8e408e3817cf6 2015-11-27 12:07:02      A8   NA     NA        9.82     -273.15       14.29             643

The full data set can be accessed from dropbox

As you can see there are 2 consecutive observations for each address with timestamps approx 1 s apart. The variables are split between these 2 observations. How can I go about merging them in to one row, conserving the first timestamp ?

It would also be great to make sure that this only happen with 2 consecutive observations from the same address .

I would really appreciate it if someone could point me in the right direction with regards to packages /functions to use.

Check out the following code which should meet your needs. First, the timestamps column is converted to an object of class 'POSIXlt' which allows to determine the time difference between single observations. Then I use foreach to loop over all lines in parallel and skip all those records that have already been merged into another during a previous iteration (saved in the vector 'used'). which in combination with difftime allows to identify consecutive observations (eg, within 5s away from the currently processed observation). Finally (and only if the 'address' of the current observation is present in the candidate records), the lines are being merged, replacing missing values in the currently processed line with values from the consecutive observation.

## load 'foreach' package
library(foreach)

## import and reformat data
Hdata_soil <- read.csv("Hdata_soil.csv", header = TRUE, 
                       stringsAsFactors = FALSE)

## reformat timestamps
timestamps <- strptime(Hdata_soil$timestamp, format = "%Y-%m-%d %H:%M:%S")

## vector with information about merged lines
used <- integer()
dat_out <- foreach(i = 1:length(timestamps), .combine = "rbind") %do% {

  ## skip current iteration if line has already been merged into another line
  if (i %in% used)
    return(NULL)

  ## identify consecutive observation (<5s)
  x <- timestamps[i]
  y <- timestamps[(i+1):length(timestamps)]

  # (subset same or consecutive days to reduce 
  # computation time of 'difftime')
  id_day <- which(as.Date(y) == as.Date(x) | 
                    as.Date(y) == (as.Date(x) + 1))
  y <- y[id_day]

  # (subset records within 5s from current observation)
  id_sec <- which(difftime(y, x, units = "secs") < 5)
  id <- id_day[id_sec]

  ## if consecutive observation(s) exist(s) and include address of 
  ## current observation, perform merge
  if (length(id) > 0 & 
        any(Hdata_soil[i+id, "address"] == Hdata_soil[i, "address"])) {

    for (j in 1:length(id)) {
      Hdata_soil_x <- data.frame(Hdata_soil[i, ])
      Hdata_soil_y <- data.frame(Hdata_soil[i+id[j], ])

      # overwrite all missing values in current line with values 
      # from consecutive line
      Hdata_soil_x[which(is.na(Hdata_soil_x) & !is.na(Hdata_soil_y))] <- 
        Hdata_soil_y[which(is.na(Hdata_soil_x) & !is.na(Hdata_soil_y))]

      # update information about merged lines
      used <- c(used, i, i+id[j])
    }

    # return merged line
    return(Hdata_soil_x)

  ## else return current line as is  
  } else {
    used <- c(used, i)
    return(data.frame(Hdata_soil[i, ]))
  }
}

However, the code takes quite long to perform which seems to be related with difftime .

 >     user   system  elapsed 
 > 2209.504   99.389 2311.996 

I think the following should work. Make a vector of unique address labels. Then for each address label, extract the relevant rows and use various functions to pick the row you want (eg the minimum timestamp, the rssi value that isn't NA etc.). Use rbind.data.frame to rebuild from the list at the end.

unad <- unique(Hdata_soil$address)

lst <- lapply(unad, function(ad){
    recs <- Hdata_soil[Hdata_soil$address == ad,]
    X_id <- recs$X_id[1]
    ts <- min(recs$timestamp)
    rssi <- recs$rssi[!is.na(recs$rssi)]
    if(length(rssi) == 0L) rssi <- NA else if(length(rssi) >= 2L) rssi <- mean(rssi) # or something - ensure end up with length 1
    ## remaining observations like rssi
    ## ...
    return(data.frame(X_id = X_id, timestamp = ts, address = ad, rssi = rssi, ...))
})

result <- do.call(rbind.data.frame, lst)

adaptation re comment:

mtx <- matrix(1:nrow(Hdata_soil), nrow(Hdata_soil), 2)
col.names(mtx) <- c("startR", "endR")

# identifies consecutive duplicate addresses and groups together into subsets
for(r in 1:(nrow(mtx) - 1)){
    with(Hdata_soil, if(identical(address[r], address[r + 1])){
        mtx[r, 2] <- mtx[r, 2] + 1
        mtx[r + 1,] <- c(NA, NA)
    })
}

#remove nas - essentially noting that duplicate addresses have been grouped
mtx <- mtx[!is.na(mtx[, 1]),]

lst <- lapply(1:nrow(mtx), function(r){
    datsubset <- Hdata_soil[mtx[r, "startR"]:mtx[r, "endR"],, drop = FALSE]
    # aggregate the subset of rows into one row as you please
})

result <- do.call(rbind.data.frame, lst)

Note this will need some adaptation if there are any address which occur three times consecutively.

First of all, I think your data needs an extra id column, because address is not unique per row pair, neither is any other column considering how you want to group them. For the sake of simplicity I will define the id column here as:

df$id <- as.character(c(1,1,2,2,3,3))

Then we can do the following

# Replace NA's by 0
df[is.na(df)] <- 0

# Extract numeric columns
tokeep <- which(sapply(df,is.numeric))

# Sum numeric columns per id
setDT(df)[,lapply(.SD,sum),by=id,.SDcols = tokeep]

Which yields:

   id rssi soil_temp_1 soil_temp_2 soil_temp_3 soil_moisture_1
1:  1  -65        9.73     -273.15       14.63             647
2:  2  -64        8.56        9.46        9.64             660
3:  3  -64        9.82     -273.15       14.29             643

You can consequently merge this with the remaining non numeric columns of your original df, choosing which unique values you want to drop in X_id and timestamp .

this solution using dplyr may work if you are confident that rounding the "timestamp" to the nearest minute will provide a unique identifier in conjunction with "address":

library(readr) # Required only for recreating your data frame
library(dplyr)

Hdata_soil <- readr::read_csv("X_id,timestamp,address,rssi,batt_v,soil_temp_1,soil_temp_2,soil_temp_3,soil_moisture_1
565846060dd8e408e3817c58,27/11/2015 12:01:10,A8,-65,NA,NA,NA,NA,NA
565846070dd8e408e3817c59,27/11/2015 12:01:11,A8,NA,NA,9.73,-273.15,14.63,647
565846cf0dd8e408e3817caf,27/11/2015 12:04:31,A7,-64,NA,NA,NA,NA,NA
565846cf0dd8e408e3817cb0,27/11/2015 12:04:31,A7,NA,NA,8.56,9.46,9.64,660
565847650dd8e408e3817cf5,27/11/2015 12:07:01,A8,-64,NA,NA,NA,NA,NA
565847660dd8e408e3817cf6,27/11/2015 12:07:02,A8,NA,NA,9.82,-273.15,14.29,643")

# Dplyr chain to create new vars, group then summarise
Hdata_soil <- dplyr::mutate(
  Hdata_soil,
  # Convert timestamp to POSIXct
  timestamp = as.POSIXct(strptime(timestamp, format = "%d/%m/%Y %H:%M:%S"))
  # Round to nearest minute
  , timestamp_round = as.POSIXct(round(timestamp, units = "mins"))
) %>% 
  # Group by nearest minute timestamps and address
  dplyr::group_by(timestamp_round, address) %>% 
  # Take minimum non-NA value
  dplyr::summarise_each(
    funs(min(., na.rm = TRUE))
  )

Which yields:

> # Print
> Hdata_soil
Source: local data frame [3 x 10]
Groups: timestamp_round [?]

      timestamp_round address                     X_id           timestamp  rssi batt_v soil_temp_1 soil_temp_2 soil_temp_3 soil_moisture_1
               (time)   (chr)                    (chr)              (time) (int)  (lgl)       (dbl)       (dbl)       (dbl)           (int)
1 2015-11-27 12:01:00      A8 565846060dd8e408e3817c58 2015-11-27 12:01:10   -65     NA        9.73     -273.15       14.63             647
2 2015-11-27 12:05:00      A7 565846cf0dd8e408e3817caf 2015-11-27 12:04:31   -64     NA        8.56        9.46        9.64             660
3 2015-11-27 12:07:00      A8 565847650dd8e408e3817cf5 2015-11-27 12:07:01   -64     NA        9.82     -273.15       14.29             643

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM