简体   繁体   中英

Split a vector and summing values

I'm a R newbie. I've got a vector

vec <- c(105,29,41,70,77,0,56,49,63,0,105)

and i would like to sum values till "0" occurs and then create a vector with such values, such as:

vec2 <- c(322,168,105)

But i really don't know where to start! Any suggestion?

Starting with this vector...

> vec
 [1] 105  29  41  70  77   0  56  49  63   0 105

We can compute a logical TRUE/FALSE vector of where the zeroes are:

> vec == 0
 [1] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE

When you add FALSE and TRUE, FALSE is zero and TRUE is one, so if we add that vector up every time we get to a TRUE the value increases. So using cumsum for the cumulative sum, we get:

> cumsum(vec==0)
 [1] 0 0 0 0 0 1 1 1 1 2 2

Now that result defines the groups that we want to add up within, so let's split vec by that result:

> split(vec, cumsum(vec==0))
$`0`
[1] 105  29  41  70  77

$`1`
[1]  0 56 49 63

$`2`
[1]   0 105

So apart from the zeroes in the second and subsequent parts of the list, that's the numbers we want to add up. Because we are adding we can add the zeroes and it doesn't make any difference (but if you wanted the mean you would have to drop the zeroes). Now we use sapply to iterate over list elements and compute the sum:

> sapply(split(vec, cumsum(vec==0)),sum)
  0   1   2 
322 168 105 

Job done. Ignore the 0 1 2 labels.

Another option is by

as.numeric(by(vec, cumsum(vec == 0), sum))
#[1] 322 168 105

Benchmark

Benchmark comparison of the methods for a larger vector based on microbenchmark

# Create sample vector with N entries
set.seed(2018)
N <- 10000
vec <- sample(100, N, replace = T)
vec[sample(length(vec), 100)] <- 0

library(microbenchmark)
res <- microbenchmark(
    vapply = {
        I <- which(vec == 0)
        vapply(1:(length(I)+1),
            function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
            numeric(1))
   },
   by = {
       as.numeric(by(vec, cumsum(vec == 0), sum))
   },
   aggregate = {
       aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
   },
   split = {
       sapply(split(vec, cumsum(vec == 0)), sum)
   },
   Reduce = {
       ans <- numeric(0)
       s <- n <- 0
       Reduce(f = function (y,x) {
           if(x == 0) {
               ans <<- c(ans,s)
               s <<- 0
           }
           n <<- n+1
           s <<- x+s
           if (n == length(vec))
               ans <<- c(ans,s)
           s
       }, vec, init = 0, accumulate = TRUE)
       ans
   },
   for_loop = {
       I   <- which(vec == 0)
       n   <- length(vec)
       N   <- length(I) + 1
       res <- numeric(N)
       for(k in seq_along(res)) {
           if (k == 1) {
               res[k] <- sum(vec[1:I[1]])
               next
           }
           if (k == N) {
               res[k] <- sum(vec[I[N-1]:n])
               next
           }
           res[k] <- sum(vec[I[k-1]:I[k]])
       }
       res
   }
)
res
#    Unit: microseconds
#      expr       min         lq       mean     median         uq       max
#    vapply   435.658   487.4230   621.6155   511.3625   607.2005  6175.039
#        by  3897.401  4187.2825  4721.3168  4436.5850  4936.2900 12365.351
# aggregate  4817.032  5392.0620  6002.2579  5831.2905  6310.3665  9782.524
#     split   611.175   758.4485   895.2201   838.7665   957.0085  1516.556
#    Reduce 21372.054 22169.9110 25363.8684 23022.6920 25503.6145 49255.714
#  for_loop 15172.255 15846.5735 17252.6895 16445.7900 17572.7535 34401.827

library(ggplot2)
autoplot(res)

在此输入图像描述

The aggregate function is useful for this kind of thing. You create a grouping variable with cumsum (similar to how @Spacedman explained). Using the sum function as the aggregating operation. The [[2]] at the end just extracts what you want from what aggregate returns:

aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]

[1] 322 168 105

With vapply

Here is an option with vapply

I <- which(vec == 0)
vapply(1:(length(I)+1), 
       function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]), 
       numeric(1))
# [1] 322 168 105

With Reduce

Here is a solution using Reduce

ans <- numeric(0)
s <- n <- 0
Reduce(f = function (y,x) {
            if(x == 0) {
              ans <<- c(ans,s)
              s <<- 0
            }
            n <<- n+1
            s <<- x+s
            if(n == length(vec))
              ans <<- c(ans,s)
            s
       }, vec, init = 0, accumulate = TRUE)
ans
# [1] 322 168 105

With A Loop

Or maybe an old fashioned loop

I   <- which(vec == 0)
n   <- length(vec)
N   <- length(I) + 1
res <- numeric(N)
for(k in seq_along(res)) {
  if (k == 1) {
    res[k] <- sum(vec[1:I[1]])
    next
  }
  if (k == N) {
    res[k] <- sum(vec[I[N-1]:n])
    next
  }
  res[k] <- sum(vec[I[k-1]:I[k]])
}
res
# [1] 322 168 105

Benchmarking

Data

Here is the data used for benchmarking

# c.f. @MauritsEvers
# Create sample vector with N entries
set.seed(2018)
N <- 10000
vec <- sample(100, N, replace = T)
vec[sample(length(vec), 100)] <- 0

Functions

Here are the functions for the second benchmarking figures:

reduce <- function(vec) {
  ans <- numeric(0)
  s <- n <- 0
  Reduce(f = function (y,x) {
    if(x == 0) {
      ans <<- c(ans,s)
      s <<- 0
    }
    n <<- n+1
    s <<- x+s
    if(n == length(vec))
     ans <<- c(ans,s)
     s 
  }, vec, init = 0, accumulate = TRUE)
  ans
}
Vapply <- function (vec) {
  I <- which(vec == 0)
  vapply(1:(length(I)+1), 
         function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]), 
         numeric(1))
}
By <- function (vec) as.numeric(by(vec, cumsum(vec == 0), sum))
Split <- function (vec) sapply(split(vec, cumsum(vec==0)),sum)
Aggregate <- function (vec) aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
for_loop <- function(vec) {
  I <- which(vec == 0)
  n <- length(vec)
  N <- length(I)+1
  res <- numeric(N)
  for(k in seq_along(res)) {
    if (k == 1) {
      res[k] <- sum(vec[1:I[1]])
      next
    }
    if (k == N) {
      res[k] <- sum(vec[I[N-1]:n])
      next
    }
    res[k] <- sum(vec[I[k-1]:I[k]])
  }
  res
}
Rowsum <- function (vec) rowsum(vec, cumsum(vec == 0))

Benchmarking

Here are the two benchmarking processes combined:

# c.f. @MauritsEvers
resBoth <- microbenchmark::microbenchmark(
  Vapply = {
    I <- which(vec == 0)
    vapply(1:(length(I)+1),
           function(k) sum(vec[max(I[k-1],1):min(I[k], length(vec), na.rm = TRUE)]),
           numeric(1))
  },
  Vapply(vec),
  By = {
    as.numeric(by(vec, cumsum(vec == 0), sum))
  },
  By(vec),
  Aggregate = {
    aggregate(vec, by = list(cumsum(vec == 0)), FUN = sum)[[2]]
  },
  Aggregate(vec),
  Split = {
    sapply(split(vec, cumsum(vec == 0)), sum)
  },
  Split(vec),
  reduce = {
    ans <- numeric(0)
    s <- n <- 0
    Reduce(f = function (y,x) {
      if(x == 0) {
        ans <<- c(ans,s)
        s <<- 0
      }
      n <<- n+1
      s <<- x+s
      if (n == length(vec))
        ans <<- c(ans,s)
      s
    }, vec, init = 0, accumulate = TRUE)
    ans
  },
  reduce(vec),
  for_loop = {
    I   <- which(vec == 0)
    n   <- length(vec)
    N   <- length(I) + 1
    res <- numeric(N)
    for(k in seq_along(res)) {
      if (k == 1) {
        res[k] <- sum(vec[1:I[1]])
        next
      }
      if (k == N) {
        res[k] <- sum(vec[I[N-1]:n])
        next
      }
      res[k] <- sum(vec[I[k-1]:I[k]])
    }
    res
  },
  for_loop(vec),
  Rowsum = {rowsum(vec, cumsum(vec == 0))},
  Rowsum(vec),
  times = 10^3
 )

Results

Here are the benchmarking results

resBoth
# Unit: microseconds
#           expr       min         lq       mean     median         uq       max neval     cld
#         Vapply   234.121   281.5280   358.0708   311.7955   343.5215  4775.018  1000 ab     
#    Vapply(vec)   234.850   278.6100   376.3956   306.3260   334.4050 14564.278  1000 ab     
#             By  1866.029  2108.7175  2468.1208  2209.0025  2370.5520 23316.045  1000   c    
#        By(vec)  1870.769  2120.5695  2473.1643  2217.3900  2390.6090 21039.762  1000   c    
#      Aggregate  2738.324  3015.6570  3298.0863  3117.9480  3313.2295 13328.404  1000    d   
# Aggregate(vec)  2733.583  2998.1530  3295.6874  3109.1955  3349.1500  8277.694  1000    d   
#          Split   359.202   412.0800   478.0553   444.1710   492.3080  4622.220  1000  b     
#     Split(vec)   366.131   410.4395   475.2633   444.1715   490.3025  4601.799  1000  b     
#         reduce 10862.491 13062.3755 15353.2826 14465.0870 16559.3990 76305.463  1000       g
#    reduce(vec) 10403.004 12448.9965 14658.4035 13825.9995 15893.3255 67337.080  1000      f 
#       for_loop  6687.724  7429.4670  8518.0470  7818.0250  9023.9955 27541.136  1000     e  
#  for_loop(vec)   123.624   145.8690   187.2201   157.5390   177.4140  9928.200  1000 a      
#         Rowsum   235.579   264.3880   305.7516   282.2570   322.7360   792.068  1000 ab     
#    Rowsum(vec)   239.590   264.9350   307.2508   284.8100   322.0060  1778.143  1000 ab  

rowsum() is known to be quite fast. We can use cumsum(vec == 0) for the grouping.

c(rowsum(vec, cumsum(vec == 0)))
# [1] 322 168 105

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM