簡體   English   中英

如何在R中找到累積方差或標准差

[英]How to find cumulative variance or standard deviation in R

我在數據框中有一列X ,我需要找到累積的標准偏差。

X  Cumulative.SD
1     -
4    2.12
5    2.08
6    2.16
9    2.91

基本的想法

x <- c(1, 4, 5, 6, 9)

## cumulative sample size
n <- seq_along(x)

## cumulative mean
m <- cumsum(x) / n
#[1] 1.000000 2.500000 3.333333 4.000000 5.000000

## cumulative squared mean
m2 <- cumsum(x * x) / n
#[1]  1.0  8.5 14.0 19.5 31.8

## cumulative variance
v <- (m2 - m * m) * (n / (n - 1))
#[1]      NaN 4.500000 4.333333 4.666667 8.500000

## cumulative standard deviation
s <- sqrt(v)
#[1]      NaN 2.121320 2.081666 2.160247 2.915476

效用函數

cummean <- function (x) cumsum(x) / seq_along(x)

cumvar <- function (x, sd = FALSE) {
  x <- x - x[sample.int(length(x), 1)]  ## see Remark 2 below
  n <- seq_along(x)
  v <- (cumsum(x ^ 2) - cumsum(x) ^ 2 / n) / (n - 1)
  if (sd) v <- sqrt(v)
  v
  }

## Rcpp version: `cumvar_cpp`
library(Rcpp)

cppFunction('NumericVector cumvar_cpp(NumericVector x, bool sd) {
  int n = x.size();
  NumericVector v(n);
  srand(time(NULL));
  double pivot = x[rand() % n];
  double *xx = &x[0], *xx_end = &x[n], *vv = &v[0];
  int i = 0; double xi, sum2 = 0.0, sum = 0.0, vi;
  for (; xx < xx_end; xx++, vv++, i++) {
    xi = *xx - pivot;
    sum += xi; sum2 += xi * xi;
    vi = (sum2 - (sum * sum) / (i + 1)) / i;
    if (sd) vi = sqrt(vi);
    *vv = vi;
    }
  return v;
  }')

速度

cumvarcumvar_cpp很快有兩個原因:

  1. 它們是“矢量化的”;
  2. 它們具有O(n)復雜度而不是O(n ^ 2)復雜度。

對於越來越長的矢量,它們比簡單的滾動計算越來越快。

x <- rnorm(1e+3)
library(microbenchmark)
library(TTR)
microbenchmark("zheyuan" = cumvar(x, TRUE),
               "zheyuan_cpp" = cumvar_cpp(x, TRUE),
               "Rich" = vapply(seq_along(x), function(i) sd(x[1:i]), 1),
               "akrun" = runSD(x, n = 1, cumulative = TRUE))
#Unit: microseconds
#        expr       min         lq        mean     median         uq        max
#     zheyuan   101.261   105.2505   118.85214   121.0040   128.5925    157.702
# zheyuan_cpp    69.749    72.7190    81.38878    82.2335    84.2820    213.193
#        Rich 74595.329 75201.9420 77533.38803 75814.6945 77465.9945 136099.832
#       akrun  4329.892  4436.0145  4710.82440  4669.8380  4715.6035   6908.231

x <- rnorm(1e+4)
microbenchmark("zheyuan" = cumvar(x, TRUE),
               "zheyuan_cpp" = cumvar_cpp(x, TRUE),
               "akrun" = runSD(x, n = 1, cumulative = TRUE))
#Unit: microseconds
#        expr        min         lq        mean      median         uq
#     zheyuan    842.844    863.676    997.9585    880.2245    968.077
# zheyuan_cpp    618.823    632.254    709.1971    639.2990    657.366
#       akrun 147279.410 148200.370 150839.8161 149599.6170 151981.069

x <- rnorm(1e+5)
microbenchmark("zheyuan" = cumvar(x, TRUE),
               "zheyuan_cpp" = cumvar_cpp(x, TRUE),
               "akrun" = runSD(x, n = 1, cumulative = TRUE),
               times = 10)
#Unit: milliseconds
#        expr          min           lq         mean       median           uq
#     zheyuan     8.446502     8.657557    22.531637     9.431389    11.082594
# zheyuan_cpp     6.189955     6.305053     6.698292     6.365656     6.812374
#       akrun 14477.847050 14559.844609 14787.200820 14755.526655 15021.524429

備注1

我用cumstats搜索“累積方差R”,發現了一個小包的cumstats 它有一個cumvar函數,但是用sapply編寫(如Rich Scriven的答案 ),因此我無需進行實驗。


備注2

感謝Benjamin Christoffersen的專業闡述 我已經添加到我的原始cumvar以下行以增強數值穩定性。

x <- x - x[sample.int(length(x), 1)]

然后它與Ben的roll_var相比返回正確的值。

## using Ben's example
set.seed(99858398)
x <- rnorm(1e2, mean = 1e8)
all.equal(cumvar_cpp(x, FALSE), base::c(roll_var(x)))
#[1] TRUE

計算累積方差的速度比較:

x <- rnorm(1e+6)
microbenchmark("zheyuan" = cumvar(x, TRUE),
               "zheyuan_cpp" = cumvar_cpp(x, FALSE),
               "Ben_cpp" = roll_var(x),
               times = 20)
#Unit: milliseconds
#        expr      min       lq     mean   median       uq       max neval
#     zheyuan 85.47676 87.36403 91.05656 89.64444 93.99912 102.04964    20
# zheyuan_cpp 42.27983 42.41443 44.29919 42.65548 46.43293  51.24379    20
#     Ben_cpp 46.99105 47.12178 49.48072 47.76016 50.44587  60.11491    20

x <- rnorm(1e+7)
microbenchmark("zheyuan" = cumvar(x, TRUE),
               "zheyuan_cpp" = cumvar_cpp(x, FALSE),
               "Ben_cpp" = roll_var(x),
               times = 10)
#Unit: milliseconds
#        expr       min        lq      mean    median        uq       max neval
#     zheyuan 1171.3624 1215.8870 1278.3862 1266.9576 1330.6168 1486.7895    10
# zheyuan_cpp  463.6257  473.2711  479.8156  476.8822  482.4766  512.0520    10
#     Ben_cpp  571.7481  583.2694  587.9993  584.1206  602.0050  605.1515    10

我們可以使用runSD

TTR::runSD(df1$X, n = 1, cumulative = TRUE)
#[1]       NA 2.121320 2.081666 2.160247 2.915476

數據

df1 <- data.frame(X = c(1, 4, 5, 6, 9))

您還可以在維基站點查看算法以計算方差,並使用Rcpp實現Welford的在線算法 ,如下所示

library(Rcpp)
func <- cppFunction(
  "arma::vec roll_var(arma::vec &X){
    const arma::uword n_max = X.n_elem;
    double xbar = 0, M = 0; 
    arma::vec out(n_max);
    double *x = X.begin(), *o = out.begin();

    for(arma::uword n = 1; n <= n_max; ++n, ++x, ++o){
      double tmp = (*x - xbar);
      xbar += (*x - xbar) / n;
      M += tmp * (*x - xbar);
      if(n > 1L)
        *o = M / (n - 1.);
    }

    if(n_max > 0)
      out[0] = std::numeric_limits<double>::quiet_NaN();

    return out;
  }", depends = "RcppArmadillo")

# it gives the same
x <- c(1, 4, 5, 6, 9)
drop(func(x))
#R [1]  NaN 4.50 4.33 4.67 8.50
sapply(seq_along(x), function(i) var(x[1:i]))
#R [1]   NA 4.50 4.33 4.67 8.50

# it is fast
x <- rnorm(1e+3)
microbenchmark::microbenchmark(
  func = func(x), 
  sapply = sapply(seq_along(x), function(i) var(x[1:i])))
#R Unit: microseconds
#R    expr      min       lq    mean  median      uq   max neval
#R    func     9.09     9.88    30.7    20.5    21.9  1189   100
#R  sapply 43183.49 45040.29 47043.5 46134.4 47309.7 80345   100

取平方根給出標准偏差。


這種方法的一個主要優點是取消沒有問題。 比如,比較

# there are no issues with cancellation
set.seed(99858398)
x <- rnorm(1e2, mean = 1e8)
cumvar <- function (x, sd = FALSE) {
  n <- seq_along(x)
  v <- (cumsum(x ^ 2) - cumsum(x) ^ 2 / n) / (n - 1)
  if (sd) v <- sqrt(v)
  v
}
z1 <- drop(func(x))[-1]
z2 <- cumvar(x)[-1]
plot(z1, ylim = range(z1, z2), type = "l", lwd  = 2)
lines(seq_along(z2), z2, col = "DarkBlue")

在此輸入圖像描述

藍線是您從平方均值中減去平方值的算法。

你可以使用sapply()

sapply(seq_along(X), function(i) sd(X[1:i]))
# [1]       NA 2.121320 2.081666 2.160247 2.915476

或稍快的vapply()

vapply(seq_along(X), function(i) sd(X[1:i]), 1)
# [1]       NA 2.121320 2.081666 2.160247 2.915476

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM