![](/img/trans.png)
[英]How does r calculate the standard deviation and variance for non numeric data?
[英]How to find cumulative variance or standard deviation in R
我在數據框中有一列X
,我需要找到累積的標准偏差。
X Cumulative.SD
1 -
4 2.12
5 2.08
6 2.16
9 2.91
基本的想法
x <- c(1, 4, 5, 6, 9)
## cumulative sample size
n <- seq_along(x)
## cumulative mean
m <- cumsum(x) / n
#[1] 1.000000 2.500000 3.333333 4.000000 5.000000
## cumulative squared mean
m2 <- cumsum(x * x) / n
#[1] 1.0 8.5 14.0 19.5 31.8
## cumulative variance
v <- (m2 - m * m) * (n / (n - 1))
#[1] NaN 4.500000 4.333333 4.666667 8.500000
## cumulative standard deviation
s <- sqrt(v)
#[1] NaN 2.121320 2.081666 2.160247 2.915476
效用函數
cummean <- function (x) cumsum(x) / seq_along(x)
cumvar <- function (x, sd = FALSE) {
x <- x - x[sample.int(length(x), 1)] ## see Remark 2 below
n <- seq_along(x)
v <- (cumsum(x ^ 2) - cumsum(x) ^ 2 / n) / (n - 1)
if (sd) v <- sqrt(v)
v
}
## Rcpp version: `cumvar_cpp`
library(Rcpp)
cppFunction('NumericVector cumvar_cpp(NumericVector x, bool sd) {
int n = x.size();
NumericVector v(n);
srand(time(NULL));
double pivot = x[rand() % n];
double *xx = &x[0], *xx_end = &x[n], *vv = &v[0];
int i = 0; double xi, sum2 = 0.0, sum = 0.0, vi;
for (; xx < xx_end; xx++, vv++, i++) {
xi = *xx - pivot;
sum += xi; sum2 += xi * xi;
vi = (sum2 - (sum * sum) / (i + 1)) / i;
if (sd) vi = sqrt(vi);
*vv = vi;
}
return v;
}')
速度
cumvar
和cumvar_cpp
很快有兩個原因:
O(n)
復雜度而不是O(n ^ 2)
復雜度。 對於越來越長的矢量,它們比簡單的滾動計算越來越快。
x <- rnorm(1e+3)
library(microbenchmark)
library(TTR)
microbenchmark("zheyuan" = cumvar(x, TRUE),
"zheyuan_cpp" = cumvar_cpp(x, TRUE),
"Rich" = vapply(seq_along(x), function(i) sd(x[1:i]), 1),
"akrun" = runSD(x, n = 1, cumulative = TRUE))
#Unit: microseconds
# expr min lq mean median uq max
# zheyuan 101.261 105.2505 118.85214 121.0040 128.5925 157.702
# zheyuan_cpp 69.749 72.7190 81.38878 82.2335 84.2820 213.193
# Rich 74595.329 75201.9420 77533.38803 75814.6945 77465.9945 136099.832
# akrun 4329.892 4436.0145 4710.82440 4669.8380 4715.6035 6908.231
x <- rnorm(1e+4)
microbenchmark("zheyuan" = cumvar(x, TRUE),
"zheyuan_cpp" = cumvar_cpp(x, TRUE),
"akrun" = runSD(x, n = 1, cumulative = TRUE))
#Unit: microseconds
# expr min lq mean median uq
# zheyuan 842.844 863.676 997.9585 880.2245 968.077
# zheyuan_cpp 618.823 632.254 709.1971 639.2990 657.366
# akrun 147279.410 148200.370 150839.8161 149599.6170 151981.069
x <- rnorm(1e+5)
microbenchmark("zheyuan" = cumvar(x, TRUE),
"zheyuan_cpp" = cumvar_cpp(x, TRUE),
"akrun" = runSD(x, n = 1, cumulative = TRUE),
times = 10)
#Unit: milliseconds
# expr min lq mean median uq
# zheyuan 8.446502 8.657557 22.531637 9.431389 11.082594
# zheyuan_cpp 6.189955 6.305053 6.698292 6.365656 6.812374
# akrun 14477.847050 14559.844609 14787.200820 14755.526655 15021.524429
備注1
我用cumstats
搜索“累積方差R”,發現了一個小包的cumstats
。 它有一個cumvar
函數,但是用sapply
編寫(如Rich Scriven的答案 ),因此我無需進行實驗。
備注2
感謝Benjamin Christoffersen的專業闡述 。 我已經添加到我的原始cumvar
以下行以增強數值穩定性。
x <- x - x[sample.int(length(x), 1)]
然后它與Ben的roll_var
相比返回正確的值。
## using Ben's example
set.seed(99858398)
x <- rnorm(1e2, mean = 1e8)
all.equal(cumvar_cpp(x, FALSE), base::c(roll_var(x)))
#[1] TRUE
計算累積方差的速度比較:
x <- rnorm(1e+6)
microbenchmark("zheyuan" = cumvar(x, TRUE),
"zheyuan_cpp" = cumvar_cpp(x, FALSE),
"Ben_cpp" = roll_var(x),
times = 20)
#Unit: milliseconds
# expr min lq mean median uq max neval
# zheyuan 85.47676 87.36403 91.05656 89.64444 93.99912 102.04964 20
# zheyuan_cpp 42.27983 42.41443 44.29919 42.65548 46.43293 51.24379 20
# Ben_cpp 46.99105 47.12178 49.48072 47.76016 50.44587 60.11491 20
x <- rnorm(1e+7)
microbenchmark("zheyuan" = cumvar(x, TRUE),
"zheyuan_cpp" = cumvar_cpp(x, FALSE),
"Ben_cpp" = roll_var(x),
times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval
# zheyuan 1171.3624 1215.8870 1278.3862 1266.9576 1330.6168 1486.7895 10
# zheyuan_cpp 463.6257 473.2711 479.8156 476.8822 482.4766 512.0520 10
# Ben_cpp 571.7481 583.2694 587.9993 584.1206 602.0050 605.1515 10
我們可以使用runSD
TTR::runSD(df1$X, n = 1, cumulative = TRUE)
#[1] NA 2.121320 2.081666 2.160247 2.915476
df1 <- data.frame(X = c(1, 4, 5, 6, 9))
您還可以在維基站點查看算法以計算方差,並使用Rcpp
實現Welford的在線算法 ,如下所示
library(Rcpp)
func <- cppFunction(
"arma::vec roll_var(arma::vec &X){
const arma::uword n_max = X.n_elem;
double xbar = 0, M = 0;
arma::vec out(n_max);
double *x = X.begin(), *o = out.begin();
for(arma::uword n = 1; n <= n_max; ++n, ++x, ++o){
double tmp = (*x - xbar);
xbar += (*x - xbar) / n;
M += tmp * (*x - xbar);
if(n > 1L)
*o = M / (n - 1.);
}
if(n_max > 0)
out[0] = std::numeric_limits<double>::quiet_NaN();
return out;
}", depends = "RcppArmadillo")
# it gives the same
x <- c(1, 4, 5, 6, 9)
drop(func(x))
#R [1] NaN 4.50 4.33 4.67 8.50
sapply(seq_along(x), function(i) var(x[1:i]))
#R [1] NA 4.50 4.33 4.67 8.50
# it is fast
x <- rnorm(1e+3)
microbenchmark::microbenchmark(
func = func(x),
sapply = sapply(seq_along(x), function(i) var(x[1:i])))
#R Unit: microseconds
#R expr min lq mean median uq max neval
#R func 9.09 9.88 30.7 20.5 21.9 1189 100
#R sapply 43183.49 45040.29 47043.5 46134.4 47309.7 80345 100
取平方根給出標准偏差。
這種方法的一個主要優點是取消沒有問題。 比如,比較
# there are no issues with cancellation
set.seed(99858398)
x <- rnorm(1e2, mean = 1e8)
cumvar <- function (x, sd = FALSE) {
n <- seq_along(x)
v <- (cumsum(x ^ 2) - cumsum(x) ^ 2 / n) / (n - 1)
if (sd) v <- sqrt(v)
v
}
z1 <- drop(func(x))[-1]
z2 <- cumvar(x)[-1]
plot(z1, ylim = range(z1, z2), type = "l", lwd = 2)
lines(seq_along(z2), z2, col = "DarkBlue")
藍線是您從平方均值中減去平方值的算法。
你可以使用sapply()
。
sapply(seq_along(X), function(i) sd(X[1:i]))
# [1] NA 2.121320 2.081666 2.160247 2.915476
或稍快的vapply()
。
vapply(seq_along(X), function(i) sd(X[1:i]), 1)
# [1] NA 2.121320 2.081666 2.160247 2.915476
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.