简体   繁体   中英

R: Rolling mean on values below variable quantile threshold

I have a large data.table . I require:

  • the rolling quantile of a numeric column
  • the rolling mean of the numeric column applied on values below the (moving) quantile threshold
library(data.table)

set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7)), value=rnorm(17))

> data
    group      value
 1:     A -0.3260365
 2:     A  0.5524619
 3:     A -0.6749438
 4:     A  0.2143595
 5:     A  0.3107692
 6:     A  1.1739663
 7:     A  0.6187899
 8:     A -0.1127343
 9:     A  0.9170283
10:     A -0.2232594
11:     B  0.5264481
12:     B -0.7948444
13:     B  1.4277555
14:     B -1.4668197
15:     B -0.2366834
16:     B -0.1933380
17:     B -0.8497547

To determine the rolling quantile of the 'value' column I am using the function runquantile() from the package caTools which takes ~1 minute to execute on my data set.

For the same rolling window (here k=4 ), how to obtain the rolling mean value below the moving quantile, when computation time is a concern? In the example, the result should look something like column 'mean_below_q'.

library(caTools)

data[,rolling_q := c(rep(NA,3),runquantile(value,k=4,0.4,endrule="trim")),group]

> data
    group      value   rolling_q mean_below_q
 1:     A -0.3260365          NA           NA
 2:     A  0.5524619          NA           NA
 3:     A -0.6749438          NA           NA
 4:     A  0.2143595 -0.21795730  -0.50049020
 5:     A  0.3107692  0.23364141  -0.23029220
 6:     A  1.1739663  0.23364141  -0.23029220
 7:     A  0.6187899  0.37237334   0.26256430
 8:     A -0.1127343  0.37237334   0.09901745
 9:     A  0.9170283  0.67843754   0.25302780
10:     A -0.2232594  0.03357052  -0.16799680
11:     B  0.5264481          NA           NA
12:     B -0.7948444          NA           NA
13:     B  1.4277555          NA           NA
14:     B -1.4668197 -0.53058593  -1.13083200
15:     B -0.2366834 -0.68321222  -1.13083200
16:     B -0.1933380 -0.22801430  -0.85175150
17:     B -0.8497547 -0.72714047  -1.15828700

Thank you very much.


As per @chinsoon12 brilliant script it solves the problem and takes roughly 15 minutes on the large data set.

# Option 1)

library(data.table)

set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7),rep("C",13)), value=c(rnorm(17),rep(0,3),-0.9,rep(0,4),-0.5,-0.8,rep(0,3)))
sz <- 4L
prob <- 0.4

#see stats:::quantile.default
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]

data[, rolling_q := frollapply(value, sz, function(x) {
        x <- sort(x, partial = unique(c(lo, hi)))
        qs <- x[lo]
        i <- which(index > lo & x[hi] != qs)
        h <- (index - lo)[i]
        qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
        qs
        #.(q, sum(x[1L:lo]) / lo)
    }), group]

data[, mean_below_q := 
    data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
        by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]

Lastly, I tried to use runquantile() from the caTools package in conjunction with the Option 1) code for the rolling mean below threshold. It seems to work fine and takes in total 2.5 minutes on the large data set.

# rolling_q via runquantile & mean_below_q via Option 1)

library(data.table)
library(caTools)

set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7),rep("C",13)), value=c(rnorm(17),rep(0,3),-0.9,rep(0,4),-0.5,-0.8,rep(0,3)))

data[,rolling_q := c(rep(NA,3),runquantile(value,k=4,0.4,endrule="trim")),group]

sz <- 4L
prob <- 0.4

index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]

data[, mean_below_q := 
    data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
        by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]

> data
    group      value   rolling_q sr er mean_below_q
 1:     A -0.3260365          NA -2  1           NA
 2:     A  0.5524619          NA -1  2           NA
 3:     A -0.6749438          NA  0  3           NA
 4:     A  0.2143595 -0.21795730  1  4  -0.50049017
 5:     A  0.3107692  0.23364141  2  5  -0.23029219
 6:     A  1.1739663  0.23364141  3  6  -0.23029219
 7:     A  0.6187899  0.37237334  4  7   0.26256434
 8:     A -0.1127343  0.37237334  5  8   0.09901745
 9:     A  0.9170283  0.67843754  6  9   0.25302777
10:     A -0.2232594  0.03357052  7 10  -0.16799684
11:     B  0.5264481          NA  8 11           NA
12:     B -0.7948444          NA  9 12           NA
13:     B  1.4277555          NA 10 13           NA
14:     B -1.4668197 -0.53058593 11 14  -1.13083206
15:     B -0.2366834 -0.68321222 12 15  -1.13083206
16:     B -0.1933380 -0.22801430 13 16  -0.85175154
17:     B -0.8497547 -0.72714047 14 17  -1.15828722
18:     C  0.0000000          NA 15 18           NA
19:     C  0.0000000          NA 16 19           NA
20:     C  0.0000000          NA 17 20           NA
21:     C -0.9000000  0.00000000 18 21  -0.22500000
22:     C  0.0000000  0.00000000 19 22  -0.22500000
23:     C  0.0000000  0.00000000 20 23  -0.22500000
24:     C  0.0000000  0.00000000 21 24  -0.22500000
25:     C  0.0000000  0.00000000 22 25   0.00000000
26:     C -0.5000000  0.00000000 23 26  -0.12500000
27:     C -0.8000000 -0.40000000 24 27  -0.65000000
28:     C  0.0000000 -0.40000000 25 28  -0.65000000
29:     C  0.0000000 -0.40000000 26 29  -0.65000000
30:     C  0.0000000  0.00000000 27 30  -0.20000000
    group      value   rolling_q sr er mean_below_q

I believe there is a typo as data is showing mean of values below percentile rather than above percentile.

Here are 2 options:

1) using frollapply (as it returns a value of length 1), hence you need another non-equi join to calculate your mean of values below percentile.

data[, rolling_q := frollapply(value, sz, function(x) {
        x <- sort(x, partial = unique(c(lo, hi)))
        qs <- x[lo]
        i <- which(index > lo & x[hi] != qs)
        h <- (index - lo)[i]
        qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
        qs
        #.(q, sum(x[1L:lo]) / lo)
    }), group]

data[, mean_below_q := 
    data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
        by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]

2) Using 1 non-equi join to calculate both values

data[, c("rolling_q", "mean_below_q") := 
    .SD[.SD, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
        by=.EACHI, {
            if (.N >= sz) {
                x <- x.value
                x <- sort(x, partial = unique(c(lo, hi)))
                qs <- x[lo]
                i <- which(index > lo & x[hi] != qs)
                h <- (index - lo)[i]
                qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
                qs

                .(qs, sum(x[1L:lo]) / lo)
            } else 
                .(NA_real_, NA_real_)
        }][, (1L:3L) := NULL]
]

output:

    group      value sr er   rolling_q mean_below_q
 1:     A -0.3260365 -2  1          NA           NA
 2:     A  0.5524619 -1  2          NA           NA
 3:     A -0.6749438  0  3          NA           NA
 4:     A  0.2143595  1  4 -0.21795730  -0.50049017
 5:     A  0.3107692  2  5  0.23364141  -0.23029219
 6:     A  1.1739663  3  6  0.23364141  -0.23029219
 7:     A  0.6187899  4  7  0.37237334   0.26256434
 8:     A -0.1127343  5  8  0.37237334   0.09901745
 9:     A  0.9170283  6  9  0.67843754   0.25302777
10:     A -0.2232594  7 10  0.03357052  -0.16799684
11:     B  0.5264481  8 11          NA           NA
12:     B -0.7948444  9 12          NA           NA
13:     B  1.4277555 10 13          NA           NA
14:     B -1.4668197 11 14 -0.53058593  -1.13083206
15:     B -0.2366834 12 15 -0.68321222  -1.13083206
16:     B -0.1933380 13 16 -0.22801430  -0.85175154
17:     B -0.8497547 14 17 -0.72714047  -1.15828722

data:

library(data.table)

set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7)), value=rnorm(17))
sz <- 4L
prob <- 0.4

#see stats:::quantile.default
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM