滾動回歸多列

Question

我有一個問題是找到一種最有效的方法來計算具有多列的xts對象的滾動線性回歸。 我已經在stackoverflow上搜索並閱讀了之前的幾個問題。

這個問題和答案很接近，但在我看來還不夠，因為我想計算所有回歸中因變量不變的多元回歸。 我試圖用隨機數據重現一個例子：

require(xts)
require(RcppArmadillo)  # Load libraries

data <- matrix(sample(1:10000, 1500), 1500, 5, byrow = TRUE)  # Random data
data[1000:1500, 2] <- NA  # insert NAs to make it more similar to true data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))

NR <- nrow(data)  # number of observations
NC <- ncol(data)  # number of factors
obs <- 30  # required number of observations for rolling regression analysis
info.names <- c("res", "coef")

info <- array(NA, dim = c(NR, length(info.names), NC))
colnames(info) <- info.names

創建數組是為了隨時間和每個因子存儲多個變量（殘差，系數等）。

loop.begin.time <- Sys.time()

for (j in 2:NC) {
  cat(paste("Processing residuals for factor:", j), "\n")
  for (i in obs:NR) {
    regression.temp <- fastLm(data[i:(i-(obs-1)), j] ~ data[i:(i-(obs-1)), 1])
    residuals.temp <- regression.temp$residuals
    info[i, "res", j] <- round(residuals.temp[1] / sd(residuals.temp), 4)
    info[i, "coef", j] <- regression.temp$coefficients[2]
  } 
}

loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)  # prints the loop runtime

由於循環顯示的想法是每次與其他因素之一運行30次觀察滾動回歸，其中data[, 1]作為因變量（因子）。 我必須將30個殘差存儲在臨時對象中，以便將它們標准化為fastLm不計算標准化殘差。

如果xts對象中的列數（因子）增加到100左右，那么循環非常慢並且變得很麻煩 - 1,000列將需要永恆。 我希望有一個更高效的代碼來創建大型數據集的滾動回歸。

Answer 1

如果你深入到線性回歸的數學水平，它應該很快。 如果X是自變量而Y是因變量。 系數由下式給出

Beta = inv(t(X) %*% X) %*% (t(X) %*% Y)

我有點困惑你想要哪個變量是依賴的，哪個是獨立的，但希望解決下面的類似問題對你也有幫助。

在下面的示例中，我采用1000個變量而不是原始的5個，並且不引入任何NA。

require(xts)

data <- matrix(sample(1:10000, 1500000, replace=T), 1500, 1000, byrow = TRUE)  # Random data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))

NR <- nrow(data)  # number of observations
NC <- ncol(data)  # number of factors
obs <- 30  # required number of observations for rolling regression analysis

現在我們可以使用Joshua的TTR包計算系數。

library(TTR)

loop.begin.time <- Sys.time()

in.dep.var <- data[,1]
xx <- TTR::runSum(in.dep.var*in.dep.var, obs)
coeffs <- do.call(cbind, lapply(data, function(z) {
    xy <- TTR::runSum(z * in.dep.var, obs)
    xy/xx
}))

loop.end.time <- Sys.time()

print(loop.end.time - loop.begin.time)  # prints the loop runtime

時差3.934461秒

res.array = array(NA, dim=c(NC, NR, obs))
for(z in seq(obs)) {
  res.array[,,z] = coredata(data - lag.xts(coeffs, z-1) * as.numeric(in.dep.var))
}
res.sd <- apply(res.array, c(1,2), function(z) z / sd(z))

如果我在索引中沒有出現任何錯誤， res.sd應該給你標准化的殘差。 請隨時修復此解決方案以糾正任何錯誤。

Answer 2

使用rollRegres包這是一種更快捷的方法

library(xts)
library(RcppArmadillo)

#####
# simulate data
set.seed(50554709)
data <- matrix(sample(1:10000, 1500), 1500, 5, byrow = TRUE)  # Random data
# data[1000:1500, 2] <- NA # only focus on the parts that are computed
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))

#####
# setup for solution in OP
NR <- nrow(data)
NC <- ncol(data)
obs <- 30L
info.names <- c("res", "coef")

info <- array(NA, dim = c(NR, length(info.names), NC))
colnames(info) <- info.names

#####
# solve with rollRegres
library(rollRegres)

loop.begin.time <- Sys.time()

X <- cbind(1, drop(data[, 1]))
out <- lapply(2:NC, function(j){
  fit <- roll_regres.fit(
    y = data[, j], x = X, width = obs, do_compute = c("sigmas"))

  # are you sure you want the residual of the first and not the last
  # observation in each window?
  idx <- 1:(nrow(data) - obs + 1L)
  idx_tail <- idx + obs - 1L
  resids <- c(rep(NA_real_, obs - 1L),
                  data[idx, j] - rowSums(fit$coefs[idx_tail, ] * X[idx, ]))

  # the package uses the unbaised estimator so we have to time by this factor
  # to get the same
  sds <-  fit$sigmas * sqrt((obs - 2L) / (obs - 1L))

  unclass(cbind(coef = fit$coefs[, 2L], res = drop(round(resids / sds, 4))))
})

loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)
#R Time difference of 0.03123808 secs

#####
# solve with original method
loop.begin.time <- Sys.time()

for (j in 2:NC) {
  cat(paste("Processing residuals for factor:", j), "\n")
  for (i in obs:NR) {
    regression.temp <- fastLm(data[i:(i-(obs-1)), j] ~ data[i:(i-(obs-1)), 1])
    residuals.temp <- regression.temp$residuals
    info[i, "res", j] <- round(residuals.temp[1] / sd(residuals.temp), 4)
    info[i, "coef", j] <- regression.temp$coefficients[2]
  }
}
#R Processing residuals for factor: 2
#R Processing residuals for factor: 3
#R Processing residuals for factor: 4
#R Processing residuals for factor: 5

loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)  # prints the loop runtime
#R Time difference of 7.554767 secs

#####
# check that results are the same
all.equal(info[, "coef", 2L], out[[1]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 2L], out[[1]][, "res"])
#R [1] TRUE

all.equal(info[, "coef", 3L], out[[2]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 3L], out[[2]][, "res"])
#R [1] TRUE

all.equal(info[, "coef", 4L], out[[3]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 4L], out[[3]][, "res"])
#R [1] TRUE

all.equal(info[, "coef", 5L], out[[4]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 5L], out[[4]][, "res"])
#R [1] TRUE

在上述解決方案中注意這個評論

# are you sure you want the residual of the first and not the last
# observation in each window?

這是對Sameer的回答的比較

library(rollRegres)
require(xts)

data <- matrix(sample(1:10000, 1500000, replace=T), 1500, 1000, byrow = TRUE)  # Random data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))

NR <- nrow(data)  # number of observations
NC <- ncol(data)  # number of factors
obs <- 30  # required number of observations for rolling regression analysis

loop.begin.time <- Sys.time()

X <- cbind(1, drop(data[, 1]))
out <- lapply(2:NC, function(j){
  fit <- roll_regres.fit(
    y = data[, j], x = X, width = obs, do_compute = c("sigmas"))

  # are you sure you want the residual of the first and not the last
  # observation in each window?
  idx <- 1:(nrow(data) - obs + 1L)
  idx_tail <- idx + obs - 1L
  resids <- c(rep(NA_real_, obs - 1L),
              data[idx, j] - rowSums(fit$coefs[idx_tail, ] * X[idx, ]))

  # the package uses the unbaised estimator so we have to time by this factor
  # to get the same
  sds <-  fit$sigmas * sqrt((obs - 2L) / (obs - 1L))

  unclass(cbind(coef = fit$coefs[, 2L], res = drop(round(resids / sds, 4))))
})

loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)
#R Time difference of 0.9019711 secs

時間包括用於計算標准化殘差的時間。

滾動回歸多列

問題描述

2 個解決方案

解決方案1
10 2012-08-09 14:12:29

解決方案2
0 2018-07-02 17:39:08

滾動回歸多列

問題描述

2 個解決方案

解決方案1 10 2012-08-09 14:12:29

解決方案2 0 2018-07-02 17:39:08

解決方案1
10 2012-08-09 14:12:29

解決方案2
0 2018-07-02 17:39:08