Rolling regression over multiple columns

问题

I have an issue finding the most efficient way to calculate a rolling linear regression over a xts object with multiple columns. I have searched and read several previously questions here on stackoverflow.

This question and answer comes close but not enough in my opinion as I want to calculate multiple regressions with the dependent variable unchanged in all the regressions. I have tried to reproduce an example with random data:

require(xts)
require(RcppArmadillo)  # Load libraries

data <- matrix(sample(1:10000, 1500), 1500, 5, byrow = TRUE)  # Random data
data[1000:1500, 2] <- NA  # insert NAs to make it more similar to true data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))

NR <- nrow(data)  # number of observations
NC <- ncol(data)  # number of factors
obs <- 30  # required number of observations for rolling regression analysis
info.names <- c("res", "coef")

info <- array(NA, dim = c(NR, length(info.names), NC))
colnames(info) <- info.names

The array is created in order to store multiple variables (residuals, coefficients etc.) over time and per factor.

loop.begin.time <- Sys.time()

for (j in 2:NC) {
  cat(paste("Processing residuals for factor:", j), "\n")
  for (i in obs:NR) {
    regression.temp <- fastLm(data[i:(i-(obs-1)), j] ~ data[i:(i-(obs-1)), 1])
    residuals.temp <- regression.temp$residuals
    info[i, "res", j] <- round(residuals.temp[1] / sd(residuals.temp), 4)
    info[i, "coef", j] <- regression.temp$coefficients[2]
  } 
}

loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)  # prints the loop runtime

As the loop shows the idea is to run a 30 observations rolling regression with data[, 1] as the dependent variable (factor) every time against one of the other factors. I have to store the 30 residuals in a temporary object in order to standardize them as fastLm does not calculate standardized residuals.

The loop is extremely slow and becomes a cumbersome if the numbers of columns (factors) in the xts object increases to around 100 - 1,000 columns would take an eternity. I hope one has a more efficient code to create rolling regressions over a large data set.

回答1:

It should be pretty quick if you go down to level of the math of the linear regression. If X is the independent variable and Y is the dependent variable. The coefficients are given by

Beta = inv(t(X) %*% X) %*% (t(X) %*% Y)

I'm a little confused about which variable you want to be the dependent and which one is the independent but hopefully solving a similar problem below will help you as well.

In the example below I take 1000 variables instead of the original 5 and do not introduce any NA's.

require(xts)

data <- matrix(sample(1:10000, 1500000, replace=T), 1500, 1000, byrow = TRUE)  # Random data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))

NR <- nrow(data)  # number of observations
NC <- ncol(data)  # number of factors
obs <- 30  # required number of observations for rolling regression analysis

Now we can calculate the coefficients using Joshua's TTR package.

library(TTR)

loop.begin.time <- Sys.time()

in.dep.var <- data[,1]
xx <- TTR::runSum(in.dep.var*in.dep.var, obs)
coeffs <- do.call(cbind, lapply(data, function(z) {
    xy <- TTR::runSum(z * in.dep.var, obs)
    xy/xx
}))

loop.end.time <- Sys.time()

print(loop.end.time - loop.begin.time)  # prints the loop runtime

Time difference of 3.934461 secs

res.array = array(NA, dim=c(NC, NR, obs))
for(z in seq(obs)) {
  res.array[,,z] = coredata(data - lag.xts(coeffs, z-1) * as.numeric(in.dep.var))
}
res.sd <- apply(res.array, c(1,2), function(z) z / sd(z))

If I haven't made any errors in the indexing res.sd should give you the standardized residuals. Please feel free to fix this solution to correct any bugs.

回答2:

Here is a much faster way to do it with the rollRegres package

library(xts)
library(RcppArmadillo)

#####
# simulate data
set.seed(50554709)
data <- matrix(sample(1:10000, 1500), 1500, 5, byrow = TRUE)  # Random data
# data[1000:1500, 2] <- NA # only focus on the parts that are computed
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))

#####
# setup for solution in OP
NR <- nrow(data)
NC <- ncol(data)
obs <- 30L
info.names <- c("res", "coef")

info <- array(NA, dim = c(NR, length(info.names), NC))
colnames(info) <- info.names

#####
# solve with rollRegres
library(rollRegres)

loop.begin.time <- Sys.time()

X <- cbind(1, drop(data[, 1]))
out <- lapply(2:NC, function(j){
  fit <- roll_regres.fit(
    y = data[, j], x = X, width = obs, do_compute = c("sigmas"))

  # are you sure you want the residual of the first and not the last
  # observation in each window?
  idx <- 1:(nrow(data) - obs + 1L)
  idx_tail <- idx + obs - 1L
  resids <- c(rep(NA_real_, obs - 1L),
                  data[idx, j] - rowSums(fit$coefs[idx_tail, ] * X[idx, ]))

  # the package uses the unbaised estimator so we have to time by this factor
  # to get the same
  sds <-  fit$sigmas * sqrt((obs - 2L) / (obs - 1L))

  unclass(cbind(coef = fit$coefs[, 2L], res = drop(round(resids / sds, 4))))
})

loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)
#R Time difference of 0.03123808 secs

#####
# solve with original method
loop.begin.time <- Sys.time()

for (j in 2:NC) {
  cat(paste("Processing residuals for factor:", j), "\n")
  for (i in obs:NR) {
    regression.temp <- fastLm(data[i:(i-(obs-1)), j] ~ data[i:(i-(obs-1)), 1])
    residuals.temp <- regression.temp$residuals
    info[i, "res", j] <- round(residuals.temp[1] / sd(residuals.temp), 4)
    info[i, "coef", j] <- regression.temp$coefficients[2]
  }
}
#R Processing residuals for factor: 2
#R Processing residuals for factor: 3
#R Processing residuals for factor: 4
#R Processing residuals for factor: 5

loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)  # prints the loop runtime
#R Time difference of 7.554767 secs

#####
# check that results are the same
all.equal(info[, "coef", 2L], out[[1]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 2L], out[[1]][, "res"])
#R [1] TRUE

all.equal(info[, "coef", 3L], out[[2]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 3L], out[[2]][, "res"])
#R [1] TRUE

all.equal(info[, "coef", 4L], out[[3]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 4L], out[[3]][, "res"])
#R [1] TRUE

all.equal(info[, "coef", 5L], out[[4]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 5L], out[[4]][, "res"])
#R [1] TRUE

Do notice this comment inside the above solution

# are you sure you want the residual of the first and not the last
# observation in each window?

Here is a comparison to Sameer's answer

library(rollRegres)
require(xts)

data <- matrix(sample(1:10000, 1500000, replace=T), 1500, 1000, byrow = TRUE)  # Random data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))

NR <- nrow(data)  # number of observations
NC <- ncol(data)  # number of factors
obs <- 30  # required number of observations for rolling regression analysis

loop.begin.time <- Sys.time()

X <- cbind(1, drop(data[, 1]))
out <- lapply(2:NC, function(j){
  fit <- roll_regres.fit(
    y = data[, j], x = X, width = obs, do_compute = c("sigmas"))

  # are you sure you want the residual of the first and not the last
  # observation in each window?
  idx <- 1:(nrow(data) - obs + 1L)
  idx_tail <- idx + obs - 1L
  resids <- c(rep(NA_real_, obs - 1L),
              data[idx, j] - rowSums(fit$coefs[idx_tail, ] * X[idx, ]))

  # the package uses the unbaised estimator so we have to time by this factor
  # to get the same
  sds <-  fit$sigmas * sqrt((obs - 2L) / (obs - 1L))

  unclass(cbind(coef = fit$coefs[, 2L], res = drop(round(resids / sds, 4))))
})

loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)
#R Time difference of 0.9019711 secs

The time includes the time used to compute the standardized residuals.

来源：https://stackoverflow.com/questions/11873123/rolling-regression-over-multiple-columns

标签

apply

xts

linear-regression

rolling-computation