Recursive regression in R

后端 未结 3 1232
清酒与你
清酒与你 2021-02-10 05:41

Say I have a data frame in R as follows:

> set.seed(1)
> X <- runif(50, 0, 1)
> Y <- runif(50, 0, 1)
> df <- data.frame(X,Y)
> head(df)

         


        
相关标签:
3条回答
  • 2021-02-10 05:54

    The solution Greg proposes with biglm is faster than the solution LyzandeR suggest with lm but still quite slow. There is a lot of overhead which can be avoid with the function I show below. I figure you can make it considerably faster if you do it all C++ with Rcpp

    # simulate data
    set.seed(101)
    n <- 1000
    X <- matrix(rnorm(10 * n), n, 10)
    y <- drop(10 + X %*% runif(10)) + rnorm(n)
    dat <- data.frame(y = y, X)
    
    # assign wrapper for biglm
    biglm_wrapper <- function(formula, data, min_window_size){
      mf <- model.frame(formula, data)
      X <- model.matrix(terms(mf), mf)
      y - model.response(mf)
    
      n <- nrow(X)
      p <- ncol(X)
      storage.mode(X) <- "double"
      storage.mode(y) <- "double"
      w <- 1
      qr <- list(
        d = numeric(p), rbar = numeric(choose(p, 2)), 
        thetab = numeric(p), sserr = 0, checked = FALSE, tol = numeric(p))
      nrbar = length(qr$rbar)
      beta. <- numeric(p)
    
      out <- NULL
      for(i in 1:n){
        row <- X[i, ] # will be over written
        qr[c("d", "rbar", "thetab", "sserr")] <- .Fortran(
          "INCLUD", np = p, nrbar = nrbar, weight = w, xrow = row, yelem = y[i], 
          d = qr$d, rbar = qr$rbar, thetab = qr$thetab, sserr = qr$sserr, ier = i - 0L, 
          PACKAGE = "biglm")[
            c("d", "rbar", "thetab", "sserr")]
    
        if(i >= min_window_size){
          tmp <- .Fortran(
            "REGCF", np = p, nrbar = nrbar, d = qr$d, rbar = qr$rbar,
            thetab = qr$thetab, tol = qr$tol, beta = beta., nreq = p, ier = i,
            PACKAGE = "biglm")
          Coef <- tmp$beta
    
          # compute vcov. See biglm/R/vcov.biglm.R
          R <- diag(p)
          R[row(R) > col(R)] <- qr$rbar
          R <- t(R)
          R <- sqrt(qr$d) * R
          ok <- qr$d != 0
          R[ok, ok] <- chol2inv(R[ok, ok, drop = FALSE])
          R[!ok, ] <- NA
          R[ ,!ok] <- NA
          out <- c(out, list(cbind(
            coef = Coef, 
            SE   = sqrt(diag(R * qr$sserr / (i - p + sum(!ok)))))))
        }
      }
    
      out
    }
    
    # assign function to compare with 
    library(biglm)
    f2 <- function(formula, data, min_window_size){
      fit1 <- biglm(formula, data = data[1:min_window_size, ])
      data.split <- 
        split(data, c(rep(NA,min_window_size),1:(nrow(data) - min_window_size)))
      out4 <- Reduce(update, data.split, init=fit1, accumulate=TRUE)
      lapply(out4, function(x) summary(x)$mat[, c("Coef", "SE")])
    }
    
    # show that the two gives the same
    frm <- y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10
    r1 <- biglm_wrapper(frm, dat, 25)
    r2 <- f2(frm, dat, 25)
    all.equal(r1, r2, check.attributes = FALSE)
    #R> [1] TRUE
    
    # show run time
    microbenchmark::microbenchmark(
      r1 = biglm_wrapper(frm, dat, 25), 
      r2 = f2(frm, dat, 25), 
      r3 = lapply(
        25:nrow(dat), function(x) lm(frm, data = dat[1:x , ])),
      times = 5)
    #R> Unit: milliseconds
    #R>  expr        min         lq       mean    median         uq        max neval cld
    #R>    r1   43.72469   44.33467   44.79847   44.9964   45.33536   45.60124     5 a  
    #R>    r2 1101.51558 1161.72464 1204.68884 1169.4580 1246.74321 1344.00278     5  b 
    #R>    r3 2080.52513 2232.35939 2231.02060 2253.1420 2260.74737 2328.32908     5   c
    
    0 讨论(0)
  • 2021-02-10 06:09

    You may be interested in the biglm function in the biglm package. This allows you to fit a regression on a subset of the data, then update the regression model with additional data. The original idea was to use this for large datasets so that you only need part of the data in memory at any given time, but it fits the description of what you want to do perfectly (you can wrap the updating process in a loop). The summary for biglm objects gives confidence intervals in addition to standard errors (and coefficients of course).

    library(biglm)
    
    fit1 <- biglm( Sepal.Width ~ Sepal.Length + Species, data=iris[1:20,])
    summary(fit1)
    
    out <- list()
    out[[1]] <- fit1
    
    for(i in 1:130) {
      out[[i+1]] <- update(out[[i]], iris[i+20,])
    }
    
    out2 <- lapply(out, function(x) summary(x)$mat)
    out3 <- sapply(out2, function(x) x[2,2:3])
    matplot(t(out3), type='l')
    

    If you don't want to use an explicit loop, then the Reduce function can help:

    fit1 <- biglm( Sepal.Width ~ Sepal.Length + Species, data=iris[1:20,])
    iris.split <- split(iris, c(rep(NA,20),1:130))
    out4 <- Reduce(update, iris.split, init=fit1, accumulate=TRUE)
    out5 <- lapply(out4, function(x) summary(x)$mat)
    out6 <- sapply(out5, function(x) x[2,2:3])
    
    all.equal(out3,out6)
    
    0 讨论(0)
  • 2021-02-10 06:17

    Maybe this will be of help:

    set.seed(1)
    X1 <- runif(50, 0, 1)
    X2 <- runif(50, 0, 10) # I included another variable just for a better demonstration
    Y <- runif(50, 0, 1)
    df <- data.frame(X1,X2,Y)
    
    
    rolling_lms <- lapply( seq(20,nrow(df) ), function(x) lm( Y ~ X1+X2, data = df[1:x , ]) )
    

    Using the above lapply function you the recursive regression you want with full information.

    For example for the first regression with 20 observations:

    > summary(rolling_lms[[1]])
    
    Call:
    lm(formula = Y ~ X1 + X2, data = df[1:x, ])
    
    Residuals:
         Min       1Q   Median       3Q      Max 
    -0.45975 -0.19158 -0.05259  0.13609  0.67775 
    
    Coefficients:
                Estimate Std. Error t value Pr(>|t|)   
    (Intercept)  0.61082    0.17803   3.431  0.00319 **
    X1          -0.37834    0.23151  -1.634  0.12060   
    X2           0.01949    0.02541   0.767  0.45343   
    ---
    Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    
    Residual standard error: 0.2876 on 17 degrees of freedom
    Multiple R-squared:  0.1527,    Adjusted R-squared:  0.05297 
    F-statistic: 1.531 on 2 and 17 DF,  p-value: 0.2446
    

    And has all the info you need.

    > length(rolling_lms)
    [1] 31
    

    It performed 31 linear regressions starting from 20 observations and until it reached 50. Every regression with all the information is stored as an element of the rolling_lms list.

    EDIT

    As per Carl's comment below, in order to get a vector of all the slopes for each regression, for X1 variable on this occasion, this is a very good technique (as Carl suggested):

    all_slopes<-unlist(sapply(1:31,function(j) rolling_lms[[j]]$coefficients[2]))
    

    Output:

    > all_slopes
             X1          X1          X1          X1          X1          X1          X1          X1          X1          X1 
    -0.37833614 -0.23231852 -0.20465589 -0.20458938 -0.11796060 -0.14621369 -0.13861210 -0.11906724 -0.10149900 -0.14045509 
             X1          X1          X1          X1          X1          X1          X1          X1          X1          X1 
    -0.14331323 -0.14450837 -0.16214836 -0.15715630 -0.17388457 -0.11427933 -0.10624746 -0.09767893 -0.10111773 -0.06415914 
             X1          X1          X1          X1          X1          X1          X1          X1          X1          X1 
    -0.06432559 -0.04492075 -0.04122131 -0.06138768 -0.06287532 -0.06305953 -0.06491377 -0.01389334 -0.01703270 -0.03683358 
             X1 
    -0.02039574 
    
    0 讨论(0)
提交回复
热议问题