Sum of subvectors of a vector in R

后端 未结 6 705
悲哀的现实
悲哀的现实 2021-01-04 19:01

Given a vector x of length k, I would like to obtain a k by k matrix X where X[i,j] is the sum of x[i] + ... + x[j]. The

相关标签:
6条回答
  • 2021-01-04 19:11

    We can use outer():

    mySum <- function(i,j) sum(x[i:j])
    outer(1:10, 1:10, Vectorize(mySum))
    

    EDIT: You could also go for a solution by foreach:

    library(foreach)
    mySum <- function(j) sum(x[i:j])
    mySum <- Vectorize(mySum)
    foreach(i = 1:10, .combine = 'rbind') %do% mySum(1:10)
    

    and maybe run it in parallel instead.

    0 讨论(0)
  • 2021-01-04 19:19

    Here's another approach which seems to be significantly faster than OP's for loop (by factor ~30) and faster than the other answers currently present (by factor >=18):

    n <- 5
    x <- 1:5
    z <- lapply(1:n, function(i) cumsum(x[i:n]))
    m <- mapply(function(y, l) c(rep(NA, n-l), y), z, lengths(z))
    m[upper.tri(m)] <- t(m)[upper.tri(m)]
    m
    
    #     [,1] [,2] [,3] [,4] [,5]
    #[1,]    1    3    6   10   15
    #[2,]    3    2    5    9   14
    #[3,]    6    5    3    7   12
    #[4,]   10    9    7    4    9
    #[5,]   15   14   12    9    5
    

    Benchmarks (scroll down for results)

    library(microbenchmark)
    n <- 100
    x <- 1:n
    
    f1 <- function() {
      X <- matrix(0,n,n)
      for(i in 1:n) {
        for(j in 1:n) {
          X[i,j] <- sum(x[i:j])
        }
      }
      X
    }
    
    f2 <- function() {
      mySum <- function(i,j) sum(x[i:j])
      outer(1:n, 1:n, Vectorize(mySum))
    }
    
    f3 <- function() {
      matrix(apply(expand.grid(1:n, 1:n), 1, function(y) sum(x[y[2]:y[1]])), n, n)
    }
    
    f4 <- function() {
      z <- lapply(1:n, function(i) cumsum(x[i:n]))
      m <- mapply(function(y, l) c(rep(NA, n-l), y), z, lengths(z))
      m[upper.tri(m)] <- t(m)[upper.tri(m)]
      m
    }
    
    f5 <- function() {
      X <- diag(x)
      for(i in 1:(n-1)) {
        for(j in 1:(n-i)){
          X[j+i,j] <- X[j,j+i] <- X[j+i,j+i] + X[j+i-1,j]
        }  
      }
      X
    }
    
    microbenchmark(f1(), f2(), f3(), f4(), f5(), times = 25L, unit = "relative")
    #Unit: relative
    # expr      min       lq     mean   median       uq      max neval
    # f1() 29.90113 29.01193 30.82411 31.15412 32.51668 35.93552    25
    # f2() 29.46394 30.93101 31.79682 31.88397 34.52489 28.74846    25
    # f3() 56.05807 53.82641 53.63785 55.36704 55.62439 45.94875    25
    # f4()  1.00000  1.00000  1.00000  1.00000  1.00000  1.00000    25
    # f5() 16.30136 17.46371 18.86259 17.87850 21.19914 23.68106    25
    
    all.equal(f1(), f2())
    #[1] TRUE
    all.equal(f1(), f3())
    #[1] TRUE
    all.equal(f1(), f4())
    #[1] TRUE
    all.equal(f1(), f5())
    #[1] TRUE
    

    Updated with the edited function by Neal Fultz.

    0 讨论(0)
  • 2021-01-04 19:22

    You don't need to repeatedly recalculate the sums in the inner loop, instead, you can build up the matrix by subdiagonal using the fact that a cell equals the cell above it plus the cell on the diagonal to the right. This should reduce the order of the algorithm from O(n^3) to O(n^2).

    Here's a quick and dirty implementation:

    X <- diag(x)
    
    for(i in 1:9) {
        for(j in 1:(10-i)){
            X[j+i,j] <- X[j,j+i] <- X[j+i,j+i] + X[j+i-1,j]
        }  
    }
    

    EDIT:

    As others have pointed out, you can get a little more speed and simplicity by using cumsum and vectorizing the inner loop:

    n <- length(x)
    X <- diag(x)
    for(i in 1:n) {
        X[i:n,i] <- X[i,i:n] <- cumsum(x[i:n])
    }
    
    0 讨论(0)
  • Here is an Rcpp function that is almost a literal translation of your code:

    set.seed(1)
    x <- rnorm(10)
    
    X <- matrix(0,10,10)
    for(i in 1:10) 
      for(j in 1:10)
        X[i,j] <- sum(x[i:j])
    
    library(inline)
    library(Rcpp)
    
    cppFunction(
      'NumericMatrix allSums(NumericVector x) {
            int n = x.length();
            NumericMatrix X(n, n);
            for (int i = 0; i < n; ++i) {
              for (int j = 0; j < n; ++j) {
                 for (int k = i; k <= j; ++k) {
                   X(i,j) += x(k);
                 }
                X(j,i) = X(i,j);
              }
            }
            return X;
        }')
    
    Y <- allSums(x)
    all.equal(X, Y)
    #[1] TRUE
    

    But of course, the algorithm can be improved:

    cppFunction(
      'NumericMatrix allSums2(NumericVector x) {
            int n = x.length();
            NumericMatrix X(n, n);
            X(0,0) = x(0);
            for (int j = 0; j < n; ++j) {
              if (j > 0) {
                X(0,j) = X(0, j-1) + x(j);
                X(j,0) = X(0,j);
              }
              for (int i = 1; i < n; ++i) {
                X(i,j) = X(i-1,j) - x(i-1); 
                X(j,i) = X(i,j);
                }
              }
            return X;
        }')
    
    Z <- allSums2(x)
    all.equal(X, Z)
    #[1] TRUE
    

    Some benchmarks:

    library(microbenchmark)
    n <- 100
    x <- 1:n
    
    f4 <- function(x, n) {
      z <- lapply(1:n, function(i) cumsum(x[i:n]))
      m <- mapply(function(y, l) c(rep(NA, n-l), y), z, lengths(z))
      m[upper.tri(m)] <- t(m)[upper.tri(m)]
      m
    }
    
    
    microbenchmark(f4(x, n), allSums(x), allSums2(x), times = 25)#
    #Unit: microseconds
    #       expr      min       lq      mean   median       uq      max neval cld
    #   f4(x, n)  933.441  938.061 1121.0901  975.633 1045.232 2635.561    25  b 
    # allSums(x) 1385.533 1391.693 1466.4784 1395.080 1408.630 2996.803    25   c
    #allSums2(x)  127.499  129.038  198.8475  133.965  139.201 1737.844    25 a  
    
    0 讨论(0)
  • 2021-01-04 19:27

    In addition to the excellent answers already provided, here is a super fast base R solution:

    subVecSum <- function(v, s) {
        t <- c(0L, cumsum(v))
        n1 <- s+1L
        m <- matrix(0L,s,s)
        for (i in 4L:n1) {
            m[i-2L,1L:(i-3L)] <- t[i-1L]-t[1L:(i-3L)]
            m[i-2L,i-2L] <- v[i-2L]
            m[i-2L,(i-1L):s] <- t[i:n1]-t[i-2L]
        }
        m[1L,] <- t[-1L]; m[s,] <- t[n1]-t[1L:s]
        m
    }
    

    In fact, according to the benchmarks below, it is the fastest base R solution (@Roland's Rcpp solution is still the fastest). It also gets faster, relatively speaking, as the size of the vector increases (I only compared f4 (provided by @docendo) as it is the fastest base R solution thus far and @Roland's Rcpp implementation. You will note that I'm using the modified f4 function as defined by @Roland).

    ## We first compile the functions.. no need to compile the Rcpp
    ## function as it is already done by calling cppFunction
    c.f4 <- compiler::cmpfun(f4)
    c.subVS1 <- compiler::cmpfun(subVecSum)
    
    n <- 100
    x <- 1:n
    microbenchmark(c.f4(x,n), c.subVS1(x,n), allSums2(x), times = 1000, unit = "relative")
    Unit: relative
              expr       min        lq     mean    median        uq       max neval cld
        c.f4(x, n) 11.355013 11.262663 9.231756 11.545315 12.074004 1.0819186  1000   c
    c.subVS1(x, n)  7.795879  7.592643 5.414135  7.624209  8.080471 0.8490876  1000  b 
       allSums2(x)  1.000000  1.000000 1.000000  1.000000  1.000000 1.0000000  1000 a  
    
    n <- 500
    x <- 1:n
    microbenchmark(c.f4(x,n), c.subVS1(x,n), allSums2(x), times = 500, unit = "relative")
    Unit: relative
              expr      min       lq     mean   median       uq       max neval cld
        c.f4(x, n) 6.231426 6.585118 6.442567 6.438163 6.882862 10.124428   500   c
    c.subVS1(x, n) 3.548766 3.271089 3.137887 2.881520 3.604536  8.854241   500  b 
       allSums2(x) 1.000000 1.000000 1.000000 1.000000 1.000000  1.000000   500 a  
    
    n <- 1000
    x <- 1:n
    microbenchmark(c.f4(x,n), c.subVS1(x,n), allSums2(x), times = 100, unit = "relative")
    Unit: relative
              expr      min        lq      mean    median        uq      max neval cld
        c.f4(x, n) 7.779537 16.352334 11.489506 15.529351 14.447210 3.639483   100   c
    c.subVS1(x, n) 2.637996  2.951763  2.937385  2.726569  2.692099 1.211545   100  b 
       allSums2(x) 1.000000  1.000000  1.000000  1.000000  1.000000 1.000000   100 a  
    
    identical(c.f4(x,n), c.subVS1(x,n), as.integer(allSums2(x)))  ## gives the same results
    [1] TRUE
    

    This algorithm takes advantage of only calculating cumsum(v) one time and utilizing indexing from there. For really large vectors, the efficiency is comparable to the Rcpp solution provided by @Roland. Observe:

    n <- 5000
    x <- 1:n
    microbenchmark(c.subVS1(x,n), allSums2(x), times = 10, unit = "relative")
    Unit: relative
              expr      min       lq     mean   median       uq      max neval cld
    c.subVS1(x, n) 1.900718 1.865304 1.854165 1.865396 1.769996 1.837354    10   b
       allSums2(x) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10  a 
    
    
    n <- 10000
    x <- 1:n
    microbenchmark(c.subVS1(x,n), allSums2(x), times = 10, unit = "relative")
    Unit: relative
              expr      min      lq     mean   median       uq     max neval cld
    c.subVS1(x, n) 1.503538 1.53851 1.493883 1.526843 1.496783 1.29196    10   b
       allSums2(x) 1.000000 1.00000 1.000000 1.000000 1.000000 1.00000    10  a 
    

    Not bad, for base R, however Rcpp stills rules the day!!!

    0 讨论(0)
  • 2021-01-04 19:33

    You can also try this:

    x <- 1:10
    
    matrix(apply(expand.grid(1:10, 1:10), 1, function(y) sum(x[y[2]:y[1]])), 10, 10)
          [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
     [1,]    1    3    6   10   15   21   28   36   45    55
     [2,]    3    2    5    9   14   20   27   35   44    54
     [3,]    6    5    3    7   12   18   25   33   42    52
     [4,]   10    9    7    4    9   15   22   30   39    49
     [5,]   15   14   12    9    5   11   18   26   35    45
     [6,]   21   20   18   15   11    6   13   21   30    40
     [7,]   28   27   25   22   18   13    7   15   24    34
     [8,]   36   35   33   30   26   21   15    8   17    27
     [9,]   45   44   42   39   35   30   24   17    9    19
    [10,]   55   54   52   49   45   40   34   27   19    10
    
    0 讨论(0)
提交回复
热议问题