Rowwise matrix multiplication in R

后端 未结 4 522
别跟我提以往
别跟我提以往 2021-01-18 23:05

I have a matrix with the dimension of 100 million records and 100 columns.

Now I want to multiply that matrix by rowwise.

My sample code for matrix multiplic

相关标签:
4条回答
  • 2021-01-18 23:30

    Try package data.table with Reduce. That might avoid internal copies of a 1e10 length vector.

    library(data.table)
    df <- data.table(df, keep.rownames=TRUE)
    df[, rowprods:= Reduce("*", .SD), .SDcols = -1]
    df[, .(rn, rowprods)]
    #                     rn   rowprods
    # 1:           Mazda RX4          0
    # 2:       Mazda RX4 Wag          0
    # 3:          Datsun 710  609055152
    # 4:      Hornet 4 Drive          0
    # 5:   Hornet Sportabout          0
    # 6:             Valiant          0
    # 7:          Duster 360          0
    # 8:           Merc 240D          0
    # 9:            Merc 230          0
    #10:            Merc 280          0
    #11:           Merc 280C          0
    #12:          Merc 450SE          0
    #13:          Merc 450SL          0
    #14:         Merc 450SLC          0
    #15:  Cadillac Fleetwood          0
    #16: Lincoln Continental          0
    #17:   Chrysler Imperial          0
    #18:            Fiat 128  470578906
    #19:         Honda Civic  564655046
    #20:      Toyota Corolla  386281789
    #21:       Toyota Corona          0
    #22:    Dodge Challenger          0
    #23:         AMC Javelin          0
    #24:          Camaro Z28          0
    #25:    Pontiac Firebird          0
    #26:           Fiat X1-9  339825992
    #27:       Porsche 914-2          0
    #28:        Lotus Europa 1259677924
    #29:      Ford Pantera L          0
    #30:        Ferrari Dino          0
    #31:       Maserati Bora          0
    #32:          Volvo 142E 1919442833
    #                     rn    rowsums
    

    However, 8 GB RAM (minus what your OS and other software needs) is not much if you want to work with data of this size. R sometimes needs to make internal copies to use your data.

    0 讨论(0)
  • 2021-01-18 23:35

    The Rfast command "rowprods" accepts a matrix, not a data.frame. Secondly, any row or colprods command will have numerical overflow errors. So ti best to use Rfast::colprods(x, method = "expsumlog").

    0 讨论(0)
  • 2021-01-18 23:36

    If you have a matrix that is too large to fit in memory, you can use package bigstatsr (disclaimer: I'm the author) to use data stored on your disk (instead of the RAM). Using function big_apply enables you to apply standard R functions on data blocks (and to combine them).

    library(bigstatsr)
    fbm <- FBM(10e6, 100)
    # inialize with random numbers
    system.time(
      big_apply(fbm, a.FUN = function(X, ind) {
        print(min(ind))
        X[, ind] <- rnorm(nrow(X) * length(ind))
        NULL
      }, a.combine = 'c')
    ) # 78 sec
    
    # compute row prods, possibly in parallel
    system.time(
      prods <- big_apply(fbm, a.FUN = function(X, ind) {
        print(min(ind))
        matrixStats::rowProds(X[ind, ])
      }, a.combine = 'c', ind = rows_along(fbm),
      block.size = 100e3, ncores = nb_cores())  
    ) # 22 sec with 1 core and 18 sec with 6 cores
    
    0 讨论(0)
  • 2021-01-18 23:40

    Some timings for reference

    library(matrixStats)
    library(inline)
    library(data.table)
    #devtools::install_github("privefl/bigstatsr")
    library(bigstatsr)
    library(RcppArmadillo)
    library(microbenchmark)
    set.seed(20L)
    N <- 1e6
    dat <- matrix(rnorm(N*100),ncol=100)
    
    fbm <- FBM(N, 100)
    big_apply(fbm, a.FUN = function(X, ind) {
        print(min(ind))
        X[, ind] <- rnorm(nrow(X) * length(ind))
        NULL
    }, a.combine = 'c')   
    
    bigstatsrMtd <- function() {
        prods <- big_apply(fbm, a.FUN = function(X, ind) {
            print(min(ind))
            matrixStats::rowProds(X[ind, ])
        }, a.combine = 'c', ind = rows_along(fbm),
            block.size = 100e3, ncores = nb_cores())  
    }
    
    df <- data.table(as.data.frame(dat), keep.rownames=TRUE)
    data.tableMtd <- function() {
        df[, rowprods:= Reduce("*", .SD), .SDcols = -1]
        df[, .(rn, rowprods)]    
    }
    
    code <- '
      arma::mat prodDat = Rcpp::as<arma::mat>(dat);
      int m = prodDat.n_rows;
      int n = prodDat.n_cols;
      arma::vec res(m);
      for (int row=0; row < m; row++) {
        res(row) = 1.0;
        for (int col=0; col < n; col++) {
          res(row) *= prodDat(row, col);
        }
      }
      return Rcpp::wrap(res);
    '
    rcppProd <- cxxfunction(signature(dat="numeric"), code, plugin="RcppArmadillo")
    
    rcppMtd <- function() {
        rcppData <- rcppProd(dat)                # generated by C++ code
    }
    
    baseMtd <- function() {
        apply(dat, 1, prod)   
    }
    
    microbenchmark(bigstatsrMtd(),
        data.tableMtd(),
        rcppMtd(),
        baseMtd(),
        times=5L
    )
    

    Note: Compiling the function in cxxfunction seems to take some time

    Here are the timing results:

    # Unit: milliseconds
    #            expr       min        lq      mean    median        uq       max
    #  bigstatsrMtd() 4519.1861 4993.0879 5296.7000 5126.2282 5504.3981 6340.5995
    # data.tableMtd()  443.1946  444.9686  690.3703  493.2399  513.4787 1556.9695
    #       rcppMtd()  787.9488  799.1575  828.3647  809.0645  871.0347  874.6178
    #       baseMtd() 5658.1424 6208.5123 6232.0040 6331.7431 6458.6806 6502.9417
    
    0 讨论(0)
提交回复
热议问题