I have a matrix with the dimension of 100 million records and 100 columns.
Now I want to multiply that matrix by rowwise.
My sample code for matrix multiplic
Try package data.table
with Reduce
. That might avoid internal copies of a 1e10 length vector.
library(data.table)
df <- data.table(df, keep.rownames=TRUE)
df[, rowprods:= Reduce("*", .SD), .SDcols = -1]
df[, .(rn, rowprods)]
# rn rowprods
# 1: Mazda RX4 0
# 2: Mazda RX4 Wag 0
# 3: Datsun 710 609055152
# 4: Hornet 4 Drive 0
# 5: Hornet Sportabout 0
# 6: Valiant 0
# 7: Duster 360 0
# 8: Merc 240D 0
# 9: Merc 230 0
#10: Merc 280 0
#11: Merc 280C 0
#12: Merc 450SE 0
#13: Merc 450SL 0
#14: Merc 450SLC 0
#15: Cadillac Fleetwood 0
#16: Lincoln Continental 0
#17: Chrysler Imperial 0
#18: Fiat 128 470578906
#19: Honda Civic 564655046
#20: Toyota Corolla 386281789
#21: Toyota Corona 0
#22: Dodge Challenger 0
#23: AMC Javelin 0
#24: Camaro Z28 0
#25: Pontiac Firebird 0
#26: Fiat X1-9 339825992
#27: Porsche 914-2 0
#28: Lotus Europa 1259677924
#29: Ford Pantera L 0
#30: Ferrari Dino 0
#31: Maserati Bora 0
#32: Volvo 142E 1919442833
# rn rowsums
However, 8 GB RAM (minus what your OS and other software needs) is not much if you want to work with data of this size. R sometimes needs to make internal copies to use your data.
The Rfast command "rowprods" accepts a matrix, not a data.frame. Secondly, any row or colprods command will have numerical overflow errors. So ti best to use Rfast::colprods(x, method = "expsumlog").
If you have a matrix that is too large to fit in memory, you can use package bigstatsr (disclaimer: I'm the author) to use data stored on your disk (instead of the RAM). Using function big_apply
enables you to apply standard R functions on data blocks (and to combine them).
library(bigstatsr)
fbm <- FBM(10e6, 100)
# inialize with random numbers
system.time(
big_apply(fbm, a.FUN = function(X, ind) {
print(min(ind))
X[, ind] <- rnorm(nrow(X) * length(ind))
NULL
}, a.combine = 'c')
) # 78 sec
# compute row prods, possibly in parallel
system.time(
prods <- big_apply(fbm, a.FUN = function(X, ind) {
print(min(ind))
matrixStats::rowProds(X[ind, ])
}, a.combine = 'c', ind = rows_along(fbm),
block.size = 100e3, ncores = nb_cores())
) # 22 sec with 1 core and 18 sec with 6 cores
Some timings for reference
library(matrixStats)
library(inline)
library(data.table)
#devtools::install_github("privefl/bigstatsr")
library(bigstatsr)
library(RcppArmadillo)
library(microbenchmark)
set.seed(20L)
N <- 1e6
dat <- matrix(rnorm(N*100),ncol=100)
fbm <- FBM(N, 100)
big_apply(fbm, a.FUN = function(X, ind) {
print(min(ind))
X[, ind] <- rnorm(nrow(X) * length(ind))
NULL
}, a.combine = 'c')
bigstatsrMtd <- function() {
prods <- big_apply(fbm, a.FUN = function(X, ind) {
print(min(ind))
matrixStats::rowProds(X[ind, ])
}, a.combine = 'c', ind = rows_along(fbm),
block.size = 100e3, ncores = nb_cores())
}
df <- data.table(as.data.frame(dat), keep.rownames=TRUE)
data.tableMtd <- function() {
df[, rowprods:= Reduce("*", .SD), .SDcols = -1]
df[, .(rn, rowprods)]
}
code <- '
arma::mat prodDat = Rcpp::as<arma::mat>(dat);
int m = prodDat.n_rows;
int n = prodDat.n_cols;
arma::vec res(m);
for (int row=0; row < m; row++) {
res(row) = 1.0;
for (int col=0; col < n; col++) {
res(row) *= prodDat(row, col);
}
}
return Rcpp::wrap(res);
'
rcppProd <- cxxfunction(signature(dat="numeric"), code, plugin="RcppArmadillo")
rcppMtd <- function() {
rcppData <- rcppProd(dat) # generated by C++ code
}
baseMtd <- function() {
apply(dat, 1, prod)
}
microbenchmark(bigstatsrMtd(),
data.tableMtd(),
rcppMtd(),
baseMtd(),
times=5L
)
Note: Compiling the function in cxxfunction
seems to take some time
Here are the timing results:
# Unit: milliseconds
# expr min lq mean median uq max
# bigstatsrMtd() 4519.1861 4993.0879 5296.7000 5126.2282 5504.3981 6340.5995
# data.tableMtd() 443.1946 444.9686 690.3703 493.2399 513.4787 1556.9695
# rcppMtd() 787.9488 799.1575 828.3647 809.0645 871.0347 874.6178
# baseMtd() 5658.1424 6208.5123 6232.0040 6331.7431 6458.6806 6502.9417