I have a large matrix:
set.seed(1)
a <- matrix(runif(9e+07),ncol=300)
I want to sort each row in the matrix:
> system
Another excellent method from Martin Morgan without any usage of external packages in Fastest way to select i-th highest value from row and assign to new column:
matrix(a[order(row(a), a)], ncol=ncol(a))
There is also an equivalent for sorting by columns under comments in the same link.
Timing code using same data as Craig:
set.seed(1)
a <- matrix(runif(9e7),ncol=300)
use_for <- function(){
sorted3 <- a
for(i in seq_len(nrow(a)))
sorted3[i,] <- sort.int(a[i,], method='quick')
sorted3
}
microbenchmark::microbenchmark(times=3L,
t(apply(a,1,sort)),
t(apply(a,1,sort.int, method='quick')),
use_for(),
Rfast::rowSort(a),
t(apply(a,1,grr::sort2)),
matrix(a[order(row(a), a)], ncol=ncol(a))
)
Timings:
Unit: seconds
expr min lq mean median uq max neval
t(apply(a, 1, sort)) 37.875665 40.143190 41.098627 42.410715 42.710108 43.009502 3
t(apply(a, 1, sort.int, method = "quick")) 26.406063 27.146861 27.714226 27.887659 28.368307 28.848955 3
use_for() 20.038295 20.140692 20.504223 20.243088 20.737187 21.231285 3
Rfast::rowSort(a) 6.105679 6.460003 6.836455 6.814326 7.201844 7.589361 3
t(apply(a, 1, grr::sort2)) 11.912422 13.035231 13.667377 14.158040 14.544854 14.931669 3
matrix(a[order(row(a), a)], ncol = ncol(a)) 10.307094 10.789946 11.294119 11.272797 11.787632 12.302466 3
And to present a more complete picture, another test for character class (excluding Rfast::rowSort
as it cannot handle character class):
set.seed(1)
a <- matrix(sample(letters, 9e6, TRUE),ncol=300)
microbenchmark::microbenchmark(times=1L,
t(apply(a,1,sort)),
t(apply(a,1,sort.int, method='quick')),
use_for(),
#Rfast::rowSort(a),
t(apply(a,1,grr::sort2)),
matrix(a[order(row(a), a, method="radix")], ncol=ncol(a))
)
Timings:
Unit: milliseconds
expr min lq mean median uq max neval
t(apply(a, 1, sort)) 30392.7951 30392.7951 30392.7951 30392.7951 30392.7951 30392.7951 1
t(apply(a, 1, sort.int, method = "quick")) 29359.7711 29359.7711 29359.7711 29359.7711 29359.7711 29359.7711 1
use_for() 31018.8827 31018.8827 31018.8827 31018.8827 31018.8827 31018.8827 1
t(apply(a, 1, grr::sort2)) 2539.1711 2539.1711 2539.1711 2539.1711 2539.1711 2539.1711 1
matrix(a[order(row(a), a, method = "radix")], ncol = ncol(a)) 480.7405 480.7405 480.7405 480.7405 480.7405 480.7405 1
Head to head:
set.seed(1)
a <- matrix(sample(letters, 9e7, TRUE),ncol=300)
microbenchmark::microbenchmark(times=1L,
t(apply(a,1,grr::sort2)),
matrix(a[order(row(a), a, method="radix")], ncol=ncol(a))
)
Timings:
Unit: seconds
expr min lq mean median uq max neval
t(apply(a, 1, grr::sort2)) 29.098726 29.098726 29.098726 29.098726 29.098726 29.098726 1
matrix(a[order(row(a), a, method = "radix")], ncol = ncol(a)) 7.067744 7.067744 7.067744 7.067744 7.067744 7.067744 1
Well, I'm not aware of that many ways to sort faster in R, and the problem is that you're only sorting 300 values, but many times. Still, you can eek some extra performance out of sort by directly calling sort.int
and using method='quick'
:
set.seed(1)
a <- matrix(runif(9e+07),ncol=300)
# Your original code
system.time(sorted <- t(apply(a,1,sort))) # 31 secs
# sort.int with method='quick'
system.time(sorted2 <- t(apply(a,1,sort.int, method='quick'))) # 27 secs
# using a for-loop is slightly faster than apply (and avoids transpose):
system.time({sorted3 <- a; for(i in seq_len(nrow(a))) sorted3[i,] <- sort.int(a[i,], method='quick') }) # 26 secs
But a better way should be to use the parallel package to sort parts of the matrix in parallel. However, the overhead of transferring data seems to be too big, and on my machine it starts swapping since I "only" have 8 GB memory:
library(parallel)
cl <- makeCluster(4)
system.time(sorted4 <- t(parApply(cl,a,1,sort.int, method='quick'))) # Forever...
stopCluster(cl)
The package grr
contains an alternate sort method that can be used to speed up this particular operation (I have reduced the matrix size somewhat so that this benchmark doesn't take forever) :
> set.seed(1)
> a <- matrix(runif(9e+06),ncol=300)
> microbenchmark::microbenchmark(sorted <- t(apply(a,1,sort))
+ ,sorted2 <- t(apply(a,1,sort.int, method='quick'))
+ ,sorted3 <- t(apply(a,1,grr::sort2)),times=3,unit='s')
Unit: seconds
expr min lq mean median uq max neval
sorted <- t(apply(a, 1, sort)) 1.7699799 1.865829 1.961853 1.961678 2.057790 2.153902 3
sorted2 <- t(apply(a, 1, sort.int, method = "quick")) 1.6162934 1.619922 1.694914 1.623551 1.734224 1.844898 3
sorted3 <- t(apply(a, 1, grr::sort2)) 0.9316073 1.003978 1.050569 1.076348 1.110049 1.143750 3
The difference becomes dramatic when the matrix contains characters:
> set.seed(1)
> a <- matrix(sample(letters,size = 9e6,replace = TRUE),ncol=300)
> microbenchmark::microbenchmark(sorted <- t(apply(a,1,sort))
+ ,sorted2 <- t(apply(a,1,sort.int, method='quick'))
+ ,sorted3 <- t(apply(a,1,grr::sort2)),times=3)
Unit: seconds
expr min lq mean median uq max neval
sorted <- t(apply(a, 1, sort)) 15.436045 15.479742 15.552009 15.523440 15.609991 15.69654 3
sorted2 <- t(apply(a, 1, sort.int, method = "quick")) 15.099618 15.340577 15.447823 15.581536 15.621925 15.66231 3
sorted3 <- t(apply(a, 1, grr::sort2)) 1.728663 1.733756 1.780737 1.738848 1.806774 1.87470 3
Results are identical for all three.
> identical(sorted,sorted2,sorted3)
[1] TRUE