Fastest way to sort each row of a large matrix in R

后端 未结 3 1160
北恋
北恋 2020-12-10 15:50

I have a large matrix:

set.seed(1)
a <- matrix(runif(9e+07),ncol=300)

I want to sort each row in the matrix:

> system         


        
相关标签:
3条回答
  • 2020-12-10 16:10

    Another excellent method from Martin Morgan without any usage of external packages in Fastest way to select i-th highest value from row and assign to new column:

    matrix(a[order(row(a), a)], ncol=ncol(a))
    

    There is also an equivalent for sorting by columns under comments in the same link.

    Timing code using same data as Craig:

    set.seed(1)
    a <- matrix(runif(9e7),ncol=300)
    
    use_for <- function(){
        sorted3 <- a
        for(i in seq_len(nrow(a))) 
            sorted3[i,] <- sort.int(a[i,], method='quick') 
        sorted3
    }
    
    microbenchmark::microbenchmark(times=3L,
        t(apply(a,1,sort)),
        t(apply(a,1,sort.int, method='quick')),
        use_for(),
        Rfast::rowSort(a),
        t(apply(a,1,grr::sort2)),
        matrix(a[order(row(a), a)], ncol=ncol(a))
    )
    

    Timings:

    Unit: seconds
                                            expr       min        lq      mean    median        uq       max neval
                            t(apply(a, 1, sort)) 37.875665 40.143190 41.098627 42.410715 42.710108 43.009502     3
      t(apply(a, 1, sort.int, method = "quick")) 26.406063 27.146861 27.714226 27.887659 28.368307 28.848955     3
                                       use_for() 20.038295 20.140692 20.504223 20.243088 20.737187 21.231285     3
                               Rfast::rowSort(a)  6.105679  6.460003  6.836455  6.814326  7.201844  7.589361     3
                      t(apply(a, 1, grr::sort2)) 11.912422 13.035231 13.667377 14.158040 14.544854 14.931669     3
     matrix(a[order(row(a), a)], ncol = ncol(a)) 10.307094 10.789946 11.294119 11.272797 11.787632 12.302466     3
    

    And to present a more complete picture, another test for character class (excluding Rfast::rowSort as it cannot handle character class):

    set.seed(1)
    a <- matrix(sample(letters, 9e6, TRUE),ncol=300)
    
    microbenchmark::microbenchmark(times=1L,
        t(apply(a,1,sort)),
        t(apply(a,1,sort.int, method='quick')),
        use_for(),
        #Rfast::rowSort(a),
        t(apply(a,1,grr::sort2)),
        matrix(a[order(row(a), a, method="radix")], ncol=ncol(a))
    )
    

    Timings:

    Unit: milliseconds
                                                              expr        min         lq       mean     median         uq        max neval
                                              t(apply(a, 1, sort)) 30392.7951 30392.7951 30392.7951 30392.7951 30392.7951 30392.7951     1
                        t(apply(a, 1, sort.int, method = "quick")) 29359.7711 29359.7711 29359.7711 29359.7711 29359.7711 29359.7711     1
                                                         use_for() 31018.8827 31018.8827 31018.8827 31018.8827 31018.8827 31018.8827     1
                                        t(apply(a, 1, grr::sort2))  2539.1711  2539.1711  2539.1711  2539.1711  2539.1711  2539.1711     1
     matrix(a[order(row(a), a, method = "radix")], ncol = ncol(a))   480.7405   480.7405   480.7405   480.7405   480.7405   480.7405     1
    

    Head to head:

    set.seed(1)
    a <- matrix(sample(letters, 9e7, TRUE),ncol=300)
    microbenchmark::microbenchmark(times=1L,
        t(apply(a,1,grr::sort2)),
        matrix(a[order(row(a), a, method="radix")], ncol=ncol(a))
    )
    

    Timings:

    Unit: seconds
                                                              expr       min        lq      mean    median        uq       max neval
                                        t(apply(a, 1, grr::sort2)) 29.098726 29.098726 29.098726 29.098726 29.098726 29.098726     1
     matrix(a[order(row(a), a, method = "radix")], ncol = ncol(a))  7.067744  7.067744  7.067744  7.067744  7.067744  7.067744     1
    
    0 讨论(0)
  • 2020-12-10 16:18

    Well, I'm not aware of that many ways to sort faster in R, and the problem is that you're only sorting 300 values, but many times. Still, you can eek some extra performance out of sort by directly calling sort.int and using method='quick':

    set.seed(1)
    a <- matrix(runif(9e+07),ncol=300)
    
    # Your original code
    system.time(sorted <- t(apply(a,1,sort))) # 31 secs
    
    # sort.int with method='quick'
    system.time(sorted2 <- t(apply(a,1,sort.int, method='quick'))) # 27 secs
    
    # using a for-loop is slightly faster than apply (and avoids transpose):
    system.time({sorted3 <- a; for(i in seq_len(nrow(a))) sorted3[i,] <- sort.int(a[i,], method='quick') }) # 26 secs
    

    But a better way should be to use the parallel package to sort parts of the matrix in parallel. However, the overhead of transferring data seems to be too big, and on my machine it starts swapping since I "only" have 8 GB memory:

    library(parallel)
    cl <- makeCluster(4)
    system.time(sorted4 <- t(parApply(cl,a,1,sort.int, method='quick'))) # Forever...
    stopCluster(cl)
    
    0 讨论(0)
  • 2020-12-10 16:32

    The package grr contains an alternate sort method that can be used to speed up this particular operation (I have reduced the matrix size somewhat so that this benchmark doesn't take forever) :

    > set.seed(1)
    > a <- matrix(runif(9e+06),ncol=300)
    > microbenchmark::microbenchmark(sorted <- t(apply(a,1,sort))
    +                                ,sorted2 <- t(apply(a,1,sort.int, method='quick'))
    +                                ,sorted3 <- t(apply(a,1,grr::sort2)),times=3,unit='s')
    Unit: seconds
                                                      expr       min       lq     mean   median       uq      max neval
                            sorted <- t(apply(a, 1, sort)) 1.7699799 1.865829 1.961853 1.961678 2.057790 2.153902     3
     sorted2 <- t(apply(a, 1, sort.int, method = "quick")) 1.6162934 1.619922 1.694914 1.623551 1.734224 1.844898     3
                     sorted3 <- t(apply(a, 1, grr::sort2)) 0.9316073 1.003978 1.050569 1.076348 1.110049 1.143750     3
    

    The difference becomes dramatic when the matrix contains characters:

    > set.seed(1)
    > a <- matrix(sample(letters,size = 9e6,replace = TRUE),ncol=300)
    > microbenchmark::microbenchmark(sorted <- t(apply(a,1,sort))
    +                                ,sorted2 <- t(apply(a,1,sort.int, method='quick'))
    +                                ,sorted3 <- t(apply(a,1,grr::sort2)),times=3)
    Unit: seconds
                                                      expr       min        lq      mean    median        uq      max neval
                            sorted <- t(apply(a, 1, sort)) 15.436045 15.479742 15.552009 15.523440 15.609991 15.69654     3
     sorted2 <- t(apply(a, 1, sort.int, method = "quick")) 15.099618 15.340577 15.447823 15.581536 15.621925 15.66231     3
                     sorted3 <- t(apply(a, 1, grr::sort2))  1.728663  1.733756  1.780737  1.738848  1.806774  1.87470     3
    

    Results are identical for all three.

    > identical(sorted,sorted2,sorted3)
    [1] TRUE
    
    0 讨论(0)
提交回复
热议问题