Fastest way to sort each row of a large matrix in R

后端未结

关注

 3  1160

I have a large matrix:

set.seed(1)
a <- matrix(runif(9e+07),ncol=300)

I want to sort each row in the matrix:

> system


                      
              相关标签:


      
      
        
          3条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  长情又很酷        
                
              
                            
                2020-12-10 16:10
              
            
            
                                                                       
Another excellent method from Martin Morgan without any usage of external packages in Fastest way to select i-th highest value from row and assign to new column:

matrix(a[order(row(a), a)], ncol=ncol(a))


There is also an equivalent for sorting by columns under comments in the same link.

Timing code using same data as Craig:

set.seed(1)
a <- matrix(runif(9e7),ncol=300)

use_for <- function(){
    sorted3 <- a
    for(i in seq_len(nrow(a))) 
        sorted3[i,] <- sort.int(a[i,], method='quick') 
    sorted3
}

microbenchmark::microbenchmark(times=3L,
    t(apply(a,1,sort)),
    t(apply(a,1,sort.int, method='quick')),
    use_for(),
    Rfast::rowSort(a),
    t(apply(a,1,grr::sort2)),
    matrix(a[order(row(a), a)], ncol=ncol(a))
)


Timings:

Unit: seconds
                                        expr       min        lq      mean    median        uq       max neval
                        t(apply(a, 1, sort)) 37.875665 40.143190 41.098627 42.410715 42.710108 43.009502     3
  t(apply(a, 1, sort.int, method = "quick")) 26.406063 27.146861 27.714226 27.887659 28.368307 28.848955     3
                                   use_for() 20.038295 20.140692 20.504223 20.243088 20.737187 21.231285     3
                           Rfast::rowSort(a)  6.105679  6.460003  6.836455  6.814326  7.201844  7.589361     3
                  t(apply(a, 1, grr::sort2)) 11.912422 13.035231 13.667377 14.158040 14.544854 14.931669     3
 matrix(a[order(row(a), a)], ncol = ncol(a)) 10.307094 10.789946 11.294119 11.272797 11.787632 12.302466     3




And to present a more complete picture, another test for character class (excluding Rfast::rowSort as it cannot handle character class):

set.seed(1)
a <- matrix(sample(letters, 9e6, TRUE),ncol=300)

microbenchmark::microbenchmark(times=1L,
    t(apply(a,1,sort)),
    t(apply(a,1,sort.int, method='quick')),
    use_for(),
    #Rfast::rowSort(a),
    t(apply(a,1,grr::sort2)),
    matrix(a[order(row(a), a, method="radix")], ncol=ncol(a))
)


Timings:

Unit: milliseconds
                                                          expr        min         lq       mean     median         uq        max neval
                                          t(apply(a, 1, sort)) 30392.7951 30392.7951 30392.7951 30392.7951 30392.7951 30392.7951     1
                    t(apply(a, 1, sort.int, method = "quick")) 29359.7711 29359.7711 29359.7711 29359.7711 29359.7711 29359.7711     1
                                                     use_for() 31018.8827 31018.8827 31018.8827 31018.8827 31018.8827 31018.8827     1
                                    t(apply(a, 1, grr::sort2))  2539.1711  2539.1711  2539.1711  2539.1711  2539.1711  2539.1711     1
 matrix(a[order(row(a), a, method = "radix")], ncol = ncol(a))   480.7405   480.7405   480.7405   480.7405   480.7405   480.7405     1




Head to head:

set.seed(1)
a <- matrix(sample(letters, 9e7, TRUE),ncol=300)
microbenchmark::microbenchmark(times=1L,
    t(apply(a,1,grr::sort2)),
    matrix(a[order(row(a), a, method="radix")], ncol=ncol(a))
)


Timings:

Unit: seconds
                                                          expr       min        lq      mean    median        uq       max neval
                                    t(apply(a, 1, grr::sort2)) 29.098726 29.098726 29.098726 29.098726 29.098726 29.098726     1
 matrix(a[order(row(a), a, method = "radix")], ncol = ncol(a))  7.067744  7.067744  7.067744  7.067744  7.067744  7.067744     1

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  [愿得一人]        
                
              
                            
                2020-12-10 16:18
              
            
            
                                                                       
Well, I'm not aware of that many ways to sort faster in R, and the problem is that you're only sorting 300 values, but many times. Still, you can eek some extra performance out of sort by directly calling sort.int and using method='quick':

set.seed(1)
a <- matrix(runif(9e+07),ncol=300)

# Your original code
system.time(sorted <- t(apply(a,1,sort))) # 31 secs

# sort.int with method='quick'
system.time(sorted2 <- t(apply(a,1,sort.int, method='quick'))) # 27 secs

# using a for-loop is slightly faster than apply (and avoids transpose):
system.time({sorted3 <- a; for(i in seq_len(nrow(a))) sorted3[i,] <- sort.int(a[i,], method='quick') }) # 26 secs


But a better way should be to use the parallel package to sort parts of the matrix in parallel. However, the overhead of transferring data seems to be too big, and on my machine it starts swapping since I "only" have 8 GB memory:

library(parallel)
cl <- makeCluster(4)
system.time(sorted4 <- t(parApply(cl,a,1,sort.int, method='quick'))) # Forever...
stopCluster(cl)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  暖寄归人        
                
              
                            
                2020-12-10 16:32
              
            
            
                                                                       
The package grr contains an alternate sort method that can be used to speed up this particular operation (I have reduced the matrix size somewhat so that this benchmark doesn't take forever) :

> set.seed(1)
> a <- matrix(runif(9e+06),ncol=300)
> microbenchmark::microbenchmark(sorted <- t(apply(a,1,sort))
+                                ,sorted2 <- t(apply(a,1,sort.int, method='quick'))
+                                ,sorted3 <- t(apply(a,1,grr::sort2)),times=3,unit='s')
Unit: seconds
                                                  expr       min       lq     mean   median       uq      max neval
                        sorted <- t(apply(a, 1, sort)) 1.7699799 1.865829 1.961853 1.961678 2.057790 2.153902     3
 sorted2 <- t(apply(a, 1, sort.int, method = "quick")) 1.6162934 1.619922 1.694914 1.623551 1.734224 1.844898     3
                 sorted3 <- t(apply(a, 1, grr::sort2)) 0.9316073 1.003978 1.050569 1.076348 1.110049 1.143750     3


The difference becomes dramatic when the matrix contains characters:

> set.seed(1)
> a <- matrix(sample(letters,size = 9e6,replace = TRUE),ncol=300)
> microbenchmark::microbenchmark(sorted <- t(apply(a,1,sort))
+                                ,sorted2 <- t(apply(a,1,sort.int, method='quick'))
+                                ,sorted3 <- t(apply(a,1,grr::sort2)),times=3)
Unit: seconds
                                                  expr       min        lq      mean    median        uq      max neval
                        sorted <- t(apply(a, 1, sort)) 15.436045 15.479742 15.552009 15.523440 15.609991 15.69654     3
 sorted2 <- t(apply(a, 1, sort.int, method = "quick")) 15.099618 15.340577 15.447823 15.581536 15.621925 15.66231     3
                 sorted3 <- t(apply(a, 1, grr::sort2))  1.728663  1.733756  1.780737  1.738848  1.806774  1.87470     3


Results are identical for all three.

> identical(sorted,sorted2,sorted3)
[1] TRUE

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复