R: Remove duplicates from a dataframe based on categories in a column

前端未结

关注

 7  1244

耶瑟儿～ 2021-02-15 16:14

Here is my example data set:

      Name Course Cateory
 1: Jason     ML      PT
 2: Jason     ML      DI
 3: Jason     ML      GT
 4: Jason     ML      SY
 5: Ja


      
      
        
          7条回答        

        
                    
            
            
                         
                
              
              
                
                   一向
                                             
                
                
                (楼主)
            
              
              
                2021-02-15 16:34
              

            
            
                        
Since you mentioned you have 10 million rows, here is a data.table solution:

library(data.table)

setDT(df)[, .SD[which.min(factor(Category, levels = c("PT","DI","GT","SY")))], by=.(Name, Course)]


Result:

    Name Course Category
1: Jason     ML       PT
2: Jason     DS       DI
3: Nancy     ML       PT
4: Nancy     DS       DI
5: James     ML       SY
6:  John     DS       GT


Benchmarking:

# Random resampling of `df` to generate 10 million rows
set.seed(123)
df_large = data.frame(lapply(df, sample, 1e7, replace = TRUE))

# Data prep Base R  
df1 <- df_large

df1$Category <- factor(df1$Category, levels = c("PT", "DI", "GT", "SY"))

df1 <- df1[order(df1$Category), ]

# Data prep data.table
df2 <- df_large

df2$Category <- factor(df2$Category, levels = c("PT", "DI", "GT", "SY"))

setDT(df2)


Results:

library(microbenchmark)
microbenchmark(df1[!duplicated(df1[,c('Name', 'Course')]), ], 
               df2[, .SD[which.min(df2$Category)], by=.(Name, Course)])

Unit: milliseconds
                                                      expr       min        lq      mean
            df1[!duplicated(df1[, c("Name", "Course")]), ] 1696.7585 1719.4932 1788.5821
 df2[, .SD[which.min(df2$Category)], by = .(Name, Course)]  387.8435  409.9365  436.4381
    median        uq       max neval
 1774.3131 1803.7565 2085.9722   100
  427.6739  451.1776  558.2749   100


Data:

df = structure(list(Name = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 4L, 
4L, 4L, 4L, 1L, 3L), .Label = c("James", "Jason", "John", "Nancy"
), class = "factor"), Course = structure(c(2L, 2L, 2L, 2L, 1L, 
1L, 2L, 2L, 1L, 1L, 2L, 1L), .Label = c("DS", "ML"), class = "factor"), 
    Category = structure(c(3L, 1L, 2L, 4L, 4L, 1L, 3L, 4L, 1L, 
    2L, 4L, 2L), .Label = c("DI", "GT", "PT", "SY"), class = "factor")), .Names = c("Name", 
"Course", "Category"), class = "data.frame", row.names = c("1:", 
"2:", "3:", "4:", "5:", "6:", "7:", "8:", "9:", "10:", "11:", 
"12:"))

    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它7个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复