R: Remove duplicates from a dataframe based on categories in a column

前端未结

关注

 7  1217

耶瑟儿～ 2021-02-15 16:14

Here is my example data set:

      Name Course Cateory
 1: Jason     ML      PT
 2: Jason     ML      DI
 3: Jason     ML      GT
 4: Jason     ML      SY
 5: Ja


      
      
        
          7条回答        

        
                    
            
            
                         
                
              
              
                
                   孤独总比滥情好
                                             
                
                
                (楼主)
            
              
              
                2021-02-15 16:38
              

            
            
                        
Quick benchmark for given solutions:

library(microbenchmark)
library(tidyverse)
library(data.table)

# 1. Data set
df_raw <- data.frame(
  name = c("Jason", "Jason", "Jason", "Jason", "Jason", "Jason", "Nancy", "Nancy", "Nancy", "Nancy", "James", "John"),
  course = c("ML", "ML", "ML", "ML", "DS", "DS", "ML", "ML", "DS", "DS", "ML", "DS"),
  category = c("PT", "DI", "GT", "SY", "SY", "DI", "PT", "SY", "DI", "GT", "SY", "GT"),
  stringsAsFactors = FALSE)

 # 3. Solution 'basic R'
 f1 <- function(){

 # 1. Create data set  
  df <- df_raw

 # 2. Convert 'category' as factor
 df$category <- factor(df$category, levels = c("PT", "DI", "GT", "SY"))

 # 3. Sort by 'category'
 df <- df[order(df$category), ]

 # 4. Select rows without duplicates by 'name' and 'course'
 df[!duplicated(df[,c('name', 'course')]), ]

}

# 4. Solution 'dplyr'
f2 <- function(){
  # 1. Create data set
  df <- df_raw

  # 2. Solution
  df_raw %>% 
    mutate(category_factored = as.numeric(factor(category, levels = c('PT','DI','GT','SY'), labels = 1:4))) %>% 
    group_by(name, course) %>% 
    filter(category_factored == min(category_factored))
}

# 5. Solution 'data.table'
f3 <- function(){
  # 1. Create data set
  df <- df_raw

  # 2. Solution
  setDT(df)[, .SD[which.min(factor(category, levels = c("PT","DI","GT","SY")))], by=.(name, course)]
}

# 6. Solution 'dplyr'
f4 <- function(){

  # 1. Create data set
  df <- df_raw

  # 2. Create 'index' to sort by
  df_index <- data.frame("category" = c('PT',"DI","GT","SY"), "index" = c(1, 2, 3, 4))

  # 3. Join to original dataset
  df <- left_join(df, df_index, by = "category")

  # 4. Sort by 'index', dedup with 'name' and 'course'
  df %>% 
    arrange(index) %>% 
    group_by(name, course) %>% 
    distinct(name, course, .keep_all = TRUE) %>% 
    select(-index)
}

# Test for solutions
microbenchmark(f1(), f2(), f3(), f4())

Unit: milliseconds
expr       min        lq      mean    median        uq       max neval  cld
f1()  1.350875  1.468044  1.682641  1.603816  1.687203  5.006231   100 a   
f2() 12.547863 12.864521 13.766343 13.543806 14.227795 18.350335   100   c 
f3()  2.517014  2.634612  2.944483  2.792619  2.873013  9.355626   100  b  
f4() 21.073892 21.608212 23.246332 22.338600 23.934932 41.883938   100    d


The best solutions are f1() and f3() as you can see.
    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它7个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复