R: Remove duplicates from a dataframe based on categories in a column

前端 未结 7 1185
耶瑟儿~
耶瑟儿~ 2021-02-15 16:14

Here is my example data set:

      Name Course Cateory
 1: Jason     ML      PT
 2: Jason     ML      DI
 3: Jason     ML      GT
 4: Jason     ML      SY
 5: Ja         


        
相关标签:
7条回答
  • 2021-02-15 16:38

    Quick benchmark for given solutions:

    library(microbenchmark)
    library(tidyverse)
    library(data.table)
    
    # 1. Data set
    df_raw <- data.frame(
      name = c("Jason", "Jason", "Jason", "Jason", "Jason", "Jason", "Nancy", "Nancy", "Nancy", "Nancy", "James", "John"),
      course = c("ML", "ML", "ML", "ML", "DS", "DS", "ML", "ML", "DS", "DS", "ML", "DS"),
      category = c("PT", "DI", "GT", "SY", "SY", "DI", "PT", "SY", "DI", "GT", "SY", "GT"),
      stringsAsFactors = FALSE)
    
     # 3. Solution 'basic R'
     f1 <- function(){
    
     # 1. Create data set  
      df <- df_raw
    
     # 2. Convert 'category' as factor
     df$category <- factor(df$category, levels = c("PT", "DI", "GT", "SY"))
    
     # 3. Sort by 'category'
     df <- df[order(df$category), ]
    
     # 4. Select rows without duplicates by 'name' and 'course'
     df[!duplicated(df[,c('name', 'course')]), ]
    
    }
    
    # 4. Solution 'dplyr'
    f2 <- function(){
      # 1. Create data set
      df <- df_raw
    
      # 2. Solution
      df_raw %>% 
        mutate(category_factored = as.numeric(factor(category, levels = c('PT','DI','GT','SY'), labels = 1:4))) %>% 
        group_by(name, course) %>% 
        filter(category_factored == min(category_factored))
    }
    
    # 5. Solution 'data.table'
    f3 <- function(){
      # 1. Create data set
      df <- df_raw
    
      # 2. Solution
      setDT(df)[, .SD[which.min(factor(category, levels = c("PT","DI","GT","SY")))], by=.(name, course)]
    }
    
    # 6. Solution 'dplyr'
    f4 <- function(){
    
      # 1. Create data set
      df <- df_raw
    
      # 2. Create 'index' to sort by
      df_index <- data.frame("category" = c('PT',"DI","GT","SY"), "index" = c(1, 2, 3, 4))
    
      # 3. Join to original dataset
      df <- left_join(df, df_index, by = "category")
    
      # 4. Sort by 'index', dedup with 'name' and 'course'
      df %>% 
        arrange(index) %>% 
        group_by(name, course) %>% 
        distinct(name, course, .keep_all = TRUE) %>% 
        select(-index)
    }
    
    # Test for solutions
    microbenchmark(f1(), f2(), f3(), f4())
    
    Unit: milliseconds
    expr       min        lq      mean    median        uq       max neval  cld
    f1()  1.350875  1.468044  1.682641  1.603816  1.687203  5.006231   100 a   
    f2() 12.547863 12.864521 13.766343 13.543806 14.227795 18.350335   100   c 
    f3()  2.517014  2.634612  2.944483  2.792619  2.873013  9.355626   100  b  
    f4() 21.073892 21.608212 23.246332 22.338600 23.934932 41.883938   100    d
    

    The best solutions are f1() and f3() as you can see.

    0 讨论(0)
提交回复
热议问题