R: Remove duplicates from a dataframe based on categories in a column

前端 未结 7 1213
耶瑟儿~
耶瑟儿~ 2021-02-15 16:14

Here is my example data set:

      Name Course Cateory
 1: Jason     ML      PT
 2: Jason     ML      DI
 3: Jason     ML      GT
 4: Jason     ML      SY
 5: Ja         


        
7条回答
  •  一向
    一向 (楼主)
    2021-02-15 16:34

    Since you mentioned you have 10 million rows, here is a data.table solution:

    library(data.table)
    
    setDT(df)[, .SD[which.min(factor(Category, levels = c("PT","DI","GT","SY")))], by=.(Name, Course)]
    

    Result:

        Name Course Category
    1: Jason     ML       PT
    2: Jason     DS       DI
    3: Nancy     ML       PT
    4: Nancy     DS       DI
    5: James     ML       SY
    6:  John     DS       GT
    

    Benchmarking:

    # Random resampling of `df` to generate 10 million rows
    set.seed(123)
    df_large = data.frame(lapply(df, sample, 1e7, replace = TRUE))
    
    # Data prep Base R  
    df1 <- df_large
    
    df1$Category <- factor(df1$Category, levels = c("PT", "DI", "GT", "SY"))
    
    df1 <- df1[order(df1$Category), ]
    
    # Data prep data.table
    df2 <- df_large
    
    df2$Category <- factor(df2$Category, levels = c("PT", "DI", "GT", "SY"))
    
    setDT(df2)
    

    Results:

    library(microbenchmark)
    microbenchmark(df1[!duplicated(df1[,c('Name', 'Course')]), ], 
                   df2[, .SD[which.min(df2$Category)], by=.(Name, Course)])
    
    Unit: milliseconds
                                                          expr       min        lq      mean
                df1[!duplicated(df1[, c("Name", "Course")]), ] 1696.7585 1719.4932 1788.5821
     df2[, .SD[which.min(df2$Category)], by = .(Name, Course)]  387.8435  409.9365  436.4381
        median        uq       max neval
     1774.3131 1803.7565 2085.9722   100
      427.6739  451.1776  558.2749   100
    

    Data:

    df = structure(list(Name = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 4L, 
    4L, 4L, 4L, 1L, 3L), .Label = c("James", "Jason", "John", "Nancy"
    ), class = "factor"), Course = structure(c(2L, 2L, 2L, 2L, 1L, 
    1L, 2L, 2L, 1L, 1L, 2L, 1L), .Label = c("DS", "ML"), class = "factor"), 
        Category = structure(c(3L, 1L, 2L, 4L, 4L, 1L, 3L, 4L, 1L, 
        2L, 4L, 2L), .Label = c("DI", "GT", "PT", "SY"), class = "factor")), .Names = c("Name", 
    "Course", "Category"), class = "data.frame", row.names = c("1:", 
    "2:", "3:", "4:", "5:", "6:", "7:", "8:", "9:", "10:", "11:", 
    "12:"))
    

提交回复
热议问题