Here is my example data set:
Name Course Cateory
1: Jason ML PT
2: Jason ML DI
3: Jason ML GT
4: Jason ML SY
5: Ja
Quick benchmark for given solutions:
library(microbenchmark)
library(tidyverse)
library(data.table)
# 1. Data set
df_raw <- data.frame(
name = c("Jason", "Jason", "Jason", "Jason", "Jason", "Jason", "Nancy", "Nancy", "Nancy", "Nancy", "James", "John"),
course = c("ML", "ML", "ML", "ML", "DS", "DS", "ML", "ML", "DS", "DS", "ML", "DS"),
category = c("PT", "DI", "GT", "SY", "SY", "DI", "PT", "SY", "DI", "GT", "SY", "GT"),
stringsAsFactors = FALSE)
# 3. Solution 'basic R'
f1 <- function(){
# 1. Create data set
df <- df_raw
# 2. Convert 'category' as factor
df$category <- factor(df$category, levels = c("PT", "DI", "GT", "SY"))
# 3. Sort by 'category'
df <- df[order(df$category), ]
# 4. Select rows without duplicates by 'name' and 'course'
df[!duplicated(df[,c('name', 'course')]), ]
}
# 4. Solution 'dplyr'
f2 <- function(){
# 1. Create data set
df <- df_raw
# 2. Solution
df_raw %>%
mutate(category_factored = as.numeric(factor(category, levels = c('PT','DI','GT','SY'), labels = 1:4))) %>%
group_by(name, course) %>%
filter(category_factored == min(category_factored))
}
# 5. Solution 'data.table'
f3 <- function(){
# 1. Create data set
df <- df_raw
# 2. Solution
setDT(df)[, .SD[which.min(factor(category, levels = c("PT","DI","GT","SY")))], by=.(name, course)]
}
# 6. Solution 'dplyr'
f4 <- function(){
# 1. Create data set
df <- df_raw
# 2. Create 'index' to sort by
df_index <- data.frame("category" = c('PT',"DI","GT","SY"), "index" = c(1, 2, 3, 4))
# 3. Join to original dataset
df <- left_join(df, df_index, by = "category")
# 4. Sort by 'index', dedup with 'name' and 'course'
df %>%
arrange(index) %>%
group_by(name, course) %>%
distinct(name, course, .keep_all = TRUE) %>%
select(-index)
}
# Test for solutions
microbenchmark(f1(), f2(), f3(), f4())
Unit: milliseconds
expr min lq mean median uq max neval cld
f1() 1.350875 1.468044 1.682641 1.603816 1.687203 5.006231 100 a
f2() 12.547863 12.864521 13.766343 13.543806 14.227795 18.350335 100 c
f3() 2.517014 2.634612 2.944483 2.792619 2.873013 9.355626 100 b
f4() 21.073892 21.608212 23.246332 22.338600 23.934932 41.883938 100 d
The best solutions are f1() and f3() as you can see.