Here is my example data set:
Name Course Cateory
1: Jason ML PT
2: Jason ML DI
3: Jason ML GT
4: Jason ML SY
5: Ja
Since you mentioned you have 10 million rows, here is a data.table
solution:
library(data.table)
setDT(df)[, .SD[which.min(factor(Category, levels = c("PT","DI","GT","SY")))], by=.(Name, Course)]
Result:
Name Course Category
1: Jason ML PT
2: Jason DS DI
3: Nancy ML PT
4: Nancy DS DI
5: James ML SY
6: John DS GT
Benchmarking:
# Random resampling of `df` to generate 10 million rows
set.seed(123)
df_large = data.frame(lapply(df, sample, 1e7, replace = TRUE))
# Data prep Base R
df1 <- df_large
df1$Category <- factor(df1$Category, levels = c("PT", "DI", "GT", "SY"))
df1 <- df1[order(df1$Category), ]
# Data prep data.table
df2 <- df_large
df2$Category <- factor(df2$Category, levels = c("PT", "DI", "GT", "SY"))
setDT(df2)
Results:
library(microbenchmark)
microbenchmark(df1[!duplicated(df1[,c('Name', 'Course')]), ],
df2[, .SD[which.min(df2$Category)], by=.(Name, Course)])
Unit: milliseconds
expr min lq mean
df1[!duplicated(df1[, c("Name", "Course")]), ] 1696.7585 1719.4932 1788.5821
df2[, .SD[which.min(df2$Category)], by = .(Name, Course)] 387.8435 409.9365 436.4381
median uq max neval
1774.3131 1803.7565 2085.9722 100
427.6739 451.1776 558.2749 100
Data:
df = structure(list(Name = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 4L,
4L, 4L, 4L, 1L, 3L), .Label = c("James", "Jason", "John", "Nancy"
), class = "factor"), Course = structure(c(2L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 1L), .Label = c("DS", "ML"), class = "factor"),
Category = structure(c(3L, 1L, 2L, 4L, 4L, 1L, 3L, 4L, 1L,
2L, 4L, 2L), .Label = c("DI", "GT", "PT", "SY"), class = "factor")), .Names = c("Name",
"Course", "Category"), class = "data.frame", row.names = c("1:",
"2:", "3:", "4:", "5:", "6:", "7:", "8:", "9:", "10:", "11:",
"12:"))