Here is my example data set:
Name Course Cateory
1: Jason ML PT
2: Jason ML DI
3: Jason ML GT
4: Jason ML SY
5: Ja
Here is a snippet that does what you asked:
df$Category <- factor(df$Category, levels = c("PT", "DI", "GT", "SY"))
df <- df[order(df$Category),]
df[!duplicated(df[,c('Name', 'Course')]),]
Output:
Name Course Category
Jason ML PT
Nancy ML PT
Jason DS DI
Nancy DS DI
John DS GT
James ML SY
Idea is that we sort based on the priority structure. Then we apply the unique operations, which will return the first match. The return will be what we want.
I may be late, but i believe this is the simplest solution. Since you mentioned 10m rows i propose a data.table implementation using the very understandable unique
function
require("data.table")
df <- data.table("Name" = c("Jason", "Jason", "Jason", "Jason", "Jason", "Jason", "Nancy", "Nancy", "Nancy", "Nancy", "James", "John"), "Course" = c("ML", "ML", "ML", "ML", "DS", "DS", "ML", "ML", "DS", "DS", "ML", "DS"), "category" = c("PT", "DI", "GT", "SY", "SY", "DI", "PT", "SY", "DI", "GT", "SY", "GT"))
unique(df[, category := factor(category, levels = c("PT","DI","GT","SY"))][order(df$"category")], by = c("Name", "Course"))
Name Course category
1: Jason ML PT
2: Nancy ML PT
3: Jason DS DI
4: Nancy DS DI
5: John DS GT
6: James ML SY
You're not removing based on category
, you're really trying to remove full duplicate rows from the dataframe.
You can remove full duplicate rows by subsetting the dataframe:
base R:
df_without_dupes <- df[!duplicated(df),]
I would suggest using the dplyr
package for this
See below:
require(dplyr)
data %>%
mutate(
Category_factored=as.numeric(factor(Category,levels=c('PT','DI','GT','SY'),labels=1:4))
) %>%
group_by(Name,Course) %>%
filter(
Category_factored == min(Category_factored)
)
In case you are new to R, install dplyr using install.packages('dplyr')
Since you mentioned you have 10 million rows, here is a data.table
solution:
library(data.table)
setDT(df)[, .SD[which.min(factor(Category, levels = c("PT","DI","GT","SY")))], by=.(Name, Course)]
Result:
Name Course Category
1: Jason ML PT
2: Jason DS DI
3: Nancy ML PT
4: Nancy DS DI
5: James ML SY
6: John DS GT
Benchmarking:
# Random resampling of `df` to generate 10 million rows
set.seed(123)
df_large = data.frame(lapply(df, sample, 1e7, replace = TRUE))
# Data prep Base R
df1 <- df_large
df1$Category <- factor(df1$Category, levels = c("PT", "DI", "GT", "SY"))
df1 <- df1[order(df1$Category), ]
# Data prep data.table
df2 <- df_large
df2$Category <- factor(df2$Category, levels = c("PT", "DI", "GT", "SY"))
setDT(df2)
Results:
library(microbenchmark)
microbenchmark(df1[!duplicated(df1[,c('Name', 'Course')]), ],
df2[, .SD[which.min(df2$Category)], by=.(Name, Course)])
Unit: milliseconds
expr min lq mean
df1[!duplicated(df1[, c("Name", "Course")]), ] 1696.7585 1719.4932 1788.5821
df2[, .SD[which.min(df2$Category)], by = .(Name, Course)] 387.8435 409.9365 436.4381
median uq max neval
1774.3131 1803.7565 2085.9722 100
427.6739 451.1776 558.2749 100
Data:
df = structure(list(Name = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 4L,
4L, 4L, 4L, 1L, 3L), .Label = c("James", "Jason", "John", "Nancy"
), class = "factor"), Course = structure(c(2L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 1L), .Label = c("DS", "ML"), class = "factor"),
Category = structure(c(3L, 1L, 2L, 4L, 4L, 1L, 3L, 4L, 1L,
2L, 4L, 2L), .Label = c("DI", "GT", "PT", "SY"), class = "factor")), .Names = c("Name",
"Course", "Category"), class = "data.frame", row.names = c("1:",
"2:", "3:", "4:", "5:", "6:", "7:", "8:", "9:", "10:", "11:",
"12:"))
You'll need to create an index to represent the order of category. Then sort based on the priority of your categories and dedup by Name and Course.
library(tidyverse)
#create index to sort by
index.df <- data.frame("Cateory" = c('PT',"DI","GT","SY"), "Index" = c(1,2,3,4))
#join to orig dataset
data <- left_join(data, index.df, by = "Cateory")
#sort by index, dedup with Name and Course
data %>% arrange(Index) %>% group_by(Name,Course) %>%
distinct(Name,Course, .keep_all = TRUE) %>% select(-Index)