问题
I'm working on a project and am looking for some help to make my code run more efficiently. I've searched for similar problems but can't seem to find anything quite as granular as this one. The solution I've come up with is extremely clunky. I'm confident that there must be a more efficient way to do this with a package like dplyr
, data.tables
, etc.
Problem: I have 3 columns of data, 'ids'
, 'x.group'
, and 'times'
. I need to extract the first 3 unique 'ids'
that appear in each 'times'
block for each 'x.group'
.
However, I do not want to include any 'ids'
or 'x.group'
equal to "0". The output at the bottom of my code yields the correct values, but it's a rather embarrassing way of getting there in my opinion.
Note: In the code example below, I am using x.groups = ['A','B','0']
, but in my actual project, these can take on many values, so they won't always be 'A' or 'B', but '0's will always be present (e.g., I could have ['A','K','0']
or ['M','W','0']
, etc.). You can find the example dataset at the bottom of this post.
# find x.groups
xs <- unique(myDF$x.group)[unique(myDF$x.group) != "0"]
# DF without '0's as x.group entries
ps <- unique(myDF[which(myDF$x.group %in% xs) , c("ids","x.group","time")])
first3.x1.t1 <- ps[ps$x.group == xs[1] & ps$ids != "0" & ps$time == "1", ]$ids[1:3]
first3.x2.t1 <- ps[ps$x.group == xs[2] & ps$ids != "0" & ps$time == "1", ]$ids[1:3]
first3.x1.t2 <- ps[ps$x.group == xs[1] & ps$ids != "0" & ps$time == "2", ]$ids[1:3]
first3.x2.t2 <- ps[ps$x.group == xs[2] & ps$ids != "0" & ps$time == "2", ]$ids[1:3]
first3.x1.t3 <- ps[ps$x.group == xs[1] & ps$ids != "0" & ps$time == "3", ]$ids[1:3]
first3.x2.t3 <- ps[ps$x.group == xs[2] & ps$ids != "0" & ps$time == "3", ]$ids[1:3]
# First 3 unique ids from time block 1 for each x.group
> first3.x1.t1; first3.x2.t1;
[1] "2" "17" "11"
[1] "5" "10" "4"
# First 3 unique ids from time block 2 for each x.group
> first3.x1.t2; first3.x2.t2;
[1] "9" "6" "16"
[1] "8" "13" "7"
# First 3 unique ids from time block 3 for each x.group
> first3.x1.t3; first3.x2.t3;
[1] "11" "2" "10"
[1] "1" "3" "13"
Data:
# create data frame
ids <- c("2","0","15","5","17","10","4","2","3","11","11","18","10","8","13","9","6","16","7","14",
"16","7","11","12","14","5","1","11","3","2","10","17","3","13","10","17","2","10","16","10")
x.group <- c("A","A","0","B","A","B","B","A","B","A","A","0","B","B","B","A","A","A","B","B",
"A","A","0","B","A","B","B","A","B","A","A","0","B","B","B","A","A","A","B","B")
time <- c(rep("1",13), rep("2",13), rep("3",14))
myDF <- as.data.frame(cbind(ids, x.group, time), stringsAsFactors = FALSE)
> myDF
ids x.group time
1 2 A 1
2 0 A 1
3 15 0 1
4 5 B 1
5 17 A 1
6 10 B 1
7 4 B 1
8 2 A 1
9 3 B 1
10 11 A 1
11 11 A 1
12 18 0 1
13 10 B 1
14 8 B 2
15 13 B 2
16 9 A 2
17 6 A 2
18 16 A 2
19 7 B 2
20 14 B 2
21 16 A 2
22 7 A 2
23 11 0 2
24 12 B 2
25 14 A 2
26 5 B 2
27 1 B 3
28 11 A 3
29 3 B 3
30 2 A 3
31 10 A 3
32 17 0 3
33 3 B 3
34 13 B 3
35 10 B 3
36 17 A 3
37 2 A 3
38 10 A 3
39 16 B 3
40 10 B 3
回答1:
aggregate(ids~.,myDF,function(x)unique(x)[1:3],subset = x.group!="0"&ids!=0)
x.group time ids.1 ids.2 ids.3
1 A 1 2 17 11
2 B 1 5 10 4
3 A 2 9 6 16
4 B 2 8 13 7
5 A 3 11 2 10
6 B 3 1 3 13
This returned a nested dataframe. You can unnest is as:
a=aggregate(ids~.,myDF,function(x)unique(x)[1:3],subset = x.group!="0"&ids!=0)
b=do.call(data.frame,a)#The unnested dataframe:
b
x.group time ids.1 ids.2 ids.3
1 A 1 2 17 11
2 B 1 5 10 4
3 A 2 9 6 16
4 B 2 8 13 7
5 A 3 11 2 10
6 B 3 1 3 13
回答2:
library(dplyr)
myDF %>%
distinct() %>%
filter(x.group != "0" & ids != 0) %>%
group_by(x.group, time) %>%
slice(1:3)
# # A tibble: 18 x 3
# # Groups: x.group, time [6]
# ids x.group time
# <chr> <chr> <chr>
# 1 2 A 1
# 2 17 A 1
# 3 11 A 1
# 4 9 A 2
# 5 6 A 2
# 6 16 A 2
# 7 11 A 3
# 8 2 A 3
# 9 10 A 3
# 10 5 B 1
# 11 10 B 1
# 12 4 B 1
# 13 8 B 2
# 14 13 B 2
# 15 7 B 2
# 16 1 B 3
# 17 3 B 3
# 18 13 B 3
回答3:
This is the data.table
solution which I assume should be the fastest; it can be faster by avoiding calling .SD
for each group.
library(data.table)
unique(setDT(myDF))[ids != 0 & x.group!=0, head(.SD, 3), by = list(time, x.group)]
# time x.group ids
# 1: 1 A 2
# 2: 1 A 17
# 3: 1 A 11
# 4: 1 B 5
# 5: 1 B 10
# 6: 1 B 4
# 7: 2 B 8
# 8: 2 B 13
# 9: 2 B 7
# 10: 2 A 9
# 11: 2 A 6
# 12: 2 A 16
# 13: 3 B 1
# 14: 3 B 3
# 15: 3 B 13
# 16: 3 A 11
# 17: 3 A 2
# 18: 3 A 10
microbenchmark(dplyr= {myDF %>%
distinct() %>%
filter(x.group != "0" & ids != 0) %>%
group_by(x.group, time) %>%
slice(1:3)},
aggreagte ={aggregate(ids~.,myDF,function(x)unique(x)[1:3],subset = x.group!="0"&ids!=0)},
data.table={unique(setDT(myDF))[ids != 0 & x.group!=0, head(.SD, 3), by = list(time, x.group)]})
# Unit: microseconds
# expr min lq mean median uq max neval
# dplyr 6696.740 7025.1780 7911.2968 7229.2430 7500.627 35545.183 100
# aggreagte 920.410 981.9920 1090.5363 1041.1590 1132.627 2801.076 100
# data.table 825.925 894.6005 979.3326 961.3135 1052.329 1267.865 100
来源:https://stackoverflow.com/questions/50769008/how-to-subset-data-by-filtering-and-grouping-efficiently-in-r