How to subset data by filtering and grouping efficiently in R

问题

I'm working on a project and am looking for some help to make my code run more efficiently. I've searched for similar problems but can't seem to find anything quite as granular as this one. The solution I've come up with is extremely clunky. I'm confident that there must be a more efficient way to do this with a package like dplyr, data.tables, etc.

Problem: I have 3 columns of data, 'ids', 'x.group', and 'times'. I need to extract the first 3 unique 'ids' that appear in each 'times' block for each 'x.group'.

However, I do not want to include any 'ids' or 'x.group' equal to "0". The output at the bottom of my code yields the correct values, but it's a rather embarrassing way of getting there in my opinion.

Note: In the code example below, I am using x.groups = ['A','B','0'], but in my actual project, these can take on many values, so they won't always be 'A' or 'B', but '0's will always be present (e.g., I could have ['A','K','0'] or ['M','W','0'], etc.). You can find the example dataset at the bottom of this post.

# find x.groups
xs <- unique(myDF$x.group)[unique(myDF$x.group) != "0"]

# DF without '0's as x.group entries
ps <- unique(myDF[which(myDF$x.group %in% xs) , c("ids","x.group","time")])

first3.x1.t1 <- ps[ps$x.group == xs[1] & ps$ids != "0" & ps$time == "1", ]$ids[1:3]
first3.x2.t1 <- ps[ps$x.group == xs[2] & ps$ids != "0" & ps$time == "1", ]$ids[1:3]
first3.x1.t2 <- ps[ps$x.group == xs[1] & ps$ids != "0" & ps$time == "2", ]$ids[1:3]
first3.x2.t2 <- ps[ps$x.group == xs[2] & ps$ids != "0" & ps$time == "2", ]$ids[1:3]
first3.x1.t3 <- ps[ps$x.group == xs[1] & ps$ids != "0" & ps$time == "3", ]$ids[1:3]
first3.x2.t3 <- ps[ps$x.group == xs[2] & ps$ids != "0" & ps$time == "3", ]$ids[1:3]

# First 3 unique ids from time block 1 for each x.group
> first3.x1.t1; first3.x2.t1;
[1] "2"  "17" "11"
[1] "5"  "10" "4"

# First 3 unique ids from time block 2 for each x.group
> first3.x1.t2; first3.x2.t2;
[1] "9"  "6"  "16"
[1] "8"  "13" "7" 

# First 3 unique ids from time block 3 for each x.group
> first3.x1.t3; first3.x2.t3;
[1] "11" "2"  "10"
[1] "1"  "3"  "13"

Data:

# create data frame
ids <- c("2","0","15","5","17","10","4","2","3","11","11","18","10","8","13","9","6","16","7","14",
     "16","7","11","12","14","5","1","11","3","2","10","17","3","13","10","17","2","10","16","10")
x.group <- c("A","A","0","B","A","B","B","A","B","A","A","0","B","B","B","A","A","A","B","B",
         "A","A","0","B","A","B","B","A","B","A","A","0","B","B","B","A","A","A","B","B")
time <- c(rep("1",13), rep("2",13), rep("3",14))

myDF <- as.data.frame(cbind(ids, x.group, time), stringsAsFactors = FALSE)
> myDF
   ids x.group time
1    2       A    1
2    0       A    1
3   15       0    1
4    5       B    1
5   17       A    1
6   10       B    1
7    4       B    1
8    2       A    1
9    3       B    1
10  11       A    1
11  11       A    1
12  18       0    1
13  10       B    1
14   8       B    2
15  13       B    2
16   9       A    2
17   6       A    2
18  16       A    2
19   7       B    2
20  14       B    2
21  16       A    2
22   7       A    2
23  11       0    2
24  12       B    2
25  14       A    2
26   5       B    2
27   1       B    3
28  11       A    3
29   3       B    3
30   2       A    3
31  10       A    3
32  17       0    3
33   3       B    3
34  13       B    3
35  10       B    3
36  17       A    3
37   2       A    3
38  10       A    3
39  16       B    3
40  10       B    3

回答1:

aggregate(ids~.,myDF,function(x)unique(x)[1:3],subset = x.group!="0"&ids!=0)
  x.group time ids.1 ids.2 ids.3
1       A    1     2    17    11
2       B    1     5    10     4
3       A    2     9     6    16
4       B    2     8    13     7
5       A    3    11     2    10
6       B    3     1     3    13

This returned a nested dataframe. You can unnest is as:

a=aggregate(ids~.,myDF,function(x)unique(x)[1:3],subset = x.group!="0"&ids!=0)
b=do.call(data.frame,a)#The unnested dataframe:
b
  x.group time ids.1 ids.2 ids.3
1       A    1     2    17    11
2       B    1     5    10     4
3       A    2     9     6    16
4       B    2     8    13     7
5       A    3    11     2    10
6       B    3     1     3    13

回答2:

library(dplyr)

myDF %>% 
  distinct() %>% 
  filter(x.group != "0" & ids != 0) %>% 
  group_by(x.group, time) %>% 
  slice(1:3)

# # A tibble: 18 x 3
# # Groups: x.group, time [6]
#    ids   x.group time 
#    <chr> <chr>   <chr>
#  1 2     A       1    
#  2 17    A       1    
#  3 11    A       1    
#  4 9     A       2    
#  5 6     A       2    
#  6 16    A       2    
#  7 11    A       3    
#  8 2     A       3    
#  9 10    A       3    
# 10 5     B       1    
# 11 10    B       1    
# 12 4     B       1    
# 13 8     B       2    
# 14 13    B       2    
# 15 7     B       2    
# 16 1     B       3    
# 17 3     B       3    
# 18 13    B       3

回答3:

This is the data.table solution which I assume should be the fastest; it can be faster by avoiding calling .SD for each group.

library(data.table)
unique(setDT(myDF))[ids != 0 & x.group!=0, head(.SD, 3), by = list(time, x.group)]


# time x.group ids
# 1:    1       A   2
# 2:    1       A  17
# 3:    1       A  11
# 4:    1       B   5
# 5:    1       B  10
# 6:    1       B   4
# 7:    2       B   8
# 8:    2       B  13
# 9:    2       B   7
# 10:    2       A   9
# 11:    2       A   6
# 12:    2       A  16
# 13:    3       B   1
# 14:    3       B   3
# 15:    3       B  13
# 16:    3       A  11
# 17:    3       A   2
# 18:    3       A  10

microbenchmark(dplyr= {myDF %>% 
    distinct() %>% 
    filter(x.group != "0" & ids != 0) %>% 
    group_by(x.group, time) %>% 
    slice(1:3)},
    aggreagte ={aggregate(ids~.,myDF,function(x)unique(x)[1:3],subset = x.group!="0"&ids!=0)},
    data.table={unique(setDT(myDF))[ids != 0 & x.group!=0, head(.SD, 3), by = list(time, x.group)]})

# Unit: microseconds
# expr      min        lq      mean    median       uq       max neval
# dplyr 6696.740 7025.1780 7911.2968 7229.2430 7500.627 35545.183   100
# aggreagte  920.410  981.9920 1090.5363 1041.1590 1132.627  2801.076   100
# data.table  825.925  894.6005  979.3326  961.3135 1052.329  1267.865   100

来源：https://stackoverflow.com/questions/50769008/how-to-subset-data-by-filtering-and-grouping-efficiently-in-r

标签

dplyr

data.table