问题
I have 2 dataframes
das <- data.frame(val=1:20,
type =c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B","C","C","C","C"),
weigh=c(20,22,23,32,34,54,19,22,24,26,31,34,36,37,51,54,31,35,43,45))
mapper <- data.frame(type=c("A","A","A","A","B","B","B","B","C","C","C","C"),start = c(19,23,27,37 ,17,25,39,50, 17,23,33,39),end = c(23,27,37,55 ,25,39,50,60, 23,33,39,48))
The expected output is
val type weigh labelweight
1 1 A 20 A_19
2 2 A 22 A_19
3 3 A 23 A_23
4 4 A 32 A_27
5 5 A 34 A_27
6 6 A 54 A_37
7 7 B 19 B_17
8 8 B 22 B_17
9 9 B 24 B_17
10 10 B 26 B_25
11 11 B 31 B_25
12 12 B 34 B_25
13 13 B 36 B_25
14 14 B 37 B_25
15 15 B 51 B_50
16 16 B 54 B_50
17 17 C 31 C_23
18 18 C 35 C_33
19 19 C 43 C_39
20 20 C 45 C_39
I am able to get the expected output with following code
p <- left_join(das,mapper)
q <- p%>%filter(weigh>=start & weigh<end)%>%mutate(labelweight= paste0(type,"_",start))
The code whatever I came up with is throwing "Error: vector memory exhausted (limit reached?)" when dealing with large datasets.
I am thinking if there is any more efficient way of getting the desired output without doing a join.
回答1:
The intervals appears to be contiguous. Here is a fast option using rolling join in data.table
:
library(data.table)
setDT(das)[, weight :=
setDT(mapper)[.SD, on=.(type, start=weigh), roll=Inf, paste(type, x.start, sep="_")]
]
If the intervals are not contiguous, you can use a non-equi join:
setDT(das)[, weight :=
setDT(mapper)[setDT(das), on=.(type, start<=weigh, end>weigh), paste(type, x.start, sep="_")]
]
output:
val type weigh weight
1: 1 A 20 A_19
2: 2 A 22 A_19
3: 3 A 23 A_23
4: 4 A 32 A_27
5: 5 A 34 A_27
6: 6 A 54 A_37
7: 7 B 19 B_17
8: 8 B 22 B_17
9: 9 B 24 B_17
10: 10 B 26 B_25
11: 11 B 31 B_25
12: 12 B 34 B_25
13: 13 B 36 B_25
14: 14 B 37 B_25
15: 15 B 51 B_50
16: 16 B 54 B_50
17: 17 C 31 C_23
18: 18 C 35 C_33
19: 19 C 43 C_39
20: 20 C 45 C_39
回答2:
Perhaps, you can use fuzzyjoin
's here :
fuzzyjoin::fuzzy_left_join(das, mapper,
by = c('type' = 'type', 'weigh' = 'start', 'weigh' = 'end'),
match_fun = list(`==`, `>=`, `<=`)) %>%
dplyr::transmute(type = type.x, val, weigh,
labelweight = paste(type.y, start, sep = '_'))
# type val weigh labelweight
#1 A 1 20 A_19
#2 A 2 22 A_19
#3 A 3 23 A_19
#4 A 3 23 A_23
#5 A 4 32 A_27
#6 A 5 34 A_27
#7 A 6 54 A_37
#8 B 7 19 B_17
#9 B 8 22 B_17
#10 B 9 24 B_17
#11 B 10 26 B_25
#12 B 11 31 B_25
#13 B 12 34 B_25
#14 B 13 36 B_25
#15 B 14 37 B_25
#16 B 15 51 B_50
#17 B 16 54 B_50
#18 C 17 31 C_23
#19 C 18 35 C_33
#20 C 19 43 C_39
#21 C 20 45 C_39
回答3:
Using R base:
encon <- function(x, y) {
lower <- y[y[,1] == x[[2]], 2]
upper <- y[y[,1] == x[[2]], 3]
paste(as.character(x[[2]]), min(lower[x[[3]] >= lower & x[[3]] <= upper]), sep="_" )
}
for (i in seq(1, nrow(das))) das[i,"label"] <- encon(das[i,], mapper)
> das
val type weigh label
1 1 A 20 A_19
2 2 A 22 A_19
3 3 A 23 A_19
4 4 A 32 A_27
5 5 A 34 A_27
6 6 A 54 A_37
7 7 B 19 B_17
8 8 B 22 B_17
9 9 B 24 B_17
10 10 B 26 B_25
11 11 B 31 B_25
12 12 B 34 B_25
13 13 B 36 B_25
14 14 B 37 B_25
15 15 B 51 B_50
16 16 B 54 B_50
17 17 C 31 C_23
18 18 C 35 C_33
19 19 C 43 C_39
20 20 C 45 C_39
来源:https://stackoverflow.com/questions/62123358/efficient-way-of-labelling-based-on-start-and-end-position