Efficient way of labelling based on start and end position

元气小坏坏 提交于 2021-02-05 08:26:33

问题


I have 2 dataframes

das <- data.frame(val=1:20,
              type =c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B","C","C","C","C"),
              weigh=c(20,22,23,32,34,54,19,22,24,26,31,34,36,37,51,54,31,35,43,45))

mapper <- data.frame(type=c("A","A","A","A","B","B","B","B","C","C","C","C"),start = c(19,23,27,37   ,17,25,39,50, 17,23,33,39),end = c(23,27,37,55  ,25,39,50,60, 23,33,39,48))

The expected output is

val type weigh labelweight
1    1    A    20    A_19
2    2    A    22    A_19
3    3    A    23    A_23
4    4    A    32    A_27
5    5    A    34    A_27
6    6    A    54    A_37
7    7    B    19    B_17
8    8    B    22    B_17
9    9    B    24    B_17
10  10    B    26    B_25
11  11    B    31    B_25
12  12    B    34    B_25
13  13    B    36    B_25
14  14    B    37    B_25
15  15    B    51    B_50
16  16    B    54    B_50
17  17    C    31    C_23
18  18    C    35    C_33
19  19    C    43    C_39
20  20    C    45    C_39

I am able to get the expected output with following code

p <- left_join(das,mapper)
q <- p%>%filter(weigh>=start & weigh<end)%>%mutate(labelweight= paste0(type,"_",start))

The code whatever I came up with is throwing "Error: vector memory exhausted (limit reached?)" when dealing with large datasets.

I am thinking if there is any more efficient way of getting the desired output without doing a join.


回答1:


The intervals appears to be contiguous. Here is a fast option using rolling join in data.table:

library(data.table)
setDT(das)[, weight := 
    setDT(mapper)[.SD, on=.(type, start=weigh), roll=Inf, paste(type, x.start, sep="_")]
]

If the intervals are not contiguous, you can use a non-equi join:

setDT(das)[, weight := 
    setDT(mapper)[setDT(das), on=.(type, start<=weigh, end>weigh), paste(type, x.start, sep="_")]        
]

output:

    val type weigh weight
 1:   1    A    20   A_19
 2:   2    A    22   A_19
 3:   3    A    23   A_23
 4:   4    A    32   A_27
 5:   5    A    34   A_27
 6:   6    A    54   A_37
 7:   7    B    19   B_17
 8:   8    B    22   B_17
 9:   9    B    24   B_17
10:  10    B    26   B_25
11:  11    B    31   B_25
12:  12    B    34   B_25
13:  13    B    36   B_25
14:  14    B    37   B_25
15:  15    B    51   B_50
16:  16    B    54   B_50
17:  17    C    31   C_23
18:  18    C    35   C_33
19:  19    C    43   C_39
20:  20    C    45   C_39



回答2:


Perhaps, you can use fuzzyjoin's here :

fuzzyjoin::fuzzy_left_join(das, mapper, 
                   by = c('type' = 'type', 'weigh' = 'start', 'weigh' = 'end'), 
                   match_fun = list(`==`, `>=`, `<=`)) %>%
     dplyr::transmute(type = type.x, val, weigh, 
                      labelweight = paste(type.y, start, sep = '_'))


#   type val weigh labelweight
#1     A   1    20        A_19
#2     A   2    22        A_19
#3     A   3    23        A_19
#4     A   3    23        A_23
#5     A   4    32        A_27
#6     A   5    34        A_27
#7     A   6    54        A_37
#8     B   7    19        B_17
#9     B   8    22        B_17
#10    B   9    24        B_17
#11    B  10    26        B_25
#12    B  11    31        B_25
#13    B  12    34        B_25
#14    B  13    36        B_25
#15    B  14    37        B_25
#16    B  15    51        B_50
#17    B  16    54        B_50
#18    C  17    31        C_23
#19    C  18    35        C_33
#20    C  19    43        C_39
#21    C  20    45        C_39



回答3:


Using R base:

encon <- function(x, y) {
  lower <- y[y[,1] == x[[2]], 2]
  upper <- y[y[,1] == x[[2]], 3]
  paste(as.character(x[[2]]), min(lower[x[[3]] >= lower & x[[3]] <= upper]), sep="_" )
}

for (i in seq(1, nrow(das))) das[i,"label"] <- encon(das[i,], mapper)
> das
   val type weigh label
1    1    A    20  A_19
2    2    A    22  A_19
3    3    A    23  A_19
4    4    A    32  A_27
5    5    A    34  A_27
6    6    A    54  A_37
7    7    B    19  B_17
8    8    B    22  B_17
9    9    B    24  B_17
10  10    B    26  B_25
11  11    B    31  B_25
12  12    B    34  B_25
13  13    B    36  B_25
14  14    B    37  B_25
15  15    B    51  B_50
16  16    B    54  B_50
17  17    C    31  C_23
18  18    C    35  C_33
19  19    C    43  C_39
20  20    C    45  C_39



来源:https://stackoverflow.com/questions/62123358/efficient-way-of-labelling-based-on-start-and-end-position

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!