问题
Given a data table with start and end coordinates for sequences of integers:
set.seed(1)
df1 <- data.table(
START = c(seq(1, 10000000, 10), seq(1, 10000000, 10), seq(1, 10000000, 10)),
END = c(seq(10, 10000000, 10), seq(10, 10000000, 10), seq(10, 10000000, 10))
And a vector of integers:
vec1 <- sample(1:100000, 10000)
How can I count the number of integers in vec1 that are within the start and end coordinates of each sequence in df1? I am currently using a for loop:
COUNT <- rep(NA, nrow(df1))
for (i in 1:nrow(df1)){
vec2 <- seq(from = df1$START[i], to = df1$END[i])
COUNT[i] <- table(vec2 %in% vec1)[2]
print(i)
}
df1$COUNT <- COUNT
However, the datatable and vector I am applying this to are very large? Is anyone able to suggest a way to improve performance?
Any help will be greatly appreciated!
回答1:
### example data:
# df1 <- data.table(START = c(1, 8, 11), END = c(4, 9, 30))
# vec1 <- c(3, 2, 8)
#
df1[, ind := .I] # add uniqe index to data.table
dt2 <- as.data.table(vec1, key = 'vec1') # convert to data.table
dt2[, vec2 := vec1] # dublicate column
setkey(df1) # sets keys // order data by all columns
# Fast overlap join:
ans1 = foverlaps(dt2, df1, by.x = c('vec1', 'vec2'), by.y = c('START', 'END'),
type = "within", nomatch = 0L)
counts <- ans1[, .N, keyby = ind] # count by ind
# merge to inital data
df1[, COUNT := counts[df1, on = .(ind), x.N]]
df1
setorder(df1, ind) # reorder by ind to get inital order
df1[, ind := NULL] # deletes ind colum
df1[is.na(COUNT), COUNT := 0L] # NAs is 0 count
df1
# START END COUNT
# 1: 1 4 2
# 2: 8 9 1
# 3: 11 30 0
回答2:
One option is to use between
library(data.table)
df1[, count := sum(between(vec1, START, END)), by = seq_len(nrow(df1))]
回答3:
We can do this with a non-equi join
df1[data.table(val = vec1), count := .N,on = .(START < val,
END >= val), by = .EACHI]
head(df1)
If we want to get the output in the other way, using @minem's example
data.table(START = vec1, END = vec1)[df1, .N,
on = .(START >= START, END < END), by = .EACHI]
# START END N
#1: 1 4 2
#2: 8 9 1
#3: 11 30 0
来源:https://stackoverflow.com/questions/56294563/how-to-count-matches-between-a-vector-and-dataframe-of-sequence-coordinates