问题
do you know any ready-to-use method to obtain length and also overlap of two strings? However only with R
, maybe something from stringr
? I was looking here, unfortunately without succes.
str1 <- 'ABCDE'
str2 <- 'CDEFG'
str_overlap(str1, str2)
'CDE'
str_overlap_len(str1, str2)
3
Other examples:
str1 <- 'ATTAGACCTG'
str2 <- 'CCTGCCGGAA'
str_overlap(str1, str2)
'CCTG'
str_overlap_len(str1, str2)
4
///
str1 <- 'foobarandfoo'
str2 <- 'barand'
str_overlap(str1, str2)
'barand'
str_overlap_len(str1, str2)
6
/// Yes two solutions, always pick always overlap
str1 <- 'EFGABCDE'
str2 <- 'ABCDECDE'
str_overlap(str1, str2)
'ABCDE'
str_overlap_len(str1, str2)
5
I was wonder about homemade small function for this, such as this one?
回答1:
It seems to me that you (OP) are not very concerned with performance of the code but more interested in a potential approch to solve it without readymade functions. So here is an example I came up with to compute the longest common substring. I have to note that this only returns the first largest common substring found even when there can be several of the same length. This is something you could modify to fit your needs. And please don't expect this to be super fast - it won't.
foo <- function(str1, str2, ignore.case = FALSE, verbose = FALSE) {
if(ignore.case) {
str1 <- tolower(str1)
str2 <- tolower(str2)
}
if(nchar(str1) < nchar(str2)) {
x <- str2
str2 <- str1
str1 <- x
}
x <- strsplit(str2, "")[[1L]]
n <- length(x)
s <- sequence(seq_len(n))
s <- split(s, cumsum(s == 1L))
s <- rep(list(s), n)
for(i in seq_along(s)) {
s[[i]] <- lapply(s[[i]], function(x) {
x <- x + (i-1L)
x[x <= n]
})
s[[i]] <- unique(s[[i]])
}
s <- unlist(s, recursive = FALSE)
s <- unique(s[order(-lengths(s))])
i <- 1L
len_s <- length(s)
while(i < len_s) {
lcs <- paste(x[s[[i]]], collapse = "")
if(verbose) cat("now checking:", lcs, "\n")
check <- grepl(lcs, str1, fixed = TRUE)
if(check) {
cat("the (first) longest common substring is:", lcs, "of length", nchar(lcs), "\n")
break
} else {
i <- i + 1L
}
}
}
str1 <- 'ABCDE'
str2 <- 'CDEFG'
foo(str1, str2)
# the (first) longest common substring is: CDE of length 3
str1 <- 'ATTAGACCTG'
str2 <- 'CCTGCCGGAA'
foo(str1, str2)
# the (first) longest common substring is: CCTG of length 4
str1 <- 'foobarandfoo'
str2 <- 'barand'
foo(str1, str2)
# the (first) longest common substring is: barand of length 6
str1 <- 'EFGABCDE'
str2 <- 'ABCDECDE'
foo(str1, str2)
# the (first) longest common substring is: ABCDE of length 5
set.seed(2018)
str1 <- paste(sample(c(LETTERS, letters), 500, TRUE), collapse = "")
str2 <- paste(sample(c(LETTERS, letters), 250, TRUE), collapse = "")
foo(str1, str2, ignore.case = TRUE)
# the (first) longest common substring is: oba of length 3
foo(str1, str2, ignore.case = FALSE)
# the (first) longest common substring is: Vh of length 2
回答2:
Hope this helps:
library(stringr)
larsub<-function(x) {
a<-x[1]
b<-x[2]
# get all forward substrings of a
for(n in seq(1,nchar(a)))
{
sb<-unique(combn(strsplit(a, "")[[1]],n, FUN=paste, collapse=""))
if(length(unlist(str_extract_all(b,sb)))==0){
r<-prior
return(r)
}
prior<-unlist(str_extract_all(b,sb))
}
}
c1<-larsub(c('ABCD','BCDE'))
c2<-larsub(c('ABDFD','BCDE'))
c3<-larsub(c('CDEWQ','DEQ'))
c4<-larsub(c('BNEOYJBELMGY','BELM'))
print(c1)
print(c2)
print(c3)
print(c4)
Output:
> print(c1)
[1] "BCD"
> print(c2)
[1] "B" "D"
> print(c3)
[1] "DEQ"
> print(c4)
[1] "BELM"
`
Diclaimer: the logic was borrowed from the lcs answer here: longest common substring in R finding non-contiguous matches between the two strings posted by @Rick Scriven
来源:https://stackoverflow.com/questions/48701107/find-length-of-overlap-in-strings