I\'m trying to figure out an efficient way to go about splitting a string like
\"111110000011110000111000\"
into a vector
[1] \
How about this:
s <- "111110000011110000111000"
spl <- strsplit(s,"10|01")[[1]]
l <- length(spl)
sapply(1:l, function(i) paste0(spl[i],i%%2,ifelse(i==1 | i==l, "",i%%2)))
# [1] "11111" "00000" "1111" "0000" "111" "000"
It's not really what the OP was looking for (concise R code), but thought I'd give it a try in Rcpp
, and turned out relatively simple and about 5x faster than the fastest R-based answers.
library(Rcpp)
cppFunction(
'std::vector<std::string> split_str_cpp(std::string x) {
std::vector<std::string> parts;
int start = 0;
for(int i = 1; i <= x.length(); i++) {
if(x[i] != x[i-1]) {
parts.push_back(x.substr(start, i-start));
start = i;
}
}
return parts;
}')
And testing on these
str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
Gives the following output
> split_str_cpp(str1)
[1] "11111" "00000" "1111" "0000" "111" "000"
> split_str_cpp(x1)
[1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111" "000"
> split_str_cpp(x2)
[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" "00000" "222" "aaa" "bb" "cc" "d" "11"
[15] "D" "aa" "BB"
And a benchmark shows it's about 5-10x faster than R solutions.
akrun <- function(str1) strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
richard1 <- function(x3){
cs <- cumsum(
rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
)
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
richard2 <- function(x3) {
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}
library(microbenchmark)
library(stringi)
set.seed(24)
x3 <- stri_rand_strings(1, 1e6)
microbenchmark(split_str_cpp(x3), akrun(x3), richard1(x3), richard2(x3), unit = 'relative', times=20L)
Comparison:
Unit: relative
expr min lq mean median uq max neval
split_str_cpp(x3) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
akrun(x3) 9.675613 8.952997 8.241750 8.689001 8.403634 4.423134 20
richard1(x3) 5.355620 5.226103 5.483171 5.947053 5.982943 3.379446 20
richard2(x3) 4.842398 4.756086 5.046077 5.389570 5.389193 3.669680 20
Simple for
loop solution
x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=substr(x,1,1)
for (i in 2:nchar(x)) {
tmp=substr(x,i,i)
if (tmp==substr(x,i-1,i-1)) {
res_vector[length(res_vector)]=paste0(res_vector[length(res_vector)],tmp)
} else {
res_vector[length(res_vector)+1]=tmp
}
}
res_vector
#[1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d" "11111" "00000" "222" "aaa" "bb" "cc" "d" "11" "D" "aa" "BB"
Or a maybe a little bit faster with a pre-allocated results vector
x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=rep(NA_character_,nchar(x))
res_vector[1]=substr(x,1,1)
counter=1
old_tmp=''
for (i in 2:nchar(x)) {
tmp=substr(x,i,i)
if (tmp==old_tmp) {
res_vector[counter]=paste0(res_vector[counter],tmp)
} else {
res_vector[counter+1]=tmp
counter=counter+1
}
old_tmp=tmp
}
res_vector[!is.na(res_vector)]
Another way would be to add whitespace between the alternating digits. This would work for any two, not just 1s and 0s. Then use strsplit
on the whitespace:
x <- "111110000011110000111000"
(y <- gsub('(\\d)(?!\\1)', '\\1 \\2', x, perl = TRUE))
# [1] "11111 00000 1111 0000 111 000 "
strsplit(y, ' ')[[1]]
# [1] "11111" "00000" "1111" "0000" "111" "000"
Or more succinctly as @akrun points out:
strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111" "0000" "111" "000"
also changing \\d
to \\w
works also
x <- "aaaaabbcccccccbbbad"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "aaaaa" "bb" "ccccccc" "bbb" "a" "d"
x <- "111110000011110000111000"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111" "0000" "111" "000"
You could also use \K
(rather than explicitly using the capture groups, \\1
and \\2
) which I don't see used a lot nor do I know how to explain it :}
AFAIK \\K
resets the starting point of the reported match and any previously consumed characters are no longer included, basically throwing away everything matched up to that point.
x <- "1111100000222000333300011110000111000"
(z <- gsub('(\\d)\\K(?!\\1)', ' ', x, perl = TRUE))
# [1] "11111 00000 222 000 3333 000 1111 0000 111 000 "
Original Approach: Here is a stringi approach that incorporates rle()
.
x <- "111110000011110000111000"
library(stringi)
cs <- cumsum(
rle(stri_split_boundaries(x, type = "character")[[1L]])$lengths
)
stri_sub(x, c(1L, head(cs + 1L, -1L)), cs)
# [1] "11111" "00000" "1111" "0000" "111" "000"
Or, you can use the length
argument in stri_sub()
rl <- rle(stri_split_boundaries(x, type = "character")[[1L]])
with(rl, {
stri_sub(x, c(1L, head(cumsum(lengths) + 1L, -1L)), length = lengths)
})
# [1] "11111" "00000" "1111" "0000" "111" "000"
Updated for Efficiency: After realizing that base::strsplit()
is faster than stringi::stri_split_boundaries()
, here is a more efficient version of my previous answer using only base functions.
set.seed(24)
x3 <- stri_rand_strings(1L, 1e6L)
system.time({
cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
substring(x3, c(1L, head(cs + 1L, -1L)), cs)
})
# user system elapsed
# 0.686 0.012 0.697
Another approach in case, using mapply
:
x="111110000011110000111000"
with(rle(strsplit(x,'')[[1]]),
mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
#[1] "11111" "00000" "1111" "0000" "111" "000"