Split string based on alternating character in R

后端 未结 9 453
醉话见心
醉话见心 2021-01-30 10:02

I\'m trying to figure out an efficient way to go about splitting a string like

\"111110000011110000111000\"

into a vector

[1] \         


        
相关标签:
9条回答
  • 2021-01-30 10:29

    How about this:

    s <- "111110000011110000111000"
    
    spl <- strsplit(s,"10|01")[[1]]
    l <- length(spl)
    sapply(1:l, function(i) paste0(spl[i],i%%2,ifelse(i==1 | i==l, "",i%%2)))
    
    # [1] "11111" "00000" "1111"  "0000"  "111"   "000"  
    
    0 讨论(0)
  • 2021-01-30 10:30

    It's not really what the OP was looking for (concise R code), but thought I'd give it a try in Rcpp, and turned out relatively simple and about 5x faster than the fastest R-based answers.

    library(Rcpp)
    
    cppFunction(
      'std::vector<std::string> split_str_cpp(std::string x) {
    
      std::vector<std::string> parts;
    
      int start = 0;
    
      for(int i = 1; i <= x.length(); i++) {
          if(x[i] != x[i-1]) {
            parts.push_back(x.substr(start, i-start));
            start = i;
          } 
      }
    
      return parts;
    
      }')
    

    And testing on these

    str1 <- "111110000011110000111000"
    x1 <- "1111100000222000333300011110000111000"
    x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
    

    Gives the following output

    > split_str_cpp(str1)
    [1] "11111" "00000" "1111"  "0000"  "111"   "000"  
    > split_str_cpp(x1)
     [1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"   "000"  
    > split_str_cpp(x2)
     [1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"   "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"     
    [15] "D"       "aa"      "BB"   
    

    And a benchmark shows it's about 5-10x faster than R solutions.

    akrun <- function(str1) strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
    
    richard1 <- function(x3){
      cs <- cumsum(
        rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
      )
      stri_sub(x3, c(1, head(cs + 1, -1)), cs)
    }
    
    richard2 <- function(x3) {
      cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
      stri_sub(x3, c(1, head(cs + 1, -1)), cs)
    }
    
    library(microbenchmark)
    library(stringi)
    
    set.seed(24)
    x3 <- stri_rand_strings(1, 1e6)
    
    microbenchmark(split_str_cpp(x3), akrun(x3), richard1(x3), richard2(x3), unit = 'relative', times=20L)
    

    Comparison:

    Unit: relative
                  expr      min       lq     mean   median       uq      max neval
     split_str_cpp(x3) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20
             akrun(x3) 9.675613 8.952997 8.241750 8.689001 8.403634 4.423134    20
          richard1(x3) 5.355620 5.226103 5.483171 5.947053 5.982943 3.379446    20
          richard2(x3) 4.842398 4.756086 5.046077 5.389570 5.389193 3.669680    20
    
    0 讨论(0)
  • 2021-01-30 10:35

    Simple for loop solution

    x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
    res_vector=substr(x,1,1)
    
    for (i in 2:nchar(x)) {
      tmp=substr(x,i,i)
      if (tmp==substr(x,i-1,i-1)) {
        res_vector[length(res_vector)]=paste0(res_vector[length(res_vector)],tmp)
      } else {
        res_vector[length(res_vector)+1]=tmp
      }
    }
    
    res_vector
    
    #[1] "aaaaa"  "bb"  "ccccccc"  "bbb"  "a"  "d"  "11111"  "00000"  "222"  "aaa"  "bb"  "cc"  "d"  "11"  "D"  "aa"  "BB"
    

    Or a maybe a little bit faster with a pre-allocated results vector

    x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
    res_vector=rep(NA_character_,nchar(x))
    res_vector[1]=substr(x,1,1)
    counter=1
    old_tmp=''
    
    for (i in 2:nchar(x)) {
      tmp=substr(x,i,i)
      if (tmp==old_tmp) {
        res_vector[counter]=paste0(res_vector[counter],tmp)
      } else {
        res_vector[counter+1]=tmp
        counter=counter+1
      }
      old_tmp=tmp
    }
    
    res_vector[!is.na(res_vector)]
    
    0 讨论(0)
  • 2021-01-30 10:38

    Another way would be to add whitespace between the alternating digits. This would work for any two, not just 1s and 0s. Then use strsplit on the whitespace:

    x <- "111110000011110000111000"
    
    (y <- gsub('(\\d)(?!\\1)', '\\1 \\2', x, perl = TRUE))
    # [1] "11111 00000 1111 0000 111 000 "
    
    
    strsplit(y, ' ')[[1]]
    # [1] "11111" "00000" "1111"  "0000"  "111"   "000"  
    

    Or more succinctly as @akrun points out:

    strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]]
    # [1] "11111" "00000" "1111"  "0000"  "111"   "000"  
    

    also changing \\d to \\w works also

    x  <- "aaaaabbcccccccbbbad"
    strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
    # [1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"      
    
    x <- "111110000011110000111000"
    strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
    # [1] "11111" "00000" "1111"  "0000"  "111"   "000" 
    

    You could also use \K (rather than explicitly using the capture groups, \\1 and \\2) which I don't see used a lot nor do I know how to explain it :}

    AFAIK \\K resets the starting point of the reported match and any previously consumed characters are no longer included, basically throwing away everything matched up to that point.

    x <- "1111100000222000333300011110000111000"
    (z <- gsub('(\\d)\\K(?!\\1)', ' ', x, perl = TRUE))
    # [1] "11111 00000 222 000 3333 000 1111 0000 111 000 "
    
    0 讨论(0)
  • 2021-01-30 10:40

    Original Approach: Here is a stringi approach that incorporates rle().

    x <- "111110000011110000111000"
    library(stringi)
    
    cs <- cumsum(
        rle(stri_split_boundaries(x, type = "character")[[1L]])$lengths
    )
    stri_sub(x, c(1L, head(cs + 1L, -1L)), cs)
    # [1] "11111" "00000" "1111"  "0000"  "111"   "000"  
    

    Or, you can use the length argument in stri_sub()

    rl <- rle(stri_split_boundaries(x, type = "character")[[1L]])
    with(rl, {
        stri_sub(x, c(1L, head(cumsum(lengths) + 1L, -1L)), length = lengths)
    })
    # [1] "11111" "00000" "1111"  "0000"  "111"   "000"  
    

    Updated for Efficiency: After realizing that base::strsplit() is faster than stringi::stri_split_boundaries(), here is a more efficient version of my previous answer using only base functions.

    set.seed(24)
    x3 <- stri_rand_strings(1L, 1e6L)
    
    system.time({
        cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
        substring(x3, c(1L, head(cs + 1L, -1L)), cs)
    })
    #   user  system elapsed 
    #  0.686   0.012   0.697 
    
    0 讨论(0)
  • 2021-01-30 10:41

    Another approach in case, using mapply:

    x="111110000011110000111000"
    
    with(rle(strsplit(x,'')[[1]]), 
         mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
    #[1] "11111" "00000" "1111"  "0000"  "111"   "000"  
    
    0 讨论(0)
提交回复
热议问题