Split string based on alternating character in R

后端 未结 9 454
醉话见心
醉话见心 2021-01-30 10:02

I\'m trying to figure out an efficient way to go about splitting a string like

\"111110000011110000111000\"

into a vector

[1] \         


        
相关标签:
9条回答
  • 2021-01-30 10:42

    Try

    strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
    #[1] "11111" "00000" "1111"  "0000"  "111"   "000"  
    

    Update

    A modification of @rawr's solution with stri_extract_all_regex

    library(stringi)
    stri_extract_all_regex(str1, '(?:(\\w))\\1*')[[1]]
    #[1] "11111" "00000" "1111"  "0000"  "111"   "000"  
    
    
    stri_extract_all_regex(x1, '(?:(\\w))\\1*')[[1]]
    #[1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"  
    #[10] "000"  
    
    stri_extract_all_regex(x2, '(?:(\\w))\\1*')[[1]]
    #[1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"  
    #[8] "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"     
    #[15] "D"       "aa"      "BB"     
    

    Benchmarks

    library(stringi) 
    set.seed(24)
    x3 <- stri_rand_strings(1, 1e4)
    
    akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]]
    #modified @thelatemail's function to make it bit more general
    thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3, 
                perl=TRUE))[[1]]
    rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
    ananda <- function() unlist(read.fwf(textConnection(x3), 
                    rle(strsplit(x3, "")[[1]])$lengths, 
                    colClasses = "character"))
    Colonel <- function() with(rle(strsplit(x3,'')[[1]]), 
       mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
    
    Cryo <- function(){
       res_vector=rep(NA_character_,nchar(x3))
      res_vector[1]=substr(x3,1,1)
      counter=1
      old_tmp=''
    
       for (i in 2:nchar(x3)) {
        tmp=substr(x3,i,i)
        if (tmp==old_tmp) {
        res_vector[counter]=paste0(res_vector[counter],tmp)
        } else {
        res_vector[counter+1]=tmp
        counter=counter+1
        }
      old_tmp=tmp
       }
    
     res_vector[!is.na(res_vector)]
      }
    
    
     richard <- function(){
         cs <- cumsum(
         rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
       )
       stri_sub(x3, c(1, head(cs + 1, -1)), cs)
      }
    
     nicola<-function(x) {
       indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))
       substring(x,indices[-length(indices)]+1,indices[-1])
     }
    
     richard2 <- function() {
      cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
      stri_sub(x3, c(1, head(cs + 1, -1)), cs)
     }
    
    system.time(akrun())
    # user  system elapsed 
    # 0.003   0.000   0.003 
    
    system.time(thelate())
    #   user  system elapsed 
    #  0.272   0.001   0.274 
    
    system.time(rawr())
    # user  system elapsed 
    #  0.397   0.001   0.398 
    
    system.time(ananda())
    #  user  system elapsed 
    # 3.744   0.204   3.949 
    
    system.time(Colonel())
    #   user  system elapsed 
    #  0.154   0.001   0.154 
    
    system.time(Cryo())
    #  user  system elapsed 
    # 0.220   0.005   0.226 
    
    system.time(richard())
    #  user  system elapsed 
    # 0.007   0.000   0.006 
    
    system.time(nicola(x3))
    # user  system elapsed 
    # 0.190   0.001   0.191 
    

    On a slightly bigger string,

    set.seed(24)
    x3 <- stri_rand_strings(1, 1e6)
    
    system.time(akrun())
    #user  system elapsed 
    #0.166   0.000   0.155 
    system.time(richard())
    #  user  system elapsed 
    # 0.606   0.000   0.569 
    system.time(richard2())
    #  user  system elapsed 
    # 0.518   0.000   0.487 
    
    system.time(Colonel())
    #  user  system elapsed 
    # 9.631   0.000   9.358 
    
    
    library(microbenchmark)
     microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')
     #Unit: relative
     #     expr      min       lq     mean   median       uq      max neval cld
     # richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581    20   b
     #richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861    20   b
     # akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20  a 
    

    NOTE: Tried to run the other methods, but it takes a long time

    data

    str1 <- "111110000011110000111000"
    x1 <- "1111100000222000333300011110000111000"
    x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
    
    0 讨论(0)
  • 2021-01-30 10:47

    Variation on a theme:

    x <- "111110000011110000111000"
    regmatches(x,gregexpr("1+|0+",x))[[1]]
    #[1] "11111" "00000" "1111"  "0000"  "111"   "000"
    
    0 讨论(0)
  • 2021-01-30 10:54

    You could probably make use of substr or read.fwf along with rle (though it is unlikely to be as efficient as any regex-based solution):

    x <- "111110000011110000111000"
    unlist(read.fwf(textConnection(x), 
                    rle(strsplit(x, "")[[1]])$lengths, 
                    colClasses = "character"))
    #      V1      V2      V3      V4      V5      V6 
    # "11111" "00000"  "1111"  "0000"   "111"   "000"
    

    One advantage of this approach is that it would work even with, say:

    x <- paste(c(rep("a", 5), rep("b", 2), rep("c", 7),
                 rep("b", 3), rep("a", 1), rep("d", 1)), collapse = "")
    x
    # [1] "aaaaabbcccccccbbbad"
    
    unlist(read.fwf(textConnection(x), 
                    rle(strsplit(x, "")[[1]])$lengths, 
                    colClasses = "character"))
    #        V1        V2        V3        V4        V5        V6 
    #   "aaaaa"      "bb" "ccccccc"     "bbb"       "a"       "d" 
    
    0 讨论(0)
提交回复
热议问题