So, In a string containing multiple 1\'s,
Now, it is possible that, the number
\'1\'
appears at several positions, let\'s say, at m
The following code does exactly what you ask for. Try it with str_groups('1101101101')
. It returns a list of 3-vectors. Note that the first triple is (1, 3, 4) because the character at the 10th position is also a 1.
str_groups <- function (s) {
digits <- as.numeric(strsplit(s, '')[[1]])
index1 <- which(digits == 1)
len <- length(digits)
back <- length(index1)
if (back == 0) return(list())
maxpitch <- (len - 1) %/% 2
patterns <- matrix(0, len, maxpitch)
result <- list()
for (pitch in 1:maxpitch) {
divisors <- which(pitch %% 1:(pitch %/% 2) == 0)
while (index1[back] > len - 2 * pitch) {
back <- back - 1
if (back == 0) return(result)
}
for (startpos in index1[1:back]) {
if (patterns[startpos, pitch] != 0) next
pos <- seq(startpos, len, pitch)
if (digits[pos[2]] != 1 || digits[pos[3]] != 1) next
repeats <- length(pos)
if (repeats > 3) for (i in 4:repeats) {
if (digits[pos[i]] != 1) {
repeats <- i - 1
break
}
}
continue <- F
for (subpitch in divisors) {
sublen <- patterns[startpos, subpitch]
if (sublen > pitch / subpitch * (repeats - 1)) {
continue <- T
break
}
}
if (continue) next
for (i in 1:repeats) patterns[pos[i], pitch] <- repeats - i + 1
result <- append(result, list(c(startpos, pitch, repeats)))
}
}
return(result)
}
Note: this algorithm has roughly quadratic runtime complexity, so if you make your strings twice as long, it will take four times as much time to find all patterns on average.
To aid understanding of the code. For particulars of R functions such as which
, consult the R online documentation, for example by running ?which
on the R command line.
PROCEDURE str_groups WITH INPUT $s (a string of the form /(0|1)*/):
digits := array containing the digits in $s
index1 := positions of the digits in $s that are equal to 1
len := pointer to last item in $digits
back := pointer to last item in $index1
IF there are no items in $index1, EXIT WITH empty list
maxpitch := the greatest possible interval between 1-digits, given $len
patterns := array with $len rows and $maxpitch columns, initially all zero
result := array of triplets, initially empty
FOR EACH possible $pitch FROM 1 TO $maxpitch:
divisors := array of divisors of $pitch (including 1, excluding $pitch)
UPDATE $back TO the last position at which a pattern could start;
IF no such position remains, EXIT WITH result
FOR EACH possible $startpos IN $index1 up to $back:
IF $startpos is marked as part of a pattern, SKIP TO NEXT $startpos
pos := possible positions of pattern members given $startpos, $pitch
IF either the 2nd or 3rd $pos is not 1, SKIP TO NEXT $startpos
repeats := the number of positions in $pos
IF there are more than 3 positions in $pos THEN
count how long the pattern continues
UPDATE $repeats TO the length of the pattern
END IF (more than 3 positions)
FOR EACH possible $subpitch IN $divisors:
check $patterns for pattern with interval $subpitch at $startpos
IF such a pattern is found AND it envelopes the current pattern,
SKIP TO NEXT $startpos
(using helper variable $continue to cross two loop levels)
END IF (pattern found)
END FOR (subpitch)
FOR EACH consecutive position IN the pattern:
UPDATE $patterns at row of position and column of $pitch TO ...
... the remaining length of the pattern at that position
END FOR (position)
APPEND the triplet ($startpos, $pitch, $repeats) TO $result
END FOR (startpos)
END FOR (pitch)
EXIT WITH $result
END PROCEDURE (str_groups)
Perhaps the following route will help:
Convert string to a vector of integers characters
v <- as.integer(strsplit(s, "")[[1]])
Repeatedly convert this vector to matrices of varying number of rows...
m <- matrix(v, nrow=...)
...and use rle
to find relevant patterns in the rows of the matrix m
:
rle(m[1, ]); rle(m[2, ]); ...
This is not a complete answer, but some ideas (partly based on comments):
z <- "1101101101"
zz <- as.numeric(strsplit(z,"")[[1]])
Compute autocorrelation function and draw plot: in this case I'm getting the periodicity=3 pretty crudely as the first point at which there is an increase followed by a decrease ...
a1 <- acf(zz)
first.peak <- which(diff(sign(diff(a1$acf[,,1])))==-2)[1]
Now we know the periodicity is 3; create runs of 3 with embed()
and analyze their similarities:
ee <- embed(zz,first.peak)
pp <- apply(ee,1,paste,collapse="")
mm <- outer(pp,pp,"==")
aa <- apply(mm[!duplicated(mm),],1,which)
sapply(aa,length) ## 3 3 2 ## number of repeats
sapply(aa,function(x) unique(diff(x))) ## 3 3 3