Finding duplicate values in r

前端未结

关注

 3  679

So, In a string containing multiple 1\'s,

Now, it is possible that, the number

\'1\'

appears at several positions, let\'s say, at m

Final version, optimized and without errors

str_groups <- function (s) {
    digits <- as.numeric(strsplit(s, '')[[1]])
    index1 <- which(digits == 1)
    len <- length(digits)
    back <- length(index1)
    if (back == 0) return(list())
    maxpitch <- (len - 1) %/% 2
    patterns <- matrix(0, len, maxpitch)
    result <- list()

    for (pitch in 1:maxpitch) {
        divisors <- which(pitch %% 1:(pitch %/% 2) == 0)
        while (index1[back] > len - 2 * pitch) {
            back <- back - 1
            if (back == 0) return(result)
        }
        for (startpos in index1[1:back]) {
            if (patterns[startpos, pitch] != 0) next
            pos <- seq(startpos, len, pitch)
            if (digits[pos[2]] != 1 || digits[pos[3]] != 1) next
            repeats <- length(pos)
            if (repeats > 3) for (i in 4:repeats) {
                if (digits[pos[i]] != 1) {
                    repeats <- i - 1
                    break
                }
            }
            continue <- F
            for (subpitch in divisors) {
                sublen <- patterns[startpos, subpitch]
                if (sublen > pitch / subpitch * (repeats - 1)) {
                    continue <- T
                    break
                }
            }
            if (continue) next
            for (i in 1:repeats) patterns[pos[i], pitch] <- repeats - i + 1
            result <- append(result, list(c(startpos, pitch, repeats)))
        }
    }

    return(result)
}

Note: this algorithm has roughly quadratic runtime complexity, so if you make your strings twice as long, it will take four times as much time to find all patterns on average.

Pseudocode version

To aid understanding of the code. For particulars of R functions such as which, consult the R online documentation, for example by running ?which on the R command line.

PROCEDURE str_groups WITH INPUT $s (a string of the form /(0|1)*/):
    digits := array containing the digits in $s
    index1 := positions of the digits in $s that are equal to 1
    len := pointer to last item in $digits
    back := pointer to last item in $index1
    IF there are no items in $index1, EXIT WITH empty list
    maxpitch := the greatest possible interval between 1-digits, given $len
    patterns := array with $len rows and $maxpitch columns, initially all zero
    result := array of triplets, initially empty

    FOR EACH possible $pitch FROM 1 TO $maxpitch:
        divisors := array of divisors of $pitch (including 1, excluding $pitch)
        UPDATE $back TO the last position at which a pattern could start;
            IF no such position remains, EXIT WITH result
        FOR EACH possible $startpos IN $index1 up to $back:
            IF $startpos is marked as part of a pattern, SKIP TO NEXT $startpos
            pos := possible positions of pattern members given $startpos, $pitch
            IF either the 2nd or 3rd $pos is not 1, SKIP TO NEXT $startpos
            repeats := the number of positions in $pos
            IF there are more than 3 positions in $pos THEN
                count how long the pattern continues
                UPDATE $repeats TO the length of the pattern
            END IF (more than 3 positions)
            FOR EACH possible $subpitch IN $divisors:
                check $patterns for pattern with interval $subpitch at $startpos
                IF such a pattern is found AND it envelopes the current pattern,
                    SKIP TO NEXT $startpos
                    (using helper variable $continue to cross two loop levels)
                END IF (pattern found)
            END FOR (subpitch)
            FOR EACH consecutive position IN the pattern:
                UPDATE $patterns at row of position and column of $pitch TO ...
                    ... the remaining length of the pattern at that position
            END FOR (position)
            APPEND the triplet ($startpos, $pitch, $repeats) TO $result
        END FOR (startpos)
    END FOR (pitch)

    EXIT WITH $result
END PROCEDURE (str_groups)

0 讨论(0)

挽巷

2021-01-23 06:42
Perhaps the following route will help:
1. Convert string to a vector of integers characters
```
v <- as.integer(strsplit(s, "")[[1]])
```
2. Repeatedly convert this vector to matrices of varying number of rows...
```
m <- matrix(v, nrow=...)
```
3. ...and use rle to find relevant patterns in the rows of the matrix m:
```
rle(m[1, ]); rle(m[2, ]); ...
```
0 讨论(0)
发布评论:

提交评论
- 加载中...
星月不相逢

2021-01-23 06:49
This is not a complete answer, but some ideas (partly based on comments):
```
z <- "1101101101"
zz <- as.numeric(strsplit(z,"")[[1]])
```
Compute autocorrelation function and draw plot: in this case I'm getting the periodicity=3 pretty crudely as the first point at which there is an increase followed by a decrease ...
```
a1 <- acf(zz)
first.peak <- which(diff(sign(diff(a1$acf[,,1])))==-2)[1]
```
Now we know the periodicity is 3; create runs of 3 with embed() and analyze their similarities:
```
ee <- embed(zz,first.peak)
pp <- apply(ee,1,paste,collapse="")
mm <- outer(pp,pp,"==")
aa <- apply(mm[!duplicated(mm),],1,which)
sapply(aa,length)  ## 3 3 2   ## number of repeats
sapply(aa,function(x) unique(diff(x)))  ## 3 3 3
```
0 讨论(0)
发布评论:

提交评论
- 加载中...