R - Identify a sequence of row elements by groups in a dataframe

后端 未结 3 1242
执念已碎
执念已碎 2021-01-06 00:20

Consider the following sample dataframe:

> df
   id name time
1   1    b   10
2   1    b   12
3   1    a    0
4   2    a    5
5   2    b   11
6   2    a           


        
相关标签:
3条回答
  • 2021-01-06 00:44

    You can use an ifelse in filter with lag and lead, and then tidyr::spread to reshape to wide:

    library(tidyverse)
    
    df %>% arrange(id, time) %>% group_by(id) %>% 
        filter(ifelse(name == 'b',    # if name is b...
                      lag(name) == 'a',    # is the previous name a?
                      lead(name) == 'b')) %>%    # else if name is not b, is next name b?
        ungroup() %>% mutate(i = rep(seq(n() / 2), each = 2)) %>%    # create indices to spread by
        spread(name, time) %>% select(a, b)    # spread to wide and clean up
    
    ## # A tibble: 3 × 2
    ##       a     b
    ## * <int> <int>
    ## 1     3    10
    ## 2     5     7
    ## 3     9    11
    

    Based on the comment below, here's a version that uses gregexpr to find the first index of a matched pattern, which while more complicated, scales more easily to longer patterns like "aabb":

    df %>% group_by(pattern = 'aabb', id) %>%    # add pattern as column, group
        arrange(time) %>%
        # collapse each group to a string for name and a list column for time
        summarise(name = paste(name, collapse = ''), time = list(time)) %>% 
        # group and add list-column of start indices for each match
        rowwise() %>% mutate(i = gregexpr(pattern, name)) %>% 
        unnest(i, .drop = FALSE) %>%    # expand, keeping other list columns
        filter(i != -1) %>%    # chop out rows with no match from gregexpr
        rowwise() %>%    # regroup
        # subset with sequence from index through pattern length 
        mutate(time = list(time[i + 0:(nchar(pattern) - 1)]), 
               pattern = strsplit(pattern, '')) %>%    # expand pattern to list column
        rownames_to_column('match') %>%    # add rownames as match index column
        unnest(pattern, time) %>%    # expand matches in parallel
        # paste sequence onto each letter (important for spreading if repeated letters)
        group_by(match) %>% mutate(pattern = paste0(pattern, seq(n()))) %>% 
        spread(pattern, time)    # spread to wide form
    
    ## Source: local data frame [1 x 8]
    ## Groups: match [1]
    ## 
    ##   match    id  name     i    a1    a2    b3    b4
    ## * <chr> <int> <chr> <int> <int> <int> <int> <int>
    ## 1     1     1 aabba     1     0     3    10    12
    

    Note that if the pattern doesn't happen to be in alphabetical order, the resulting columns will not be ordered by their indices. Since indices are preserved, though, you can sort with something like select(1:4, parse_number(names(.)[-1:-4]) + 4).

    0 讨论(0)
  • 2021-01-06 00:52
    library(dplyr); library(tidyr)
    
    # sort data frame by id and time
    df %>% arrange(id, time) %>% group_by(id) %>% 
    
           # get logical vector indicating rows of a followed by b and mark each pair as unique
           # by cumsum
           mutate(ab = name == "a" & lead(name) == "b", g = cumsum(ab)) %>% 
    
           # subset rows where conditions are met
           filter(ab | lag(ab)) %>% 
    
           # reshape your data frame to wide format
           select(-ab) %>% spread(name, time)
    
    
    #Source: local data frame [3 x 4]
    #Groups: id [2]
    
    #     id     g     a     b
    #* <int> <int> <int> <int>
    #1     1     1     3    10
    #2     2     1     5     7
    #3     2     2     9    11
    

    If length of the sequence is larger than two, then you will need to check multiple lags, and one option of this is to use shift function(which accepts a vector as lag/lead steps) from data.table combined with Reduce, say if we need to check pattern abb:

    library(dplyr); library(tidyr); library(data.table)
    pattern = c("a", "b", "b")
    len_pattern = length(pattern)
    
    df %>% arrange(id, time) %>% group_by(id) %>% 
    
           # same logic as before but use Reduce function to check multiple lags condition
           mutate(ab = Reduce("&", Map("==", shift(name, n = 0:(len_pattern - 1), type = "lead"), pattern)), 
                  g = cumsum(ab)) %>% 
    
           # use reduce or to subset sequence rows having the same length as the pattern
           filter(Reduce("|", shift(ab, n = 0:(len_pattern - 1), type = "lag"))) %>% 
    
           # make unique names
           group_by(g, add = TRUE) %>% mutate(name = paste(name, 1:n(), sep = "_")) %>% 
    
           # pivoting the table to wide format
           select(-ab) %>% spread(name, time) 
    
    #Source: local data frame [1 x 5]
    #Groups: id, g [1]
    
    #     id     g   a_1   b_2   b_3
    #* <int> <int> <int> <int> <int>
    #1     1     1     3    10    12
    
    0 讨论(0)
  • It's somewhat convoluted, but how about a rolling join?

    library(data.table)
    setorder(setDT(df), id, time)
    
    df[ name == "b" ][
        df[, if(name == "a") .(time = last(time)), by=.(id, name, r = rleid(id,name))],
        on = .(id, time),
        roll = -Inf,
        nomatch = 0,
        .(a = i.time, b = x.time)
    ]
    
       a  b
    1: 3 10
    2: 5  7
    3: 9 11
    
    0 讨论(0)
提交回复
热议问题