Consider the following sample dataframe:
> df
id name time
1 1 b 10
2 1 b 12
3 1 a 0
4 2 a 5
5 2 b 11
6 2 a
You can use an ifelse
in filter
with lag
and lead
, and then tidyr::spread
to reshape to wide:
library(tidyverse)
df %>% arrange(id, time) %>% group_by(id) %>%
filter(ifelse(name == 'b', # if name is b...
lag(name) == 'a', # is the previous name a?
lead(name) == 'b')) %>% # else if name is not b, is next name b?
ungroup() %>% mutate(i = rep(seq(n() / 2), each = 2)) %>% # create indices to spread by
spread(name, time) %>% select(a, b) # spread to wide and clean up
## # A tibble: 3 × 2
## a b
## * <int> <int>
## 1 3 10
## 2 5 7
## 3 9 11
Based on the comment below, here's a version that uses gregexpr
to find the first index of a matched pattern, which while more complicated, scales more easily to longer patterns like "aabb"
:
df %>% group_by(pattern = 'aabb', id) %>% # add pattern as column, group
arrange(time) %>%
# collapse each group to a string for name and a list column for time
summarise(name = paste(name, collapse = ''), time = list(time)) %>%
# group and add list-column of start indices for each match
rowwise() %>% mutate(i = gregexpr(pattern, name)) %>%
unnest(i, .drop = FALSE) %>% # expand, keeping other list columns
filter(i != -1) %>% # chop out rows with no match from gregexpr
rowwise() %>% # regroup
# subset with sequence from index through pattern length
mutate(time = list(time[i + 0:(nchar(pattern) - 1)]),
pattern = strsplit(pattern, '')) %>% # expand pattern to list column
rownames_to_column('match') %>% # add rownames as match index column
unnest(pattern, time) %>% # expand matches in parallel
# paste sequence onto each letter (important for spreading if repeated letters)
group_by(match) %>% mutate(pattern = paste0(pattern, seq(n()))) %>%
spread(pattern, time) # spread to wide form
## Source: local data frame [1 x 8]
## Groups: match [1]
##
## match id name i a1 a2 b3 b4
## * <chr> <int> <chr> <int> <int> <int> <int> <int>
## 1 1 1 aabba 1 0 3 10 12
Note that if the pattern doesn't happen to be in alphabetical order, the resulting columns will not be ordered by their indices. Since indices are preserved, though, you can sort with something like select(1:4, parse_number(names(.)[-1:-4]) + 4)
.
library(dplyr); library(tidyr)
# sort data frame by id and time
df %>% arrange(id, time) %>% group_by(id) %>%
# get logical vector indicating rows of a followed by b and mark each pair as unique
# by cumsum
mutate(ab = name == "a" & lead(name) == "b", g = cumsum(ab)) %>%
# subset rows where conditions are met
filter(ab | lag(ab)) %>%
# reshape your data frame to wide format
select(-ab) %>% spread(name, time)
#Source: local data frame [3 x 4]
#Groups: id [2]
# id g a b
#* <int> <int> <int> <int>
#1 1 1 3 10
#2 2 1 5 7
#3 2 2 9 11
If length of the sequence is larger than two, then you will need to check multiple lags, and one option of this is to use shift
function(which accepts a vector as lag/lead steps) from data.table
combined with Reduce
, say if we need to check pattern abb
:
library(dplyr); library(tidyr); library(data.table)
pattern = c("a", "b", "b")
len_pattern = length(pattern)
df %>% arrange(id, time) %>% group_by(id) %>%
# same logic as before but use Reduce function to check multiple lags condition
mutate(ab = Reduce("&", Map("==", shift(name, n = 0:(len_pattern - 1), type = "lead"), pattern)),
g = cumsum(ab)) %>%
# use reduce or to subset sequence rows having the same length as the pattern
filter(Reduce("|", shift(ab, n = 0:(len_pattern - 1), type = "lag"))) %>%
# make unique names
group_by(g, add = TRUE) %>% mutate(name = paste(name, 1:n(), sep = "_")) %>%
# pivoting the table to wide format
select(-ab) %>% spread(name, time)
#Source: local data frame [1 x 5]
#Groups: id, g [1]
# id g a_1 b_2 b_3
#* <int> <int> <int> <int> <int>
#1 1 1 3 10 12
It's somewhat convoluted, but how about a rolling join?
library(data.table)
setorder(setDT(df), id, time)
df[ name == "b" ][
df[, if(name == "a") .(time = last(time)), by=.(id, name, r = rleid(id,name))],
on = .(id, time),
roll = -Inf,
nomatch = 0,
.(a = i.time, b = x.time)
]
a b
1: 3 10
2: 5 7
3: 9 11