I am looking for a tidyverse-solution that can count occurrences of unique values of TF
within groups, id
in the data datatbl
Should probably go to the old question, but maybe this will trigger some further optimization.
To keep things flowing I have played a bit with the data.table
function and get down to about twice of the execution time of the tidyverse
version - the bottleneck is the dcast()
function, see the screenshot from profvis
below:
dt_method <- function(dt_test) {
tmp_dt <- dt_test[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
, c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][, ':='(
rl_PM = sprintf("PM%02d", rl),
United = paste(id, TF, rn, sep = '_')
)]
res_dt <- tmp_dt[, .(sprintf("PM%02d", seq_len(max(rl) - 1L)), seq_len(max(rl) - 1L)), by = .(id)] %>%
tmp_dt[., on = .(id), allow.cartesian = TRUE] %>%
.[rl == V2, PM := dn] %>%
.[rl == V2 + 1L, PM := up] %>%
dcast(., United ~ V1, value.var = "PM") %>%
.[, c('id', 'TF', 'rn') := lapply(tstrsplit(United, '_'), as.numeric)] %>%
.[dt_test, on = .(rn, id, TF)] %>% .[, -c('rn', 'United')]
res_dt
}
Pipes were needed to deal with some odd errors, but I still consider them allowed even for data.table
.
Microbenchmark results:
Unit: milliseconds
expr min lq mean median uq max neval
dt_method(dt_test) 868.1491 932.8076 1048.5077 1029.9609 1078.0735 1518.0327 10
tidy_method(df_test) 478.6824 515.5639 557.9644 565.9422 585.3143 622.1093 10
And identical()
with fixed order of columns:
identical(
dt_method(dt_test)[order(id), lapply(.SD, as.numeric)] %>% setcolorder(c('id', 'TF', setdiff(names(.), c('id', 'TF')))) %>% as.data.frame(),
as.data.frame(tidy_method(df_test))
)
profvis
timings:
Using Uwe's answer as a base:
(Disclaimer: I am not using dplyr
too much, treated this as an exercise for myself, so it is for sure not dplyr
-optimal, see e.g. dcast
.)
library(data.table)
library(magrittr)
library(dplyr)
library(tibble)
df <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1,7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0, 0,
1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1))
dfa <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1),
PM01 = c(NA, -3, NA, -2, -1, 1, 2, 3, NA, NA, NA, NA, -3, -2, -1,
1, 2, 3, NA, NA, -2, -1, 1, NA, NA, NA, NA, NA, NA, NA),
PM02 = c(NA, NA, NA, NA, NA, -3, -2, -1, NA, 1, 2, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, NA, NA, NA, NA, NA),
PM03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, -2, -1, 1, NA, NA, NA, NA),
PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, NA, NA, NA),
PM05 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, 3))
tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
, c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]
res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
rl == V1, PM := dn][rl == V1 + 1L, PM := up][
, dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
df, on = .(rn, id, TF)][, -"rn"]
res_dt
all.equal(res_dt, as.data.table(dfa))
As much tidyverse-sque as possible:
tmp_dplyr <- df %>%
# create row id column (required for final join to get NA rows back in)
mutate(rn = row_number()) %>%
# ignore NA rows
filter(complete.cases(.)) %>%
# number streaks of unique values within each group
group_by(id) %>%
mutate(rl = rleid(TF)) %>%
# create ascending and descending counts for each streak
# this is done once to avoid repeatedly creation of counts for each PM
# (slight performance gain)
group_by(id, rl) %>%
mutate(
up = seq_len(n()),
dn = -rev(seq_len(n()))
)
res_dplyr <- tmp_dplyr %>%
## Replicating tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE]
group_by(id) %>%
## Part below can for sure be optimized for code length, it's just too early now...
transmute(rl = max(rl)) %>% # Cannot transmute id directly
unique() %>%
ungroup() %>%
slice(rep(1:n(), times = rl - 1L)) %>%
group_by(id) %>%
transmute(V1 = seq_len(max(rl) - 1L)) %>%
ungroup() %>%
right_join(tmp_dplyr, by = 'id') %>%
## End or replicating tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE]
## Copy descending counts to rows before the switch and ascending counts to rows after the switch
mutate(
PM = ifelse(rl == V1, dn, NA),
PM = ifelse(rl == V1 + 1L, up, PM)
) %>%
## This is very not tidyverse-sque, but I don't get the gather/spread ...
dcast(id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM") %>%
full_join(df, by = c('rn', 'id', 'TF')) %>%
select(-rn)
all.equal( ## Using data.table all.equal
res_dplyr[do.call(order, res_dplyr),] %>% as.data.table(),
res_dt[do.call(order, res_dt),]
)