using tidyverse; counting after and before change in value, within groups, generating new variables for each unique shift

后端未结

关注

 3  759

南旧 2021-01-04 04:45

I am looking for a tidyverse-solution that can count occurrences of unique values of TF within groups, id in the data datatbl

3条回答

一生所求 (楼主)

2021-01-04 05:37

Update with a bit optimized data.table function:

Should probably go to the old question, but maybe this will trigger some further optimization.

To keep things flowing I have played a bit with the data.table function and get down to about twice of the execution time of the tidyverse version - the bottleneck is the dcast() function, see the screenshot from profvis below:

dt_method <- function(dt_test) {
  tmp_dt <- dt_test[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
    , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][, ':='(
      rl_PM = sprintf("PM%02d", rl),
      United = paste(id, TF, rn, sep = '_')
    )]

  res_dt <- tmp_dt[, .(sprintf("PM%02d", seq_len(max(rl) - 1L)), seq_len(max(rl) - 1L)), by = .(id)] %>% 
    tmp_dt[., on = .(id), allow.cartesian = TRUE] %>%  
    .[rl == V2, PM := dn] %>%
    .[rl == V2 + 1L, PM := up] %>%
    dcast(., United ~ V1, value.var = "PM") %>%
    .[, c('id', 'TF', 'rn') := lapply(tstrsplit(United, '_'), as.numeric)] %>%
    .[dt_test, on = .(rn, id, TF)] %>% .[, -c('rn', 'United')]
  res_dt
}

Pipes were needed to deal with some odd errors, but I still consider them allowed even for data.table.

Microbenchmark results:

Unit: milliseconds
                 expr      min       lq      mean    median        uq       max neval
   dt_method(dt_test) 868.1491 932.8076 1048.5077 1029.9609 1078.0735 1518.0327    10
 tidy_method(df_test) 478.6824 515.5639  557.9644  565.9422  585.3143  622.1093    10

And identical() with fixed order of columns:

identical(
  dt_method(dt_test)[order(id), lapply(.SD, as.numeric)] %>% setcolorder(c('id', 'TF', setdiff(names(.), c('id', 'TF')))) %>% as.data.frame(),
  as.data.frame(tidy_method(df_test))
)

profvis timings:

Old part:

Using Uwe's answer as a base:

(Disclaimer: I am not using dplyr too much, treated this as an exercise for myself, so it is for sure not dplyr-optimal, see e.g. dcast.)

library(data.table)
library(magrittr)
library(dplyr)
library(tibble)

df <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
                    1, 1, 1, 1,7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
             TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0, 0,
                    1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1))

dfa <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
                     1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
              TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
                     0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1),
              PM01 = c(NA, -3, NA, -2, -1, 1, 2, 3, NA, NA, NA, NA, -3, -2, -1,
                       1, 2, 3, NA, NA, -2, -1, 1, NA, NA, NA, NA, NA, NA, NA),
              PM02 = c(NA, NA, NA, NA, NA, -3, -2, -1, NA, 1, 2, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, NA, NA, NA, NA, NA),
              PM03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, -2, -1, 1, NA, NA, NA, NA),
              PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, NA, NA, NA),
              PM05 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, 3))

tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
  , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]

res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
  rl == V1, PM := dn][rl == V1 + 1L, PM := up][
    , dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
      df, on = .(rn, id, TF)][, -"rn"]
res_dt

all.equal(res_dt, as.data.table(dfa))

As much tidyverse-sque as possible:

tmp_dplyr <- df %>%
  # create row id column (required for final join to get NA rows back in)
  mutate(rn = row_number()) %>%
  # ignore NA rows 
  filter(complete.cases(.)) %>%
  # number streaks of unique values within each group
  group_by(id) %>%
  mutate(rl = rleid(TF)) %>%
  # create ascending and descending counts for each streak
  # this is done once to avoid repeatedly creation of counts for each PM 
  # (slight performance gain)
  group_by(id, rl) %>%
  mutate(
    up = seq_len(n()),
    dn = -rev(seq_len(n()))
  )

res_dplyr <- tmp_dplyr %>%
  ## Replicating tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE]
  group_by(id) %>%
  ## Part below can for sure be optimized for code length, it's just too early now...
  transmute(rl = max(rl)) %>% # Cannot transmute id directly
  unique() %>%
  ungroup() %>%
  slice(rep(1:n(), times = rl - 1L)) %>%
  group_by(id) %>%
  transmute(V1 = seq_len(max(rl) - 1L)) %>%
  ungroup() %>%
  right_join(tmp_dplyr, by = 'id') %>%
  ## End or replicating tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE]
  ## Copy descending counts to rows before the switch and ascending counts to rows after the switch
  mutate(
    PM = ifelse(rl == V1, dn, NA),
    PM = ifelse(rl == V1 + 1L, up, PM)
  ) %>%
  ## This is very not tidyverse-sque, but I don't get the gather/spread ...
  dcast(id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM") %>%
  full_join(df, by = c('rn', 'id', 'TF')) %>%
  select(-rn)

all.equal( ## Using data.table all.equal
  res_dplyr[do.call(order, res_dplyr),] %>% as.data.table(),
  res_dt[do.call(order, res_dt),]
)

0 讨论(0)

查看其它3个回答