using tidyverse; counting after and before change in value, within groups, generating new variables for each unique shift

后端 未结 3 728
南旧
南旧 2021-01-04 04:45

I am looking for a tidyverse-solution that can count occurrences of unique values of TF within groups, id in the data datatbl

相关标签:
3条回答
  • 2021-01-04 05:22

    I had a answer without data.table but it was not using dplyr. Here is my attempt using dplyr:

            #Remove the NAs 
    dfr <-  df %>% filter(!is.na(TF)) %>% 
      # group by id
      group_by(id) %>% 
      # Calculate the rle on TF for each group
      do(., mrle = rle(.$TF)) %>% mutate(Total=sum(mrle$lengths)) %>%
      # Trasform the rle result in a data.frame counting the values after and before changes
      do( {
      t<- .$mrle
      #for each length generate the columns
      res <- as.data.frame(lapply(seq_along(t$lengths[-length(t$lengths)]), function(i) {
    
          #before change counts
          n1 <- t$lengths[i]
          #position  the counts
          if(i==1) {
            before <- 0
          } else {
            before <- sum(t$lengths[1:i-1])
          }
    
          #after change conts
          n2 <- t$lengths[i+1]
    
          if(i == (length(t$lengths)-1))
            after  <- 0
          else
            after <- .$Total - before - n1 - n2
    
          # assemble the column
          c(rep(NA,before),-n1:-1,1:n2, rep(NA,after))
    
        } ))
    
      colnames(res) <- paste0("PM", 1:ncol(res))
      #preserve the id
      cbind(id=.$id,res)
    
     })
    
    #Join with the original data.frame
    res <-  df %>% mutate(rn = row_number()) %>% filter(!is.na(TF)) %>% bind_cols(dfr) %>% right_join( df %>% mutate(rn = row_number()) ) %>% select(-rn, -id1)
    
    #Verify
    mapply(all.equal, dfa,res)
    #  id   TF PM01 PM02 PM03 PM04 PM05 
    #TRUE TRUE TRUE TRUE TRUE TRUE TRUE
    
    0 讨论(0)
  • 2021-01-04 05:25

    Here is another tidyverse approach that uses dplyr, tidyr and zoo (used for its na.locf function) package:

    Firstly, instead of dropping NAs in the TF column and then join back as all the other suggested approaches (including the data.table approach), I wrote a helper method here, that counts forward by chunks ignoring NAs;

    forward_count <- function(v) {
        valid <- !is.na(v)
        valid_v <- v[valid]
        chunk_size = head(rle(valid_v)$lengths, -1)
        idx <- cumsum(chunk_size) + 1
        ones <- rep(1, length(valid_v))
        ones[idx] <- 1 - chunk_size
        v[valid] <- cumsum(ones)
        v
    }
    

    And it works as is required by count after the change:

    v <- sample(c(NA, 0, 1), 15, replace = T)
    v
    # [1] NA NA NA  0  1 NA  1 NA  1  1  0  1  0  0  0
    forward_count(v)
    # [1] NA NA NA  1  1 NA  2 NA  3  4  1  1  1  2  3
    

    Count before the change can be implemented by reverse the vector twice with this exact same function:

    -rev(forward_count(rev(v)))
    # [1] NA NA NA -1 -4 NA -3 NA -2 -1 -1 -1 -3 -2 -1
    

    Now define the headers, count forward column as fd, count backward column as bd using dplyr package:

    library(dplyr); library(tidyr); library(zoo);
    
    tidy_method <- function(df) {
        df %>% 
            group_by(id) %>% 
            mutate(
                rle_id = cumsum(diff(na.locf(c(0, TF))) != 0),   # chunk id for constant TF
                PM_fd = if_else(                 # PM count after change headers
                    rle_id == head(rle_id, 1), 
                    "head", sprintf('PM%02d', rle_id)
                ), 
                PM_bd = if_else(                 # shift the header up as before change headers
                    rle_id == tail(rle_id, 1), 
                    "tail", sprintf('PM%02d', rle_id+1)
                ), 
                fd = forward_count(TF),             # after change count
                bd = -rev(forward_count(rev(TF))),  # before change count
                rn = seq_along(id)) %>%             # row number
            gather(key, value, PM_fd, PM_bd) %>%    # align headers with the count
            mutate(count_ = if_else(key == "PM_fd", fd, bd)) %>%
            select(-key) %>% spread(value, count_) %>%    # reshaper PM column as headers
            select(id, TF, rn, matches('PM')) %>%  # drop no longer needed columns
            arrange(id, rn) %>% select(-rn)
    }
    

    Timing compared with the data.table method:

    Define the data.table method as:

    dt_method <- function(df) {
        tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
            , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]
    
        res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
            rl == V1, PM := dn][rl == V1 + 1L, PM := up][
                , dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
                    df, on = .(rn, id, TF)][, -"rn"]
        res_dt
    }
    

    Data: A medium sized data by repeating the sample data frame 200 times:

    df_test <- bind_rows(rep(list(df), 200))
    
    microbenchmark::microbenchmark(dt_method(df_test), tidy_method(df_test), times = 10)
    #Unit: milliseconds
    #                 expr       min        lq      mean    median        uq       max neval
    #   dt_method(df_test) 2321.5852 2439.8393 2490.8583 2456.1118 2557.4423 2834.2399    10
    # tidy_method(df_test)  402.3624  412.2838  437.0801  414.5655  418.6564  540.9667    10
    

    Order the data.table method result by id and convert all column data types to numeric; the results from data.table approach and tidyverse are identical:

    identical(
        as.data.frame(dt_method(df_test)[order(id), lapply(.SD, as.numeric)]), 
        as.data.frame(tidy_method(df_test))
    )
    # [1] TRUE
    
    0 讨论(0)
  • 2021-01-04 05:37

    Update with a bit optimized data.table function:

    Should probably go to the old question, but maybe this will trigger some further optimization.

    To keep things flowing I have played a bit with the data.table function and get down to about twice of the execution time of the tidyverse version - the bottleneck is the dcast() function, see the screenshot from profvis below:

    dt_method <- function(dt_test) {
      tmp_dt <- dt_test[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
        , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][, ':='(
          rl_PM = sprintf("PM%02d", rl),
          United = paste(id, TF, rn, sep = '_')
        )]
    
      res_dt <- tmp_dt[, .(sprintf("PM%02d", seq_len(max(rl) - 1L)), seq_len(max(rl) - 1L)), by = .(id)] %>% 
        tmp_dt[., on = .(id), allow.cartesian = TRUE] %>%  
        .[rl == V2, PM := dn] %>%
        .[rl == V2 + 1L, PM := up] %>%
        dcast(., United ~ V1, value.var = "PM") %>%
        .[, c('id', 'TF', 'rn') := lapply(tstrsplit(United, '_'), as.numeric)] %>%
        .[dt_test, on = .(rn, id, TF)] %>% .[, -c('rn', 'United')]
      res_dt
    }
    

    Pipes were needed to deal with some odd errors, but I still consider them allowed even for data.table.

    Microbenchmark results:

    Unit: milliseconds
                     expr      min       lq      mean    median        uq       max neval
       dt_method(dt_test) 868.1491 932.8076 1048.5077 1029.9609 1078.0735 1518.0327    10
     tidy_method(df_test) 478.6824 515.5639  557.9644  565.9422  585.3143  622.1093    10
    

    And identical() with fixed order of columns:

    identical(
      dt_method(dt_test)[order(id), lapply(.SD, as.numeric)] %>% setcolorder(c('id', 'TF', setdiff(names(.), c('id', 'TF')))) %>% as.data.frame(),
      as.data.frame(tidy_method(df_test))
    )
    

    profvis timings:

    Old part:

    Using Uwe's answer as a base:

    (Disclaimer: I am not using dplyr too much, treated this as an exercise for myself, so it is for sure not dplyr-optimal, see e.g. dcast.)

    library(data.table)
    library(magrittr)
    library(dplyr)
    library(tibble)
    
    df <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 
                        1, 1, 1, 1,7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
                 TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0, 0,
                        1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1))
    
    dfa <- tibble(id = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
                         1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7),
                  TF = c(NA, 0, NA, 0, 0, 1, 1, 1, NA, 0, 0, NA, 0, 0,
                         0, 1, 1, 1, NA, NA, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1),
                  PM01 = c(NA, -3, NA, -2, -1, 1, 2, 3, NA, NA, NA, NA, -3, -2, -1,
                           1, 2, 3, NA, NA, -2, -1, 1, NA, NA, NA, NA, NA, NA, NA),
                  PM02 = c(NA, NA, NA, NA, NA, -3, -2, -1, NA, 1, 2, NA, NA, NA, NA,
                           NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, NA, NA, NA, NA, NA),
                  PM03 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                           NA, NA, NA, NA, NA, NA, NA, NA, -2, -1, 1, NA, NA, NA, NA),
                  PM04 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                           NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, NA, NA, NA),
                  PM05 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
                           NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1, 1, 2, 3))
    
    tmp_dt <- setDT(df)[, rn := .I][!is.na(TF)][, rl := rleid(TF), by = id][
      , c("up", "dn") := .(seq_len(.N), -rev(seq_len(.N))), by = .(id, rl)][]
    
    res_dt <- tmp_dt[tmp_dt[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE][
      rl == V1, PM := dn][rl == V1 + 1L, PM := up][
        , dcast(.SD, id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM")][
          df, on = .(rn, id, TF)][, -"rn"]
    res_dt
    
    all.equal(res_dt, as.data.table(dfa))
    

    As much tidyverse-sque as possible:

    tmp_dplyr <- df %>%
      # create row id column (required for final join to get NA rows back in)
      mutate(rn = row_number()) %>%
      # ignore NA rows 
      filter(complete.cases(.)) %>%
      # number streaks of unique values within each group
      group_by(id) %>%
      mutate(rl = rleid(TF)) %>%
      # create ascending and descending counts for each streak
      # this is done once to avoid repeatedly creation of counts for each PM 
      # (slight performance gain)
      group_by(id, rl) %>%
      mutate(
        up = seq_len(n()),
        dn = -rev(seq_len(n()))
      )
    
    res_dplyr <- tmp_dplyr %>%
      ## Replicating tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE]
      group_by(id) %>%
      ## Part below can for sure be optimized for code length, it's just too early now...
      transmute(rl = max(rl)) %>% # Cannot transmute id directly
      unique() %>%
      ungroup() %>%
      slice(rep(1:n(), times = rl - 1L)) %>%
      group_by(id) %>%
      transmute(V1 = seq_len(max(rl) - 1L)) %>%
      ungroup() %>%
      right_join(tmp_dplyr, by = 'id') %>%
      ## End or replicating tmp[tmp[, seq_len(max(rl) - 1L), by = .(id)], on = .(id), allow.cartesian = TRUE]
      ## Copy descending counts to rows before the switch and ascending counts to rows after the switch
      mutate(
        PM = ifelse(rl == V1, dn, NA),
        PM = ifelse(rl == V1 + 1L, up, PM)
      ) %>%
      ## This is very not tidyverse-sque, but I don't get the gather/spread ...
      dcast(id + TF + rn ~ sprintf("PM%02d", V1), value.var = "PM") %>%
      full_join(df, by = c('rn', 'id', 'TF')) %>%
      select(-rn)
    
    all.equal( ## Using data.table all.equal
      res_dplyr[do.call(order, res_dplyr),] %>% as.data.table(),
      res_dt[do.call(order, res_dt),]
    )
    
    0 讨论(0)
提交回复
热议问题