Assign event number based on Date of occurece in R dataframe

℡╲_俬逩灬. 提交于 2021-01-29 19:50:40

问题


How to assign an event number based on their date of occurrence satisfying the following conditions.

  1. If the event occurs for at least 3 consecutive days ( or more ) assign event number e1 and so on and mutate (join) with the original data frame.
  2. If the occurrence is not for continuous 3 days, assign NA and mutate with the original data frame. In time series dts how can I achieve it. The output data frame would be like dts_output (done manually).

    dts<-structure(list(Date = structure(c(16442, 16443, 16444, 16445, 
     16484, 16485, 16486, 16487, 16488, 16489, 16490, 16491, 16492, 
    16493, 16499, 16500, 16511, 16512, 16513), class = "Date"), cct = c(11, 
     11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 
     11, 11)), row.names = c(NA, -19L), class = c("tbl_df", "tbl", 
     "data.frame"))

    dts

#Expected output

    dts_output<-structure(list(Date = structure(c(16442, 16443, 16444, 16445, 
           16484, 16485, 16486, 16487, 16488, 16489, 16490, 16491, 16492, 
           16493, 16499, 16500, 16511, 16512, 16513), class = "Date"), cct = c(11, 
           11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 
           11, 11), event = c("e1", "e1", "e1", "e1", "e2", "e2", "e2", 
           "e2", "e2", "e2", "e2", "e2", "e2", "e2", NA, NA, "e3", "e3", 
           "e3")), row.names = c(NA, -19L), spec = structure(list(cols = list(
           Date = structure(list(), class = c("collector_character", 
           "collector")), cct = structure(list(), class = c("collector_double", 
           "collector")), event = structure(list(), class = c("collector_character", 
           "collector"))), default = structure(list(), class = c("collector_guess", 
           "collector")), skip = 1L), class = "col_spec"), class = c("spec_tbl_df", 
           "tbl_df", "tbl", "data.frame"))
    dts_output

回答1:


Maybe a long path but can do the task:

library(dplyr)
library(tidyr)
#Code
dts$Var <- c(0,diff(dts$Date))
i <- which(dts$Var!=1)
dts$Var <- ifelse(dts$Var==1,NA,dts$Var)
dts$Var[i] <- 1:length(i)
#Fill
input1 <- dts %>% fill(Var) %>%
  group_by(Var) %>%
  mutate(Var2=ifelse(n()>=3,cur_group_id(),NA))
#Extract unique
add <- data.frame(Var2=unique(na.omit(input1$Var2)),stringsAsFactors = F)
add$Group <- paste0('e',1:nrow(add))
#Merge
input2 <- input1 %>% left_join(add) %>%
  select(-c(Var,Var2))

Output:

# A tibble: 19 x 4
# Groups:   Var [4]
     Var Date         cct Group
   <dbl> <date>     <dbl> <chr>
 1     1 2015-01-07    11 e1   
 2     1 2015-01-08    11 e1   
 3     1 2015-01-09    11 e1   
 4     1 2015-01-10    11 e1   
 5     2 2015-02-18    11 e2   
 6     2 2015-02-19    11 e2   
 7     2 2015-02-20    11 e2   
 8     2 2015-02-21    11 e2   
 9     2 2015-02-22    11 e2   
10     2 2015-02-23    11 e2   
11     2 2015-02-24    11 e2   
12     2 2015-02-25    11 e2   
13     2 2015-02-26    11 e2   
14     2 2015-02-27    11 e2   
15     3 2015-03-05    11 NA   
16     3 2015-03-06    11 NA   
17     4 2015-03-17    11 e3   
18     4 2015-03-18    11 e3   
19     4 2015-03-19    11 e3 



回答2:


An option with data.table

library(data.table)
setDT(dts)[, grp :=cumsum(c(TRUE, diff(Date) != 1)), .(cct)]
i1 <- dts[, .I[.N >= 3], .(cct, grp)]$V1
dts[i1,  event := paste0('e', .GRP), .(cct, grp)][, grp := NULL][]

-output

#         Date cct event
# 1: 2015-01-07  11    e1
# 2: 2015-01-08  11    e1
# 3: 2015-01-09  11    e1
# 4: 2015-01-10  11    e1
# 5: 2015-02-18  11    e2
# 6: 2015-02-19  11    e2
# 7: 2015-02-20  11    e2
# 8: 2015-02-21  11    e2
# 9: 2015-02-22  11    e2
#10: 2015-02-23  11    e2
#11: 2015-02-24  11    e2
#12: 2015-02-25  11    e2
#13: 2015-02-26  11    e2
#14: 2015-02-27  11    e2
#15: 2015-03-05  11  <NA>
#16: 2015-03-06  11  <NA>
#17: 2015-03-17  11    e3
#18: 2015-03-18  11    e3
#19: 2015-03-19  11    e3

Or using tidyverse

library(dplyr)
library(stringr)
dts %>%
     group_by(cct) %>%
     group_by(grp = cumsum(c(TRUE, diff(Date) != 1)), .add = TRUE) %>%
     filter(n() >=3) %>%
     mutate(event = str_c('e', cur_group_id())) %>%
     ungroup %>% 
     select(-grp) %>% 
     left_join(dts %>% 
                 mutate(rn = row_number()), .) %>%
     select(-rn)
# A tibble: 19 x 3
#   Date         cct event
#   <date>     <dbl> <chr>
# 1 2015-01-07    11 e1   
# 2 2015-01-08    11 e1   
# 3 2015-01-09    11 e1   
# 4 2015-01-10    11 e1   
# 5 2015-02-18    11 e2   
# 6 2015-02-19    11 e2   
# 7 2015-02-20    11 e2   
# 8 2015-02-21    11 e2   
# 9 2015-02-22    11 e2   
#10 2015-02-23    11 e2   
#11 2015-02-24    11 e2   
#12 2015-02-25    11 e2   
#13 2015-02-26    11 e2   
#14 2015-02-27    11 e2   
#15 2015-03-05    11 <NA> 
#16 2015-03-06    11 <NA> 
#17 2015-03-17    11 e3   
#18 2015-03-18    11 e3   
#19 2015-03-19    11 e3   

Or it can be simplified to

dts %>%
  group_by(cct) %>%
  group_by(grp = cumsum(c(TRUE, diff(Date) != 1)), .add = TRUE) %>% 
  mutate(event = if(n() >=3) cur_group_id()[n() >=3] else NA ) %>% 
  ungroup %>% 
  mutate(event = str_c('e', as.integer(factor(event))))


来源:https://stackoverflow.com/questions/65238734/assign-event-number-based-on-date-of-occurece-in-r-dataframe

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!