I\'ve got a dataset with a starting date and an enddate, and I want to split the rows in this dataframe based on the year in the period. Take this data frame for example:
<Another approach could be
library(dplyr)
library(lubridate)
#sample data
df <- data.frame("starting_date" = as.Date(c("2015-06-01", "2013-06-01", "2016-02-11")),
"end_date" = as.Date(c("2017-09-30", "2017-11-11", "2017-01-01")),
col3=c('AAA','BBB', 'CCC'),
col4=c('33445454','565664', '123'))
df1 <- df[,1:2] %>%
rowwise() %>%
do(rbind(data.frame(matrix(as.character(c(
.$starting_date,
seq(.$starting_date, .$end_date, by=1)[grep("\\d{4}-12-31|\\d{4}-01-01", seq(.$starting_date, .$end_date, by=1))],
.$end_date)), ncol=2, byrow=T)))) %>%
data.frame() %>%
`colnames<-`(c("starting_date", "end_date")) %>%
mutate(starting_date= as.Date(starting_date, format= "%Y-%m-%d"),
end_date= as.Date(end_date, format= "%Y-%m-%d"))
#add temporary columns to the original and expanded date column dataframes
df$row_idx <- seq(1:nrow(df))
df$temp_col <- (year(df$end_date) - year(df$starting_date)) +1
df1 <- cbind(df1,row_idx = rep(df$row_idx,df$temp_col))
#join both dataframes to get the final result
final_df <- left_join(df1,df[,3:(ncol(df)-1)],by="row_idx") %>%
select(-row_idx)
final_df
Output is:
starting_date end_date col3 col4
1 2015-06-01 2015-12-31 AAA 33445454
2 2016-01-01 2016-12-31 AAA 33445454
3 2017-01-01 2017-09-30 AAA 33445454
4 2013-06-01 2013-12-31 BBB 565664
5 2014-01-01 2014-12-31 BBB 565664
6 2015-01-01 2015-12-31 BBB 565664
7 2016-01-01 2016-12-31 BBB 565664
8 2017-01-01 2017-11-11 BBB 565664
9 2016-02-11 2016-12-31 CCC 123
10 2017-01-01 2017-01-01 CCC 123