Thanks in advance.
I am trying to add missing date values that were not included in a observation period for three different individuals.
My data look like t
Calculate min and max times (seconds since Epoch):
min_time = as.integer(min(PostData$Date))
max_time = as.integer(max(PostData$Date))
Use sequence to build the list of missing dates:
list_of_dates = seq(min_time,max_time, 86400) #since there are 86400 seconds in a day
list_of_dates = as.Date(as.POSIXct( list_of_dates ), origin = '1970-01-01 00:00.00 UTC')
#convert back to a date
Build a list of missing IndID and Date combos
temp = merge(unique(PostData$IndID),list_of_dates)
names(temp) = c("IndID","Date")
data_missing_indID_date = temp[!which(temp$IndID %in% PostData$IndID & temp$Date %in% PostData$Date ),]
Build the rest of the columns:
data_missing_indID_date$Event = 0
data_missing_indID_date$Number = NA
data_missing_indID_date$Percent = NA
rbind
it to the original data frame:
final_data = rbind(PostData, data_missing_indID_date)
Hers's a dplyr
solution. The result, based on the sample data, is a data.frame with 89 rows, I hope that's what you intended to get.
require(dplyr)
PostData %>%
mutate(Date = as.Date(as.character(Date))) %>%
group_by(IndID) %>%
do(left_join(data.frame(IndID = .$IndID[1], Date = seq(min(.$Date), max(.$Date), 1)), .,
by=c("IndID", "Date"))) %>%
mutate(Event = ifelse(is.na(Event), 0, Event))
# IndID Date Event Number Percent
#1 P01 2011-03-04 1 2 0.390
#2 P01 2011-03-05 0 NA NA
#3 P01 2011-03-06 0 NA NA
#4 P01 2011-03-07 0 NA NA
#5 P01 2011-03-08 0 NA NA
#6 P01 2011-03-09 0 NA NA
#7 P01 2011-03-10 0 NA NA
#8 P01 2011-03-11 1 2 0.975
#...
#84 P06 2012-01-25 0 NA NA
#85 P06 2012-01-26 0 NA NA
#86 P06 2012-01-27 1 4 0.758
#87 P06 2012-01-28 0 NA NA
#88 P06 2012-01-29 0 8 0.290
#89 P06 2012-01-30 0 1 0.150
A base R version:
do.call(rbind,
by(
PostData,
PostData$IndID,
function(x) {
out <- merge(
data.frame(
IndID=x$IndID[1],
Date=seq.POSIXt(min(x$Date),max(x$Date),by="1 day")
),
x,
all.x=TRUE
)
out$Event[is.na(out$Event)] <- 0
out
}
)
)
Result:
IndID Date Event Number Percent
P01.1 P01 2011-03-04 1 2 0.390
P01.2 P01 2011-03-05 0 NA NA
P01.3 P01 2011-03-06 0 NA NA
P01.4 P01 2011-03-07 0 NA NA
P01.5 P01 2011-03-08 0 NA NA
P01.6 P01 2011-03-09 0 NA NA
P01.7 P01 2011-03-10 0 NA NA
P01.8 P01 2011-03-11 1 2 0.975
<<etc>>
Try this.. This will add missing Dates with proper IDs and remaining fields as 0
library(data.table)
library(plyr)
dtPostData = data.table(PostData)
minmaxTab = dtPostData[,list(minDate=min(Date),maxDate=max(Date)),by=IndID]
df = lapply(1:nrow(minmaxTab),function(x) {
temp = seq(minmaxTab$minDate[x],minmaxTab$maxDate[x],by=24*60*60)
temp = temp[!(temp %in% dtPostData[IndID == minmaxTab$IndID[x],]$Date)]
data.table(IndID = minmaxTab$IndID[x], Date = temp, Event = 0, Number = 0, Percent = 0)
})
df <- ldply(x, data.frame)
df
#Results
IndID Date Event Number Percent
1 P01 2011-03-05 0 0 0
2 P01 2011-03-06 0 0 0
3 P01 2011-03-07 0 0 0
4 P01 2011-03-08 0 0 0
5 P01 2011-03-09 0 0 0
6 P01 2011-03-10 0 0 0
7 P01 2011-03-12 0 0 0
8 P01 2011-03-16 0 0 0
9 P03 2011-07-10 0 0 0