I have a dataset with 500k appointments lasting between 5 and 60 minutes.
tdata <- structure(list(Start = structure(c(1325493000, 1325493600, 1325494200,
Is this any better.
Create a blank time vector and a blank count vector.
vecTime <- seq(from=tdata$Start[1],to=tdata$End[nrow(tdata)],by=60)
addz <- array(0,length(vecTime))
remz <- array(0,length(vecTime))
startAgg <- aggregate(tdata$Start,by=list(tdata$Start),length)
endAgg <- aggregate(tdata$End,by=list(tdata$End),length)
addz[which(vecTime %in% startAgg$Group.1 )] <- startAgg$x
remz[which(vecTime %in% endAgg$Group.1)] <- -endAgg$x
res <- data.frame(time=vecTime,occupancy=cumsum(addz + remz))
Here's a strategy - order by start time, then unlist the data by going start,end,start,end,... and see if that vector needs to be reordered. If it doesn't, then there are no conflicts and if it does you can see how many appointments (and which appointments if you like) conflict with each other.
# Using Roland's example:
DF <- read.table(text=" Start, End, Location, Room
1,2012-01-02 08:30:00,2012-01-02 08:40:00,LocationA,RoomA
2,2012-01-02 08:40:00,2012-01-02 08:50:00,LocationA,RoomA
3,2012-01-02 08:50:00,2012-01-02 09:55:00,LocationA,RoomA
4,2012-01-02 09:00:00,2012-01-02 09:10:00,LocationA,RoomA
5,2012-01-02 09:00:00,2012-01-02 09:10:00,LocationA,RoomB
6,2012-01-02 09:10:00,2012-01-02 09:20:00,LocationA,RoomB",header=TRUE,sep=",",stringsAsFactors=FALSE)
dt = data.table(DF)
# the conflicting appointments
dt[order(Start),
.SD[unique((which(order(c(rbind(Start, End))) != 1:(2*.N)) - 1) %/% 2 + 1)],
by = list(Location, Room)]
# Location Room Start End
#1: LocationA RoomA 2012-01-02 08:50:00 2012-01-02 09:55:00
#2: LocationA RoomA 2012-01-02 09:00:00 2012-01-02 09:10:00
# and a speedier version of the above, that avoids constructing the full .SD:
dt[dt[order(Start),
.I[unique((which(order(c(rbind(Start, End))) != 1:(2*.N)) - 1) %/% 2 + 1)],
by = list(Location, Room)]$V1]
Perhaps the formula for going from unmatched order to correct indices above can be simplified, I didn't spend too much time thinking about it and just used the first thing that got the job done.
I am not exactly sure, if I understand your goal. Still, this might be of use:
#I changed the example to actually have concurrent appointments
DF <- read.table(text=" Start, End, Location, Room
1, 2012-01-02 08:30:00, 2012-01-02 08:40:00, LocationA, RoomA
2, 2012-01-02 08:40:00, 2012-01-02 08:50:00, LocationA, RoomA
3, 2012-01-02 08:50:00, 2012-01-02 09:55:00, LocationA, RoomA
4, 2012-01-02 09:00:00, 2012-01-02 09:10:00, LocationA, RoomA
5, 2012-01-02 09:00:00, 2012-01-02 09:10:00, LocationA, RoomB
6, 2012-01-02 09:10:00, 2012-01-02 09:20:00, LocationA, RoomB",header=TRUE,sep=",",stringsAsFactors=FALSE)
DF$Start <- as.POSIXct(DF$Start,format="%Y-%d-%m %H:%M:%S",tz="GMT")
DF$End <- as.POSIXct(DF$End,format="%Y-%d-%m %H:%M:%S",tz="GMT")
library(data.table)
DT <- data.table(DF)
DT[,c("Start_num","End_num"):=lapply(.SD,as.numeric),.SDcols=1:2]
fun <- function(s,e) {
require(intervals)
mat <- cbind(s,e)
inter <- Intervals(mat,closed=c(FALSE,FALSE),type="R")
io <- interval_overlap( inter, inter )
tablengths <- table(sapply(io,length))[-1]
sum(c(0,as.vector(tablengths/as.integer(names(tablengths)))))
}
#number of overlapping events per room and location
DT[,fun(Start_num,End_num),by=list(Location,Room)]
# Location Room V1
#1: LocationA RoomA 1
#2: LocationA RoomB 0
I didn't test this, especially not for speed.