问题
I have a dataset with 500k appointments lasting between 5 and 60 minutes.
tdata <- structure(list(Start = structure(c(1325493000, 1325493600, 1325494200, 1325494800, 1325494800, 1325495400, 1325495400, 1325496000, 1325496000, 1325496600, 1325496600, 1325497500, 1325497500, 1325498100, 1325498100, 1325498400, 1325498700, 1325498700, 1325499000, 1325499300), class = c("POSIXct", "POSIXt"), tzone = "GMT"), End = structure(c(1325493600, 1325494200, 1325494500, 1325495400, 1325495400, 1325496000, 1325496000, 1325496600, 1325496600, 1325496900, 1325496900, 1325498100, 1325498100, 1325498400, 1325498700, 1325498700, 1325499000, 1325499300, 1325499600, 1325499600), class = c("POSIXct", "POSIXt"), tzone = "GMT"), Location = c("LocationA", "LocationA", "LocationA", "LocationA", "LocationA", "LocationA", "LocationA", "LocationA", "LocationA", "LocationB", "LocationB", "LocationB", "LocationB", "LocationB", "LocationB", "LocationB", "LocationB", "LocationB", "LocationB", "LocationB"), Room = c("RoomA", "RoomA", "RoomA", "RoomA", "RoomB", "RoomB", "RoomB", "RoomB", "RoomB", "RoomB", "RoomA", "RoomA", "RoomA", "RoomA", "RoomA", "RoomA", "RoomA", "RoomA", "RoomA", "RoomA")), .Names = c("Start", "End", "Location", "Room"), row.names = c(NA, 20L), class = "data.frame")
> head(tdata)
Start End Location Room
1 2012-01-02 08:30:00 2012-01-02 08:40:00 LocationA RoomA
2 2012-01-02 08:40:00 2012-01-02 08:50:00 LocationA RoomA
3 2012-01-02 08:50:00 2012-01-02 08:55:00 LocationA RoomA
4 2012-01-02 09:00:00 2012-01-02 09:10:00 LocationA RoomA
5 2012-01-02 09:00:00 2012-01-02 09:10:00 LocationA RoomB
6 2012-01-02 09:10:00 2012-01-02 09:20:00 LocationA RoomB
I would like calculate the number of concurrent appointments in total, per Location, and per Room (and several other factors in de original dataset).
I have tried using mysql
package to perform a left join, which works for a small dataset, but takes forever for the entire dataset:
# SQL Join.
start.min <- min(tdata$Start, na.rm=T)
end.max <- max(tdata$End, na.rm=T)
tinterval <- seq.POSIXt(start.min, end.max, by = "mins")
tinterval <- as.data.frame(tinterval)
library(sqldf)
system.time(
output <- sqldf("SELECT *
FROM tinterval
LEFT JOIN tdata
ON tinterval.tinterval >= tdata.Start
AND tinterval.tinterval < tdata.End "))
head(output)
tinterval Start End Location Room
1 2012-01-02 09:30:00 2012-01-02 09:30:00 2012-01-02 09:40:00 LocationA RoomA
2 2012-01-02 09:31:00 2012-01-02 09:30:00 2012-01-02 09:40:00 LocationA RoomA
3 2012-01-02 09:32:00 2012-01-02 09:30:00 2012-01-02 09:40:00 LocationA RoomA
4 2012-01-02 09:33:00 2012-01-02 09:30:00 2012-01-02 09:40:00 LocationA RoomA
5 2012-01-02 09:34:00 2012-01-02 09:30:00 2012-01-02 09:40:00 LocationA RoomA
6 2012-01-02 09:35:00 2012-01-02 09:30:00 2012-01-02 09:40:00 LocationA RoomA
It creates a data frame where all the "active" appointments are listed for each minute. The large dataset covers a full year (~525600 minutes). With an average appointment duration of 18 minutes, I expect the sql join to create a data set with ~ 5 million rows, which I can use to create occupancy plots for different factors (Location/Room etc).
Building on the sapply solution suggested in How to count number of concurrent users I tried using data.table
and snowfall
as follows:
require(snowfall)
require(data.table)
sfInit(par=T, cpu=4)
sfLibrary(data.table)
tdata <- data.table(tdata)
tinterval <- seq.POSIXt(start.min, end.max, by = "mins")
setkey(tdata, Start, End)
sfExport("tdata") # "Transport" data to cores
system.time( output <- data.frame(tinterval,sfSapply(tinterval, function(i) length(tdata[Start <= i & i < End,Start]) ) ) )
> head(output)
tinterval sfSapply.tinterval..function.i..length.tdata.Start....i...i...
1 2012-01-02 08:30:00 1
2 2012-01-02 08:31:00 1
3 2012-01-02 08:32:00 1
4 2012-01-02 08:33:00 1
5 2012-01-02 08:34:00 1
6 2012-01-02 08:35:00 1
This solution is fast, takes ~18 seconds to calculate 1 day (about 2 hours for a full year). The downside is I cannot create subsets of number of concurrent appointments for certain factors (Location, Room etc). I have the feeling there must be a better way to do this.. any advice?
UPDATE: Final solution looks like this, based on Geoffrey's answer. The example shows how the occupancies for each location can be determined.
setkey(tdata, Location, Start, End)
vecTime <- seq(from=tdata$Start[1],to=tdata$End[nrow(tdata)],by=60)
res <- data.frame(time=vecTime)
for(i in 1:length(unique(tdata$Location)) ) {
addz <- array(0,length(vecTime))
remz <- array(0,length(vecTime))
tdata2 <- tdata[J(unique(tdata$Location)[i]),] # Subset a certain location.
startAgg <- aggregate(tdata2$Start,by=list(tdata2$Start),length)
endAgg <- aggregate(tdata2$End,by=list(tdata2$End),length)
addz[which(vecTime %in% startAgg$Group.1 )] <- startAgg$x
remz[which(vecTime %in% endAgg$Group.1)] <- -endAgg$x
res[,c( unique(tdata$Location)[i] )] <- cumsum(addz + remz)
}
> head(res)
time LocationA LocationB
1 2012-01-01 03:30:00 1 0
2 2012-01-01 03:31:00 1 0
3 2012-01-01 03:32:00 1 0
4 2012-01-01 03:33:00 1 0
5 2012-01-01 03:34:00 1 0
6 2012-01-01 03:35:00 1 0
回答1:
Is this any better.
Create a blank time vector and a blank count vector.
vecTime <- seq(from=tdata$Start[1],to=tdata$End[nrow(tdata)],by=60)
addz <- array(0,length(vecTime))
remz <- array(0,length(vecTime))
startAgg <- aggregate(tdata$Start,by=list(tdata$Start),length)
endAgg <- aggregate(tdata$End,by=list(tdata$End),length)
addz[which(vecTime %in% startAgg$Group.1 )] <- startAgg$x
remz[which(vecTime %in% endAgg$Group.1)] <- -endAgg$x
res <- data.frame(time=vecTime,occupancy=cumsum(addz + remz))
回答2:
I am not exactly sure, if I understand your goal. Still, this might be of use:
#I changed the example to actually have concurrent appointments
DF <- read.table(text=" Start, End, Location, Room
1, 2012-01-02 08:30:00, 2012-01-02 08:40:00, LocationA, RoomA
2, 2012-01-02 08:40:00, 2012-01-02 08:50:00, LocationA, RoomA
3, 2012-01-02 08:50:00, 2012-01-02 09:55:00, LocationA, RoomA
4, 2012-01-02 09:00:00, 2012-01-02 09:10:00, LocationA, RoomA
5, 2012-01-02 09:00:00, 2012-01-02 09:10:00, LocationA, RoomB
6, 2012-01-02 09:10:00, 2012-01-02 09:20:00, LocationA, RoomB",header=TRUE,sep=",",stringsAsFactors=FALSE)
DF$Start <- as.POSIXct(DF$Start,format="%Y-%d-%m %H:%M:%S",tz="GMT")
DF$End <- as.POSIXct(DF$End,format="%Y-%d-%m %H:%M:%S",tz="GMT")
library(data.table)
DT <- data.table(DF)
DT[,c("Start_num","End_num"):=lapply(.SD,as.numeric),.SDcols=1:2]
fun <- function(s,e) {
require(intervals)
mat <- cbind(s,e)
inter <- Intervals(mat,closed=c(FALSE,FALSE),type="R")
io <- interval_overlap( inter, inter )
tablengths <- table(sapply(io,length))[-1]
sum(c(0,as.vector(tablengths/as.integer(names(tablengths)))))
}
#number of overlapping events per room and location
DT[,fun(Start_num,End_num),by=list(Location,Room)]
# Location Room V1
#1: LocationA RoomA 1
#2: LocationA RoomB 0
I didn't test this, especially not for speed.
回答3:
Here's a strategy - order by start time, then unlist the data by going start,end,start,end,... and see if that vector needs to be reordered. If it doesn't, then there are no conflicts and if it does you can see how many appointments (and which appointments if you like) conflict with each other.
# Using Roland's example:
DF <- read.table(text=" Start, End, Location, Room
1,2012-01-02 08:30:00,2012-01-02 08:40:00,LocationA,RoomA
2,2012-01-02 08:40:00,2012-01-02 08:50:00,LocationA,RoomA
3,2012-01-02 08:50:00,2012-01-02 09:55:00,LocationA,RoomA
4,2012-01-02 09:00:00,2012-01-02 09:10:00,LocationA,RoomA
5,2012-01-02 09:00:00,2012-01-02 09:10:00,LocationA,RoomB
6,2012-01-02 09:10:00,2012-01-02 09:20:00,LocationA,RoomB",header=TRUE,sep=",",stringsAsFactors=FALSE)
dt = data.table(DF)
# the conflicting appointments
dt[order(Start),
.SD[unique((which(order(c(rbind(Start, End))) != 1:(2*.N)) - 1) %/% 2 + 1)],
by = list(Location, Room)]
# Location Room Start End
#1: LocationA RoomA 2012-01-02 08:50:00 2012-01-02 09:55:00
#2: LocationA RoomA 2012-01-02 09:00:00 2012-01-02 09:10:00
# and a speedier version of the above, that avoids constructing the full .SD:
dt[dt[order(Start),
.I[unique((which(order(c(rbind(Start, End))) != 1:(2*.N)) - 1) %/% 2 + 1)],
by = list(Location, Room)]$V1]
Perhaps the formula for going from unmatched order to correct indices above can be simplified, I didn't spend too much time thinking about it and just used the first thing that got the job done.
来源:https://stackoverflow.com/questions/17168611/how-to-calculate-number-of-occurrences-per-minute-for-a-large-dataset