Although the details of this are, of course, app specific, in the SO spirit I\'m trying to keep this as general as possible! The basic problem is how to merge data.frames by
Update: In v1.9.3+, now overlap joins are implemented. This is a special case where start and end Date
are identical in Speeches
. We can accomplish this using foverlaps()
as follows:
require(data.table) ## 1.9.3+
setDT(Speeches)
setDT(History)
Speeches[, `:=`(Date2 = Date, id = .I)]
setkey(History, Name, Role.Start, Role.End)
ans = foverlaps(Speeches, History, by.x=c("Name", "Date", "Date2"))[, Date2 := NULL]
ans = ans[order(id, Value)][, N := 1:.N, by=list(Name, Date, Role, id)]
ans = dcast.data.table(ans, id+Name+Date ~ Role+N, value.var="Value")
This is a case for range/interval join.
Here's the data.table
way. It uses two rolling joins.
require(data.table) ## 1.9.2+
dt1 = as.data.table(Speeches)
dt2 = as.data.table(History)
# first rolling join - to get end indices
setkey(dt2, Name, Role.Start)
tmp1 = dt2[dt1, roll=Inf, which=TRUE]
# second rolling join - to get start indices
setkey(dt2, Name, Role.End)
tmp2 = dt2[dt1, roll=-Inf, which=TRUE]
# generate dt1's and dt2's corresponding row indices
idx = tmp1-tmp2+1L
idx1 = rep(seq_len(nrow(dt1)), idx)
idx2 = data.table:::vecseq(tmp2, idx, sum(idx))
dt1[, id := 1:.N] ## needed for casting later
# subset using idx1 and idx2 and bind them colwise
ans = cbind(dt1[idx1], dt2[idx2, -1L, with=FALSE])
# a little reordering to get the output correctly (factors are a pain!)
ans = ans[order(id,Value)][, N := 1:.N, by=list(Name, Date, Role, id)]
# finally cast them.
f_ans = dcast.data.table(ans, id+Name+Date ~ Role+N, value.var="Value")
Here's the output:
id Name Date Political groups_1 National parties_1 Member_1 Member_2 Member_3 Substitute_1
1: 1 AAA 2004-05-05 j l c f NA d
2: 2 AAA 2003-12-18 j l c f h d
3: 3 AAA 2003-12-18 j l c f h d
4: 4 AAA 2003-12-18 j l c f h d
5: 5 AAA 2003-11-17 j l c f h d
6: 6 AAA 2003-11-06 j l c f h d
7: 7 AAA 2003-10-20 j l c f h d
8: 8 AAA 2003-09-25 j l c f h d
9: 9 AAA 2003-06-04 j l c f h d
10: 10 BBB 2012-04-20 i k b g NA NA
11: 11 BBB 2012-04-19 i k b g NA NA
12: 12 BBB 2012-04-19 i k b g NA NA
13: 13 BBB 2012-04-19 i k b g NA NA
14: 14 BBB 2012-04-19 i k b g NA NA
15: 15 BBB 2012-04-19 i k b g NA NA
16: 16 BBB 2012-04-19 i k b g NA NA
17: 17 BBB 2012-04-19 i k b g NA NA
18: 18 BBB 2012-04-18 i k b g NA NA
19: 19 BBB 2012-04-18 i k b g NA NA
20: 20 BBB 2012-04-18 i k b g NA NA
Alternatively you can also accomplish this using GenomicRanges
package from bioconductor, which deals with Ranges quite nicely, especially when you require an additional column to join by (Name
) in addition to the ranges. You can install it from here.
require(GenomicRanges)
require(data.table)
dt1 <- as.data.table(Speeches)
dt2 <- as.data.table(History)
gr1 = GRanges(Rle(dt1$Name), IRanges(as.numeric(dt1$Date), as.numeric(dt1$Date)))
gr2 = GRanges(Rle(dt2$Name), IRanges(as.numeric(dt2$Role.Start), as.numeric(dt2$Role.End)))
olaps = findOverlaps(gr1, gr2, type="within")
idx1 = queryHits(olaps)
idx2 = subjectHits(olaps)
# from here, you can do exactly as above
dt1[, id := 1:.N]
...
...
dcast.data.table(ans, id+Name+Date ~ Role+N, value.var="Value")
Gives the same result as above.
Here's an approach using sqldf(...)
from the sqldf
package. This produces your result, with the following exceptions:
Member.n
columns contain values in alphabetical order, rather than the order in which they appear in the History
data frame. So Member.1
would contain c
and Member.2
would contain f
, rather than the other way around.Note that Speeches
and History
are used for the input data frames, and I use your Output
dataframe to get the columns' order only.
library(sqldf) # for sqldf(...)
library(reshape2) # for dcast(...)
colnames(History)[4:5] <- c("Start","End") # sqldf doesn't like "." in colnames
Speeches$id <- rownames(Speeches) # need unique id column
result <- sqldf("select a.id, a.Name, a.Date, b.Role, b.Value
from Speeches a, History b
where a.Name=b.Name and a.Date between b.Start and b.End")
Roles <- aggregate(Role~Name+Date+id,result,function(x)
ifelse(x=="Member",paste(x,1:length(x),sep="."),as.character(x)))$Role
result$Roles <- unlist(Roles)
result <- dcast(result,Name+Date+id~Roles,value.var="Value")
result <- result[order(result$id),] # re-order the rows
result <- result[,colnames(Output)] # re-order the columns
Explanation
Speeches
to differentiate between the replicated columns in the result. So we use the row names for that. sqldf(...)
to merge the Speeches
and History
tables based on your criteria. Because you want dates to match based on a range, this may be the best approach. aggregate(...)
and paste(...)
. dcast(...)
.