问题
I have a data frame with rainfall measurements from several rain gauges, like the sample below:
> rnfl
ID date value
1 250 2000-03-01 5.37
2 250 2000-03-02 0.00
3 250 2000-03-03 2.94
4 250 2000-03-04 0.00
5 250 2000-03-05 0.00
6 250 2000-03-06 0.00
7 250 2000-03-07 2.76
8 250 2000-03-08 3.06
9 250 2000-03-09 31.05
10 250 2000-03-10 9.48
11 250 2000-03-11 0.00
12 250 2000-03-12 0.00
13 250 2000-03-13 0.00
14 732 2011-05-01 2.40
15 732 2011-05-02 15.60
16 732 2011-05-03 8.80
17 732 2011-05-04 47.00
18 732 2011-05-05 45.40
19 732 2011-05-06 5.85
20 732 2011-05-07 0.00
21 732 2011-05-08 0.00
22 732 2011-05-09 0.80
23 732 2011-05-10 0.00
24 1439 2006-08-01 0.00
25 1439 2006-08-02 0.00
26 1439 2006-08-03 0.00
27 1439 2006-08-04 0.00
28 1439 2006-08-05 0.00
29 1439 2006-08-06 0.00
30 1439 2006-08-07 0.00
31 1439 2006-08-08 0.00
32 1440 2000-03-06 0.00
33 1440 2000-03-07 4.57
34 1440 2000-03-08 3.06
35 1440 2000-03-09 9.02
36 1440 2000-03-10 4.23
37 1534 2000-04-01 14.94
38 1534 2000-04-02 43.65
39 1534 2000-04-03 0.00
40 1534 2000-04-04 0.00
41 1534 2000-04-05 0.00
I also have a data frame with each gauge's ID along with the ID's of the nearest few gauges and their distance:
> near
ID ID_nearest distance
1 250 1440 1102.65
2 250 732 3881.40
3 250 1534 15479.97
4 250 1439 19231.39
5 253 499 909.27
6 253 89 2219.03
7 253 815 2452.21
8 254 64 11254.43
9 255 237 11607.83
10 256 416 4503.37
11 256 921 10132.95
12 256 1210 11449.56
For example, gauge ID 250
has four close neighbors: ID's 1440
, 732
, 1534
and 1439
. For each combination like this in near
, I need to find the overlapping dates between the main and the surrounding gauges. In other words, I need to find whether gauges 1440
, 732
, 1534
and 1439
have any dates which overlaps ID 250
.
The expected output would be something like this:
ID ID_nearest common_date_begin common_date_end diff_days
1 250 1440 2000-03-06 2000-03-10 4
2 250 732 <NA> <NA> NA
3 250 1534 <NA> <NA> NA
4 250 1439 <NA> <NA> NA
and so on for each ID
in near
.
How do I achieve this? Thank you very much.
Required data to reproduce this question:
rnfl <- structure(list(ID = c(250L, 250L, 250L, 250L, 250L, 250L, 250L,
250L, 250L, 250L, 250L, 250L, 250L, 732L, 732L, 732L, 732L, 732L,
732L, 732L, 732L, 732L, 732L, 1439L, 1439L, 1439L, 1439L, 1439L,
1439L, 1439L, 1439L, 1440L, 1440L, 1440L, 1440L, 1440L, 1534L,
1534L, 1534L, 1534L, 1534L), date = structure(c(11017, 11018,
11019, 11020, 11021, 11022, 11023, 11024, 11025, 11026, 11027,
11028, 11029, 15095, 15096, 15097, 15098, 15099, 15100, 15101,
15102, 15103, 15104, 13361, 13362, 13363, 13364, 13365, 13366,
13367, 13368, 11022, 11023, 11024, 11025, 11026, 11048, 11049,
11050, 11051, 11052), class = "Date"), value = c(5.37, 0, 2.94,
0, 0, 0, 2.76, 3.06, 31.05, 9.48, 0, 0, 0, 2.4, 15.6, 8.8, 47,
45.4, 5.85, 0, 0, 0.8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4.57, 3.06,
9.02, 4.23, 14.94, 43.65, 0, 0, 0)), row.names = c(NA, -41L), class = "data.frame")
near <- structure(list(ID = c("250", "250", "250", "250", "253", "253",
"253", "254", "255", "256", "256", "256"), ID_nearest = c("1440",
"732", "1534", "1439", "499", "89", "815", "64", "237", "416",
"921", "1210"), distance = c(1102.65, 3881.4, 15479.97, 19231.39,
909.27, 2219.03, 2452.21, 11254.43, 11607.83, 4503.37, 10132.95,
11449.56)), row.names = c(NA, -12L), class = "data.frame")
回答1:
An option using data.table
:
library(data.table)
setDT(near)[, c("ID", "ID_nearest") := lapply(.SD, as.integer), .SDcols=c("ID", "ID_nearest")]
setDT(rnfl)
m <- rnfl[rnfl, on=.(date), {
k <- x.ID!=i.ID
unique(data.table(
ID=i.ID[k],
ID_nearest=x.ID[k],
common_date_begin=min(date[k]),
common_date_end=max(date[k])
))
}]
m[near, on=.(ID, ID_nearest)][,
diff_days := common_date_end - common_date_begin][]
output:
ID ID_nearest common_date_begin common_date_end distance diff_days
1: 250 1440 2000-03-06 2000-03-10 1102.65 4 days
2: 250 732 <NA> <NA> 3881.40 NA days
3: 250 1534 <NA> <NA> 15479.97 NA days
4: 250 1439 <NA> <NA> 19231.39 NA days
5: 253 499 <NA> <NA> 909.27 NA days
6: 253 89 <NA> <NA> 2219.03 NA days
7: 253 815 <NA> <NA> 2452.21 NA days
8: 254 64 <NA> <NA> 11254.43 NA days
9: 255 237 <NA> <NA> 11607.83 NA days
10: 256 416 <NA> <NA> 4503.37 NA days
11: 256 921 <NA> <NA> 10132.95 NA days
12: 256 1210 <NA> <NA> 11449.56 NA days
for larger datasets, it would make sense to collapse rnfl
into rows of ranges for each consecutive periods for each ID before performing an overlapping join and then lookup these overlaps into near
:
#summarize into consecutive periods
summ <- rnfl[, .(startdate=date[1L], enddate=date[.N]),
.(ID, g=cumsum(c(0L, diff(date)!=1L)))]
#perform overlapping join
setkey(summ, startdate, enddate)
olap <- unique(foverlaps(summ, summ)[ID!=i.ID, .(
ID1=pmin(ID, i.ID),
ID2=pmax(ID, i.ID),
common_date_begin=pmax(startdate, i.startdate),
common_date_end=pmin(enddate, i.enddate))])
#sorry I forgot to sort the IDs in the original post. have fixed here
near[, c("ID1", "ID2") := .(pmin(ID, ID_nearest), pmax(ID, ID_nearest))]
#lookup join for overlapping dates and calc dates diff
cols <- c("common_date_begin", "common_date_end")
near[olap, on=.(ID1, ID2), (cols) := mget(paste0("i.", cols))][,
diff_days := common_date_end - common_date_begin][]
output:
ID ID_nearest dist ID1 ID2 common_date_begin common_date_end diff_days
1: 1 1117 3022.2234 1 1117 2000-03-01 2006-12-03 2468
2: 1 386 16107.7359 1 386 2006-01-01 2006-12-03 336
3: 1 920 17327.0028 1 920 2000-03-01 2004-11-04 1709
4: 1000 688 401.5005 688 1000 2019-12-25 2019-12-31 6
5: 1000 48 5576.3986 48 1000 2000-03-01 2006-12-03 2468
---
2649: 992 318 12462.7490 318 992 2006-01-01 2017-06-16 4184
2650: 996 448 0.0000 448 996 2019-12-25 2019-12-31 6
2651: 997 1085 498.8696 997 1085 2000-03-01 2017-01-22 6171
2652: 997 390 17627.1155 390 997 2003-08-08 2017-01-22 4916
2653: 999 467 5392.2740 467 999 2007-11-14 2019-04-09 4164
Total timing is about 5s on my PC including reading in the large file and format the date column. The processing code takes about 1.5s.
data:
#https://www.dropbox.com/s/aadf4w6538lw22q/****_SO.zip?dl=0
near <- fread("near.csv")
rnfl <- fread("rnfl.csv")
lu <- rnfl[, .(date={cd <- unique(date)}, DATE=as.IDate(cd))]
rnfl[lu, on=.(date), date := DATE][, date := as.IDate(as.integer(date))]
回答2:
Maybe not the cleanest/efficient but here is one way to do this in base R.
We find the common dates for each combination of ID
and ID_nearest
, if there are any common dates we create a dataframe with minimum, maximum value of the dates along with the difference in number of days in them.
out <- near[c('ID', 'ID_nearest')]
cbind(out,do.call(rbind, c(Map(function(x, y) {
common_dates <- intersect(rnfl$date[rnfl$ID == x], rnfl$date[rnfl$ID == y])
if(length(common_dates) > 0) {
class(common_dates) <- "Date"
data.frame(common_date_begin = min(common_dates),
common_date_end = max(common_dates),
diff_days = as.integer(max(common_dates) - min(common_dates)))
} else c(common_date_begin = NA, common_date_end = NA, diff_days = NA)
},out$ID, out$ID_nearest), make.row.names = FALSE)))
# ID ID_nearest common_date_begin common_date_end diff_days
#1 250 1440 2000-03-06 2000-03-10 4
#2 250 732 <NA> <NA> NA
#3 250 1534 <NA> <NA> NA
#4 250 1439 <NA> <NA> NA
#....
#....
来源:https://stackoverflow.com/questions/60460942/r-find-overlapping-dates-per-group-based-on-another-data-frame