问题
I have a dataset of ~190,000 rows which includes: Sample Data: found here:
> df[1:100,1:6]
AcousticTagCode Species SiteCode StartDetection EndDetection Duration_min
1 5004.24 Striped Bass RGD1 2014-10-01 23:01:12.12 2014-10-01 23:59:41.41 58.488167
2 5004.24 Striped Bass RGD1 2014-10-02 00:00:06.06 2014-10-02 01:00:00.00 59.892167
3 5004.24 Striped Bass RGD1 2014-10-02 01:00:01.01 2014-10-02 01:20:12.12 20.185167
4 5004.24 Striped Bass RGD1 2014-10-02 04:14:15.15 2014-10-02 04:32:16.16 18.016833
5 5004.24 Striped Bass RGD1 2014-10-02 22:00:06.06 2014-10-02 22:59:54.54 59.791167
6 5004.24 Striped Bass RGD1 2014-10-02 23:00:10.10 2014-10-02 23:59:55.55 59.740667
7 5004.24 Striped Bass RGD1 2014-10-03 00:00:08.08 2014-10-03 00:59:46.46 59.639667
8 5004.24 Striped Bass RGD1 2014-10-03 01:00:10.10 2014-10-03 01:58:18.18 58.134667
9 5004.24 Striped Bass RGD1 2014-10-03 02:05:05.05 2014-10-03 02:36:11.11 31.101000
10 5004.24 Striped Bass RGD1 2014-10-03 04:01:03.03 2014-10-03 04:43:35.35 42.538667
11 5004.24 Striped Bass RGD1 2014-10-03 06:00:15.15 2014-10-03 06:48:23.23 48.134667
12 5004.24 Striped Bass RGD1 2014-10-03 07:02:00.00 2014-10-03 07:57:33.33 55.555500
13 5004.24 Striped Bass RGD1 2014-10-03 08:04:27.27 2014-10-03 08:59:19.19 54.865333
14 5004.24 Striped Bass RGD1 2014-10-03 09:01:03.03 2014-10-03 09:59:36.36 58.555500
15 5004.24 Striped Bass RGD1 2014-10-03 10:00:33.33 2014-10-03 10:58:50.50 58.286167
16 5004.24 Striped Bass RGD1 2014-10-03 11:00:02.02 2014-10-03 11:59:56.56 59.892167
17 5004.24 Striped Bass RGD1 2014-10-03 12:00:10.10 2014-10-03 12:18:01.01 17.848500
18 5004.24 Striped Bass RGD1 2014-10-03 13:08:56.56 2014-10-03 13:24:06.06 15.175167
19 5004.24 Striped Bass RGD1 2014-10-03 14:29:00.00 2014-10-03 14:58:52.52 29.865333
20 5004.24 Striped Bass RGD1 2014-10-03 15:00:05.05 2014-10-03 15:59:17.17 59.202000
21 5004.24 Striped Bass RGD1 2014-10-03 16:05:47.47 2014-10-03 16:59:50.50 54.050500
22 5004.24 Striped Bass RGD1 2014-10-03 17:00:05.05 2014-10-03 17:43:37.37 43.538667
23 5004.24 Striped Bass RGD1 2014-10-03 18:02:10.10 2014-10-03 18:38:58.58 36.791167
24 5004.24 Striped Bass RGD1 2014-10-03 19:03:44.44 2014-10-03 19:58:26.26 54.697000
25 5004.24 Striped Bass RGD1 2014-10-03 20:09:42.42 2014-10-03 20:58:24.24 48.697000
26 5004.24 Striped Bass RGD1 2014-10-03 21:00:05.05 2014-10-03 21:59:03.03 58.966333
27 5004.24 Striped Bass RGD1 2014-10-03 22:00:23.23 2014-10-03 22:59:46.46 59.387167
28 5004.24 Striped Bass RGD1 2014-10-03 23:00:41.41 2014-10-03 23:59:29.29 58.798000
29 5004.24 Striped Bass RGD1 2014-10-04 09:16:18.18 2014-10-04 09:59:35.35 43.286167
30 5004.24 Striped Bass RGD1 2014-10-04 10:00:05.05 2014-10-04 10:59:18.18 59.218833
31 5004.24 Striped Bass RGD1 2014-10-04 11:00:05.05 2014-10-04 11:59:59.59 59.892167
32 5004.24 Striped Bass RGD1 2014-10-04 12:00:01.01 2014-10-04 12:59:49.49 59.808000
33 5004.24 Striped Bass RGD1 2014-10-04 13:00:23.23 2014-10-04 13:42:25.25 42.033667
34 5004.24 Striped Bass RGD1 2014-10-04 14:00:55.55 2014-10-04 14:53:26.26 52.528667
35 5004.24 Striped Bass RGD1 2014-10-04 15:00:32.32 2014-10-04 15:24:24.24 23.865333
36 5004.24 Striped Bass RGD1 2014-10-04 17:20:04.04 2014-10-04 17:24:09.09 4.084167
37 5004.24 Striped Bass RGD1 2014-10-04 18:23:54.54 2014-10-04 18:52:30.30 28.612833
38 5004.24 Striped Bass RGD1 2014-10-04 19:04:09.09 2014-10-04 19:59:58.58 55.808000
39 5004.24 Striped Bass RGD1 2014-10-04 20:00:02.02 2014-10-04 20:59:56.56 59.892167
40 5004.24 Striped Bass RGD1 2014-10-04 21:00:00.00 2014-10-04 21:59:59.59 59.976333
41 5004.24 Striped Bass RGD1 2014-10-04 22:00:03.03 2014-10-04 22:59:57.57 59.892167
42 5004.24 Striped Bass RGD1 2014-10-04 23:00:13.13 2014-10-04 23:59:57.57 59.723833
43 5004.24 Striped Bass RGD1 2014-10-05 00:00:00.00 2014-10-05 00:59:43.43 59.723833
44 5004.24 Striped Bass RGD1 2014-10-05 01:01:34.34 2014-10-05 01:59:58.58 58.387167
45 5004.24 Striped Bass RGD1 2014-10-05 02:00:02.02 2014-10-05 02:57:00.00 56.959500
46 5004.24 Striped Bass RGD1 2014-10-05 03:08:26.26 2014-10-05 03:58:33.33 50.117833
47 5004.24 Striped Bass RGD1 2014-10-05 04:00:59.59 2014-10-05 04:58:36.36 57.629667
48 5004.24 Striped Bass RGD1 2014-10-05 05:03:22.22 2014-10-05 05:54:09.09 50.781167
49 5004.24 Striped Bass RGD1 2014-10-05 06:00:40.40 2014-10-05 06:57:54.54 57.218833
50 5004.24 Striped Bass RGD1 2014-10-05 07:11:13.13 2014-10-05 07:59:52.52 48.639667
51 5004.24 Striped Bass RGD1 2014-10-05 08:00:11.11 2014-10-05 08:59:55.55 59.723833
52 5004.24 Striped Bass RGD1 2014-10-05 09:00:43.43 2014-10-05 09:59:52.52 59.134667
53 5004.24 Striped Bass RGD1 2014-10-05 10:00:22.22 2014-10-05 10:56:15.15 55.882167
54 5004.24 Striped Bass RGD1 2014-10-05 11:02:31.31 2014-10-05 11:53:29.29 50.966333
55 5004.24 Striped Bass RGD1 2014-10-05 13:54:22.22 2014-10-05 13:59:12.12 4.831667
56 5004.24 Striped Bass RGD1 2014-10-05 22:00:40.40 2014-10-05 22:59:59.59 59.303000
57 5004.24 Striped Bass RGD1 2014-10-05 23:00:03.03 2014-10-05 23:59:37.37 59.572333
58 5004.24 Striped Bass RGD1 2014-10-06 00:00:36.36 2014-10-06 00:59:19.19 58.713833
59 5004.24 Striped Bass RGD1 2014-10-06 01:00:00.00 2014-10-06 01:59:54.54 59.892167
60 5004.24 Striped Bass RGD1 2014-10-06 02:00:38.38 2014-10-06 02:59:46.46 59.134667
61 5004.24 Striped Bass RGD1 2014-10-06 03:03:03.03 2014-10-06 03:59:16.16 56.218833
62 5004.24 Striped Bass RGD1 2014-10-06 04:00:11.11 2014-10-06 04:36:28.28 36.286167
63 5004.24 Striped Bass RGD1 2014-10-06 05:16:11.11 2014-10-06 05:58:33.33 42.370333
64 5004.24 Striped Bass RGD1 2014-10-06 12:00:40.40 2014-10-06 12:58:17.17 57.612833
65 5004.24 Striped Bass RGD1 2014-10-06 18:02:17.17 2014-10-06 18:12:23.23 10.101000
66 5004.24 Striped Bass RGD1 2014-10-06 19:44:35.35 2014-10-06 19:58:00.00 13.410833
67 5004.24 Striped Bass RGD1 2014-10-06 20:02:00.00 2014-10-06 20:59:59.59 57.976333
68 5004.24 Striped Bass RGD1 2014-10-06 21:00:03.03 2014-10-06 21:43:15.15 43.202000
69 5004.24 Striped Bass RGD1 2014-10-06 22:21:58.58 2014-10-06 22:59:49.49 37.865333
70 5004.24 Striped Bass RGD1 2014-10-06 23:00:35.35 2014-10-06 23:57:08.08 56.545500
71 5004.24 Striped Bass RGD1 2014-10-07 00:01:01.01 2014-10-07 00:59:19.19 58.303000
72 5004.24 Striped Bass RGD1 2014-10-07 01:01:32.32 2014-10-07 01:53:55.55 52.370333
73 5004.24 Striped Bass RGD1 2014-10-07 02:14:45.45 2014-10-07 02:59:33.33 44.798000
74 5004.24 Striped Bass RGD1 2014-10-07 03:15:54.54 2014-10-07 03:59:57.57 44.050500
75 5004.24 Striped Bass RGD1 2014-10-07 04:00:05.05 2014-10-07 04:31:31.31 31.437667
76 5004.24 Striped Bass RGD1 2014-10-07 05:33:56.56 2014-10-07 05:59:16.16 25.343500
77 5004.24 Striped Bass RGD1 2014-10-07 06:32:00.00 2014-10-07 06:43:00.00 11.006833
78 5004.24 Striped Bass RGD1 2014-10-07 07:02:25.25 2014-10-07 07:29:22.22 26.949500
79 5004.24 Striped Bass RGD1 2014-10-07 08:00:43.43 2014-10-07 08:51:26.26 50.713833
80 5004.24 Striped Bass RGD1 2014-10-07 09:04:32.32 2014-10-07 09:46:55.55 42.370333
81 5004.24 Striped Bass RGD1 2014-10-07 10:03:05.05 2014-10-07 10:32:47.47 29.707000
82 5004.24 Striped Bass RGD1 2014-10-07 11:52:15.15 2014-10-07 11:59:56.56 7.673333
83 5004.24 Striped Bass RGD1 2014-10-07 12:00:02.02 2014-10-07 12:42:19.19 42.286167
84 5004.24 Striped Bass RGD1 2014-10-07 13:03:10.10 2014-10-07 13:59:59.59 56.808000
85 5004.24 Striped Bass RGD1 2014-10-07 20:47:56.56 2014-10-07 20:50:00.00 2.074167
86 5004.24 Striped Bass RGD1 2014-10-07 21:27:12.12 2014-10-07 21:59:08.08 31.932667
87 5004.24 Striped Bass RGD1 2014-10-07 22:02:49.49 2014-10-07 22:59:16.16 56.444500
88 5004.24 Striped Bass RGD1 2014-10-07 23:00:27.27 2014-10-07 23:58:00.00 57.545500
89 5004.24 Striped Bass RGD1 2014-10-08 00:01:07.07 2014-10-08 01:00:00.00 58.882167
90 5004.24 Striped Bass RGD1 2014-10-08 01:00:09.09 2014-10-08 01:59:57.57 59.791167
91 5004.24 Striped Bass RGD1 2014-10-08 02:00:05.05 2014-10-08 02:59:03.03 58.966333
92 5004.24 Striped Bass RGD1 2014-10-08 03:04:10.10 2014-10-08 03:55:12.12 51.033667
93 5004.24 Striped Bass RGD1 2014-10-08 05:26:26.26 2014-10-08 05:59:28.28 33.033667
94 5004.24 Striped Bass RGD1 2014-10-08 06:02:49.49 2014-10-08 06:59:58.58 57.134667
95 5004.24 Striped Bass RGD1 2014-10-08 07:00:02.02 2014-10-08 07:59:40.40 59.639667
96 5004.24 Striped Bass RGD1 2014-10-08 08:00:07.07 2014-10-08 08:59:50.50 59.723833
97 5004.24 Striped Bass RGD1 2014-10-08 09:01:13.13 2014-10-08 09:51:35.35 50.370333
98 5004.24 Striped Bass RGD1 2014-10-08 10:04:53.53 2014-10-08 10:59:09.09 54.276167
99 5004.24 Striped Bass RGD1 2014-10-08 11:06:27.27 2014-10-08 11:31:23.23 24.932667
100 5004.24 Striped Bass RGD1 2014-10-08 20:03:30.30 2014-10-08 20:59:59.59 56.471333
- A unique individual identifier "AcousticTagCode"
- The individual's species "Species"
- An observation site "SiteCode"
- Start time for an event "StartDetection"
- An End time for an event "EndDetection"
the duration of the event "Duration_min"
AcousticTagCode Species SiteCode StartDetection EndDetection Duration_min 1 5004.24 Striped Bass RGD1 2014-10-01 23:01:12.12 2014-10-01 23:59:41.41 58.48817 2 5004.24 Striped Bass RGD1 2014-10-02 00:00:06.06 2014-10-02 01:00:00.00 59.89217 3 5004.24 Striped Bass RGD1 2014-10-02 01:00:01.01 2014-10-02 01:20:12.12 20.18517 4 5004.24 Striped Bass RGD1 2014-10-02 04:14:15.15 2014-10-02 04:32:16.16 18.01683 5 5004.24 Striped Bass RGD1 2014-10-02 22:00:06.06 2014-10-02 22:59:54.54 59.79117 6 5004.24 Striped Bass RGD1 2014-10-02 23:00:10.10 2014-10-02 23:59:55.55 59.74067
There is an analysis done in a recent paper where the authors use survival analysis to determine Continual Residence Time (CRT) at a site by defining an optimal time interval or Maximum Blanking Period* (MBP*) between consecutive detections at which it is probable that the individual is still residing at the site, but outside of the range of detection.
The basic outline is this:
Define the initial time interval for 1 mbp
mbp <- 7 #seconds
Create a list of integers to multiply by mbp to test survival analysis for different time intervals
n = c(1,2,4,8,16,32,64,128,256,512)
Here's where I'm stuck. For each value of n, I need to determine whether
n*mbp > StartDetection of Event[i+1] - EndDetection of Event[i]
If the above evaluation is True, I add the durations of all events until the event where the above evaluates False OR the TagCode
changes OR the SiteCode
changes.
for instance if n=1
and mbp = 7
in line 2 above, the time of EndDetection
is 01:00:00.00
and the StartDetection
time of the new event (line 3) is 01:00:01.01
. The difference is 1.01
seconds which is less than n*mbp
, therefore I would added the durations of lines 2 and 3 as a new variable CRT
. The problem is when more than 2 consecutive events all meet the above criteria as below:
AcousticTagCode Species SiteCode StartDetection EndDetection Duration_min
38 5004.24 Striped Bass RGD1 2014-10-04 19:04:09.09 2014-10-04 19:59:58.58 55.80800
39 5004.24 Striped Bass RGD1 2014-10-04 20:00:02.02 2014-10-04 20:59:56.56 59.89217
40 5004.24 Striped Bass RGD1 2014-10-04 21:00:00.00 2014-10-04 21:59:59.59 59.97633
41 5004.24 Striped Bass RGD1 2014-10-04 22:00:03.03 2014-10-04 22:59:57.57 59.89217
here, the events from lines 38-41 all occur within 7 seconds of each other, the AcousticTagCode is identical, and SiteCode remains at RGD1 therefore the event durations all need to be added together and called CRT
Is there a way that I can take the initial data set here and create a new dataframe (df_n) for each iteration of n which has columns:
AcousticTagCode Species SiteCode CRT
using the two examples above this would look like lines 2 and n below:
head(df_1)
AcousticTagCode Species SiteCode CRT
1 5004.24 Striped Bass RGD1 58.49
2 5004.24 Striped Bass RGD1 80.08
...
i 5004.24 Striped Bass RGD1 235.57
Additionally anytime the condition is not met the event is considered a standalone event and CRT = Duration as seen in the above table line 1.
My skillset with R is fairly rudimentary, I'm sure there is a simple way to do this, but I'm not aware of it, and my searching skills have not availed me of anything remotely useful
Sample Data: found here:
回答1:
It sounds like you are trying to do two things: 1) For each unique combination of acoustic, species, site code and start-end time, find all other matching acoustic-species-site cases where its start time is <= (mbp + end time of the original combination), then add duration to get CRT; 2) repeat this process for each value of mbp x n.
Since you haven't included any sample data, I have created some example data to try and match yours: 100 cases, where acoustic a
can be 1 or 2, species
can be 'bass' or 'trout', site
can be 'p' or 'q', some random values, start times, and end times that randomly end 5-15 seconds after each start time.
set.seed(123)
df <- data.frame(a=sample(1:2,100,T),species=sample(c('bass','trout'),100,T),
site=sample(c('p','q'),100,T),
value=round(runif(100),2),
start=sample(seq(c(ISOdate(2000,1,1,0,0,0)), by = "sec", length.out = 100),100,F),
stringsAsFactors = F)
df$end <- df$start + sample(c(5:15),length(df$start),T)
To accomplish number 1, I think you can take this approach: use complete
to create the sequence of times within 7 seconds of each combination. Then use the possible sequences to join your original data. For example, if you have a row with 1-bass-p
and end time 00:00:21, possible matches are when other 1-bass-p
rows have a start time from 00:00:22 to 00:00:28.
df1 <- df %>% arrange(a,species,site,end) %>%
distinct(a,species,site,start,end) %>%
mutate(end.orig = end) %>%
group_by(a,species,site,start,end,end.orig) %>%
complete(end=seq(from=end,to=(end+7),by='sec'))
df.orig <- df1 %>% select(-end.orig) %>% left_join(.,df) %>% filter(!is.na(value))
df.match <- df1 %>% ungroup %>% select(-start) %>% rename(start=end) %>% left_join(.,df) %>%
filter(!is.na(value))
bind_rows(df.orig %>% ungroup %>% select(a,species,site,value,end.orig),
df.match %>% ungroup %>% select(a,species,site,value,end.orig)) %>%
group_by(a,species,site,end.orig) %>%
summarise(crt=sum(value),n=n())
# A tibble: 97 x 6
# Groups: a, species, site [8]
a species site end.orig crt n
<int> <chr> <chr> <dttm> <dbl> <int>
1 1 bass p 2000-01-01 00:00:21 1.58 3
2 1 bass p 2000-01-01 00:00:26 1.38 4
3 1 bass p 2000-01-01 00:00:36 2.27 4
4 1 bass p 2000-01-01 00:00:42 1.69 3
5 1 bass p 2000-01-01 00:00:46 1.23 2
6 1 bass p 2000-01-01 00:00:55 0.84 1
7 1 bass p 2000-01-01 00:01:02 1.32 2
8 1 bass p 2000-01-01 00:01:18 0.74 2
9 1 bass p 2000-01-01 00:01:29 0.54 2
10 1 bass p 2000-01-01 00:01:42 0.42 1
# ... with 87 more rows
To accomplish 2, you can use lapply
to repeat this process over your values of mbp x n. This will return a list of 10 elements (i.e., length of n x mbp).
mbp <- 7; n <- c(1,2,4,8,16,32,64,128,256,512) * mbp
f <- lapply(1:length(n), function(x){
df1 <- df %>% arrange(a,species,site,end) %>%
distinct(a,species,site,start,end) %>%
mutate(end.orig = end) %>%
group_by(a,species,site,start,end,end.orig) %>%
complete(end=seq(from=end,to=(end + n[x]),by='sec')) ### x is here
df.orig <- df1 %>% select(-end.orig) %>% left_join(.,df) %>% filter(!is.na(value))
df.match <- df1 %>% ungroup %>% select(-start) %>% rename(start=end) %>% left_join(.,df) %>%
filter(!is.na(value))
out <- bind_rows(df.orig %>% ungroup %>% select(a,species,site,value,end.orig),
df.match %>% ungroup %>% select(a,species,site,value,end.orig)) %>%
group_by(a,species,site,end.orig) %>%
summarise(crt=sum(value),n=n())
return(out)
})
来源:https://stackoverflow.com/questions/57514285/grouping-of-event-time-data-based-on-multiple-iterative-conditions