问题
I have a dataframe df.sample
like this
id <- c("A","A","A","A","A","A","A","A","A","A","A")
date <- c("2018-11-12","2018-11-12","2018-11-12","2018-11-12","2018-11-12",
"2018-11-12","2018-11-12","2018-11-14","2018-11-14","2018-11-14",
"2018-11-12")
hour <- c(8,8,9,9,13,13,16,6,7,19,7)
min <- c(47,59,6,18,22,36,12,32,12,21,47)
value <- c(70,70,86,86,86,74,81,77,79,83,91)
df.sample <- data.frame(id,date,hour,min,value,stringsAsFactors = F)
df.sample$date <- as.Date(df.sample$date,format="%Y-%m-%d")
I have another data frame df.state
like this
id <- c("A","A","A")
starttime <- c("2018-11-12 08:59:00","2018-11-14 06:24:17","2018-11-15 09:17:00")
endtime <- c("2018-11-12 15:57:00","2018-11-14 17:22:16","2018-11-15 12:17:32")
state <- c("Pass","Pass","Pass")
df.state <- data.frame(id,starttime,endtime,state,stringsAsFactors = F)
df.state$starttime <- as.POSIXct(df.state$starttime,format="%Y-%m-%d %H:%M:%S")
df.state$endtime <- as.POSIXct(df.state$endtime,format="%Y-%m-%d %H:%M:%S")
I am trying to merge these 2 data frames based on a condition
if the hour
and min
in df.sample
is within the starttime
and endtime
of df.state
, then merge state = Pass
in the df.sample
.
For example, the row 2 in df.sample
has hour = 8
, min = 59
and since it is within the starttime = 2018-11-12 08:59:00
in df.state
, the value Pass
is added
Here is my desired output
id date hour min value state
A 2018-11-12 8 47 70
A 2018-11-12 8 59 70 Pass
A 2018-11-12 9 6 86 Pass
A 2018-11-12 9 18 86 Pass
A 2018-11-12 13 22 86 Pass
A 2018-11-12 13 36 74 Pass
A 2018-11-12 16 12 81
A 2018-11-14 6 32 77 Pass
A 2018-11-14 7 12 79 Pass
A 2018-11-14 19 21 83
A 2018-11-12 7 47 91
I am able to merge these 2 dataframes like this but not able to look up hour and min of df.sample in the starttime and endtime of df.state
library(tidyverse)
df.sample <- df.sample %>%
left_join(df.state)
Can someone point me in the right direction
回答1:
Using non-equi join from data.table
package is much faster and easier if you happen to have big data frames: Benchmark | Video
library(data.table)
## convert both data.frames to data.tables by reference
setDT(df.sample)
setDT(df.state)
## create a `time` column in df.sample
df.sample[, time := as.POSIXct(paste0(date, " ", hour, ":", min, ":00"))]
## change column order
setcolorder(df.sample, c("id", "time"))
# join by id and time within start & end time limits
# "x." is used so we can refer to the column in other data.table explicitly
df.state[df.sample, .(id, time, date, hour, min, value, state = x.state),
on = .(id, starttime <= time, endtime >= time)]
#> id time date hour min value state
#> 1: A 2018-11-12 08:47:00 2018-11-12 8 47 70 <NA>
#> 2: A 2018-11-12 08:59:00 2018-11-12 8 59 70 Pass
#> 3: A 2018-11-12 09:06:00 2018-11-12 9 6 86 Pass
#> 4: A 2018-11-12 09:18:00 2018-11-12 9 18 86 Pass
#> 5: A 2018-11-12 13:22:00 2018-11-12 13 22 86 Pass
#> 6: A 2018-11-12 13:36:00 2018-11-12 13 36 74 Pass
#> 7: A 2018-11-12 16:12:00 2018-11-12 16 12 81 <NA>
#> 8: A 2018-11-14 06:32:00 2018-11-14 6 32 77 Pass
#> 9: A 2018-11-14 07:12:00 2018-11-14 7 12 79 Pass
#> 10: A 2018-11-14 19:21:00 2018-11-14 19 21 83 <NA>
#> 11: A 2018-11-12 07:47:00 2018-11-12 7 47 91 <NA>
### remove NA
df.state[df.sample, .(id, time, date, hour, min, value, state = x.state),
on = .(id, starttime <= time, endtime >= time), nomatch = 0L]
#> id time date hour min value state
#> 1: A 2018-11-12 08:59:00 2018-11-12 8 59 70 Pass
#> 2: A 2018-11-12 09:06:00 2018-11-12 9 6 86 Pass
#> 3: A 2018-11-12 09:18:00 2018-11-12 9 18 86 Pass
#> 4: A 2018-11-12 13:22:00 2018-11-12 13 22 86 Pass
#> 5: A 2018-11-12 13:36:00 2018-11-12 13 36 74 Pass
#> 6: A 2018-11-14 06:32:00 2018-11-14 6 32 77 Pass
#> 7: A 2018-11-14 07:12:00 2018-11-14 7 12 79 Pass
Created on 2019-05-23 by the reprex package (v0.3.0)
回答2:
(Important prep note: as.POSIXct
creates POSIXct values with the local time zone, whereas lubridate::ymd
creates UTC times. You will get unexpected results if the time zones vary in your join below.)
df.state$starttime <- lubridate::ymd_hms(df.state$starttime)
df.state$endtime <- lubridate::ymd_hms(df.state$endtime)
This can be done with fuzzyjoin:
library(fuzzyjoin)
df.sample %>%
mutate(sample_time = lubridate::ymd_hm(paste(date, hour, min))) %>%
fuzzy_left_join(df.state,
by = c("id" = "id",
"sample_time" = "starttime",
"sample_time" = "endtime"),
match_fun = list(`==`, `>=`, `<=`))
id.x date hour min value sample_time id.y starttime endtime state
1 A 2018-11-12 8 47 70 2018-11-12 08:47:00 <NA> <NA> <NA> <NA>
2 A 2018-11-12 8 59 70 2018-11-12 08:59:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
3 A 2018-11-12 9 6 86 2018-11-12 09:06:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
4 A 2018-11-12 9 18 86 2018-11-12 09:18:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
5 A 2018-11-12 13 22 86 2018-11-12 13:22:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
6 A 2018-11-12 13 36 74 2018-11-12 13:36:00 A 2018-11-12 08:59:00 2018-11-12 15:57:00 Pass
7 A 2018-11-12 16 12 81 2018-11-12 16:12:00 <NA> <NA> <NA> <NA>
8 A 2018-11-14 6 32 77 2018-11-14 06:32:00 A 2018-11-14 06:24:17 2018-11-14 17:22:16 Pass
9 A 2018-11-14 7 12 79 2018-11-14 07:12:00 A 2018-11-14 06:24:17 2018-11-14 17:22:16 Pass
10 A 2018-11-14 19 21 83 2018-11-14 19:21:00 <NA> <NA> <NA> <NA>
11 A 2018-11-12 7 47 91 2018-11-12 07:47:00 <NA> <NA> <NA> <NA>
回答3:
Can be done by first adding a time column to your df.sample
data.frame and then doing an evaluation based on your criteria using sapply
and adding this result to df.sample
df.sample$time <- paste0(df.sample$date, ' ', sprintf('%02d', df.sample$hour),':', sprintf('%02d', df.sample$min), ':00')
df.sample$state <- sapply(df.sample$time, function(x) {
after_start <- x >= df.state$starttime
before_end <- x <= df.state$endtime
y <- cbind(after_start, before_end)
pass_check <- apply(y, 1, sum)
if (2 %in% pass_check) {'PASS'} else {''}
})
df.sample
id date hour min value time state
1 A 2018-11-12 8 47 70 2018-11-12 08:47:00
2 A 2018-11-12 8 59 70 2018-11-12 08:59:00 PASS
3 A 2018-11-12 9 6 86 2018-11-12 09:06:00 PASS
4 A 2018-11-12 9 18 86 2018-11-12 09:18:00 PASS
5 A 2018-11-12 13 22 86 2018-11-12 13:22:00 PASS
6 A 2018-11-12 13 36 74 2018-11-12 13:36:00 PASS
7 A 2018-11-12 16 12 81 2018-11-12 16:12:00
8 A 2018-11-14 6 32 77 2018-11-14 06:32:00 PASS
9 A 2018-11-14 7 12 79 2018-11-14 07:12:00 PASS
10 A 2018-11-14 19 21 83 2018-11-14 19:21:00
11 A 2018-11-12 7 47 91 2018-11-12 07:47:00
回答4:
What I have done is extract decimal hour from each dataframe that you supplied, so that I can ask whether or not a value is found within that decimal hour. But first, you have to merge the datasets based on id (assuming you have other ids) and date (assuming that there is only one state per day; or in other words one date exists per day in the df.state dataset).
id <- c("A","A","A","A","A","A","A","A","A","A","A")
date <- c("2018-11-12","2018-11-12","2018-11-12","2018-11-12","2018-11-12",
"2018-11-12","2018-11-12","2018-11-14","2018-11-14","2018-11-14",
"2018-11-12")
hour <- c(8,8,9,9,13,13,16,6,7,19,7)
min <- c(47,59,6,18,22,36,12,32,12,21,47)
value <- c(70,70,86,86,86,74,81,77,79,83,91)
df.sample <- data.frame(id,date,hour,min,value,stringsAsFactors = F)
df.sample$date <- as.Date(df.sample$date,format="%Y-%m-%d")
df.sample$dec.hour <- as.numeric(df.sample$hour) +
as.numeric(df.sample$min)/60
All I have added above are these last couple of lines to calculate a decimal hour from the hour and minute values that you have provided
id <- c("A","A","A")
starttime <- c("2018-11-12 08:59:00","2018-11-14 06:24:17","2018-11-15 09:17:00")
endtime <- c("2018-11-12 15:57:00","2018-11-14 17:22:16","2018-11-15 12:17:32")
state <- c("Pass","Pass","Pass")
df.state <- data.frame(id,starttime,endtime,state,stringsAsFactors = F)
Here I have added a date vector (for the merge). I arbitrarily chose starttime, assuming the date for start and endtime are always the same.
df.state$date <- as.Date(df.state$starttime,format="%Y-%m-%d")
Then I get a decimal hour for both the start and end times, on that date
t.str <- strptime(df.state$starttime, "%Y-%m-%d %H:%M:%S")
df.state$dec.hour.start <- as.numeric(format(t.str, "%H")) +
as.numeric(format(t.str, "%M"))/60
t.end <- strptime(df.state$endtime, "%Y-%m-%d %H:%M:%S")
df.state$dec.hour.end <- as.numeric(format(t.end, "%H")) +
as.numeric(format(t.end, "%M"))/60
merge the dataframes by id and date
df<-merge(df.sample, df.state, by=c("id","date"))
if the decimal hour of the sample is within the start or end decimal hour (for that date), then return a TRUE for state.
df<-df %>%
mutate(state = dec.hour >= dec.hour.start & dec.hour <= dec.hour.end)
Now if you want to get rid of all of these extra columns that I have created (so it looks like your desired output):
df<-df[,-c(6:8,10:11)]
Because df$state is logical, you have to first turn the values into characters if you want to change TRUE to pass and FALSE to a blank space:
df$state<-as.character(df$state)
df$state[df$state=="TRUE"]<-"pass"
df$state[df$state=="FALSE"]<-""
Take a look:
df
> df
id date hour min value state
1 A 2018-11-12 8 47 70
2 A 2018-11-12 8 59 70 pass
3 A 2018-11-12 9 6 86 pass
4 A 2018-11-12 9 18 86 pass
5 A 2018-11-12 13 22 86 pass
6 A 2018-11-12 13 36 74 pass
7 A 2018-11-12 16 12 81
8 A 2018-11-12 7 47 91
9 A 2018-11-14 6 32 77 pass
10 A 2018-11-14 7 12 79 pass
11 A 2018-11-14 19 21 83
I used this post: extract hours and seconds from POSIXct for plotting purposes in R to extract decimal hours and this one: Check to see if a value is within a range? to see whether or not your sample time was within your state time.
来源:https://stackoverflow.com/questions/56281178/merge-2-dataframes-using-conditions-on-hour-and-min-of-df1-in-datetimes-of-d