问题
I am struggling with downloading (ideally csv, but I could also deal with html format) data from the Alberta Electric System Operator site (AESO Site). The data are access by completing the form and then clicking the OK radio button.
I've worked through trying to access this using both rvest and curl, but have run up against a wall. The issue appears to be that the servlet is housed inside a frame
I think this is as close as I've gotten using getForm:
url <- "http://ets.aeso.ca/ets_web/docroot/Market/Reports/HistoricalReportsStart.html"
if(url.exists(url))
postForm(url,
SelectFormat = "html",
SelectReport = "--- Metered Volumes (All)",
BeginMonth = 12,
BeginDay = 12,
BeginYear =2016,
EndMonth = 12,
EndDay =13,
EndYear =2016,
radiobutton = "OK",submit = "OK", style = "POST")
test<-getForm(url, .params = fd)
and, I've also tried using rvest:
s <- html_session(url)
f0 <- html_form(s)
However, this seems to be where I am getting the error, since the form is housed within a frame, so I get no content in f0.
Any help would be much appreciated.
回答1:
Just hit there report directly:
library(httr)
library(rvest)
library(stringi)
library(tidyverse)
get_metered_volumes_report <- function(start_date, end_date) {
start_date <- as.Date(start_date)
end_date <- as.Date(end_date)
GET(
url = "http://ets.aeso.ca/ets_web/ip/Market/Reports/PublicSummaryAllReportServlet",
query = list(
beginDate = format(start_date, "%m%d%Y"),
endDate = format(end_date, "%m%d%Y"),
contentType = "csv"
)
) -> res
stop_for_status(res)
# Neither the CSV nor HTML output is all that great but the CSV
# can be made to work with (IMO) less effort than the HTML. You may
# need to do some extra checks for data format (for either CSV or
# HTML), though, in "production" mode.
# From what I saw in the output, you likely need to modify
# this attempt at munging since the "hours" seem off, but you
# at least now have the data.
content(res, as="text") %>%
stri_split_lines() %>%
flatten_chr() ->
read.csv(
text = paste0(c(paste0(l[8:9], collapse=","), l[11:length(l)]), collapse="\n"),
header = TRUE, stringsAsFactors=FALSE
) %>% janitor::clean_names() %>%
tbl_df()
}
Example:
xdf <- get_metered_volumes_report("2016-12-12", "2016-12-13")
xdf
## # A tibble: 2,877 x 30
## pool_participant_id asset_type asset_id x x_1 x_2 hour_1 hour_2 hour_3 hour_4 hour_5 hour_6 hour_7 hour_8 hour_9 hour_10 hour_11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 - - - 28.40 23.07 21.41 22.22 23.78 37.37 38.94 39.97 46.00 47.26 38.49 42.51 41.15 43.91
## 2 4285 IPP 42G1 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 3 9496 RETAILER 941A 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 4 9496 RETAILER 941C 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 5 9496 RETAILER 941E 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 6 9496 RETAILER 941F 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 7 9496 RETAILER 941L 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 8 9496 RETAILER 941P 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 9 9496 RETAILER 941R 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 10 9496 RETAILER 941U 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## # ... with 2,867 more rows, and 13 more variables: hour_12 <chr>, hour_13 <chr>, hour_14 <chr>, hour_15 <chr>, hour_16 <chr>, hour_17 <chr>,
## # hour_18 <chr>, hour_19 <chr>, hour_20 <chr>, hour_21 <chr>, hour_22 <lgl>, hour_23 <lgl>, hour_24 <lgl>
and:
glimpse(xdf)
## Observations: 2,877
## Variables: 30
## $ pool_participant_id <chr> "-", "4285", "9496", "9496", "9496", "9496", "9496", "9496", "9496", "9496", "9496", "9558", "9558", "9558", "95...
## $ asset_type <chr> "-", "IPP", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RET...
## $ asset_id <chr> "-", "42G1", "941A", "941C", "941E", "941F", "941L", "941P", "941R", "941U", "941X", "G035", "G036", "951A", "95...
## $ x <chr> "28.40", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ x_1 <chr> "23.07", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ x_2 <chr> "21.41", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_1 <chr> "22.22", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_2 <chr> "23.78", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_3 <chr> "37.37", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_4 <chr> "38.94", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_5 <chr> "39.97", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_6 <chr> "46.00", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_7 <chr> "47.26", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_8 <chr> "38.49", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_9 <chr> "42.51", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_10 <chr> "41.15", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_11 <chr> "43.91", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_12 <chr> "46.95", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_13 <chr> "45.73", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_14 <chr> "49.95", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_15 <chr> "34.90", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_16 <chr> "25.82", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_17 <chr> "24.00", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_18 <chr> "25.91", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_19 <chr> "27.99", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_20 <chr> "29.40", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_21 <chr> "24.27", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_22 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ hour_23 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ hour_24 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
You can target other report URLs as well:
来源:https://stackoverflow.com/questions/47964334/downloading-file-via-html-form-in-a-frame