Downloading file via HTML form in a frame

纵饮孤独 提交于 2019-12-11 01:44:59

问题


I am struggling with downloading (ideally csv, but I could also deal with html format) data from the Alberta Electric System Operator site (AESO Site). The data are access by completing the form and then clicking the OK radio button.

I've worked through trying to access this using both rvest and curl, but have run up against a wall. The issue appears to be that the servlet is housed inside a frame

I think this is as close as I've gotten using getForm:

 url <- "http://ets.aeso.ca/ets_web/docroot/Market/Reports/HistoricalReportsStart.html"

         if(url.exists(url)) 
       postForm(url,
                SelectFormat  = "html",
                SelectReport = "--- Metered Volumes (All)",
                BeginMonth = 12,
                BeginDay = 12,
                BeginYear =2016,
                EndMonth = 12,
                EndDay =13,
                EndYear =2016,
                radiobutton = "OK",submit = "OK", style = "POST")

     test<-getForm(url, .params = fd)

and, I've also tried using rvest:

 s <- html_session(url)
 f0 <- html_form(s)

However, this seems to be where I am getting the error, since the form is housed within a frame, so I get no content in f0.

Any help would be much appreciated.


回答1:


Just hit there report directly:

library(httr)
library(rvest)
library(stringi)
library(tidyverse)

get_metered_volumes_report <- function(start_date, end_date) {

  start_date <- as.Date(start_date)
  end_date <- as.Date(end_date)

  GET(
    url = "http://ets.aeso.ca/ets_web/ip/Market/Reports/PublicSummaryAllReportServlet",
    query = list(
      beginDate = format(start_date, "%m%d%Y"),
      endDate = format(end_date, "%m%d%Y"),
      contentType = "csv"
    )
  ) -> res

  stop_for_status(res)

  # Neither the CSV nor HTML output is all that great but the CSV
  # can be made to work with (IMO) less effort than the HTML. You may
  # need to do some extra checks for data format (for either CSV or
  # HTML), though, in "production" mode.

  # From what I saw in the output, you likely need to modify 
  # this attempt at munging since the "hours" seem off, but you
  # at least now have the data.

  content(res, as="text") %>% 
    stri_split_lines() %>% 
    flatten_chr() -> 

  read.csv(
    text = paste0(c(paste0(l[8:9], collapse=","), l[11:length(l)]), collapse="\n"),
    header = TRUE, stringsAsFactors=FALSE
  )  %>% janitor::clean_names() %>% 
    tbl_df()

}

Example:

xdf <- get_metered_volumes_report("2016-12-12", "2016-12-13")

xdf
## # A tibble: 2,877 x 30
##    pool_participant_id asset_type asset_id      x    x_1    x_2 hour_1 hour_2 hour_3 hour_4 hour_5 hour_6 hour_7 hour_8 hour_9 hour_10 hour_11
##                  <chr>      <chr>    <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>   <chr>   <chr>
##  1                   -          -        -  28.40  23.07  21.41  22.22  23.78  37.37  38.94  39.97  46.00  47.26  38.49  42.51   41.15   43.91
##  2                4285        IPP     42G1 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  3                9496   RETAILER     941A 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  4                9496   RETAILER     941C 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  5                9496   RETAILER     941E 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  6                9496   RETAILER     941F 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  7                9496   RETAILER     941L 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  8                9496   RETAILER     941P 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  9                9496   RETAILER     941R 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
## 10                9496   RETAILER     941U 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
## # ... with 2,867 more rows, and 13 more variables: hour_12 <chr>, hour_13 <chr>, hour_14 <chr>, hour_15 <chr>, hour_16 <chr>, hour_17 <chr>,
## #   hour_18 <chr>, hour_19 <chr>, hour_20 <chr>, hour_21 <chr>, hour_22 <lgl>, hour_23 <lgl>, hour_24 <lgl>

and:

glimpse(xdf)
## Observations: 2,877
## Variables: 30
## $ pool_participant_id <chr> "-", "4285", "9496", "9496", "9496", "9496", "9496", "9496", "9496", "9496", "9496", "9558", "9558", "9558", "95...
## $ asset_type          <chr> "-", "IPP", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RET...
## $ asset_id            <chr> "-", "42G1", "941A", "941C", "941E", "941F", "941L", "941P", "941R", "941U", "941X", "G035", "G036", "951A", "95...
## $ x                   <chr> "28.40", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ x_1                 <chr> "23.07", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ x_2                 <chr> "21.41", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_1              <chr> "22.22", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_2              <chr> "23.78", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_3              <chr> "37.37", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_4              <chr> "38.94", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_5              <chr> "39.97", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_6              <chr> "46.00", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_7              <chr> "47.26", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_8              <chr> "38.49", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_9              <chr> "42.51", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_10             <chr> "41.15", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_11             <chr> "43.91", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_12             <chr> "46.95", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_13             <chr> "45.73", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_14             <chr> "49.95", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_15             <chr> "34.90", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_16             <chr> "25.82", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_17             <chr> "24.00", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_18             <chr> "25.91", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_19             <chr> "27.99", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_20             <chr> "29.40", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_21             <chr> "24.27", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_22             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ hour_23             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ hour_24             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...

You can target other report URLs as well:



来源:https://stackoverflow.com/questions/47964334/downloading-file-via-html-form-in-a-frame

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!