问题
I downloaded a file from a website with rvest
. How can I save the response as a csv
file?
Step 1: Monkey patch rvest
package like in this thread: How to submit login form in Rvest package w/o button argument
library(tidyverse)
library(rvest)
library(R.utils)
# monkey path submit_form
custom.submit_request <- function (form, submit = NULL)
{
is_submit <- function(x) {
if (!exists("type", x) | is.null(x$type)){
return(F);
}
tolower(x$type) %in% c("submit", "image", "button")
}
submits <- Filter(is_submit, form$fields)
if (length(submits) == 0) {
stop("Could not find possible submission target.", call. = FALSE)
}
if (is.null(submit)) {
submit <- names(submits)[[1]]
message("Submitting with '", submit, "'")
}
if (!(submit %in% names(submits))) {
stop("Unknown submission name '", submit, "'.\n", "Possible values: ",
paste0(names(submits), collapse = ", "), call. = FALSE)
}
other_submits <- setdiff(names(submits), submit)
method <- form$method
if (!(method %in% c("POST", "GET"))) {
warning("Invalid method (", method, "), defaulting to GET",
call. = FALSE)
method <- "GET"
}
url <- form$url
fields <- form$fields
fields <- Filter(function(x) length(x$value) > 0, fields)
fields <- fields[setdiff(names(fields), other_submits)]
values <- pluck(fields, "value")
names(values) <- names(fields)
list(method = method, encode = form$enctype, url = url, values = values)
}
reassignInPackage("submit_request", "rvest", custom.submit_request)
Step 2: Download file
# start scraping
url <- "https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDExpenditures.aspx"
session_1 <- html_session(url)
# there are two blue buttons:
session_1 %>%
html_nodes(".BlueButton") %>%
html_attr(name = "value")
#> [1] "Search" "Export"
# click export button
form <- html_form(session_1)[[1]]
session_2 <- submit_form(session = session_1, form = form,
submit = "M$C$sCDTransactions$csfFilter$btnExport")
# now there are multiple buttons with hyperlinks
# get the link for the csv file
url_csv <- session_2 %>%
html_nodes(".BlueButton") %>%
html_attr(name = "href") %>%
magrittr::extract2(4) %>%
url_absolute(base = session_2$url)
# download csv file
file <- jump_to(session_2, url_csv)
file$response
#> Response [https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDExpenditures.aspx?exportAll=False&exportFormat=CSV&isExport=True]
#> Date: 2018-09-22 17:49
#> Status: 200
#> Content-Type: text/comma-separated-values; charset=utf-8
#> Size: 6.34 kB
#> "Result","Date","Transaction Type","Payment Type","Payment Detail","Amou...
#> 1,5/8/2017,Expenditure,Future Campaign Account,,$200.00,US Postal Servic...
#> 2,11/29/2017,Expenditure,Bank Fee,,$12.00,Denali FCU,,440 E 36th Ave,Anc...
#> 3,1/1/2018,Expenditure,Electronic Funds Transfer,,$3.54,Google,,1600 Amp...
#> 4,12/31/2017,Expenditure,Electronic Funds Transfer,,$107.89,PayPal,,1840...
#> 5,1/31/2018,Expenditure,Electronic Funds Transfer,,$16.42,Paypal,,1840 E...
#> 6,2/1/2018,Expenditure,Check,197,$300.00,Corbett,Joshua,2448 Sprucewood ...
#> 7,2/1/2018,Expenditure,Electronic Funds Transfer,,$5.00,Google,,1600 Amp...
#> 8,2/28/2017,Expenditure,Bank Fee,,$4.10,First National Bank Alaska,,646 ...
#> 9,3/31/2017,Expenditure,Bank Fee,,$4.10,First National Bank Alaska,,646 ...
#> ...
Created on 2018-09-22 by the reprex package (v0.2.1)
The response looks promising. How can I save that response directly as csv
-file?
回答1:
httr::content(file$response, as="text") %>% write_lines("file.csv")
I'm answering this so the question can be marked as solved. All credit goes to @hrbrmstr.
来源:https://stackoverflow.com/questions/52459436/save-response-from-web-scraping-as-csv-file