I\'m able to scrape data off of basic html pages, but I\'m having trouble scraping off the site below. It looks like the data is presented via javascript, and I\'m not sure how
Using Relenium
:
require(relenium) # More info: https://github.com/LluisRamon/relenium
require(XML)
firefox <- firefoxClass$new() # init browser
res <- NULL
pages <- 1:2
for (page in pages) {
url <- sprintf("http://www.machinerytrader.com/list/list.aspx?pg=%d&bcatid=4&DidSearch=1&EID=1&LP=MAT&ETID=5&catid=1015&mdlx=Contains&Cond=All&SO=26&btnSearch=Search&units=imperial", page)
firefox$get(url)
doc <- htmlParse(firefox$getPageSource())
res <- rbind(res,
cbind(year_manu_model = xpathSApply(doc, '//table[substring(@id, string-length(@id)-15) = "tblListingHeader"]/tbody/tr/td[1]', xmlValue),
sn = xpathSApply(doc, '//table[substring(@id, string-length(@id)-15) = "tblListingHeader"]/tbody/tr/td[2]', xmlValue),
price = xpathSApply(doc, '//table[substring(@id, string-length(@id)-15) = "tblListingHeader"]/tbody/tr/td[3]', xmlValue),
loc = xpathSApply(doc, '//table[substring(@id, string-length(@id)-15) = "tblListingHeader"]/tbody/tr/td[4]', xmlValue),
auc = xpathSApply(doc, '//table[substring(@id, string-length(@id)-9) = "tblContent"]/tbody/tr/td[2]', xmlValue))
)
}
sapply(as.data.frame(res), substr, 0, 30)
# year_manu_model sn price loc auc
# [1,] " 1972 AMERICAN 5530" "GS14745W" "US $50,100" "MI " "\n\t\t\t\t\tAuction: 1/9/2013; 4,796"
# [2,] " AUSTIN-WESTERN 307" "307" "US $3,400" "MT " "\n\t\t\t\t\tDetails & Photo(s)Video("
# ...
library(XML)
library(relenium)
##downloading website
website<- firefoxClass$new()
website$get("http://www.machinerytrader.com/list/list.aspx?pg=1&bcatid=4&DidSearch=1&EID=1&LP=MAT&ETID=5&catid=1015&mdlx=Contains&Cond=All&SO=26&btnSearch=Search&units=imperial")
doc <- htmlParse(website$getPageSource())
##reading tables and binding the information
tables <- readHTMLTable(doc, stringsAsFactors=FALSE)
data<-do.call("rbind", tables[seq(from=8, to=56, by=2)])
data<-cbind(data, sapply(lapply(tables[seq(from=9, to=57, by=2)], '[[', i=2), '[', 1))
rownames(data)<-NULL
names(data) <- c("year.man.model", "s.n", "price", "location", "auction")
This will give you what you want for the first page (showing just the first two lines here):
head(data,2)
year.man.model s.n price location auction
1 1972 AMERICAN 5530 GS14745W US $50,100 MI Auction: 1/9/2013; 4,796 Hours; ..
2 AUSTIN-WESTERN 307 307 US $3,400 MT Auction: 12/18/2013; AUSTIN-WESTERN track excavator.
To get all pages, just loop over them, pasting the pg=i
in the address.