问题
I am using Rselenium to scrap following website: http://plovila.pomorstvo.hr/
Every time I have to enter 'NIB' field, execute and scrap all data. I am using Sys.time() function several time so my code is slow (cca 12 seconds for one NIB). I need to scrap around 200.000 NIB numbers which gives 30 days of scraping.
I am interested if I can open multiple browsers locally or somehow in the cloud and make my scraping script faster.
Is it possible to use parallel computing to overcome this issue? Do you have any suggestions?
EDIT: I am adding the code:
library(XML)
library(RCurl)
library(RSelenium)
library(png)
library(imager)
library(RMySQL)
library(htmltab)
library(jsonlite)
library(rvest)
# function for waiting instead Sys.sleep()
waitLoad <- function (xpath_check = "//input[@id = 'ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]",
iterations = 5){
counter <- 0
chk <- FALSE
while(!chk & counter <= iterations){
wait <- tryCatch(
remDr$findElement(using = "xpath",
xpath_check)$getElementText(),
# remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_Img1']")$clearElement(),
error = function(e) print(paste0("Trazi dalje"))
)
if(wait == "Trazi dalje" ){
Sys.sleep(1L)
counter <- sum(counter, 1)
}else{
chk <- TRUE
}
}
}
# Start Selenium Server
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.0
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome")
remDr$open()
# Simulate browser session and fill out form
remDr$navigate("http://plovila.pomorstvo.hr/")
remDr$findElement(using = "xpath", "//select[@id = 'ctl00_Content_FormContent_uiTipObjektaDropDown']/option[@value = '1']")$clickElement()
remDr$screenshot(display = TRUE)
# Scrap !
df <- list()
Porivni_uredjaji <- list()
Clanovi_posade <- list()
Vlasnici <- list()
Korisnici <- list()
df_2 <- list()
Tereti <- list()
pocetak <- 100000
kraj <- 100003
system.time(
for (i in pocetak:kraj){
remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$clearElement()
Sys.sleep(1L)
remDr$findElement(using = "xpath",
"//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$sendKeysToElement(list(as.character(i),
key = "enter"))
waitLoad()
remDr$screenshot(display = TRUE)
doc <- htmlParse(remDr$getPageSource()[[1]])
Sys.sleep(1L)
Ime <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[1]", fun = xmlValue)
Oznaka <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]", fun = xmlValue)
NIB <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[3]", fun = xmlValue)
Vlasnik <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[4]", fun = xmlValue)
LK_LI <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[5]", fun = xmlValue)
br1 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[6]", fun = xmlValue)
br2 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[7]", fun = xmlValue)
x <- i-pocetak + 1
if (length(NIB)==0){
Pozivni_znak <- NA
df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Pozivni_znak)
df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE)
}else{
remDr$findElement(using = "xpath", "//input[@title = 'Detalji']")$clickElement()
waitLoad("//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']", 5)
doc <- htmlParse(remDr$getPageSource()[[1]], encoding = "UTF-8")
Sys.sleep(1L)
list_a <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/fieldset/h3[1]", fun = xmlValue)
if (length(list_a) >= 1){
Namjena <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']/@value")
json <- paste0("[", '"', Namjena, '"', "]")
Namjena <- fromJSON(json)
Namjena <- as.data.frame(Namjena, stringsAsFactors = FALSE)
colnames(Namjena) <- "Namjena"
Vrsta_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVrstaPlovilaText']/@value")
json <- paste0("[", '"', Vrsta_plovila, '"', "]")
Vrsta_plovila <- fromJSON(json)
Vrsta_plovila <- as.data.frame(Vrsta_plovila, stringsAsFactors = FALSE)
colnames(Vrsta_plovila) <- "Vrsta_plovila"
Model_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiModelPlovilaText']/@value")
json <- paste0("[", '"', Model_plovila, '"', "]")
Model_plovila <- fromJSON(json)
Model_plovila <- as.data.frame(Model_plovila, stringsAsFactors = FALSE)
colnames(Model_plovila) <- "Model_plovila"
Duljina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiDuljinaTrupaText']/@value")
json <- paste0("[", '"', Duljina_trupa, '"', "]")
Duljina_trupa <- fromJSON(json)
Duljina_trupa <- as.data.frame(Duljina_trupa, stringsAsFactors = FALSE)
colnames(Duljina_trupa) <- "Duljina_trupa"
Sirina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiSirinaText']/@value")
json <- paste0("[", '"', Sirina_trupa, '"', "]")
Sirina_trupa <- fromJSON(json)
Sirina_trupa <- as.data.frame(Sirina_trupa, stringsAsFactors = FALSE)
colnames(Sirina_trupa) <- "Sirina_trupa"
Visina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVisinaText']/@value")
json <- paste0("[", '"', Visina_trupa, '"', "]")
Visina_trupa <- fromJSON(json)
Visina_trupa <- as.data.frame(Visina_trupa, stringsAsFactors = FALSE)
colnames(Visina_trupa) <- "Visina_trupa"
Gaz <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGazText']/@value")
json <- paste0("[", '"', Gaz, '"', "]")
Gaz <- fromJSON(json)
Gaz <- as.data.frame(Gaz, stringsAsFactors = FALSE)
colnames(Gaz) <- "Gaz"
Nosivost <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNosivostText']/@value")
json <- paste0("[", '"', Nosivost, '"', "]")
Nosivost <- fromJSON(json)
Nosivost <- as.data.frame(Nosivost, stringsAsFactors = FALSE)
colnames(Nosivost) <- "Nosivost"
GT <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGtText']/@value")
json <- paste0("[", '"', GT, '"', "]")
GT <- fromJSON(json)
GT <- as.data.frame(GT, stringsAsFactors = FALSE)
colnames(GT) <- "GT"
Snaga_motora <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiUkupnaSnagaText']/@value")
json <- paste0("[", '"', Snaga_motora, '"', "]")
Snaga_motora <- fromJSON(json)
Snaga_motora <- as.data.frame(Snaga_motora, stringsAsFactors = FALSE)
colnames(Snaga_motora) <- "Snaga_motora"
Brodogradiliste <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiBrodogradilisteText']/@value")
Brodogradiliste <- gsub("\"", "'", Brodogradiliste)
json <- paste0("[", '"', Brodogradiliste, '"', "]")
Brodogradiliste <- fromJSON(json)
Brodogradiliste <- as.data.frame(Brodogradiliste, stringsAsFactors = FALSE)
Encoding(Brodogradiliste[,c(1)]) <- "UTF-8"
colnames(Brodogradiliste) <- "Brodogradiliste"
Godina_gradnje <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGodGradnjeText']/@value")
json <- paste0("[", '"', Godina_gradnje, '"', "]")
Godina_gradnje <- fromJSON(json)
Godina_gradnje <- as.data.frame(Godina_gradnje, stringsAsFactors = FALSE)
colnames(Godina_gradnje) <- "Godina_gradnje"
Materijal <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaterijalGradnjeText']/@value")
json <- paste0("[", '"', Materijal, '"', "]")
Materijal <- fromJSON(json)
Materijal <- as.data.frame(Materijal, stringsAsFactors = FALSE)
colnames(Materijal) <- "Materijal"
Najveci_broj_osoba <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojOsobaText']/@value")
json <- paste0("[", '"', Najveci_broj_osoba, '"', "]")
Najveci_broj_osoba <- fromJSON(json)
Najveci_broj_osoba <- as.data.frame(Najveci_broj_osoba, stringsAsFactors = FALSE)
colnames(Najveci_broj_osoba) <- "Najveci_broj_osoba"
Najveci_broj_putnika <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojPutnikaText']/@value")
json <- paste0("[", '"', Najveci_broj_putnika, '"', "]")
Najveci_broj_putnika <- fromJSON(json)
Najveci_broj_putnika <- as.data.frame(Najveci_broj_putnika, stringsAsFactors = FALSE)
colnames(Najveci_broj_putnika) <- "Najveci_broj_putnika"
Najmanji_broj_posade <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMinBrojPosade']/@value")
json <- paste0("[", '"', Najmanji_broj_posade, '"', "]")
Najmanji_broj_posade <- fromJSON(json)
Najmanji_broj_posade <- as.data.frame(Najmanji_broj_posade, stringsAsFactors = FALSE)
colnames(Najmanji_broj_posade) <- "Najmanji_broj_posade"
Prethodna_oznaka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaOznakaText']/@value")
json <- paste0("[", '"', Prethodna_oznaka, '"', "]")
Prethodna_oznaka <- fromJSON(json)
Prethodna_oznaka <- as.data.frame(Prethodna_oznaka, stringsAsFactors = FALSE)
colnames(Prethodna_oznaka) <- "Prethodna_oznaka"
Prethodna_luka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaLukaUpisaText']/@value")
Prethodna_luka <- gsub("\"", "'", Prethodna_luka)
json <- paste0("[", '"', Prethodna_luka, '"', "]")
Prethodna_luka <- fromJSON(json)
Prethodna_luka <- as.data.frame(Prethodna_luka, stringsAsFactors = FALSE)
colnames(Prethodna_luka) <- "Prethodna_luka"
Prethodna_drĹľava <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaDrzavaUpisaText']/@value")
json <- paste0("[", '"', Prethodna_drĹľava, '"', "]")
Prethodna_drĹľava <- fromJSON(json)
Prethodna_drĹľava <- as.data.frame(Prethodna_drĹľava, stringsAsFactors = FALSE)
colnames(Prethodna_drĹľava) <- "Prethodna_drĹľava"
df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Namjena, Vrsta_plovila,
Model_plovila, Duljina_trupa, Sirina_trupa, Visina_trupa, Gaz, Nosivost, GT,
Snaga_motora, Brodogradiliste, Godina_gradnje, Materijal, Najveci_broj_osoba,
Najveci_broj_putnika, Najmanji_broj_posade, Prethodna_oznaka,
Prethodna_luka, Prethodna_drĹľava)
df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE)
df_2 <- readHTMLTable(doc)
Sys.sleep(2L)
Porivni_uredjaji[[x]] <- tryCatch(as.data.frame(cbind(df_2[[2]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
Clanovi_posade[[x]] <- tryCatch(as.data.frame(cbind(df_2[[3]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
Vlasnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[4]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
Korisnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[5]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
Tereti[[x]] <- cbind(remDr$findElement(using = "xpath", "//*/span[@id='ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiTeretiLabel']")$getElementText(), NIB)
}}
}
)
# manipulate data after scraping
for (i in 1:length(df)){
if (length(df[[i]]) < 13){
df[[i]] <- matrix(data = rep(NA, 26), nrow = 1, ncol = 26)
df[[i]] <- as.data.frame(df[[i]])
colnames(df[[i]]) <- c("Ime", "Oznaka", "NIB", "Vlasnik", "LK_LI", "br1", "br2","Namjena",
"Vrsta_plovila", "Model_plovila", "Duljina_trupa", "Sirina_trupa", "Visina_trupa",
"Gaz", "Nosivost", "GT", "Snaga_motora", "Brodogradiliste", "Godina_gradnje",
"Materijal", "Najveci_broj_osoba", "Najveci_broj_putnika", "Najmanji_broj_posade",
"Prethodna_oznaka", "Prethodna_luka", "Prethodna_drĹľava")
}
}
df_final <- do.call(rbind, df)
df_final_1 <- df_final[!is.na(df_final$NIB), ]
EDIT 2 : I have a problem with above code you posted. If I run:
(cl <- (detectCores() - 1) %>% makeCluster) %>% registerDoParallel
# open a remoteDriver for each node on the cluster
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3
clusterEvalQ(cl, {
library(RSelenium)
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome")
remDr$open()
})
myTitles <- c()
ws <- foreach(x = 1:length(urls),
.packages = c("rvest", "magrittr", "RSelenium", "jsonlite", "htmltab", "XML", "RCurl")) %dopar% {
remDr$navigate(urls[x])
Sys.sleep(3L)
remDr$getTitle()[[1]]
}
it returns an error
Error in { : task 1 failed - " Summary: UnknownError
Detail: An unknown server-side error occurred while processing the command.
Further Details: run errorDetails method"
回答1:
Maybe an issue with chrome:3.5.0 docker image. The following runs for me on win 10 with docker toolbox:
library(RSelenium)
library(rvest)
library(magrittr)
library(foreach)
library(doParallel)
# using docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3
# in windows
URLsPar <- c("https://stackoverflow.com/", "https://github.com/",
"http://www.bbc.com/", "http://www.google.com",
"https://www.r-project.org/", "https://cran.r-project.org",
"https://twitter.com/", "https://www.facebook.com/")
appHTML <- c()
(cl <- (detectCores() - 1) %>% makeCluster) %>% registerDoParallel
# open a remoteDriver for each node on the cluster
clusterEvalQ(cl, {
library(RSelenium)
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L,
browserName = "chrome")
remDr$open()
})
ws <- foreach(x = 1:length(URLsPar),
.packages = c("rvest", "magrittr", "RSelenium")) %dopar% {
print(URLsPar[x])
remDr$navigate(URLsPar[x])
remDr$getTitle()[[1]]
}
> ws
[[1]]
[1] "Stack Overflow - Where Developers Learn, Share, & Build Careers"
[[2]]
[1] "The world's leading software development platform · GitHub"
[[3]]
[1] "BBC - Homepage"
[[4]]
[1] "Google"
[[5]]
[1] "R: The R Project for Statistical Computing"
[[6]]
[1] "The Comprehensive R Archive Network"
[[7]]
[1] "Twitter. It's what's happening."
[[8]]
[1] "Facebook - Log In or Sign Up"
# close browser on each node
clusterEvalQ(cl, {
remDr$close()
})
stopImplicitCluster()
来源:https://stackoverflow.com/questions/46339646/speed-up-web-scraping-using-multiplie-rselenium-browsers