Looping in RSelenium and Scraping

社会主义新天地 提交于 2019-12-25 01:34:54

问题


I'm trying to scrape data from website using RSelenium. I am able to navigate through drop downs individually but when I run them in loop I get error.

Also after selecting all the values in the drop down I want to store the name of the facility and contact details in a table. Which I'm not able to do so far.

rm(list=ls())
setwd("D:\\work_codes\\kvk\\data")
getwd()

library(RSelenium)
library(rvest)
library(XML)
library(RCurl)
library(magrittr)
library(stringr)

rd<-rsDriver()
remDr<-rd[["client"]]

remDr$navigate("https://kvk.icar.gov.in/facilities_list.aspx")

remDr$refresh()

stateEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlState")
states<-stateEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist %>% setdiff(., ' --Select--')
states<-str_trim(states, 'left')
stateEle$clickElement()

for (i in 1:length(states)) {
  remDr$refresh()
  stateEle$clickElement()
  stateEle$sendKeysToElement(list(states[i]))
  stateEle$clickElement()
  districts<-NULL
  distEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlDistrict")
  districts<-distEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist %>% setdiff(., ' --Select--')
  districts<-str_trim(districts, 'left')
  for (j in 1:length(districts)) {
    distEle$clickElement()
    distEle$sendKeysToElement(list(districts[j]))
    distEle$clickElement()
    kvk<-NULL
    kvkEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlKvk")
    kvk<-kvkEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist %>% setdiff(., ' --Select--')
    kvk<-str_trim(kvk, 'left')
    for (k in 1:length(kvk)) {
      kvkEle$clickElement()
      kvkEle$sendKeysToElement(list(kvk[[1]]))
      kvkEle$clickElement()
      submitEle<-remDr$findElement("id", "ContentPlaceHolder1_btnSubmit")
      submitEle$clickElement()
      doc<-remDr$findElement('id', 'ContentPlaceHolder1_rptfacility_f_name_1')
      doc$getElementText()
      doc$clickElement()
      remDr$findElement('class name','Contact details:')
    }
  }
}

回答1:


library(rvest)
url<-"https://kvk.icar.gov.in/facilities_list.aspx"

page<-html_session(url)
form<-html_form(page)[[1]]

states<-html_nodes(page,css="#ContentPlaceHolder1_ddlState > option") %>% html_attr("value")
states<-states[-1]
states_name<-html_nodes(page,css="#ContentPlaceHolder1_ddlState > option") %>% html_text()
states_name<-states_name[-1]

final_df<-0
#### STATES LOOP ####
for(i in 1:length(states)){
  filled_form<-set_values(form,
                          "ctl00$ContentPlaceHolder1$ddlState"=states[i])
  page1<-submit_form(page,filled_form)
  district<-html_nodes(page1,css="#ContentPlaceHolder1_ddlDistrict > option") %>% html_attr("value")
  district<-district[-1]
  district_name<-html_nodes(page1,css="#ContentPlaceHolder1_ddlDistrict > option") %>% html_text()
  district_name<-district_name[-1]

  #### DISTRICT LOOP ####
  for(j in 1:length(district)){
    filled_form1<-set_values(html_form(page1)[[1]],
                            "ctl00$ContentPlaceHolder1$ddlState"=states[i],
                            "ctl00$ContentPlaceHolder1$ddlDistrict"=district[j])
    page2<-submit_form(page1,filled_form1)
    kvk<-html_nodes(page2,css="#ContentPlaceHolder1_ddlKvk > option") %>% html_attr("value")
    kvk<-kvk[-1]
    kvk_name<-html_nodes(page2,css="#ContentPlaceHolder1_ddlKvk > option") %>% html_text()
    kvk_name<-kvk_name[-1]

    #### KVK LOOP ####
    for(k in 1:length(kvk)){
      filled_form2<-set_values(html_form(page2)[[1]],
                               "ctl00$ContentPlaceHolder1$ddlState"=states[i],
                               "ctl00$ContentPlaceHolder1$ddlDistrict"=district[j],
                               "ctl00$ContentPlaceHolder1$ddlKvk"=kvk[k])
      page3<-submit_form(page2,filled_form2)
      contact_text<-gsub("[\r\n]","",html_nodes(page3,css=".panel-body") %>% html_text())
      if(length(contact_text) == 0){contact_text=""}
      df<-data.frame(cbind(states_name[i],district_name[j],kvk[k],contact_text))
      names(df)<-c("STATE","DISTRICT","KVK","CONTACT_TEXT")
      final_df[i*j*k] = list(df)
      ### WAITTIME TO AVOID HTTP 500 error - So the server is not overloaded
      sleep(5)
    }
  }
}


output_df<-data.table::rbindlist(final_df,fill=TRUE)

# After this perform some string operations to extract the exact information required from the CONTACT_TEXT variable

The above answer does not use any RSelenium package and I think this is more trustworthy than RSelenium.



来源:https://stackoverflow.com/questions/53943923/looping-in-rselenium-and-scraping

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!