Executing Javascript Submit form functions using scrapy in python

前端 未结 3 1266
栀梦
栀梦 2020-12-02 07:13

I am scrapping a site using scrapy framework and having trouble clicking on a javascript link for opening another page.

I can identify the code on the page as:

相关标签:
3条回答
  • 2020-12-02 07:29

    Checkout the below snipped on how to use scrapy with selenium. Crawling will be slower as you aren't just downloading the html but you will get full access to the DOM.

    Note: I have copy-pasted this snippet as the links previously provided no longer work.

    # Snippet imported from snippets.scrapy.org (which no longer works)
    
    from scrapy.contrib.spiders import CrawlSpider, Rule
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.selector import HtmlXPathSelector
    from scrapy.http import Request
    
    from selenium import selenium
    
    class SeleniumSpider(CrawlSpider):
        name = "SeleniumSpider"
        start_urls = ["http://www.domain.com"]
    
        rules = (
            Rule(SgmlLinkExtractor(allow=('\.html', )),
            callback='parse_page',follow=True),
        )
    
        def __init__(self):
            CrawlSpider.__init__(self)
            self.verificationErrors = []
            self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com")
            self.selenium.start()
    
        def __del__(self):
            self.selenium.stop()
            print self.verificationErrors
            CrawlSpider.__del__(self)
    
        def parse_page(self, response):
            item = Item()
    
            hxs = HtmlXPathSelector(response)
            #Do some XPath selection with Scrapy
            hxs.select('//div').extract()
    
            sel = self.selenium
            sel.open(response.url)
    
            #Wait for javscript to load in Selenium
            time.sleep(2.5)
    
            #Do some crawling of javascript created content with Selenium
            sel.get_text("//div")
            yield item
    
    0 讨论(0)
  • 2020-12-02 07:42

    If you want to check out a fairly huge, functional code base which uses scrapy and selenium, check out https://github.com/nicodjimenez/bus_catchers. Here is a simpler example.

    # stripped down BoltBus script 
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.support.ui import WebDriverWait 
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.keys import Keys
    from scrapy.selector import HtmlXPathSelector
    from scrapy.http import Response 
    from scrapy.http import TextResponse 
    import time
    
    # set dates, origin, destination 
    cityOrigin="Baltimore"
    cityDeparture="New York"
    day_array=[0]
    browser = webdriver.Firefox()
    
    # we are going the day of the days of the month from 15,16,...,25
    # there is a discrepancy between the index of the calendar days and the day itself: for example day[10] may correspond to Feb 7th
    for day in day_array:
    
        # Create a new instance of the Firefox driver
        browser.get("http://www.boltbus.com")
    
        # click on "region" tab
        elem_0=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_lstRegion_textBox")
        elem_0.click()
        time.sleep(5) 
    
        # select Northeast
        elem_1=browser.find_element_by_partial_link_text("Northeast")
        elem_1.click()
        time.sleep(5)
    
        # click on origin city
        elem_2=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_lstOrigin_textBox")
        elem_2.click()
        time.sleep(5)
    
        # select origin city
        elem_3=browser.find_element_by_partial_link_text(cityOrigin)
        elem_3.click()
        time.sleep(5)
    
        # click on destination city 
        elem_4=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_lstDestination_textBox")
        elem_4.click()
        time.sleep(5)
    
        # select destination city 
        elem_5=browser.find_element_by_partial_link_text(cityDeparture)
        elem_5.click()
        time.sleep(5)
    
        # click on travel date
        travel_date_elem=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_imageE")
        travel_date_elem.click()    
    
        # gets day rows of table
        date_rows=browser.find_elements_by_class_name("daysrow") 
    
        # select actual day (use variable day)
        # NOTE: you must make sure these day elements are "clickable"
        days=date_rows[0].find_elements_by_xpath("..//td")
        days[day].click()
        time.sleep(3) 
    
        # retrieve actual departure date from browser
        depart_date_elem=browser.find_element_by_id("ctl00_cphM_forwardRouteUC_txtDepartureDate")
        depart_date=str(depart_date_elem.get_attribute("value"))
    
        # PARSE TABLE
    
        # convert html to "nice format"
        text_html=browser.page_source.encode('utf-8')
        html_str=str(text_html)
    
        # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
        resp_for_scrapy=TextResponse('none',200,{},html_str,[],None)
    
        # takes a "TextResponse" object and feeds it to a scrapy function which will convert the raw HTML to a XPath document tree
        hxs=HtmlXPathSelector(resp_for_scrapy)
    
        # the | sign means "or"
        table_rows=hxs.select('//tr[@class="fareviewrow"] | //tr[@class="fareviewaltrow"]')
        row_ct=len(table_rows)
    
        for x in xrange(row_ct):
    
            cur_node_elements=table_rows[x]
            travel_price=cur_node_elements.select('.//td[@class="faresColumn0"]/text()').re("\d{1,3}\.\d\d")
    
            # I use a mixture of xpath selectors to get me to the right location in the document, and regular expressions to get the exact data
    
            # actual digits of time 
            depart_time_num=cur_node_elements.select('.//td[@class="faresColumn1"]/text()').re("\d{1,2}\:\d\d")
    
            # AM or PM (time signature)
            depart_time_sig=cur_node_elements.select('.//td[@class="faresColumn1"]/text()').re("[AP][M]")
    
            # actual digits of time 
            arrive_time_num=cur_node_elements.select('.//td[@class="faresColumn2"]/text()').re("\d{1,2}\:\d\d")
    
            # AM or PM (time signature)
            arrive_time_sig=cur_node_elements.select('.//td[@class="faresColumn2"]/text()').re("[AP][M]")
    
            print "Depart date: " + depart_date
            print "Depart time: " + depart_time_num[0] + " " + depart_time_sig[0]   
            print "Arrive time: " + arrive_time_num[0] + " " + arrive_time_sig[0]
            print "Cost: " + "$" + travel_price[0] 
            print "\n"
    
    0 讨论(0)
  • 2020-12-02 07:42

    As far as I know, scrappy crawler implemented over urrlib2 and urllib obviously dont work with js. For working with js you can use qt webkit or selenium for example. Or you could find all ajax links on page and see how implemented a data exchange with the server and send response to server api indirectly.

    0 讨论(0)
提交回复
热议问题