using scrapy to scrape asp.net website with javascript buttons and ajax requests

后端 未结 2 1143
情话喂你
情话喂你 2021-02-04 21:01

I\'d been trying to scrape some date from as asp.net website, the start page should be the following one: http://www.e3050.com/Items.aspx?cat=SON

First, I want to displa

2条回答
  •  你的背包
    2021-02-04 21:43

    Check out this here is an exact solution..

    in parse method selecting 50 products per page

    in page_rs_50 handled pagination

    start_urls = ['http://www.e3050.com/Items.aspx?cat=SON']
    pro_urls = [] # all product Urls
    
    def parse(self, response): # select 50 products on each page
        yield FormRequest.from_response(response,
            formdata={'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$pagesddl': '50',
                      'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$sortddl': 'Price(ASC)'},
            meta={'curr': 1, 'total': 0, 'flag': True},
            dont_click=True,
            callback=self.page_rs_50)
    
    def page_rs_50(self, response): # paginate the pages
        hxs = HtmlXPathSelector(response)
        curr = int(response.request.meta['curr'])
        total = int(response.request.meta['total'])
        flag = response.request.meta['flag']
        self.pro_urls.extend(hxs.select(
            "//td[@class='name']//a[contains(@id,'ctl00_ctl00_ContentPlaceHolder1_ItemListPlaceHolder_itemslv_ctrl')]/@href"
        ).extract())
        if flag:
            total = hxs.select(
                "//span[@id='ctl00_ctl00_ContentPlaceHolder1_ItemListPlaceHolder_lbl_pagesizeBtm']/text()").re('\d+')[0]
        if curr < total:
            curr += 1
            yield FormRequest.from_response(response,
                formdata={'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$pagesddl': '50',
                          'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$sortddl': 'Price(ASC)',
                          'ctl00$ctl00$ScriptManager1': 'ctl00$ctl00$ScriptManager1|ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$pager1$ctl00$ctl01'
                    , '__EVENTTARGET': 'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$pager1$ctl00$ctl01',
                          'ctl00$ctl00$ContentPlaceHolder1$ItemListPlaceHolder$hfVSFileName': hxs.select(
                              ".//input[@id='ctl00_ctl00_ContentPlaceHolder1_ItemListPlaceHolder_hfVSFileName']/@value").extract()[
                                                                                              0]},
                meta={'curr': curr, 'total': total, 'flag': False},
                dont_click=True,
                callback=self.page_rs_50
            )
        else:
            for pro in self.pro_urls:
                yield Request("http://www.e3050.com/%s" % pro,
                    callback=self.parse_product)
    
    
    def parse_product(self, response):
        pass
        #TODO Implementation Required For Parsing
    

提交回复
热议问题