How to navigate through js/ajax based pagination while scraping a website?

问题

My code works fine only for the first page of each category, But I want to scrap from all the pages of each category. I'm not able to navigate through the next pages. The website uses AJAX for populating the data when I click on next button for navigating to next page.

I have also looked into the ajax request which is being made by this website for dynamically populating data(This is the URL which pop up on network tab when I clicked on next page button https://www.couponcodesme.com/ae/category/searchfilter). But didn't find any way to mock that request manually using Scrapy. If it's possible to mock the ajax request please let me know how to do it for this particular problem. You are welcome to suggest any other solution rather than Scrapy-Splash!

I have searched the whole Stack Overflow forum but didn't find a proper solution for this problem. Please look into this and help me.

Thank You

import scrapy
from scrapy import Request
from ..items import CouponcollectItem
from scrapy_splash import SplashRequest

class Couponsite5SpiderSpider(scrapy.Spider):
    name = 'couponSite5_spider'
    allowed_domains = ['www.couponcodesme.com']

    script = '''
        function main(splash, args)
            local url = splash.args.url
            assert(splash:go(url))
            assert(splash:wait(5))
            assert(splash:runjs("$('a.category_pagination_btn.next_btn.top-page-button').click()"))
            assert(splash:wait(5))
            return {
                html = splash:html()
            }
        end
    '''

    def start_requests(self):
        yield Request(
            url="https://www.couponcodesme.com/ae/categories",
            callback=self.parse
        )

    def parse(self, response):   
        urls = response.xpath('//ul[@class="flexboxesmain categorieslist"]/li/a/@href').extract()
        for url in urls:
            yield SplashRequest(
                url=url, 
                callback=self.parse_params,
                endpoint="execute",
                args={
                    'wait': 1,
                    'lua_source': self.script
                }
            )

    def parse_params(self, response):   
        items = CouponcollectItem()
        coupon_category = response.xpath('//div[@class="head_main"]/h1[@class="h2_title"]/text()').extract()
        coupon_lists = response.css('#temp1')

        for coupon in coupon_lists.xpath('div'):
            coupon_title = coupon.xpath('normalize-space(.//h3/a/text())').extract()
            coupon_store_name = coupon.xpath('normalize-space(.//div[@class="img-vert-center setheight brdrclr"]/a/@href)').extract()
            store_img_src = coupon.xpath('normalize-space(.//div[@class="img-vert-center setheight brdrclr"]/a/img/@data-src)').extract()
            coupon_code_txt = coupon.xpath('normalize-space(.//span[@class="offer_code"]/span/text())').extract()
            coupon_store_out = coupon.xpath('.//button/@data-alt').extract()

            items['coupon_title'] = [self.deEmojify(coupon_title[0]) if len(coupon_title) != 0 else '']
            items['coupon_code_txt'] = [coupon_code_txt[0] if len(coupon_code_txt) != 0 else '']

            items['coupon_store_out'] = [coupon_store_out[0] if len(coupon_store_out) != 0 else '']
            items['store_img_src'] = [store_img_src[0] if len(store_img_src) != 0 else '']
            items['website_link'] = [response.request.url]

            if len(coupon_category) != 0:
                if coupon_category[0].endswith(' Coupons'):
                    items['coupon_category'] = [self.deEmojify(coupon_category[0][:-8])]
                else:
                    items['coupon_category'] = [self.deEmojify(coupon_category[0])]
            else:
                items['coupon_category'] = ['']

            if len(coupon_store_name) != 0:
                if coupon_store_name[0].endswith(' Coupons'):
                    items['coupon_store_name'] = [self.deEmojify(coupon_store_name[0][:-8])]
                elif coupon_store_name[0].startswith('https://'):
                    items['coupon_store_name'] = [coupon_store_name[0].split('/')[-1]]
                else:
                   items['coupon_store_name'] = [self.deEmojify(coupon_store_name[0])]
            else:
                items['coupon_store_name'] = [''] 

            yield items

    def deEmojify(self, inputString):
        return inputString.encode('ascii', 'ignore').decode('ascii')

来源：https://stackoverflow.com/questions/60375046/how-to-navigate-through-js-ajax-based-pagination-while-scraping-a-website

标签

python

web-scraping

scrapy

scrapy-splash