How to extract data from dynamic websites like Flipkart using selenium and Scrapy?

问题

As Flipkart.com shows only 15 to 20 results on 1st page and when scrolled it shows more results. Scrapy extracts results of 1st page successfully but not of next pages. I tried using Selenium for it, but couldn't find success. Here is my code :-

from scrapy.spider import Spider

from scrapy.selector import Selector

from flipkart.items import FlipkartItem

from scrapy.spider import BaseSpider

from selenium import webdriver

class FlipkartSpider(BaseSpider):
    name = "flip1"
    allowed_domains = ["flipkart.com"]
    start_urls = [
        "http://www.flipkart.com/beauty-and-personal-care/personal-care-appliances/hair-dryers/pr?sid=t06,79s,mh8&otracker=nmenu_sub_electronics_0_Hair%20Dryers"
]

    def __init__(self):
        self.driver = webdriver.Firefox()

    def parse(self, response):
        sel = Selector(response)
        self.driver.get(response.url)
        while True:
            next = self.driver.find_element_by_xpath('//div[@id="show-more-results"]')
            try:

                sites = sel.select('//div[@class="gd-col gu12 browse-product fk-inf-scroll-item"] | //div[@class="pu-details lastUnit"]')
                for site in sites:
                    item = FlipkartItem()
                    item['title'] = site.select('div//a[@class="lu-title"]/text() | div[1]/a/text()').extract()
                    item['price'] = site.select('div//div[@class="pu-price"]/div/text() | div//div[@class="pu-final"]/span/text()').extract()
                    yield item
                next.wait_for_page_to_load("30")
            except:
                break
            self.driver.close()

,My items.py is:-

import scrapy
class FlipkartItem(scrapy.Item):
    title=scrapy.Field()
    price=scrapy.Field()

and Following output I get is only for 15 items:-

[{"price": ["Rs. 599"], "title": ["\n Citron Elegant 1400 W HD001 Hair Dryer (Pink)\n "]},

{"price": ["Rs. 799"], "title": ["\n Citron Vogue 1800 W HD002 Hair Dryer (White)\n "]},

{"price": ["Rs. 645"], "title": ["\n Philips HP8100/00 Hair Dryer (Blue)\n "]},

{"price": ["Rs. 944"], "title": ["\n Philips HP8111/00 Hair Dryer\n "]},

{"price": ["Rs. 171"], "title": ["\n Nova Professional With 2 Speed NV-1290 Hair Dryer (Pink...\n "]},

{"price": ["Rs. 175"], "title": ["\n Nova NHD 2840 Hair Dryer\n "]},

{"price": ["Rs. 775"], "title": ["\n Philips HP 8112 Hair Dryer\n "]},

{"price": ["Rs. 1,925"], "title": ["\n Philips HP8643/00 Miss Fresher's Pack Hair Straightener...\n "]},

{"price": ["Rs. 144"], "title": ["\n Nova Foldable N-658 Hair Dryer (White, Pink)\n "]},

{"price": ["Rs. 1,055"], "title": ["\n Philips HP8100/46 Hair Dryer\n "]},

{"price": ["Rs. 849"], "title": ["\n Panasonic EH-ND12-P62B Hair Dryer (Pink)\n "]},

{"price": ["Rs. 760"], "title": ["\n Panasonic EH-ND11 Hair Dryer (White)\n "]},

{"price": ["Rs. 1,049"], "title": ["\n Panasonic EH-ND13-V Hair Dryer (Violet)\n "]},

{"price": ["Rs. 1,554"], "title": ["\n Philips 1600 W HP4940 Hair Dryer (White & Light Pink)\n "]},

{"price": ["Rs. 2,008"], "title": ["\n Philips Kerashine HP8216/00 Hair Dryer\n "]}]

回答1:

I managed it differently.. See my code for further reference. Working fine for complete site..

class FlipkartSpider(BaseSpider):
    name = "flip1"
    allowed_domains = ["flipkart.com"]
    start_urls = [
        "http://www.flipkart.com/tablets/pr?sid=tyy%2Chry&q=mobile&ref=b8b64676-065a-445c-a6a1-bc964d5ff938"
    ]
    '''def is_element_present(self, finder, selector, wait_time=None):
        wait_time = wait_time or self.wait_time
        end_time = time.time() + wait_time
        while time.time() < end_time:
            if finder(selector):
                return True
        return False
        def is_element_present_by_xpath(self, xpath, wait_time=None):
        return self.is_element_present(self.find_by_xpath, xpath, wait_time)
        '''
    def __init__(self):
        self.driver = webdriver.Firefox()


    def parse(self, response):
        sel = Selector(response) 
        self.driver.get(response.url)
        block="block"
        hyper="http://www.flipkart.com"
        print hyper
        #i=0
        while True:
            self.driver.execute_script("window.scrollTo(10000000,10000000)")
            self.driver.set_page_load_timeout(10000)
            try:
                show = self.driver.find_element_by_xpath('//div[@id="show-more-results"]').value_of_css_property('display')
                if show==block:
                    self.driver.find_element_by_xpath('//div[@id="show-more-results"]').click()
                no_more = self.driver.find_element_by_xpath('//*[@id="no-more-results" and @class="dont-show"]').value_of_css_property('display')
                if no_more==block:
                    break;
                time.sleep(5)
                self.driver.execute_script("window.scrollTo(10000000,10000000)")

                self.driver.set_page_load_timeout(10000)
                #if i==7:
                #   break
            except NoSuchElementException:
                print "pungi"
                break
        #down = self.driver.find_element_by_xpath('//div[@id="show-more-results"]')
        #location = down.location
        #self.((JavascriptExecutor) driver).executeScript("window.scrollBy(10000,10000);");
        #next = self.driver.find_element_by_xpath('//div[@id="show-more-results"]')
        response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
        try:
            #self.driver.set_page_load_timeout(10000)
            #driver.execute_script("window.scrollTo(0, location.get('y')")
            sites = response.xpath('//div[@class="gd-col gu12 browse-product fk-inf-scroll-item"] | //div[@class="pu-details lastUnit"] |  //div[@class="pu-visual-section"]')
            for site in sites:
                item = FlipkartItem()
                item['title'] = site.xpath('div//a[@class="lu-title"]/text() | div[1]/a/text()').extract()
                item['price'] = site.xpath('div//div[@class="pu-price"]/div/text() | div//div[@class="pu-final"]/span/text()').extract()
                item['rating'] = site.xpath('div[@class="pu-rating"]/div/@title').extract()
                item['image'] = site.xpath('a/img/@src').extract()
                data = site.xpath('a/@href').extract()
                print data
                item['link'] = data

                #print rating
                yield item
            '''for site in sites:
                item = FlipkartItem()
                item['title'] = site.xpath('div//a[@class="lu-title"]/text() | div[1]/a/text()').extract()
                item['price'] = site.xpath('div//div[@class="pu-price"]/div/text() | div//div[@class="pu-final"]/span/text()').extract()
                item['rating'] = site.xpath('div[@class="pu-rating"]/div/@title').extract()
                #print rating
                yield item'''
            #next.click()
            #self.driver.execute_script("window.scrollTo(10000000,10000000)")
        except:
            #break
            a=10
        self.driver.close()

回答2:

You have to force the webdriver loading of more results. In order to be able to interact with the other results, the webdriver need to scroll the page until the elements appear.

The code for scrolling is:

driver.execute_script("window.scrollTo(0, location.get('y')")

To decide where to scroll you can find an element in the lower part of the page (as example the footer) and keep scrolling to it.To get the coordinates of the element you can use the Webelement property location

driver = webdriver.Firefox()
down = driver.find_element_by_xpath("//someXpath")
location = down.location

回答3:

You can use Javascript to scroll down the page.

Following code will scroll the page down by 10000,10000 in x & y direction. As 10000 is big number so it takes you to the bottom of page. Once you are at bottom , AJAX request is fired by flipkart to load more item.

window.scrollBy(10000,10000);

I am not sure how we can do that in scrapy but using selenium it is easy.

Here is code

((JavascriptExecutor) driver).executeScript("window.scrollBy(10000,10000);");

来源：https://stackoverflow.com/questions/28122882/how-to-extract-data-from-dynamic-websites-like-flipkart-using-selenium-and-scrap

标签

python

selenium

selenium-webdriver

scrapy

scrapy-spider