Multiple nested request with scrapy

后端 未结 1 1269
爱一瞬间的悲伤
爱一瞬间的悲伤 2021-01-14 19:03

I try to scrap some airplane schedule information on www.flightradar24.com website for research project.

The hierarchy of json file i want to obtain is something lik

1条回答
  •  隐瞒了意图╮
    2021-01-14 19:33

    The issue is that you fork your item, where according to your logic you only want 1 item per country, so you can't yield mutltiple items at any point after parsing the country. What you want to do is stack all of them into one item.
    To do that you need to create a parsing loop:

    def parse_airports(self, response):
        item = response.meta['my_country_item']
        item['airports'] = []
    
        for airport in response.xpath('//a[@data-iata]'):
            url = airport.xpath('./@href').extract()
            iata = airport.xpath('./@data-iata').extract()
            iatabis = airport.xpath('./small/text()').extract()
            name = ''.join(airport.xpath('./text()').extract()).strip()
            lat = airport.xpath("./@data-lat").extract()
            lon = airport.xpath("./@data-lon").extract()
    
            iAirport = dict()
            iAirport['name'] = 'foobar'
            iAirport['link'] = url[0]
            iAirport['lat'] = lat[0]
            iAirport['lon'] = lon[0]
            iAirport['code_little'] = iata[0]
            iAirport['code_total'] = iatabis[0]
            item['airports'].append(iAirport)
    
        urls = []
        for airport in item['airports']:
            json_url = 'https://api.flightradar24.com/common/v1/airport.json?code={code}&plugin\[\]=&plugin-setting\[schedule\]\[mode\]=&plugin-setting\[schedule\]\[timestamp\]={timestamp}&page=1&limit=50&token='.format(
                code=airport['code_little'], timestamp="1484150483")
            urls.append(json_url)
        if not urls:
            return item
    
        # start with first url
        next_url = urls.pop()
        return Request(next_url, self.parse_schedule,
                       meta={'airport_item': item, 'airport_urls': urls, 'i': 0})
    
    def parse_schedule(self, response):
        """we want to loop this continuously for every schedule item"""
        item = response.meta['airport_item']
        i = response.meta['i']
        urls = response.meta['airport_urls']
    
        jsonload = json.loads(response.body_as_unicode())
        item['airports'][i]['schedule'] = 'foobar'
        # now do next schedule items
        if not urls:
            yield item
            return
        url = urls.pop()
        yield Request(url, self.parse_schedule,
                      meta={'airport_item': item, 'airport_urls': urls, 'i': i + 1})
    

    0 讨论(0)
提交回复
热议问题