I try to scrap some airplane schedule information on www.flightradar24.com website for research project.
The hierarchy of json file i want to obtain is something lik
The issue is that you fork your item, where according to your logic you only want 1 item per country, so you can't yield mutltiple items at any point after parsing the country. What you want to do is stack all of them into one item.
To do that you need to create a parsing loop:
def parse_airports(self, response):
item = response.meta['my_country_item']
item['airports'] = []
for airport in response.xpath('//a[@data-iata]'):
url = airport.xpath('./@href').extract()
iata = airport.xpath('./@data-iata').extract()
iatabis = airport.xpath('./small/text()').extract()
name = ''.join(airport.xpath('./text()').extract()).strip()
lat = airport.xpath("./@data-lat").extract()
lon = airport.xpath("./@data-lon").extract()
iAirport = dict()
iAirport['name'] = 'foobar'
iAirport['link'] = url[0]
iAirport['lat'] = lat[0]
iAirport['lon'] = lon[0]
iAirport['code_little'] = iata[0]
iAirport['code_total'] = iatabis[0]
item['airports'].append(iAirport)
urls = []
for airport in item['airports']:
json_url = 'https://api.flightradar24.com/common/v1/airport.json?code={code}&plugin\[\]=&plugin-setting\[schedule\]\[mode\]=&plugin-setting\[schedule\]\[timestamp\]={timestamp}&page=1&limit=50&token='.format(
code=airport['code_little'], timestamp="1484150483")
urls.append(json_url)
if not urls:
return item
# start with first url
next_url = urls.pop()
return Request(next_url, self.parse_schedule,
meta={'airport_item': item, 'airport_urls': urls, 'i': 0})
def parse_schedule(self, response):
"""we want to loop this continuously for every schedule item"""
item = response.meta['airport_item']
i = response.meta['i']
urls = response.meta['airport_urls']
jsonload = json.loads(response.body_as_unicode())
item['airports'][i]['schedule'] = 'foobar'
# now do next schedule items
if not urls:
yield item
return
url = urls.pop()
yield Request(url, self.parse_schedule,
meta={'airport_item': item, 'airport_urls': urls, 'i': i + 1})