I am trying to scrape some data from https://www.officialcharts.com/ by parallelising web requests using asyncio/aiohttp. I implemented the code given at the link here.
Try to use the latest version.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from aiohttp import ClientSession, client_exceptions
from asyncio import Semaphore, ensure_future, gather, run
from json import dumps, loads
limit = 10
http_ok = [200]
async def scrape(url_list):
tasks = list()
sem = Semaphore(limit)
async with ClientSession() as session:
for url in url_list:
task = ensure_future(scrape_bounded(url, sem, session))
tasks.append(task)
result = await gather(*tasks)
return result
async def scrape_bounded(url, sem, session):
async with sem:
return await scrape_one(url, session)
async def scrape_one(url, session):
try:
async with session.get(url) as response:
content = await response.read()
except client_exceptions.ClientConnectorError:
print('Scraping %s failed due to the connection problem', url)
return False
if response.status not in http_ok:
print('Scraping%s failed due to the return code %s', url, response.status)
return False
content = loads(content.decode('UTF-8'))
return content
if __name__ == '__main__':
urls = ['http://demin.co/echo1/', 'http://demin.co/echo2/']
res = run(scrape(urls))
print(dumps(res, indent=4))
This is a template of a real project that works as predicted.
You can find this source code here