asyncio官网
https://docs.python.org/zh-cn/3/library/asyncio-task.html
下面为伪代码:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import pandas as pd
# 将数据存入 li=[]或数据库
#获取页面
li=[]
async def fetch(url,session):
async with session.get(url) as response:
return await response.text
# 解析网页
async def parse(html):
soup = BeautifulSoup(html,'lxml')
# 获取网页中的畅销书
book_list=soup.find('ul',class_='book_list')('li')
for book in book_list:
info =book.find_all('div')
# 获取每本畅销书的排名,名称,评论数,作者,出版社
rank = info[0].text[0:-1]
name = info[2].text
comments = info[3].text.split('条')[0]
author = info[4].text
date_and_publisher = info[5].text.split()
publisher = date_and_publisher[1] if len(date_and_publisher) >= 2 else ''
# 将每本畅销书的上述信息加入到table中
li.append([rank, name, comments, author, publisher])
# 处理页面
async def download(url):
async with aiohttp.ClientSession as session:
# 获取页面
html = await fetch(session, url)
# 解析页面
await parse(html)
# 全部网页urls
urls=[url1,url2,url3]
# 利用asycio 模块进行一步IO处理
loop = asyncio.get_event_loop()
# 异步获取任务
tasks= [asyncio.ensure_future(download(url)) for url in urls]
tasks = asyncio.gather(*tasks)
loop.run_until_complete(tasks)
# 将table转化为pandas中的DataFrame并保存为CSV格式的文件
df = pd.DataFrame(li, columns=['rank','name','comments','author','publisher'])
df.to_csv('E://douban/dangdang.csv',index=False)
来源:oschina
链接:https://my.oschina.net/u/4274625/blog/3543559