from pyquery import PyQuery as pq from tornado import ioloop, gen, httpclient, queues from urllib.parse import urljoin base_url = "http://www.baidu.com" concurrency = 8 async def get_url_links(url): response = await httpclient.AsyncHTTPClient().fetch(url) html = response.body.decode("utf-8") p = pq(html) links = [] for tag_a in p("a").items(): links.append(urljoin(base_url, tag_a.attr("href"))) return links async def main(): seen_set = set() q = queues.Queue() async def fetch_url(current_url): if current_url in seen_set: return print(f"获取:{current_url}") seen_set.add(current_url) next_urls = await get_url_links(current_url) for next_url in next_urls: if next_url.startswith(base_url): await q.put(next_url) async def worker(): async for url in q: if url is None: return try: await fetch_url(url) except Exception as e: print(f"exception:{e}") finally: # 计数器,每进入一个就加1,所以我们调用完了之后,要减去1 q.task_done() # 放入初始url到队列 await q.put(base_url) # 启动协程,同时开启三个消费者 workers = gen.multi([worker() for _ in range(3)]) # 会阻塞,直到队列里面没有数据为止 await q.join() for _ in range(concurrency): await q.put(None) # 等待所有协程执行完毕 await workers if __name__ == '__main__': ioloop.IOLoop.current().run_sync(main) """ 获取:http://www.baidu.com 获取:http://www.baidu.com/gaoji/preferences.html 获取:http://www.baidu.com/ 获取:http://www.baidu.com/more/ 获取:http://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&word= 获取:http://www.baidu.com/cache/sethelp/help.html 获取:http://www.baidu.com/duty/ 获取:http://www.baidu.com/search/jiqiao.html 获取:http://www.baidu.com#iec 获取:http://www.baidu.com#circle 获取:http://www.baidu.com#aoyouc 获取:http://www.baidu.com#sougouc 获取:http://www.baidu.com#qqtc 获取:http://www.baidu.com#ttc 获取:http://www.baidu.com#ffc 获取:http://www.baidu.com#chromec 获取:http://www.baidu.com#jishu360c 获取:http://www.baidu.com#world_jishuc 获取:http://www.baidu.com#operac 获取:http://www.baidu.com#worldc 获取:http://www.baidu.com#safaric 获取:http://www.baidu.com#greenc 获取:http://www.baidu.com#krc 获取:http://www.baidu.com#bdbrowserc 获取:http://www.baidu.com/duty/index.html 获取:http://www.baidu.com/copyright.html 获取:http://www.baidu.com/mianze-shengming.html 获取:http://www.baidu.com/right.html 获取:http://www.baidu.com/yinsiquan-policy.html 获取:http://www.baidu.com/yinsiquan-sub.html 获取:http://www.baidu.com/baozhang.html 获取:http://www.baidu.com/index 获取:http://www.baidu.com/search?keywords=%E7%99%BE%E5%BA%A6%E5%B8%90%E5%8F%B7%E8%A2%AB%E5%B0%81%E7%A6%81 获取:http://www.baidu.com/search?keywords=%E5%A6%82%E4%BD%95%E4%B8%BE%E6%8A%A5%E7%BD%91%E7%AB%99 获取:http://www.baidu.com/search?keywords=%E6%8A%95%E8%AF%89%E4%BE%B5%E6%9D%83%E4%BF%A1%E6%81%AF 获取:http://www.baidu.com/search?keywords=%E7%99%BE%E5%BA%A6%E5%B8%90%E5%8F%B7%E8%A2%AB%E7%9B%97 获取:http://www.baidu.com/search?keywords=%E5%B8%90%E5%8F%B7%E7%94%B3%E8%AF%89%E6%9C%AA%E9%80%9A%E8%BF%87 获取:http://www.baidu.com/search?keywords=%E8%B4%B4%E5%90%A7%E8%B4%B4%E5%AD%90%E8%A2%AB%E5%88%A0 获取:http://www.baidu.com/search?keywords=%E5%88%A0%E9%99%A4%2F%E6%9B%B4%E6%96%B0%E5%BF%AB%E7%85%A7 获取:http://www.baidu.com/zhifu 获取:http://www.baidu.com/jubao 获取:http://www.baidu.com/statement 获取:http://www.baidu.com/personalinformation 获取:http://www.baidu.com/more/index.html 获取:http://www.baidu.com/search/jubao.html """
来源:https://www.cnblogs.com/traditional/p/11326710.html