1 import asyncio
2 import aiohttp
3 import async_timeout
4 from lxml import html
5 from timeit import default_timer as timer
6
7 from db import DBData
8
9
10 class Crawler:
11 def __init__(self, **kwargs):
12 self.domains = kwargs["domains"]
13 self.max_depth = kwargs["max_depth"]
14 self.max_retries = 3
15 self.max_workers = 10
16 self.Q = asyncio.Queue()
17 self.db_Q = asyncio.Queue()
18 self.cache = set()
19 self.count = 0
20 self.loop = asyncio.get_event_loop()
21 self.db_data = DBData()
22
23 # Clear
24 self.db_data.clear_crawler()
25
26 async def get(self, url, timeout):
27 with async_timeout.timeout(timeout):
28 async with self.session.get(url) as response:
29 return await response.text()
30
31 async def extract_urls(self, url, timeout=10):
32 tree = html.fromstring(await self.get(url, timeout))
33 # Search only in domains
34 return {p for p in tree.xpath("//a/@href")}
35 # if any(domain in p for domain in self.domains)}
36
37 async def worker(self):
38 while True:
39 url, depth, retries = await self.Q.get()
40 if url in self.cache:
41 self.db_Q.put_nowait(url)
42 self.Q.task_done()
43 continue
44 try:
45 new_urls = await self.extract_urls(url)
46 except Exception as e:
47 if retries <= self.max_retries:
48 self.Q.put_nowait((url, depth, retries + 1))
49 else:
50 print("Error in %s: %s" % (url, repr(e)))
51 else:
52 self.cache.add(url)
53 self.count += 1
54 self.db_Q.put_nowait(url)
55 print("Depth: %s Retry: %s Visited: %s" % (depth, retries, url))
56 if depth+1 <= self.max_depth:
57 for x in new_urls:
58 self.Q.put_nowait((x, depth + 1, retries))
59 self.Q.task_done()
60
61 async def run(self):
62 async with aiohttp.ClientSession(loop=self.loop) as session:
63 self.session = session
64 workers = [self.worker() for _ in range(self.max_workers)]
65 workers += [self.write_to_db() for _ in range(self.max_workers)]
66 tasks = [self.loop.create_task(x) for x in workers]
67 await asyncio.sleep(5)
68 await self.Q.join()
69 await self.db_Q.join()
70 for task in tasks:
71 task.cancel()
72
73 def start(self):
74 for domain in self.domains:
75 print("Crawling %s start..." % domain)
76
77 self.Q.put_nowait((domain, 0, 0))
78 start_time = timer()
79 self.loop.run_until_complete(asyncio.gather(self.run()))
80 self.loop.close()
81 runtime = timer() - start_time
82
83 print("Crawling %s end. Exec time: %s. Requests: %s" % (
84 domain, runtime, self.count))
85
86 async def write_to_db(self):
87 while True:
88 address = await self.db_Q.get()
89 if await self.db_data.check_url(address) is None:
90 self.db_data.add_url(address)
91 print("Write to DB: %s" % address)
92 self.db_Q.task_done()
93
94
95 if __name__ == "__main__":
96 options = {
97 "domains": ["https://www.yahoo.com/news/"],
98 "max_depth": 1
99 }
100 c = Crawler(**options)
101 c.start()
来源:oschina
链接:https://my.oschina.net/u/4399738/blog/3700618