import
asyncio
import
json
import
time
import
traceback
import
aiohttp
import
logging
from
aiohttp
import
ContentTypeError
from
motor.motor_asyncio
import
AsyncIOMotorClient
logging.basicConfig(
level
=logging.INFO,
format
=
'
%(asctime)s
-
%(levelname)s
:
%(message)s
'
)
INDEX_URL =
'https://dynamic5.scrape.cuiqingcai.com/api/book/?limit=18&offset=
{offset}
'
DETAIL_URL =
'https://dynamic5.scrape.cuiqingcai.com/api/book/
{id}
'
PAGE_SIZE =
18
PAGE_NUMBER =
1
CONCURRENCY =
5
MONGO_CONNECTION_STRING =
'mongodb://localhost:27017'
MONGO_DB_NAME =
'books'
MONGO_COLLECTION_NAME =
'books'
client = AsyncIOMotorClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DB_NAME]
collection = db[MONGO_CONNECTION_STRING]
loop = asyncio.get_event_loop()
class
Spider
(
object
):
def
__init__
(
self
):
self
.semaphore = asyncio.Semaphore(CONCURRENCY)
async
def
scrape_api
(
self
,
url
):
async
with
self
.semaphore:
try
:
logging.info(
'scraping
%s
'
, url)
async
with
self
.session.get(url)
as
response:
await
asyncio.sleep(
1
)
return
await
response.json()
except
ContentTypeError
as
e:
traceback.print_exc(e)
logging.error(
'error occurred while scraping
%s
'
, url,
exc_info
=
True
)
async
def
scrape_index
(
self
,
page
):
url = INDEX_URL.format(
offset
=PAGE_SIZE * (page -
1
))
return
await
self
.scrape_api(url)
async
def
scrape_detail
(
self
,
id
):
url = DETAIL_URL.format(
id
=
id
)
data =
await
self
.scrape_api(url)
await
self
.save_data(data)
async
def
save_data
(
self
,
data
):
logging.info(
'saving data
%s
'
, data)
if
data:
return
await
collection.update_one({
'id'
: data.get(
'id'
)
}, {
'$set'
: data
},
upsert
=
True
)
async
def
main
(
self
):
self
.session = aiohttp.ClientSession()
# index tasks
scrape_index_tasks = [asyncio.ensure_future(
self
.scrape_index(page))
for
page
in
range
(
1
, PAGE_NUMBER +
1
)]
results =
await
asyncio.gather(*scrape_index_tasks)
# detail tasks
print
(
'results'
, results)
ids = []
for
index_data
in
results:
if
not
index_data:
continue
for
item
in
index_data.get(
'results'
):
ids.append(item.get(
'id'
))
scrape_detail_tasks = [asyncio.ensure_future(
self
.scrape_detail(
id
))
for
id
in
ids]
await
asyncio.wait(scrape_detail_tasks)
await
self
.session.close()
if
__name__
==
'__main__'
:
spider = Spider()
loop.run_until_complete(spider.main())
来源:oschina
链接:https://my.oschina.net/u/3892643/blog/3219553