福利,三俗,喜闻乐见
第三天,无阻塞版本的实现,在使用线程池的基础上,添加了一些控制代码,为将来做控制台做准备,另外,禁掉了控制台的进度报告,速度飞快就上去了。
接下来要转到文章的另一部分,websocket上面去。
# -*- coding: utf8 -*-
import concurrent.futures
import urllib.request
import re
import json
import ast
import os
visitedURL = {}
maxpageID = [0]
pageDict = {}
def createPageID(url):
pID = maxpageID.pop()+1
maxpageID.append(pID)
pID = 'page_'+str(pID)
visitedURL[url] = pID
return pID
maxpicID = [0]
picDict = {}
def createPicID(url):
pID = maxpicID.pop()+1
maxpicID.append(pID)
pID = 'pic_'+str(pID)
visitedURL[url] = pID
return pID
stoppedQueue = []
waitingQueue = []
downloadingQueue = []
errorQueue = []
savedDict = dict()
#for page downloading
pageTpe = concurrent.futures.ThreadPoolExecutor(max_workers=8)
#for picture downloading
picTpe = concurrent.futures.ThreadPoolExecutor(max_workers=4)
def runMachine():
#add at least 4 tasks to download
totalpics = len(waitingQueue)
futures = {}
with picTpe as executor:
while waitingQueue:
while len(downloadingQueue)<4:
if waitingQueue:
picID = waitingQueue.pop(0)
futures[(executor.submit(processload,picID))] = picID
else:
break
if futures:
for future in concurrent.futures.as_completed(futures):
picID = futures[future]
try:
future.result()
break
except BaseException as e:
downloadingQueue.remove(picID)
print('error happens',e)
errorQueue.append(picID)
try:
picInfo = picDict[picID]
destructPicInfo(picID)
os.remove(picInfo['filepath'])
except OSError:
pass
print('total pic(s):{} loaded pic(s):{}'.format(totalpics,len(savedDict.keys())))
if errorQueue:
print('files below are not downloaded properly:')
for picID in errorQueue:
print(picDict[picID]['url'])
def destructPicInfo(picID):
if picID in picDict:
picInfo = picDict[picID]
if 'outputfile' in picInfo:
outputfile = picInfo['outputfile']
outputfile.close()
del picInfo['outputfile']
def processload(picID):
downloadingQueue.append(picID)
#open conn,loading a picture
picInfo = picDict[picID]
url = picInfo['url']
filename = url.split('/')[-1]
directory = 'pics/'
filepath = directory+filename
picInfo['filepath'] = filepath
if not os.path.exists(directory):
os.makedirs(directory)
outputfile = open(filepath,'wb')
picInfo['outputfile'] = outputfile
picInfo['progress'] = 0
conn = urllib.request.urlopen(url,timeout=10)
picInfo['total'] = int(conn.info().get('Content-Length').strip())
_type = conn.info().get('Content-Type')
if _type.find('image')<0:
destructPicInfo(picID)
del picDict[picID]
updateStatus(picInfo)
while True:
chunk = conn.read(4096)
picInfo['progress']+=len(chunk)
updateStatus(picInfo)
if not chunk:
picInfo['state'] = 2
downloadingQueue.remove(picID)
savedDict[picID] = True
updateStatus(picInfo)
destructPicInfo(picID)
conn.close()
break
outputfile.write(chunk)
if picID not in downloadingQueue:
if picID in stoppedQueue or picID in waitingQueue:
picInfo['conn'] = conn
break
def updateStatus(picInfo):
return
url = picInfo['url']
if picInfo['state']==2:
print(url,'finished!')
elif picInfo['total'] and picInfo['progress']:
print('{} progress: {:.2%}'.format(url,(picInfo['progress']/picInfo['total'])))
pass
def log(*args):
f = open('t.txt','ba')
f.write((','.join(map(str,args))+'\n').encode('utf-8'))
f.close()
def load_pic(url,pageID):
if url in visitedURL:
return
picID = createPicID(url)
#状态:0,未开始,1,排队待下,2,下载完毕
picDict[picID] = {'url':url,'pageID':pageID,'total':0,'progress':0,'state':1}
waitingQueue.append(picID)
def load_page(url):
if url in visitedURL:
return
pID = createPageID(url)
pageDict[pID] = {'url':url,'links':None}
conn = urllib.request.urlopen(url)
text = conn.readall().decode('GBK').encode('utf-8').decode('utf-8')
conn.close()
try:
startIndex = text.index('<div class="mod newslist clear">')
endIndex = text.index('<div class="mod curPosition clear">',startIndex)
text = text[startIndex:endIndex]
patt = re.compile('href="([^"]+?).htm"><img', re.DOTALL | re.IGNORECASE)
jsurls = [x+'.hdBigPic.js' for x in patt.findall(text)]
pageurllist = []
for jsurl in jsurls:
if jsurl in visitedURL:
continue
jsID = createPageID(jsurl)
pageDict[jsID] = {'url':jsurl,'links':None}
jslinks = []
try:
conn = urllib.request.urlopen(jsurl)
except BaseException as e:
print('failed')
continue
try:
text = conn.readall().decode('GBK').encode('utf-8').decode('utf-8')
text = text[:text.index('/* |xGv00|')]
obj = ast.literal_eval(text)
picnum = int(obj['Children'][0]['Children'][0]['Children'][0]['Content'])
picsobj = obj['Children'][0]['Children'][1]['Children']
for x in picsobj:
picurl = x['Children'][2]['Children'][0]['Content']
jslinks.append(picurl)
if jslinks:
pageDict[jsID]['links'] = jslinks
print(jsurl,'{} pics'.format(len(jslinks)))
try:
title = obj['Children'][0]['Children'][8]['Children'][0]['Content']
except:
title = 'unknown'
pageurllist.append(jsurl)
for picurl in jslinks:
load_pic(picurl,jsID)
except BaseException as e:
print(jsurl,'failed')
raise e
pageDict[pID]['links'] = pageurllist
except ValueError as e:
print('error',e)
#can't find proper place
pass
runMachine()
urls = ['http://games.qq.com/l/photo/gmcos/yxcos.htm']
load_page(urls[0])
来源:oschina
链接:https://my.oschina.net/u/1167335/blog/155422