福利,三俗,喜闻乐见
为了后面能够接上web界面,需要一个可随时提供查询和推送信息的结构。一开始,逻辑不用做得太复杂,也不用过多考虑性能问题,只要保证了这样的结构,以后再修改也是可以的。
要实现结构就要选取异步io操作的方式,这个问题让我老头疼了,本人用python编程的时间也不长,知道的路数不多,multiprocessing在windows下是用不了的,gevent安装又太麻烦,需要多少第三方的东西,还需要编译器,还是硬着头皮选择用thread。心里有个架构,可以保证前台的流畅,但实现比较复杂,肯定暂时不会考虑用在这个例子上,不过前面说了,只要结构做出来,以后如果有性能上的需求,放到linux上,到时候再修改也不迟。
python3的新版本有个Future包,封装了异步相关的操作,也算方便,而且就有现成的多线程下载例子
真是想人之所想啊,知道你们都喜欢这种;-)
对每一个网页,给一个任务编号。page_x
对每一个图片,给一个任务编号。pic_x
此外,每一个网页有一个图片列表。和网页原地址
此外,每一个图片有这几个数据项。
状态:0,未开始,1,排队待下,2,下载完毕
所属地址id
已下载bytes
总长bytes
如果这两个值都为0,表示未下载,
如果总长有值,表示下载中,已下载bytes表示它的进度
图片原地址
经过一些尝试,找到了下载的方法,下面是代码,代码是阻塞式实现的,先把过程调通,下一步再改成非阻塞式。实际上现在这个版本已经可以用了
# -*- coding: utf8 -*-
import concurrent.futures
import urllib.request
import re
import json
import ast
visitedURL = {}
maxpageID = [0]
pageDict = {}
def createPageID(url):
pID = maxpageID.pop()+1
maxpageID.append(pID)
pID = 'page_'+str(pID)
visitedURL[url] = pID
return pID
maxpicID = [0]
picDict = {}
def createPicID(url):
pID = maxpicID.pop()+1
maxpicID.append(pID)
pID = 'pic_'+str(pID)
visitedURL[url] = pID
return pID
stoppedQueue = []
waitingQueue = []
downloadingQueue = []
savedDict = dict()
#for page downloading
pageTpe = concurrent.futures.ThreadPoolExecutor(max_workers=8)
#for picture downloading
picTpe = concurrent.futures.ThreadPoolExecutor(max_workers=4)
def runMachine():
#add at least 4 tasks to download
while waitingQueue:
if len(downloadingQueue)<4:
picID = waitingQueue.pop(0)
processload(picID)
def processload(picID):
downloadingQueue.append(picID)
#open conn,loading a picture
picInfo = picDict[picID]
url = picInfo['url']
filename = url.split('/')[-1]
conn = urllib.request.urlopen(url,timeout=10)
picInfo['total'] = int(conn.info().get('Content-Length').strip())
outputfile = open('pics/'+filename,'wb')
picInfo['progress'] = 0
updateStatus(picInfo)
while True:
chunk = conn.read(4096)
picInfo['progress']+=len(chunk)
updateStatus(picInfo)
if not chunk:
picInfo['state'] = 2
downloadingQueue.remove(picID)
savedDict[picID] = True
updateStatus(picInfo)
outputfile.close()
conn.close()
break
outputfile.write(chunk)
#reportProgress(url,progress,total)
#report
def updateStatus(picInfo):
url = picInfo['url']
if picInfo['state']==2:
print(url,'finished!')
elif picInfo['total'] and picInfo['progress']:
print('{} progress: {:.2%}'.format(url,(picInfo['progress']/picInfo['total'])))
pass
def log(*args):
f = open('t.txt','ba')
f.write((','.join(map(str,args))+'\n').encode('utf-8'))
f.close()
def load_pic(url,pageID):
if url in visitedURL:
return
picID = createPicID(url)
#状态:0,未开始,1,排队待下,2,下载完毕
picDict[picID] = {'url':url,'pageID':pageID,'total':0,'progress':0,'state':1}
waitingQueue.append(picID)
def load_page(url):
if url in visitedURL:
return
pID = createPageID(url)
pageDict[pID] = {'url':url,'links':None}
conn = urllib.request.urlopen(url)
text = conn.readall().decode('GBK').encode('utf-8').decode('utf-8')
conn.close()
try:
startIndex = text.index('<div class="mod newslist clear">')
endIndex = text.index('<div class="mod curPosition clear">',startIndex)
text = text[startIndex:endIndex]
patt = re.compile('href="([^"]+?).htm"><img', re.DOTALL | re.IGNORECASE)
jsurls = [x+'.hdBigPic.js' for x in patt.findall(text)]
pageurllist = []
for jsurl in jsurls:
if jsurl in visitedURL:
continue
jsID = createPageID(jsurl)
pageDict[jsID] = {'url':jsurl,'links':None}
jslinks = []
try:
conn = urllib.request.urlopen(jsurl)
except BaseException as e:
print('failed')
continue
try:
text = conn.readall().decode('GBK').encode('utf-8').decode('utf-8')
text = text[:text.index('/* |xGv00|')]
obj = ast.literal_eval(text)
picnum = int(obj['Children'][0]['Children'][0]['Children'][0]['Content'])
picsobj = obj['Children'][0]['Children'][1]['Children']
for x in picsobj:
picurl = x['Children'][2]['Children'][0]['Content']
jslinks.append(picurl)
if jslinks:
pageDict[jsID]['links'] = jslinks
print(jsurl,'{} pics'.format(len(jslinks)))
try:
title = obj['Children'][0]['Children'][8]['Children'][0]['Content']
except:
title = 'unknown'
pageurllist.append(jsurl)
for picurl in jslinks:
load_pic(picurl,jsID)
except BaseException as e:
print(jsurl,'failed')
raise e
pageDict[pID]['links'] = pageurllist
except ValueError as e:
print('error',e)
#can't find proper place
pass
runMachine()
urls = ['http://games.qq.com/l/photo/gmcos/yxcos.htm']
load_page(urls[0])
来源:oschina
链接:https://my.oschina.net/u/1167335/blog/153501