目标网址:https://pvp.qq.com/web201605/wallpaper.shtml
分析网页源码
先上图
- 首先搜索目标网页
- 找到高清壁纸所在区域
- 随机点击一个壁纸,然后点击“检查”,查看网页源代码,会发现所有壁纸所在的源码区域
然后点击壁纸源码所在区域,详细查看目标壁纸的url
如下图所示:
但是这是腾讯的网站,批量获取网页源码是获取不到的。查看网页源代码。
这
**注意:**网页源代码不是我们刚刚查看的网页源代码,也就是说网页源码有缺损。这是使用Ajax动态加载的。那么我们就需要找到接口。
刷新网页,在network中找到图中所示的文件。
蓝色部分是我们要的url,这是一个json文件,里面含有壁纸的url。通过上面的url我们会发现有一个隐藏参数page,默认从0开始,一共有21各页面。
点击查看,会发现各种尺寸壁纸的url
但是这些url是经过编码的,所以我们需要将其解码
from urllib import parse
result=parse.unquote("http%3A%2F%2Fshp%2Eqpic%2Ecn%2Fishow%2F2735011316%2F1578904549%5F84828260%5F12121%5FsProdImgNo%5F1%2Ejpg%2F20")
print(result)
解码后的链接有问题,详情请查看上图,需要将20换成200,这样便获取到高清壁纸的url了。
准备部分
from urllib import parse
import requests
import os
from urllib import request
import threading
import queue
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'cookie': 'eas_sid=61b5K7L8g9y658H132i2q6O0t1; pgv_info=ssid=s617747450; pgv_pvid=9760685964; pvpqqcomrouteLine=wallpaper_wallpaper',
'referer': 'https://pvp.qq.com/web201605/wallpaper.shtml'
} #设置请求头
生产者与消费者
使用生产者和消费者模式来运行,并且使用queue来存储,queue是安全队列。
class Producer(threading.Thread):
def __init__(self,page_url_list,image_queue,*args,**kwargs):
super(Producer,self).__init__(*args,**kwargs)
self.page_url_list=page_url_list
self.image_queue=image_queue
def run(self) -> None:
while not self.page_url_list.empty():
page_url = self.page_url_list.get()
resp = requests.get(page_url, headers=headers)
dates = resp.json() # 得到的是一个json文件的字典,使用requests.get().json()方法的前提是返回的内容是类似与json文件的字符串
for date in dates['List']:
image_urls = extract_images(date)
name = parse.unquote(date['sProdName']).replace("1:1", "").strip()
dirpath = os.path.join('images', name)
try:
os.mkdir(dirpath)
except:
pass
for index,image_url in enumerate(image_urls):
self.image_queue.put({"image_url":image_url,"dirpath":os.path.join(dirpath,"%d.jpg"%(index+1))})
class Consummer(threading.Thread):
def __init__(self,image_queue,*args,**kwargs):
super(Consummer,self).__init__(*args,**kwargs)
self.image_queue=image_queue
def run(self) -> None:
while True:
try:
image_obj=self.image_queue.get(timeout=10)
image_url=image_obj.get('image_url')
dirpath=image_obj.get('dirpath')
try:
request.urlretrieve(image_url,dirpath)
print(dirpath+"下载完成!!!")
except:
print(image_url,'\n',dirpath)
print(dirpath+"下载失败!!!")
except:
break
生产中producer用来处理通过page_url_list队列来获得的url,得到json文件,然后处理json文件,提取url并且实现对高清壁纸url的解码。通过os库来创建目录
消费者中,处理通过image_queue获得的url,下载图片,并将图片保存进生产者创建的指定目录中。
完整代码
提取各个高清壁纸链接
def extract_images(date): #获取每个壁纸的真实url
image_urls=[]
for x in range(1,9):
image_url=parse.unquote(date['sProdImgNo_%d'%x]).replace('200','0')
image_urls.append(image_url)
return image_urls
功能分配
def get_url():
page_url_list=queue.Queue(21)
image_queue=queue.Queue(10000)
for x in range(21):
page_url='https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page={}&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1578971884017'.format(x)
page_url_list.put(page_url)
for x in range(3):
th=Producer(page_url_list,image_queue,name='生产者%d号'%x)
th.start()
for x in range(8):
th=Consummer(image_queue,name='消费者%d号'%x)
th.start()
首先获得21个json文件的url,设置线程
完整代码
from urllib import parse
import requests
import os
from urllib import request
import threading
import queue
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'cookie': 'eas_sid=61b5K7L8g9y658H132i2q6O0t1; pgv_info=ssid=s617747450; pgv_pvid=9760685964; pvpqqcomrouteLine=wallpaper_wallpaper',
'referer': 'https://pvp.qq.com/web201605/wallpaper.shtml'
} #设置请求头
class Producer(threading.Thread):
def __init__(self,page_url_list,image_queue,*args,**kwargs):
super(Producer,self).__init__(*args,**kwargs)
self.page_url_list=page_url_list
self.image_queue=image_queue
def run(self) -> None:
while not self.page_url_list.empty():
page_url = self.page_url_list.get()
resp = requests.get(page_url, headers=headers)
dates = resp.json() # 得到的是一个json文件的字典,使用requests.get().json()方法的前提是返回的内容是类似与json文件的字符串
for date in dates['List']:
image_urls = extract_images(date)
name = parse.unquote(date['sProdName']).replace("1:1", "").strip()
dirpath = os.path.join('images', name)
try:
os.mkdir(dirpath)
except:
pass
for index,image_url in enumerate(image_urls):
self.image_queue.put({"image_url":image_url,"dirpath":os.path.join(dirpath,"%d.jpg"%(index+1))})
class Consummer(threading.Thread):
def __init__(self,image_queue,*args,**kwargs):
super(Consummer,self).__init__(*args,**kwargs)
self.image_queue=image_queue
def run(self) -> None:
while True:
try:
image_obj=self.image_queue.get(timeout=10)
image_url=image_obj.get('image_url')
dirpath=image_obj.get('dirpath')
try:
request.urlretrieve(image_url,dirpath)
print(dirpath+"下载完成!!!")
except:
print(image_url,'\n',dirpath)
print(dirpath+"下载失败!!!")
except:
break
def extract_images(date): #获取每个壁纸的真实url
image_urls=[]
for x in range(1,9):
image_url=parse.unquote(date['sProdImgNo_%d'%x]).replace('200','0')
image_urls.append(image_url)
return image_urls
def get_url():
page_url_list=queue.Queue(21)
image_queue=queue.Queue(10000)
for x in range(21):
page_url='https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page={}&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1578971884017'.format(x)
page_url_list.put(page_url)
for x in range(3):
th=Producer(page_url_list,image_queue,name='生产者%d号'%x)
th.start()
for x in range(8):
th=Consummer(image_queue,name='消费者%d号'%x)
th.start()
if __name__ == '__main__':
get_url()
运行结果
来源:CSDN
作者:稳在前
链接:https://blog.csdn.net/qq_44767889/article/details/103969496