1. 目标网址
http://www.xiachufang.com/
2. 源码参考
# @Time : 2020/3/5 16:04
# @Author : GKL
# FileName : spider.py
# Software : PyCharm
import re
import requests
class Spider(object):
def __init__(self):
self.url = 'http://www.xiachufang.com/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def get_data(self):
"""
获取网页源码以及首页图片下载
:return:
"""
response = requests.get(self.url, headers=self.headers).text
# 正则匹配两组数据
data_list1 = re.findall(r'src="(.*?)" alt="(.*?)"', response)
data_list2 = re.findall(r'data-src="(.*?)" alt="(.*?)"', response)
# list 合并
data_list1.extend(data_list2)
for data in data_list1:
pic = data[0]
title = data[1]
# 替换标题特殊字符,防止下载出错
title = re.sub('[/ 😊 |]', '', title)
# 判断图片地址是否以http://i5为开始,分析页面可知只有以http://i5开始的链接才是图片地址
if pic.startswith('http://i5'):
# 切片分割获取大图
url = pic.split('@')[0]
# 获取图片二进制数据
response_download = requests.get(url, headers=self.headers).content
# 定义保存地址
save_path = 'images/' + title + '.jpg'
# 图片持久化
with open(save_path, 'wb') as f:
print(save_path)
f.write(response_download)
if __name__ == '__main__':
s = Spider()
s.get_data()
来源:CSDN
作者:玩爬虫的小朋友
链接:https://blog.csdn.net/gklcsdn/article/details/104678284