爬虫的相关操作
1、爬文本内容
# coding=gbk import requests ##声明相关库 import re response=requests.get('http://duanziwang.com/') ##获取到需要爬虫的网址 data=response.text ##把网站的相关内容全部爬下来 res=re.findall('<p>(.*?)</p>',data) ##获取自己需要的那一部分 print(res)
import requests import re respone=requests.get('http://ishuo.cn/') data=respone.text res=re.findall('<li class="list_li">(.*?)</li>',data) dict={} for i in res: ##对内容进行循环,并取出标题跟内容 context=re.findall('<div class="content">(.*?)</div>',i)[0] title=re.findall('<a href="/subject/.*?">(.*?)</a>',i)[0] desc=re.findall('</a>(04月.*?)</div>',i)[0] dict[title]=(context,desc) for i in dict.items(): print(f'{i[0]:<20} | {i[1]}')
2、爬图片内容
import requests import re respone=requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1&tdsourcetag=s_pcqq_aiomsg') data=respone.text img_res=re.findall('data-src="(.*?)"',data) for i in img_res: img_response=requests.get(i) img_data=img_response.content ##图片会以二进制的形式显示 img_name=i.split('/')[-1] ##以/进行分割并取出每一张图片 f=open(img_name,'wb') ##打开图片并写入 f.write(img_data)
3、爬视频内容
import requests import re response = requests.get('http://www.mod.gov.cn/v/index.htm') data = response.text mp4_res2 = re.findall('<a href="(.*?)">', data) for i in mp4_res2: # type:str res = re.findall('(.*?htm)', i)[0] res = 'http://www.mod.gov.cn/v/' + res response = requests.get(res) data = response.text # http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4 url_res = re.findall('//Video (.*?.mp4)',data)[0] mp4_response = requests.get(url_res) mp4_data = mp4_response.content f = open('test.mp4','wb') f.write(mp4_data) # break
来源:https://www.cnblogs.com/lzss/p/11227768.html