链家房产的爬取
什么是爬虫,爬虫就是我们模拟浏览器向服务器发送请求获取并处理响应的一个过程。
爬取链家首先要了解链家的url
上面是第一页(忽略没有pg1)
这是第二页
这是第三页
相信大家都发现规律了,就是该url里面pg后面跟的是页数
这里我们就可以一遍又一遍的来爬取每一页的内容
就把url里面的pg后面的用.format表示表示如下
def __init__(self): self.temp_url="https://cd.lianjia.com/ershoufang/pg{}rs%E9%83%BD%E6%B1%9F%E5%A0%B0/" self.header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
def make_url(self): start_url=[self.temp_url.format(i)for i in range(1,9)]#列表推导式 return start_url
处理完url就是发送请求了
我把代码写在了dffc类里面,在该类中定义各种各样的方法。
class dffc:
该dffc类中有make_url方法def make_url(self): start_url=[self.temp_url.format(i)for i in range(1,9)]#列表推导式 return start_url
def send_requests(self,url): response = requests.get(url, headers=self.header) json_str = response.content.decode() return json_str
发送请求后就是处理数据我用lxml来提取的数据
def draw_data(self,json_str):#提取数据 Element=etree.HTML(json_str) page_li=Element.xpath("//div[@class='leftContent']/ul//li") lit=[] for li in page_li: dit = {} global house_num dit['img_src']=li.xpath(".//img[@class='lj-lazy']/@data-original")[0] if len(dit['img_src'])>1: headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Mobile Safari/537.36"} response = requests.get(url=dit['img_src'], headers=headers) global img_num img_num+=1 with open('E:\\tangyuantao\爬虫\爬虫代码\lianjia_imgs\\name'+repr(img_num)+'.jpg', "wb")as b: b.write(response.content) int(img_num) print("第%d张图片保存完成!"%img_num) else: print("no img") dit['house_url']=li.xpath(".//div[@class='title']/a/@href")[0] dit['title']=li.xpath(".//div[@class='title']/a/text()")[0] dit['adress']=li.xpath(".//div[@class='houseInfo']//text()") dit['adress']=dit['adress'][0]+dit['adress'][1] dit['flood']=li.xpath(".//div[@class='flood']/div/text()")[0].strip("- ") dit['totalPrice']=li.xpath(".//div[@class='totalPrice']//text()") dit['totalPrice']=dit['totalPrice'][0]+dit['totalPrice'][1] dit['unitPrice']=li.xpath(".//div[@class='unitPrice']/span/text()")[0] dit['unitPrice']=re.findall("\d+",dit['unitPrice'])[0] global add_unit_price global house_num house_num+=1 add_unit_price+=int(dit['unitPrice']) repr(dit['unitPrice']) lit.append(dit) return lit
然后就是保存数据def save_data(self,lit): for unit_house in lit: unit_house_str=json.dumps(unit_house,ensure_ascii=False) with open("lianjia.txt","a",encoding="utf-8")as f: f.write(unit_house_str) f.write("\n")
所有代码
import re import requests from lxml import etree import json img_num=0 page=1 add_unit_price=0 house_num=0 class dffc: def __init__(self): self.temp_url="https://cd.lianjia.com/ershoufang/pg{}rs%E9%83%BD%E6%B1%9F%E5%A0%B0/" self.header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"} def make_url(self): start_url=[self.temp_url.format(i)for i in range(1,9)]#列表推导式 return start_url def draw_data(self,json_str):#提取数据 Element=etree.HTML(json_str) page_li=Element.xpath("//div[@class='leftContent']/ul//li") lit=[] for li in page_li: dit = {} global house_num dit['img_src']=li.xpath(".//img[@class='lj-lazy']/@data-original")[0] if len(dit['img_src'])>1: headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Mobile Safari/537.36"} response = requests.get(url=dit['img_src'], headers=headers) global img_num img_num+=1 with open('E:\\tangyuantao\爬虫\爬虫代码\lianjia_imgs\\name'+repr(img_num)+'.jpg', "wb")as b: b.write(response.content) int(img_num) print("第%d张图片保存完成!"%img_num) else: print("no img") dit['house_url']=li.xpath(".//div[@class='title']/a/@href")[0] dit['title']=li.xpath(".//div[@class='title']/a/text()")[0] dit['adress']=li.xpath(".//div[@class='houseInfo']//text()") dit['adress']=dit['adress'][0]+dit['adress'][1] dit['flood']=li.xpath(".//div[@class='flood']/div/text()")[0].strip("- ") dit['totalPrice']=li.xpath(".//div[@class='totalPrice']//text()") dit['totalPrice']=dit['totalPrice'][0]+dit['totalPrice'][1] dit['unitPrice']=li.xpath(".//div[@class='unitPrice']/span/text()")[0] dit['unitPrice']=re.findall("\d+",dit['unitPrice'])[0] global add_unit_price global house_num house_num+=1 add_unit_price+=int(dit['unitPrice']) repr(dit['unitPrice']) lit.append(dit) return lit def send_requests(self,url): response = requests.get(url, headers=self.header) json_str = response.content.decode() return json_str def unit_price(self): print("该片区的住房单价为%f元每平方米" % (add_unit_price / house_num)) print("总共统计了%d套二手房" % house_num) def save_data(self,lit): for unit_house in lit: unit_house_str=json.dumps(unit_house,ensure_ascii=False) with open("lianjia.txt","a",encoding="utf-8")as f: f.write(unit_house_str) f.write("\n") def run(self): start_url=self.make_url() #处理url for url in start_url: global page print("*"*5+"这是第%d页"%page+"*"*5) page+=1 json_str=self.send_requests(url)# 发送请求接收数据 lit=self.draw_data(json_str) #提取数据 self.save_data(lit) #保存数据 self.unit_price() Pc_test=dffc() Pc_test.run()
你们想要粘贴复制的话,直接运行代码的话会报错。先把上面的那几个库给安装了,然后把保存图片的路径改一下就行了。
with open('E:\\tangyuantao\爬虫\爬虫代码\lianjia_imgs\\name'+repr(img_num)+'.jpg', "wb")as b: b.write(response.content)
文章来源: 链家房产爬取