爬取汽车之家北京二手车信息
经测试,该网站:https://www.che168.com/beijing/list/ 反爬机制较低,仅需要伪造请求头设置爬取速率,但是100页之后需要登录,登录之后再爬要慎重,一不小心就会永久封号。爬取的数据以各种类型存放,下面展示保存到mysql数据库中:
代码解析:
程序源码自提Github:https://github.com/H-Ang/carsSpider
爬虫主程序
# 汽车之家爬虫,北京二手车
import requests
from lxml import etree
from data_save import *
import time
class Car_second():
name = ''
gonglishu = ''
brought_year = ''
location = ''
img_url = ''
price = ''
def getInfors(url,i):
print("Page %d is saving." % i)
# 构造请求头
headers = {
"Cache-Control":"no-cache",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Referer":"https://www.che168.com/beijing/list/",
}
response = requests.get(url=url,headers=headers)
html = response.text
ob_xml = etree.HTML(html)
infos = ob_xml.xpath('//*[@id="viewlist_ul"]//li[not(contains(@class,"adv-img"))]/a')
secondCars = []
for info in infos:
if info.xpath('.//img/@src2') == []:
img = info.xpath('.//img/@src')[0]
else:
img = info.xpath('.//img/@src2')[0]
name = info.xpath('.//h4/text()')[0]
price = info.xpath('.//span[@class="price"]/text()')[0] + info.xpath('.//em/text()')[0]
myl = info.xpath('.//p/text()')[0].split('/')
gonglishu = myl[0]
brought_year = myl[1]
location = myl[2]
secondCar = Car_second()
secondCar.name = name
secondCar.img_url = img
secondCar.brought_year = brought_year
secondCar.location = location
secondCar.gonglishu = gonglishu
secondCar.price = price
secondCars.append(secondCar)
return secondCars
if __name__ == '__main__':
url = 'https://www.che168.com/beijing/a0_0msdgscncgpi1ltocsp{}exx0/'
for i in range(1,101):
car_infors = getInfors(url.format(i),i)
time.sleep(0.95)
#savdFile(car_infors)
saveMysql(car_infors)
保存数据
def savdFile(datas):
# 保存到文本文件
with open('J:\DATAs\北京市二手车(汽车之家)\data.txt','a+',encoding='utf-8') as f:
for data in datas:
# try:
# name = data.name
# gonglishu = data.gonglishu
# brought_year = data.brought_year
# location = data.location
# img_url = data.img_url
# price = data.price
# writeCont = name+"/"+gonglishu+"/"+brought_year+"/"+location+"\n"+price+"图片地址:"+img_url
# f.write(writeCont+'\n\n')
# except:
# print(writeCont)
name = data.name
gonglishu = data.gonglishu
brought_year = data.brought_year
location = data.location
img_url = data.img_url
price = data.price
writeCont = name+"/"+gonglishu+"/"+brought_year+"/"+location+"\n"+price+"图片地址:"+img_url
f.write(writeCont+'\n\n')
print('保存完成。')
# 将数据保存到数据库中
from sqlalchemy import Column,create_engine,Integer,String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
Base = declarative_base()
class Car(Base):
__tablename__ = "second_cars"
id = Column(Integer,primary_key=True,autoincrement=True,nullable=False)
carName = Column(String(100))
gonglishu = Column(String(20))
brought_year = Column(String(10))
location = Column(String(10))
image_url = Column(String(200))
price = Column(String(10))
def saveMysql(datas):
connect = create_engine("mysql+pymysql://root:root@127.0.0.1:3306/second_cars",
encoding='utf-8',
echo=True)
Base.metadata.create_all(connect)
DBsession = sessionmaker(bind=connect)
session = DBsession()
for data in datas:
car = Car(
carName=data.name,
gonglishu = data.gonglishu,
brought_year = data.brought_year,
price=data.price,
location = data.location,
image_url = data.img_url,
)
session.add(car)
session.commit()
session.close()
反思
保存到mysql数据库是,创建新对象并传参时有点复杂,我曾经记得有种很简单明了的方法,现在怎么也想不起来,望指教哈。
来源:CSDN
作者:Code_st
链接:https://blog.csdn.net/qq_42776455/article/details/84872544