爬取汽车之家北京二手车信息

我的未来我决定 提交于 2019-12-08 22:46:17

爬取汽车之家北京二手车信息

经测试,该网站:https://www.che168.com/beijing/list/ 反爬机制较低,仅需要伪造请求头设置爬取速率,但是100页之后需要登录,登录之后再爬要慎重,一不小心就会永久封号。爬取的数据以各种类型存放,下面展示保存到mysql数据库中:
在这里插入图片描述

代码解析:

程序源码自提Github:https://github.com/H-Ang/carsSpider

爬虫主程序

# 汽车之家爬虫,北京二手车
import requests
from lxml import etree
from data_save import *
import time

class Car_second():
    name = ''
    gonglishu = ''
    brought_year = ''
    location = ''
    img_url = ''
    price = ''

def getInfors(url,i):
    print("Page %d is saving." % i)
    # 构造请求头
    headers = {
        "Cache-Control":"no-cache",
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
        "Referer":"https://www.che168.com/beijing/list/",
    }

    response = requests.get(url=url,headers=headers)

    html = response.text

    ob_xml = etree.HTML(html)

    infos = ob_xml.xpath('//*[@id="viewlist_ul"]//li[not(contains(@class,"adv-img"))]/a')

    secondCars = []
    for info in infos:
        if info.xpath('.//img/@src2') == []:
            img = info.xpath('.//img/@src')[0]
        else:
            img = info.xpath('.//img/@src2')[0]

        name = info.xpath('.//h4/text()')[0]

        price = info.xpath('.//span[@class="price"]/text()')[0] + info.xpath('.//em/text()')[0]

        myl = info.xpath('.//p/text()')[0].split('/')
        gonglishu = myl[0]
        brought_year = myl[1]
        location = myl[2]

        secondCar = Car_second()
        secondCar.name = name
        secondCar.img_url = img
        secondCar.brought_year = brought_year
        secondCar.location = location
        secondCar.gonglishu = gonglishu
        secondCar.price = price

        secondCars.append(secondCar)

    return secondCars

if __name__ == '__main__':
    url = 'https://www.che168.com/beijing/a0_0msdgscncgpi1ltocsp{}exx0/'
    for i in range(1,101):
        car_infors = getInfors(url.format(i),i)
        time.sleep(0.95)
        #savdFile(car_infors)
        saveMysql(car_infors)

保存数据

def savdFile(datas):
    # 保存到文本文件
    with open('J:\DATAs\北京市二手车(汽车之家)\data.txt','a+',encoding='utf-8') as f:
        for data in datas:
            # try:
            #     name = data.name
            #     gonglishu = data.gonglishu
            #     brought_year = data.brought_year
            #     location = data.location
            #     img_url = data.img_url
            #     price = data.price
            #     writeCont = name+"/"+gonglishu+"/"+brought_year+"/"+location+"\n"+price+"图片地址:"+img_url
            #     f.write(writeCont+'\n\n')
            # except:
            #     print(writeCont)
            name = data.name
            gonglishu = data.gonglishu
            brought_year = data.brought_year
            location = data.location
            img_url = data.img_url
            price = data.price
            writeCont = name+"/"+gonglishu+"/"+brought_year+"/"+location+"\n"+price+"图片地址:"+img_url
            f.write(writeCont+'\n\n')
    print('保存完成。')

# 将数据保存到数据库中
from  sqlalchemy import Column,create_engine,Integer,String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

Base = declarative_base()

class Car(Base):
    __tablename__ = "second_cars"
    id = Column(Integer,primary_key=True,autoincrement=True,nullable=False)
    carName = Column(String(100))
    gonglishu = Column(String(20))
    brought_year = Column(String(10))
    location = Column(String(10))
    image_url = Column(String(200))
    price = Column(String(10))

def saveMysql(datas):
    connect = create_engine("mysql+pymysql://root:root@127.0.0.1:3306/second_cars",
                            encoding='utf-8',
                            echo=True)


    Base.metadata.create_all(connect)

    DBsession = sessionmaker(bind=connect)
    session = DBsession()

    for data in datas:

        car = Car(
            carName=data.name,
            gonglishu = data.gonglishu,
            brought_year = data.brought_year,
            price=data.price,
            location = data.location,
            image_url = data.img_url,
        )

        session.add(car)
        session.commit()
        session.close()

反思

保存到mysql数据库是,创建新对象并传参时有点复杂,我曾经记得有种很简单明了的方法,现在怎么也想不起来,望指教哈。

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!