爬取安居客

冷暖自知 提交于 2019-12-08 22:21:29

爬取安居客,获取杭州萧山二手房
获取链接,小区,地址,二手房,价格,保存到csv表格中
代码如下:

# -*- coding: utf-8 -*-
import csv
import time

import requests

import io
import sys

from lxml import etree
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

def anju():
    #请求头内容
    headers={
        'authority': 'hangzhou.anjuke.com',
        'method': 'GET',
        'path': '/community/xiaoshan/?tdsourcetag=s_pctim_aiomsg',
        'scheme': 'https',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'max-age=0',
        'cookie': '自己cookie',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': '本地请求头'
    }
    #循环页数
    for i in range(1,25):
        res=requests.get(f'https://hangzhou.anjuke.com/community/xiaoshan/p{i}/',headers=headers)
        #传递参数
        pares(res.text)
        #每爬一次,暂停时间
        time.sleep(12)
def pares(html):
    root=etree.HTML(html)
    #文件操作
    with open("杭州萧山二手房.csv", 'a')as f:
        writer = csv.writer(f)
        writer.writerow(['链接','小区','地址','房源信息','价格'])
        #爬取内容
        for i in range(2, 31):
        		#防止报错
       			try:
            		url = root.xpath(f'//*[@id="list-content"]/div[{i}]/div[1]/h3/a/@href')
            		url = ''.join(url)
           			name = root.xpath(f'//*[@id="list-content"]/div[{i}]/div[1]/h3/a/text()')
           			name = ''.join(name)
            		site = root.xpath(f'//*[@id="list-content"]/div[{i}]/div[1]/address/text()')
            		site = ''.join(site)
            		house = root.xpath(f'//*[@id="list-content"]/div[{i}]/div[1]/p[2]/span/text()')
            		house = ''.join(house)
            		price = root.xpath(f'//*[@id="list-content"]/div[{i}]/div[2]/p[1]/strong/text()')
            		price = ''.join(price)
            		price = price+'元/每平米'
            	except:
            		pass
            	try:
                	#加入表格
                	writer.writerow([url,name,site,house,price])
                	#错误加入0
            	except:
                writer.writerow([0,0,0,0,0])
            print(url, name, site, house, price)
if __name__=='__main__':
    anju()

在这里插入图片描述效果如图

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!