链接二手房数据可视化

梦想与她 提交于 2019-12-08 21:26:05

使用python3.7对链家网中广州二手房的交易数据进行爬取,并使用python-highcharts对爬取到的数据进行可视化分析。

首先,配置需要的环境:

打开终端cmd,进入pip所在的目录,安装python-highcharts库:
pip install python-highcharts

对链家网进行数据爬虫,得到json格式的数据:

LianJia_by_json.py

from selenium import webdriver
from bs4 import BeautifulSoup
import re
import xlwt
import urllib
import json
import os
import time
class spider:
    def __init__(self,driver):
        self.driver = driver
        self.chengjiao_info_dict = {}
        self.href_list = []
        self.classification_dict = {}
        # self.data_json_file = 'LianJia_'+time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())+'.json'
        # print(self.data_json_file)
        # exit(0)
    def spider(self):
        ele_ershoufang = self.driver.find_element_by_link_text('二手房')
        ele_ershoufang.click()
        handles = self.driver.window_handles
        self.driver.switch_to.window(handles[-1])
        self.driver.find_element_by_link_text('成交').click()
        # 获取总共的页数
        soup = BeautifulSoup(self.driver.page_source, 'lxml')
        # page_info_list = soup.find("div",class_="page-box house-lst-page-box").find_all("a")
        page_info = soup.select("div.page-box.house-lst-page-box")[0].get("page-data")  # 获取到一个字符串
        total_page = (page_info.split(':')[1]).split(',')[0]
        # print(total_page)
        self.get_chengjiao_house_href(int(total_page))
        self.get_chengjiao_base_info()
    # 获取一页中所有成交房源的信息
    def get_chengjiao_house_href(self,total_page):
        # 循环获得每一页中成交房源的连接
        for page in range(1,total_page+1):
            url = 'https://gz.lianjia.com/chengjiao/pg'+str(page)
            response = urllib.request.urlopen(url)
            html = response.read().decode("utf-8")
            # print(html)
            soup = BeautifulSoup(html, 'lxml')
            href_info_list = soup.find("ul", class_="listContent").find_all("li")
            # 在多次请求网页信息过程中,有时会出现返回的响应中房源信息为0条的情况,所以需要多次重复请求
            while len(href_info_list)==0:
                response = urllib.request.urlopen(url)
                html = response.read().decode("utf-8")
                # print(html)
                soup = BeautifulSoup(html, 'lxml')
                href_info_list = soup.find("ul", class_="listContent").find_all("li")
                print(len(href_info_list))
            for li in href_info_list:
                # print(li)
                pattern = re.compile('<li><a.*?href=(.*?)rel="nofollow".*?</li>')
                m = pattern.findall(str(li))[0]
                href = m.split('"')[1]
                self.href_list.append(href)
                # break
            print('第',page,'页:链接列表长度',len(self.href_list))
            # sleep(2)
            # break
    def get_chengjiao_base_info(self):
        # 循环操作,直到得到所有房源的信息
        columns = 1
        self.chengjiao_info_dict = {}
        for url in self.href_list:
            info_dict = {}
            # 获取到当前房源的信息
            response = urllib.request.urlopen(url)
            html = response.read().decode("utf-8")
            soup = BeautifulSoup(html,'lxml')
            # 得到小区名
            housing_name_info = (soup.find_all('div',class_='wrapper')[0]).get_text()
            housing_name = housing_name_info.split(' ')[0]
            # print(housing_name)
            info_dict['小区名']=housing_name
            price = (((soup.find_all('div',class_='info fr')[0]).find_all('div',class_='price')[0]).find_all('b')[0]).get_text()
            print(price)
            info_dict['单价:元/平'] = price
            info_dict['总价:万'] = ((soup.find_all('span',class_='dealTotalPrice')[0]).find_all('i')[0]).get_text()
            info_dict['成交日期'] = ((soup.find_all('div',class_='wrapper')[0]).find_all('span')[0]).get_text().split(' ')[0]
            # print(info_dict['成/交日期'])
            agent = (soup.find_all('div',class_="myAgent")[0]).find_all('a')
            info_dict['所属区域'] = (agent[0]).get_text()+(agent[1]).get_text()
            print(info_dict['所属区域'])
            # self.key_list.append('小区名')
            # self.chengjiao_info_dict['房源链接'] = url
            # self.key_list.append('小区名')

            base_info_list = soup.find('div',class_='content').find('ul').find_all('li')
            # base_info_list = soup.fina('div',class_='content')
            for info in base_info_list:
                # print(info)
                patten_title = re.compile('<li>.*?"label">(.*?)</span>.*?</li>', re.S)
                pattern_content = re.compile('<li>.*?</span>(.*?)</li>',re.S)
                # 在sublime中直接传入info没有报错,而在这里却需要转为字符串,否则报错?????
                key = (patten_title.findall(str(info))[0]).strip()
                # self.key_list.append(key)
                value = (pattern_content.findall(str(info))[0]).strip()
                info_dict[key] = value
            # print(info_dict)
            self.chengjiao_info_dict[url] = info_dict
            # print(self.chengjiao_info_dict)

            # 显示当前进度
            print(len(self.href_list),':',columns)
            # sleep(2)
            self.write_house_info()
            # 开始分类统计
            # self.get_classification_dict()
            columns += 1
            # break
        # self.write_classification()
        # for key in self.info_dict.keys():
        #     self.key_list.append(key)
    def write_house_info(self):
        with open("LianJia_data_json_file",'w',encoding='utf-8') as fp:
            json.dump(self.chengjiao_info_dict,fp,ensure_ascii=False)

if __name__ == "__main__":
    url = 'https://gz.lianjia.com/'
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get(url)
    sp = spider(driver)
    sp.spider()

得到的json格式的数据LianJia_data_json_file.json的部分截图如下:

将获取到的json格式的数据转换为Excel格式,以便阅读:

LianJia_json_deal.py

import json
import xlwt

# 设置表头
def set_excel_title(LianJia_data_dict,LianJia_data_all):
    LianJia_data_all.write(0,0,'href')
    raw = 1
    for k in (list(LianJia_data_dict.values())[0]).keys():
        LianJia_data_all.write(0, raw, k)
        print (k)
        raw += 1

def set_excel_content(LianJia_data_dict,LianJia_data_all):
    column = 1
    for href in LianJia_data_dict:
        LianJia_data_all.write(column, 0, href)
        raw = 1
        for value in (LianJia_data_dict[href]).values():
            LianJia_data_all.write(column, raw, value)
            raw += 1
        column += 1

def get_data_excel():
    with open('LianJia_data_json_file.json', 'r', encoding='utf-8') as fp:
        LianJia_data_dict = json.load(fp)

    wb = xlwt.Workbook(encoding='ascii')
    LianJia_data_all = wb.add_sheet('LianJia_data_all')

    set_excel_title(LianJia_data_dict,LianJia_data_all)
    set_excel_content(LianJia_data_dict,LianJia_data_all)

    wb.save('LianJia_data.xls')
if __name__ =="__main__":
    get_data_excel()

得到的excel表格LinaJia_data.xls的部分截图如下:

将得到的json数据进行分类统计,为数据可视化做准备:

LianJia_select.py

import json
import time

with open('LianJia_2019_01_08_12_42_25.json', 'r', encoding='utf-8') as fp:
    LianJia_data_dict = json.load(fp)

select_dict={}
# sub_key为每个房源的每一项信息的关键子,如:户型结构、所属区域等
for sub_key in (list(LianJia_data_dict.values()))[0]:
    # print(sub_key)
    select_sub_dict = {}
    for href in LianJia_data_dict:
        # print(LianJia_data_dict[href][sub_key])
        if LianJia_data_dict[href][sub_key] in select_sub_dict:
            select_sub_dict[LianJia_data_dict[href][sub_key]] += 1
        else:
            select_sub_dict[LianJia_data_dict[href][sub_key]] = 1
    select_dict[sub_key] = select_sub_dict
    print(sub_key,':',select_dict[sub_key])
# print(select_dict)

with open('LianJia_select_data_file.json', 'w', encoding='utf-8') as fp:
    json.dump(select_dict,fp,ensure_ascii=False)

获得的分类后的json数据的分布截图取下:

所有的数据准备好后,就开始对数据进行可视化操作。

在python-highcharts库中,不同的可视化图形所需要的数据格式是不完全相同的,这里,针对自己所接触到的情况,总结如下:

饼状图pie:
    data=[
    	{
    		'name':data name,
    		'y':data values,
    		'sliced': True,			需要突出显示的部分
    		'selected': True
    	},{
    		'name':data name,
    		'y':data values
    	},
    	……
    ]
    没有x轴的配置。
		
折线图line/曲线图spline/柱形图column/面积图area:
	data = [1,2,3,4,5,6,7,8,9,0]
	categories = []        # 横坐标中每个数据的名称
	配置X轴坐标:
	options={
		'xAxis': {'type': categories},		#以获取到的categories列表中的元素作为x轴的坐标
	}
		
柱形图下钻column_drilldown:该图可以在每项大的分类中进行更进一步的分类统计
	第一层数据:        # 即大的分类的数据
		data = [{
			'name': "Chrome",
			'y': 24.030000000000005,        # 数据为该项的百分比
			'drilldown': "Chrome"
		}, 
		……,
		{
			'name': "Proprietary or Undetectable",
			'y': 0.2,
			'drilldown': None						#该项目不设置下钻
		}]
	第二层数据:        # 大分类项的分类统计的数据
		data_sub = [
			["v11.0", 24.13],
	    	["v8.0", 17.2],
			["v9.0", 8.11],
			["v10.0", 5.33],
			["v6.0", 1.06],
			["v7.0", 0.5]
		]
		配置X轴坐标:
			options={
				'xAxis': {'type': 'category'},			#这个是固定的
			}

开始进行数据可视化操作:

LianJia_hc.py

import json
from highcharts import Highchart
def get_data_dict(word):
    with open('LianJia_select_data_2019_01_08_15_47_59.json', 'r', encoding='utf-8') as fp:
        LianJia_select_data_dict = json.load(fp)

    data_dict = LianJia_select_data_dict[word]

    return data_dict

def get_chart(word):
    data_dict = get_data_dict(word)
    print(data_dict)
    data = list(data_dict.values())
    categories = list(data_dict.keys())
    print(data)
    print(len(data))
    print(categories)
    text = '链家二手'+word+'统计'
    options = {
        'title':{'text':text},
        'xAxis': {'categories': categories},
        'plotOptions': {
            'series': {
                'dataLabels': {
                    'enabled': True,    #显示出数据点的数值
                    'shadow': True,     #数据标签边框有阴影
                    'backgroundColor': 'rgba(252, 255, 197, 0.7)'      #设置数据点标签的背景色
                    # 'borderRadius': 10,  # 圆角,默认是0,lable是方的,这里10已经比较园了
                    # 'borderWidth': 20,    #这个是啥????
                    # 'padding': 5,          #这个也不晓得是啥
                    # 'style': {'fontWeight': 'bold'},

                }
            }
        }
    }
    H.set_dict_options(options)

    # 绘制面积图
    # H.add_data_set(data,'area')
    # H.save_file(word.split(':')[0] + '_area')
    # 绘制柱形图
    H.add_data_set(data, 'bar')
    H.save_file(word.split(':')[0]+'_bar')

    # 绘制折线图
    # H.add_data_set(data, 'line')
    # H.save_file(word.split(':')[0] + '_line')

    # 绘制曲线图
    # H.add_data_set(data, 'spline')
    # H.save_file(word.split(':')[0] + '_spline')

# def get_heatmap_chart():


if __name__ == "__main__":
    # H = Highchart(width=950, height=600)
    H = Highchart()
    # get_chart('建筑结构')
    get_chart('房屋户型')
    # get_chart('房屋朝向')
    # get_chart('成交日期')
    # get_chart('单价:元/平')
    # get_chart('所属区域')

在该程序总,可以绘制出数据的不同可视化图形,这里对房屋户型的折线图,曲线图,面积图及柱形图进行展示:

 

对二手房是否配备电梯进行饼状图可视化分析:

LianJia_hc_pie.py

import json
from highcharts import Highchart
def get_data_dict(word):
    with open('LianJia_select_data_2019_01_08_15_47_59.json', 'r', encoding='utf-8') as fp:
        LianJia_select_data_dict = json.load(fp)

    data_dict = LianJia_select_data_dict[word]

    return data_dict

def get_chart(word):
    data_dict = get_data_dict(word)
    # data = list(data_dict.values())
    # categories = list(data_dict.keys())
    data=[]
    for key in data_dict:
        data_list = []
        data_list.append(key)
        data_list.append(data_dict[key])
        data.append(data_list)
    text = '链家二手'+word+'统计'
    options = {
        'title':{'text':text},
        'plotOptions': {
            'series': {
                'allowPointSelect': True,
                'dataLabels': {
                    'enabled': True,    #显示出数据点的数值
                    'shadow': True,     #数据标签边框有阴影
                    'backgroundColor': 'rgba(252, 255, 197, 0.7)'      #设置数据点标签的背景色
                    # 'borderRadius': 10,  # 圆角,默认是0,lable是方的,这里10已经比较园了
                    # 'borderWidth': 20,    #这个是啥????
                    # 'padding': 5,          #这个也不晓得是啥
                    # 'style': {'fontWeight': 'bold'},

                }
            }
        }
    }
    H.set_dict_options(options)
    H.add_data_set(data, 'pie')
    H.save_file(word.split(':')[0]+'_pie')



if __name__ == "__main__":
    # H = Highchart(width=950, height=600)
    H = Highchart()
    get_chart('配备电梯')
    # get_chart('房屋户型')
    # get_chart('成交日期')
    # get_chart('单价:元/平')
    # get_chart('所属区域')

 

链家二手房所属区域进行下钻分析:

Lianjia_hc_column_dirlldown.py

import json
from highcharts import Highchart

'''
    对链家二手房成交房所属区域进行下钻分析
'''

def get_data_dict():
    with open('LianJia_select_data_2019_01_08_15_47_59.json', 'r', encoding='utf-8') as fp:
        LianJia_select_data_dict = json.load(fp)

    area_sub_dict = LianJia_select_data_dict['所属区域']
    # area_dict为广州各个区的二手房统计情况
    area_dict = {}
    # area_sub_class_dict为各个区下属区域二手房统计情况
    area_sub_class_dict = {}
    for area_sub in area_sub_dict:
        area = area_sub[:2]
        # area_sub_class_dict[area] = []
        area_sub_temp = []
        if area in area_dict:
            area_dict[area] += int(area_sub_dict[area_sub])
            area_sub_temp.append(area_sub)
            area_sub_temp.append(area_sub_dict[area_sub])
            # area_sub_class_dict[area][area_sub] = area_sub_dict[area_sub]
            area_sub_class_dict[area].append(area_sub_temp)
        else:
            area_dict[area] = int(area_sub_dict[area_sub])
            area_sub_class_dict[area] = []
            area_sub_temp.append(area_sub)
            area_sub_temp.append(area_sub_dict[area_sub])
            area_sub_class_dict[area].append(area_sub_temp)
    get_chart(area_dict, area_sub_class_dict)
def get_chart(area_dict,area_sub_class_dict):
    data = []
    sum = 0
    for value in area_dict.values():
        sum += int(value)
    # print(sum)
    for area in area_dict:
        dict = {}
        dict['name'] = area
        dict['y'] = ((int(area_dict[area]))/sum)*100
        dict['drilldown'] = area
        data.append(dict)
    # print(data)

    options = {
        'title':{'text':'广州链家二手房地区型统计'},
        'xAxis': {'type': 'category'},
        'plotOptions': {
            'series': {
                'dataLabels': {
                    'enabled': True,    #显示出数据点的数值
                    'shadow': True,     #数据标签边框有阴影
                    'format': '{point.y:.1f}%',
                    # 'backgroundColor': 'rgba(252, 255, 197, 0.7)'      #设置数据点标签的背景色
                    # 'borderRadius': 10,  # 圆角,默认是0,lable是方的,这里10已经比较园了
                    # 'borderWidth': 20,    #这个是啥????
                    # 'padding': 5,          #这个也不晓得是啥
                    # 'style': {'fontWeight': 'bold'},

                }
            }
        }
    }
    H.set_dict_options(options)
    H.add_data_set(data, 'column', "area", colorByPoint=True)

    for area in area_sub_class_dict:
        data_sub = area_sub_class_dict[area]
        for value in data_sub:
            value[1] = (value[1]/int(area_dict[area]))*100
        # print(data_sub)
        H.add_drilldown_data_set(data_sub, 'column', area, name=area)

    H.save_file('所属区域_column_drilldown')

if __name__ == "__main__":
    # H = Highchart(width=950, height=600)
    H = Highchart()
    get_data_dict()

 大分类的数据可视化结果:

大类中进一步分析的可视化结果:

如白云区和天河区

 

 

 在操作的过程中还存在一些待解决的问题:

1.在开始的时候,遇到一个charts库,也是python中highcharts的一个库,但用起来比较摸不着头脑,还不清楚他们的关系。

2.关于这些可视化图形的细节配置还有待进一步研究,目前只能粗略实现。

3.在代码的编写过程中,对文件的处理还不能达到理想的效果,如:在下一次运行程序生成文件的时候需要手动删除已有的文件。

 

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!