爬取属性有:1、第一车网
id | title | date | kilometers | location | current-price | original_price | vehicle_grade | driving_mode | Wheelbase | structure | describe_message |
2、人人车
title | price | newcarprice | three_type_message | kilometers-price | date | location | tranmission | transfer | car_describe | kilometers |
3、二手车之家
id | title | date | kilometers | location | current-price | original_price | vehicle_grade | driving_mode | Wheelbase | structure | describe_message |
所使用的库:
import re
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from retrying import retry
人人车的数据获取流程:
1、获取人人车首页每个车的链接
2、通过每个车的链接进入子页面,获取所有信息。
人人车代码:
import re
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from retrying import retry
@retry(stop_max_attempt_number=10, wait_random_max=1)
def getHTMLText(url):
headers = {'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Referer': 'https://googleads.g.doubleclick.net/pagead/ads?client=ca-pub-7060171147028597&output=html&h=90&slotname=5057984058&adk=3524818333&adf=515356539&w=728&fwrn=4&fwrnh=100&lmt=1544410849&rafmt=1&guci=2.2.0.0.2.2.0.0&format=728x90&url=https%3A%2F%2Fwww.carsforsale.com%2FSearch%3FSearchTypeID%3D2%26Make%3DHyundai%26Model%3DTucson%26BodyStyle%3D%26SubBodyStyle%3D%26MinModelYear%3D%26MaxModelYear%3D%26MinPrice%3D%26MaxPrice%3D%26FromEstimatedMonthlyPayment%3D%26ToEstimatedMonthlyPayment%3D%26MaxMileage%3D%26FromFuelEconomy%3D%26Radius%3D%26ZipCode%3D%26State%3D%26City%3D%26FullStateName%3D%26Latitude%3D%26Longitude%3D%26Conditions%3D%26HideRepairable%3D%26FilterImageless%3D%26PricedVehiclesOnly%3D%26OrderBy%3DRelevance%26OrderDirection%3Ddesc%26PageResultSize%3D15%26PageNumber%3D1%26TotalRecords%3D%26FromDate%3D%26ToDate%3D%26DaysListed%3D%26SourceId%3D%26SourceExternalUserID%3D&flash=32.0.0&fwr=0&resp_fmts=3&wgl=1&dt=1544410849540&bpp=40&bdt=852&fdt=111&idt=109&shv=r20181205&cbv=r20180604&saldr=aa&abxe=1&correlator=2296237836056&frm=20&pv=2&ga_vid=1116481270.1544405414&ga_sid=1544410850&ga_hid=48480944&ga_fc=0&iag=0&icsg=2334722&dssz=22&mdo=0&mso=0&u_tz=480&u_his=9&u_java=0&u_h=768&u_w=1366&u_ah=728&u_aw=1366&u_cd=24&u_nplug=5&u_nmime=7&adx=311&ady=281&biw=1349&bih=662&scr_x=0&scr_y=0&eid=21060853%2C188690903%2C410075081&oid=3&rx=0&eae=0&fc=656&brdim=0%2C0%2C0%2C0%2C1366%2C0%2C1366%2C728%2C1366%2C662&vis=1&rsz=%7C%7CeoE%7C&abl=CS&ppjl=f&pfx=0&fu=144&bc=13&osw_key=3063821004&ifi=1&uci=1.4kzd5h15v778&fsb=1&xpc=iOJzRu6CkG&p=https%3A//www.carsforsale.com&dtd=139',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
}
try:
requests.adapters.DEFAULT_RETRIES = 5
s = requests.session()
s.keep_alive = False
time.sleep(np.random.random(1))
res = requests.get(url, timeout=3, headers=headers)
res.raise_for_status() # 如果状态码不是200,引发HTTPError异常
res.encoding = res.apparent_encoding
return res.text
except(Exception)as e:
print(e)
return 0
'''判断输入值是否为None'''
def isnull(s):
if s!= None:
return s.text
else:
return None
'''第二个页面的详细信息'''
def get_car_details(url):
dict = {}
res = getHTMLText(url)
if res != 0:
soup = BeautifulSoup(res, 'html.parser')
title = soup.select(
'#basic > div.version3-detail-header.container > div.version3-detail-header-right > div.right-container > div.title > h1')
price = soup.findAll('p', {'class': 'price detail-title-right-tagP'})
newcarprice = soup.findAll('div', {'class': 'new-car-price detail-title-right-tagP'})
#上牌日期、公里数、所在地、变速箱、过户记录
basic_message = soup.find('ul', {'class': 'row-fluid list-unstyled box-list-primary-detail'})
# 无重大事故,火烧,水泡信息
three_type_message = soup.find('div', {'class': 'report-danger'})
#二手车平台检测后给出的综合的评价信息
car_describe = soup.find('div', {'class': 'report-result-des'})
dict['title'] = isnull(title[0])
dict['price'] = isnull(price[0])
dict['newcarprice'] = isnull(newcarprice[0])
dict['three_type_message'] = isnull(three_type_message)
dict['basic_message'] = isnull(basic_message)
dict['car_describe'] = isnull(car_describe)
return dict
'''获取每个车的的链接'''
def get_car_box_url_list(url):
links_list = []
res = getHTMLText(url)
if res != 0:
soup = BeautifulSoup(res, 'html.parser')
links = soup.findAll(href=re.compile('car'))
for link in links:
links_list.append('https://www.renrenche.com' + link.attrs['href'])
return links_list
url_header = 'https://www.renrenche.com/cn/ershouche/ft-dd/p'
url_tail = '/?ft=dd&plog_id=9e528d4d463a966f6a50ebb470e41527'
for page in range(1, 32):
url = url_header + str(page) + url_tail
links_list = get_car_box_url_list(url)
list1 = []
for link in links_list:
list1.append(get_car_details(link))
with open('renrencar.txt', 'a+') as file:
file.write(str(list1))
file.close()
print('{} page get over'.format(page))
来源:CSDN
作者:forestForQuietLive
链接:https://blog.csdn.net/forestForQuietLive/article/details/88237926