# -*- coding: utf-8 -*-
import urllib.request
from bs4 import BeautifulSoup
import csv
import re
#dizhi_url=['hohhot103','baotou114','chifeng202','tongliao458','ulanhot484','xilinhot500','manzhouli1083','wuhai1133']
def openlink(link):
maxTryNum=20
for tries in range(maxTryNum):
try:
response = urllib.request.urlopen(link).read().decode('utf-8')
return response
except:
if tries < (maxTryNum-1):
continue
else:
print("Has tried %d times to access url %s, all failed!", maxTryNum, link)
return None
break
#cityurl='http://hotels.ctrip.com/jiudian/neimenggu'
##print(cityurl)
##cityhtml = urllib.request.urlopen(cityurl).read().decode('utf-8')
#cityhtml = openlink(cityurl)
#citysoup = BeautifulSoup(cityhtml,'html.parser')
#
#city_list = citysoup.find_all('ul',attrs={"class":"p_n_list grid_8"})
#citysoup1 = BeautifulSoup(str(city_list),'html.parser')
#city_list1 = citysoup1.find_all('a',attrs={"href":True})
##print(city_list1)
##print("city_list是。。。。。。。。。。。。。:")
#city_id=[]
#for city in city_list1:
# city_id1=city['href']
# city_id.append(city_id1)
#print(city_id)
#city_id=['huhehaote','baotou','wuhai','chifeng','tongliao','eerduosi','hulunbeier','bayannaoer','wulanchabu','xingan','xilinguole','alashan',]
city_id=['huhehaote']
for cityid in city_id:
# hotelname='Hotel'+str(cityid)+'.csv'
csvFile2 = open('Hotel'+cityid+'.csv','w', newline='',encoding='utf-8-sig') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile2)
data1=['所属盟市','所属旗县','酒店名称','星级','房间数','开业年份','地址','联系方式','状态']
data=[data1]
m = len(data)
for i in range(m):
writer.writerow(data[i])
print('programbegin..................................................................')
pagecounturl='http://review.qunar.com/city_huhehaote_1.html'
# pageurl='http://hotels.ctrip.com/hotel/hohhot103/p'+str(k)
print(pagecounturl)
pagecounthtml = openlink(pagecounturl)
# pagecounthtml = urllib.request.urlopen(pagecounturl).read().decode('utf-8')
#print(html1)
#1.获取酒店名称信息
pagecountsoup = BeautifulSoup(pagecounthtml,'html.parser')
# print("pagesoup是。。。。。。。。。。。。。:")
#print(soup1)
page_count = pagecountsoup.find_all(attrs={"class":"pager"})
# print("hotel_list是。。。。。。。。。。。。。:")
# print(hotel_list)
#for count in page_count:
# pagecount=count['data-pagecount']
page_count=page_count[0].get_text().strip().split('\n')
print(page_count)
regex_str1 = ".*?([0-9]+)页"
match_obj1 = re.match(regex_str1,str(page_count))
print('pagecount是:',match_obj1)
if match_obj1:
pagecount = match_obj1.group(1)
print('pagecount是:',pagecount,type(pagecount))
for k in range(1,int(pagecount)+1):
print("***正在抓取第"+str(k)+"页********")
pageurl=''
hotel_id=[]
# if k==1:
# pageurl='http://hotels.ctrip.com/hotel/hohhot103/p0'
# else:
pageurl='http://review.qunar.com/city_'+cityid+'_'+str(k)+'.html'
# pageurl='http://hotels.ctrip.com/hotel/hohhot103/p'+str(k)
print(pageurl)
paghtml=openlink(pageurl)
# paghtml = urllib.request.urlopen(pageurl).read().decode('utf-8')
#print(html1)
#1.获取酒店名称信息
if paghtml==None:
with open('Pageerror.txt', 'a+') as f:
f.write(pageurl+'\n')
continue
pagesoup = BeautifulSoup(paghtml,'html.parser')
# print("pagesoup是。。。。。。。。。。。。。:")
#print(soup1)
hotel_list = pagesoup.find_all(attrs={"class":'pr'})
hotel_list=str(hotel_list)
hotel_list = BeautifulSoup(hotel_list,'html.parser')
hotel_list = hotel_list.find_all('a',attrs={"href":True})
# print("hotel_list是。。。。。。。。。。。。。:")
# print(hotel_list)
for hotel in hotel_list:
hotel_id1=hotel['href']
hotel_id.append(hotel_id1)
# hotel_list=str(hotel_list[0])
# hotwl_list=BeautifulSoup(str(hotel_list[0]),'html.parser')
# hotel_id=hotwl_list.find_all(attrs={"data-hotel":True})
print("hotel_id是...............................................................:")
print(hotel_id,len(hotel_id)+1)
result=[]
for i in hotel_id:
url1=''
result0=[]
# url1='http://hotels.ctrip.com/hotel/'+hotel_id[i-1]+'.html?isFull=F#ctm_ref=hod_sr_lst_dl_n_'+str(k)+'_'+str(i) #为爬取的当前页面的网址,
#num[i]为每个页面网址id,'.html?isFull=F#ctm_ref=hod_sr_lst_dl_n_2_'需要修改2为3,4,等等,#str(i)是个数
url1=i
print(url1)
html1 = openlink(url1)
#print(html1)
#1.获取酒店名称信息
if html1==None:
with open('Hotelerror.txt', 'a+') as f1:
f.write(url1+'\n')
continue
soup1 = BeautifulSoup(html1,'html.parser')
# print("soup1是:")
#print(soup1)
result1 = soup1.find_all('span',attrs={"class":"name"})
# print("result1是:")
# print(result1)
if result1:
hotelName = result1[0].string
else:
continue
print("酒店名称为:{}".format(hotelName))
result17 = soup1.find_all('span',attrs={"class":"type"})
if result17:
hotelStar = result17[0].string
print("酒店星级为:{}".format(hotelStar))
else:
hotelStar ='无'
print("酒店星级为:{}".format(hotelStar))
# print(i)
#2.获取酒店位置
# soup12 = BeautifulSoup(html1,'html.parser')
result12 = soup1.find_all(attrs={"class":"bread"})
if result12:
result120 = result12[0].get_text().split('\n')[2]
else:
result120 =None
result121 = soup1.find_all('a',attrs={"class":"js_area"})
result122 = soup1.find_all('span',attrs={"class":"position js_position"})
#result12 =str(result12)
# hotelAdress_m = result12[0].get_text().split('\n')[1]
# hotelAdress_q = result12[0].get_text().split('\n')[2]
# hotelAdress_a = result12[0].get_text().split('\n')[3]
# hotelAdress_m = result12[0].string
if result120:
hotelAdress_m = result120
else:
hotelAdress_m='无'
if result121:
hotelAdress_q = result121[0].string
else:
hotelAdress_q ='无'
if result122 :
hotelAdress_a = result122[0].get_text().split('\n')[0].split('\uf008')[0]
else:
hotelAdress_a = '无'
print("酒店所属盟市为:{}".format(hotelAdress_m))
print("酒店所属旗县为:{}".format(hotelAdress_q))
print("酒店地址为:{}".format(hotelAdress_a))
#3.获取酒店房间数
# soup13 = BeautifulSoup(html1,'html.parser')
result13 = soup1.find_all(attrs={"class":"dt-module"})
# soup14 = BeautifulSoup(result13,'html.parser')
# result14 = soup14.find_all('p')
# print(result13[0].get_text().strip().split('\n'))
hotel_inform=result13[0].get_text().strip().split('\n')
hotel_inform=' '.join(hotel_inform)
# print(hotel_inform)
regex_str1 = ".*?([0-9]+年开业)"
regex_str2 = ".*?([0-9]+间客房)"
regex_str3 = ".*?(电话.*[0-9]+)\s"
match_obj1 = re.match(regex_str1,str(hotel_inform))
match_obj2 = re.match(regex_str2,str(hotel_inform))
match_obj3 = re.match(regex_str3,str(hotel_inform))
# print('联系方式。。。。。。。。。。',match_obj3)
if match_obj1:
hotelStartbussi = match_obj1.group(1)[:-3]
print("酒店开业年份为:{}".format(hotelStartbussi))
else:
hotelStartbussi ="无"
print("酒店开业年份为:{}".format(hotelStartbussi))
if match_obj2:
hotelRoomnum = match_obj2.group(1)[:-3]
print("酒店房间数为:{}".format(hotelRoomnum))
else:
hotelRoomnum ="无"
print("酒店开业年份为:{}".format(hotelStartbussi))
#4.获取酒店床位数
#5.获取酒店电话
if match_obj3:
hotelContacter=match_obj3.group(1)[4:]
print("酒店联系方式为:{}".format(hotelContacter))
else:
hotelContacter='无'
print("酒店联系方式为:{}".format(hotelContacter))
result18 = soup1.find_all(attrs={"class":"text"})
if result18:
hotelStatus=result18[0].get_text()
else:
hotelStatus=''
# print(hotelStatus,len(hotelStatus))
if len(hotelStatus)<10:
hotel_Status = '已停业'
print("酒店状态为:{}".format(hotel_Status))
else:
hotel_Status =''
#6、星级
# soup16 = BeautifulSoup(html1,'html.parser')
# result16 = soup16.find_all(attrs={"class":"grade"})
# #print(result1)
# result16 =str(result16)
# soup17 = BeautifulSoup(result16,'html.parser')
# result17 = soup17.find_all('span')
# if 'title' in str(result17[0]):
# hotelStar = result17[0]['title']
# print("酒店星级为:{}".format(hotelStar))
# else:
# hotelStar ='无'
# print("酒店星级为:{}".format(hotelStar))
result0=[format(hotelAdress_m),format(hotelAdress_q),format(hotelName),format(hotelStar),format(hotelRoomnum),format(hotelStartbussi),format(hotelAdress_a),format(hotelContacter),format(hotel_Status)]
# result0=[format(hotelAdress_m.replace(u'\xa0', u' ')),format(hotelAdress_q.replace(u'\xa0', u' ')),format(hotelName.replace(u'\xa0', u' ')),format(hotelStar.replace(u'\xa0', u' ')),format(hotelRoomnum.replace(u'\xa0', u' ')),format(hotelStartbussi.replace(u'\xa0', u' ')),format(hotelAdress_a.replace(u'\xa0', u' ')),format(hotelContacter.replace(u'\xa0', u' '))]
# result0=[format(hotelAdress_m.encode('gbk','ignore')),format(hotelAdress_q.encode('gbk','ignore')),format(hotelName.encode('gbk','ignore')),format(hotelStar.encode('gbk','ignore')),format(hotelRoomnum.encode('gbk','ignore')),format(hotelStartbussi.encode('gbk','ignore')),format(hotelAdress_a.encode('gbk','ignore')),format(hotelContacter.encode('gbk','ignore'))]
result.append(result0)
#print(result)
for j in range(0,len(result)):
data1=[result[j]]
m1 = len(data1)
for j1 in range(m1):
writer.writerow(data1[j1])
csvFile2.close()
#f.close()
#f1.close()
print('programend...............................................................')
来源:https://blog.csdn.net/sinat_36564972/article/details/99674726