import urllib.request
from bs4 import BeautifulSoup
import csv
import re
def openlink(link):
maxTryNum=20
for tries in range(maxTryNum):
try:
response = urllib.request.urlopen(link).read().decode('utf-8')
return response
except:
if tries < (maxTryNum-1):
continue
else:
print("Has tried %d times to access url %s, all failed!", maxTryNum, link)
return None
break
cityurl='http://hotels.ctrip.com/jiudian/neimenggu'
cityhtml = openlink(cityurl)
citysoup = BeautifulSoup(cityhtml, 'html.parser')
city_list = citysoup.find_all('ul', attrs={"class":"p_n_list grid_8"})
citysoup1 = BeautifulSoup(str(city_list), 'html.parser')
city_list1 = citysoup1.find_all('a', attrs={"href":True})
city_id=[]
for city in city_list1:
city_id1 = city['href']
city_id.append(city_id1)
print(city_id)
for cityid in city_id[13:14]:
cityname=re.findall(r"/([a-z]+?)[0-9]",cityid)
csvFile2 = open('Hotel'+str(cityname)+'.csv','w', newline='',encoding='utf-8-sig') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile2)
data1=['所属盟市', '所属旗县', '酒店名称', '星级', '房间数', '开业年份', '地址', '联系方式']
data=[data1]
m = len(data)
for i in range(m):
writer.writerow(data[i])
print('programbegin..................................................................')
pagecounturl='http://hotels.ctrip.com'+cityid
print(pagecounturl)
pagecounthtml = openlink(pagecounturl)
#1.获取酒店名称信息
pagecountsoup = BeautifulSoup(pagecounthtml,'html.parser')
page_count = pagecountsoup.find_all(attrs={"data-pagecount":True})
for count in page_count:
pagecount=count['data-pagecount']
print(pagecount)
for k in range(1,int(pagecount)+1):
print("***正在抓取第"+str(k)+"页********")
pageurl=''
hotel_id=[]
pageurl='http://hotels.ctrip.com'+cityid+'/p'+str(k)
print(pageurl)
paghtml=openlink(pageurl)
#1.获取酒店名称信息
if paghtml==None:
with open('Pageerror.txt', 'a+') as f:
f.write(pageurl+'\n')
continue
pagesoup = BeautifulSoup(paghtml,'html.parser')
hotel_list = pagesoup.find_all("h2",attrs={"data-id":True})
for hotel in hotel_list:
hotel_id1=hotel['data-id']
hotel_id.append(hotel_id1)
print("hotel_id是...............................................................:")
print(hotel_id,len(hotel_id)+1)
result=[]
for i in range(1,len(hotel_id)+1):
url1=''
result0=[]
# url1='http://hotels.ctrip.com/hotel/'+hotel_id[i-1]+'.html?isFull=F#ctm_ref=hod_sr_lst_dl_n_'+str(k)+'_'+str(i) #为爬取的当前页面的网址,
#num[i]为每个页面网址id,'.html?isFull=F#ctm_ref=hod_sr_lst_dl_n_2_'需要修改2为3,4,等等,#str(i)是个数
url1='http://hotels.ctrip.com/hotel/'+hotel_id[i-1]+'.html?isFull=F'
print(url1)
html1 = openlink(url1)
#1.获取酒店名称信息
if html1==None:
with open('Hotelerror.txt', 'a+') as f1:
f.write(url1+'\n')
continue
soup1 = BeautifulSoup(html1,'html.parser')
print("soup1是:")
result1 = soup1.find_all('h1')
print("result1是:")
print(result1)
hotelName = result1[0].string
print("酒店名称为:{}".format(hotelName))
print(i)
#2.获取酒店位置
soup12 = BeautifulSoup(html1,'html.parser')
result12 = soup12.find_all(attrs={"class":"adress"})
hotelAdress_m = result12[0].get_text().split('\n')[1]
hotelAdress_q = result12[0].get_text().split('\n')[2]
hotelAdress_a = result12[0].get_text().split('\n')[3]
print("酒店所属盟市为:{}".format(hotelAdress_m))
print("酒店所属旗县为:{}".format(hotelAdress_q))
print("酒店地址为:{}".format(hotelAdress_a))
#3.获取酒店房间数
soup13 = BeautifulSoup(html1,'html.parser')
result13 = soup13.find_all(attrs={"class":"htl_room_txt text_3l"})
result13 =str(result13)
soup14 = BeautifulSoup(result13,'html.parser')
result14 = soup14.find_all('p')
print(result14[0].get_text().strip().split('\n')[0].split('\xa0\xa0'))
hotel_inform=result14[0].get_text().strip().split('\n')[0]
regex_str1 = ".*?([0-9]+年开业)"
regex_str2 = ".*?([0-9]+间房)"
match_obj1 = re.match(regex_str1,str(hotel_inform))
match_obj2 = re.match(regex_str2,str(hotel_inform))
if match_obj1:
hotelStartbussi = match_obj1.group(1)[:-3]
print("酒店开业年份为:{}".format(hotelStartbussi))
else:
hotelStartbussi ="无"
print("酒店开业年份为:{}".format(hotelStartbussi))
match_ob2 = re.match(regex_str2,str(hotel_inform))
if match_obj2:
hotelRoomnum = match_obj2.group(1)[:-2]
print("酒店房间数为:{}".format(hotelRoomnum))
else:
hotelRoomnum ="无"
print("酒店开业年份为:{}".format(hotelStartbussi))
# 4.获取酒店床位数
# 5.获取酒店电话
result15 = soup14.find_all('span')
if 'data-real' in str(result15[0]):
hotelContacter=result15[0]['data-real'].split('<a')[0].strip().split(' ')[0][2:]
print("酒店联系方式为:{}".format(hotelContacter))
else:
hotelContacter='无'
print("酒店联系方式为:{}".format(hotelContacter))
# 6.星级
soup16 = BeautifulSoup(html1,'html.parser')
result16 = soup16.find_all(attrs={"class":"grade"})
result16 =str(result16)
soup17 = BeautifulSoup(result16,'html.parser')
result17 = soup17.find_all('span')
if 'title' in str(result17[0]):
hotelStar = result17[0]['title']
print("酒店星级为:{}".format(hotelStar))
else:
hotelStar ='无'
print("酒店星级为:{}".format(hotelStar))
result0=[format(hotelAdress_m),format(hotelAdress_q),format(hotelName),format(hotelStar),format(hotelRoomnum),format(hotelStartbussi),format(hotelAdress_a),format(hotelContacter)]
result.append(result0)
for j in range(0,len(result)):
data1=[result[j]]
m1 = len(data1)
for j1 in range(m1):
writer.writerow(data1[j1])
csvFile2.close()
print('programend...............................................................')
来源:https://blog.csdn.net/sinat_36564972/article/details/99674794