去哪儿网酒店信息采集

# -*- coding: utf-8 -*-

import urllib.request
from bs4 import BeautifulSoup
import csv
import re

#dizhi_url=['hohhot103','baotou114','chifeng202','tongliao458','ulanhot484','xilinhot500','manzhouli1083','wuhai1133']
def openlink(link):
    maxTryNum=20 
    for tries in range(maxTryNum): 
        try: 
            response = urllib.request.urlopen(link).read().decode('utf-8') 
            return response 
        except: 
            if tries < (maxTryNum-1): 
                continue 
            else: 
                print("Has tried %d times to access url %s, all failed!", maxTryNum, link)
                return None
                break


#cityurl='http://hotels.ctrip.com/jiudian/neimenggu'
##print(cityurl)
##cityhtml = urllib.request.urlopen(cityurl).read().decode('utf-8')
#cityhtml = openlink(cityurl)
#citysoup = BeautifulSoup(cityhtml,'html.parser')
#
#city_list = citysoup.find_all('ul',attrs={"class":"p_n_list grid_8"})
#citysoup1 = BeautifulSoup(str(city_list),'html.parser')
#city_list1 = citysoup1.find_all('a',attrs={"href":True})
##print(city_list1)
##print("city_list是。。。。。。。。。。。。。:")
#city_id=[]
#for city in city_list1:
#    city_id1=city['href']
#    city_id.append(city_id1)
#print(city_id)     

#city_id=['huhehaote','baotou','wuhai','chifeng','tongliao','eerduosi','hulunbeier','bayannaoer','wulanchabu','xingan','xilinguole','alashan',]
city_id=['huhehaote']
for cityid in city_id: 
#    hotelname='Hotel'+str(cityid)+'.csv'
    csvFile2 = open('Hotel'+cityid+'.csv','w', newline='',encoding='utf-8-sig') # 设置newline，否则两行之间会空一行
    writer = csv.writer(csvFile2)
    data1=['所属盟市','所属旗县','酒店名称','星级','房间数','开业年份','地址','联系方式','状态']
    data=[data1]
    m = len(data)
    for i in range(m):
        writer.writerow(data[i])
    print('programbegin..................................................................')
    pagecounturl='http://review.qunar.com/city_huhehaote_1.html'
    #    pageurl='http://hotels.ctrip.com/hotel/hohhot103/p'+str(k)   
    print(pagecounturl)
    pagecounthtml = openlink(pagecounturl)
    #    pagecounthtml = urllib.request.urlopen(pagecounturl).read().decode('utf-8')
        #print(html1)
        #1.获取酒店名称信息
    pagecountsoup = BeautifulSoup(pagecounthtml,'html.parser')
    #    print("pagesoup是。。。。。。。。。。。。。:")
        #print(soup1)
    page_count = pagecountsoup.find_all(attrs={"class":"pager"})
    #    print("hotel_list是。。。。。。。。。。。。。:")
    #    print(hotel_list)
    
    #for count in page_count:
    #    pagecount=count['data-pagecount']
    
    page_count=page_count[0].get_text().strip().split('\n')
    print(page_count)
    regex_str1 = ".*?([0-9]+)页"
    match_obj1 = re.match(regex_str1,str(page_count))
    print('pagecount是：',match_obj1)
    if match_obj1:
       pagecount = match_obj1.group(1)
       print('pagecount是：',pagecount,type(pagecount))
    
    for k in range(1,int(pagecount)+1):
        print("***正在抓取第"+str(k)+"页********")
        pageurl=''
        hotel_id=[]
    #    if k==1:
    #        pageurl='http://hotels.ctrip.com/hotel/hohhot103/p0'
    #    else:
        pageurl='http://review.qunar.com/city_'+cityid+'_'+str(k)+'.html'
    #    pageurl='http://hotels.ctrip.com/hotel/hohhot103/p'+str(k)
        
        print(pageurl)
        paghtml=openlink(pageurl)
#        paghtml = urllib.request.urlopen(pageurl).read().decode('utf-8')
        #print(html1)
        #1.获取酒店名称信息
        if paghtml==None:
            with open('Pageerror.txt', 'a+') as f:
                f.write(pageurl+'\n')
            continue
        pagesoup = BeautifulSoup(paghtml,'html.parser')
    #    print("pagesoup是。。。。。。。。。。。。。:")
        #print(soup1)
        hotel_list = pagesoup.find_all(attrs={"class":'pr'})
        hotel_list=str(hotel_list)
        hotel_list = BeautifulSoup(hotel_list,'html.parser')
        hotel_list = hotel_list.find_all('a',attrs={"href":True})
    #    print("hotel_list是。。。。。。。。。。。。。:")
    #    print(hotel_list)
        for hotel in hotel_list:
            hotel_id1=hotel['href']
            hotel_id.append(hotel_id1)
    #    hotel_list=str(hotel_list[0])
    #    hotwl_list=BeautifulSoup(str(hotel_list[0]),'html.parser')
    #    hotel_id=hotwl_list.find_all(attrs={"data-hotel":True})
        print("hotel_id是...............................................................:")
        print(hotel_id,len(hotel_id)+1)
        
        result=[]
        for i in hotel_id:
            url1=''
            result0=[]
    #        url1='http://hotels.ctrip.com/hotel/'+hotel_id[i-1]+'.html?isFull=F#ctm_ref=hod_sr_lst_dl_n_'+str(k)+'_'+str(i)  #为爬取的当前页面的网址，
        #num[i]为每个页面网址id，'.html?isFull=F#ctm_ref=hod_sr_lst_dl_n_2_'需要修改2为3，4，等等，#str（i）是个数
            url1=i
            print(url1)
            html1 = openlink(url1)
            #print(html1)
            #1.获取酒店名称信息
            if html1==None:
                with open('Hotelerror.txt', 'a+') as f1:
                    f.write(url1+'\n')
                continue
            soup1 = BeautifulSoup(html1,'html.parser')
#            print("soup1是：")
            #print(soup1)
            result1 = soup1.find_all('span',attrs={"class":"name"})
#            print("result1是：")
#            print(result1)
            if result1:
                hotelName = result1[0].string
            else:
                continue
            print("酒店名称为:{}".format(hotelName))
            result17 = soup1.find_all('span',attrs={"class":"type"})
            if result17:
                hotelStar = result17[0].string
                print("酒店星级为:{}".format(hotelStar))   
            else:
                hotelStar ='无'
                print("酒店星级为:{}".format(hotelStar))
#            print(i)
        #2.获取酒店位置
#            soup12 = BeautifulSoup(html1,'html.parser')
            result12 = soup1.find_all(attrs={"class":"bread"})
            if result12:
                result120 = result12[0].get_text().split('\n')[2]
            else:
                result120 =None
            result121 = soup1.find_all('a',attrs={"class":"js_area"})
            result122 = soup1.find_all('span',attrs={"class":"position js_position"})
        #result12 =str(result12)
#            hotelAdress_m = result12[0].get_text().split('\n')[1] 
#            hotelAdress_q = result12[0].get_text().split('\n')[2] 
#            hotelAdress_a = result12[0].get_text().split('\n')[3] 
#            hotelAdress_m = result12[0].string
            if result120:
                hotelAdress_m = result120
            else:
                hotelAdress_m='无'
            if result121:
                hotelAdress_q = result121[0].string
            else:
                hotelAdress_q ='无'
            if result122 :
                hotelAdress_a = result122[0].get_text().split('\n')[0].split('\uf008')[0]
            else:
                hotelAdress_a = '无'
            print("酒店所属盟市为:{}".format(hotelAdress_m))
            print("酒店所属旗县为:{}".format(hotelAdress_q))
            print("酒店地址为:{}".format(hotelAdress_a))
        
        #3.获取酒店房间数
#            soup13 = BeautifulSoup(html1,'html.parser')
            result13 = soup1.find_all(attrs={"class":"dt-module"})
        #            soup14 = BeautifulSoup(result13,'html.parser')
        #            result14 = soup14.find_all('p')
        #        print(result13[0].get_text().strip().split('\n'))
            hotel_inform=result13[0].get_text().strip().split('\n')
            hotel_inform=' '.join(hotel_inform)
#            print(hotel_inform)
        
            regex_str1 = ".*?([0-9]+年开业)"
            regex_str2 = ".*?([0-9]+间客房)"
            regex_str3 = ".*?(电话.*[0-9]+)\s"
            match_obj1 = re.match(regex_str1,str(hotel_inform))
            match_obj2 = re.match(regex_str2,str(hotel_inform))
            match_obj3 = re.match(regex_str3,str(hotel_inform))
#            print('联系方式。。。。。。。。。。',match_obj3)
            if match_obj1:
                hotelStartbussi = match_obj1.group(1)[:-3]
                print("酒店开业年份为:{}".format(hotelStartbussi))
            else:
                hotelStartbussi ="无"
                print("酒店开业年份为:{}".format(hotelStartbussi))
            if match_obj2:
                hotelRoomnum = match_obj2.group(1)[:-3]
                print("酒店房间数为:{}".format(hotelRoomnum))
            else:
                hotelRoomnum ="无"
                print("酒店开业年份为:{}".format(hotelStartbussi))
                    #4.获取酒店床位数
                    
                    #5.获取酒店电话
            if match_obj3:
                hotelContacter=match_obj3.group(1)[4:]
                print("酒店联系方式为:{}".format(hotelContacter))
            else:
                hotelContacter='无'
                print("酒店联系方式为:{}".format(hotelContacter))
                
            result18 = soup1.find_all(attrs={"class":"text"})
            if result18:
                hotelStatus=result18[0].get_text()
            else:
                hotelStatus=''
#            print(hotelStatus,len(hotelStatus))
            if len(hotelStatus)<10:
                hotel_Status = '已停业'
                print("酒店状态为:{}".format(hotel_Status))
            else:
                hotel_Status =''
            
        
        #6、星级
#            soup16 = BeautifulSoup(html1,'html.parser')
#            result16 = soup16.find_all(attrs={"class":"grade"})
#        #print(result1)
#            result16 =str(result16)
#            soup17 = BeautifulSoup(result16,'html.parser')
#            result17 = soup17.find_all('span')
#            if 'title' in str(result17[0]):
#                hotelStar = result17[0]['title']
#                print("酒店星级为:{}".format(hotelStar))   
#            else:
#                hotelStar ='无'
#                print("酒店星级为:{}".format(hotelStar))
            result0=[format(hotelAdress_m),format(hotelAdress_q),format(hotelName),format(hotelStar),format(hotelRoomnum),format(hotelStartbussi),format(hotelAdress_a),format(hotelContacter),format(hotel_Status)]
    #        result0=[format(hotelAdress_m.replace(u'\xa0', u' ')),format(hotelAdress_q.replace(u'\xa0', u' ')),format(hotelName.replace(u'\xa0', u' ')),format(hotelStar.replace(u'\xa0', u' ')),format(hotelRoomnum.replace(u'\xa0', u' ')),format(hotelStartbussi.replace(u'\xa0', u' ')),format(hotelAdress_a.replace(u'\xa0', u' ')),format(hotelContacter.replace(u'\xa0', u' '))]
            
    #        result0=[format(hotelAdress_m.encode('gbk','ignore')),format(hotelAdress_q.encode('gbk','ignore')),format(hotelName.encode('gbk','ignore')),format(hotelStar.encode('gbk','ignore')),format(hotelRoomnum.encode('gbk','ignore')),format(hotelStartbussi.encode('gbk','ignore')),format(hotelAdress_a.encode('gbk','ignore')),format(hotelContacter.encode('gbk','ignore'))]
            result.append(result0)
    #print(result)
        for j in  range(0,len(result)):
            data1=[result[j]]
            m1 = len(data1)
            for j1 in range(m1):
                writer.writerow(data1[j1])
    csvFile2.close()
#f.close()
#f1.close()
print('programend...............................................................')
来源：https://blog.csdn.net/sinat_36564972/article/details/99674726
标签
hotel