python的网络爬虫实践

五迷三道 提交于 2020-03-02 10:32:46

实现功能

  • 1.将网页内容通过BeautifulSoup格式化,并通过分析网页信息提取
  • 2.导出MP3文件
  • 3.将网页信息保存成docx文件

源码实现

#coding:utf-8
import sys
import os
from spiderhelp import *
from  docx import  Document
from  docx.shared import  Pt
from  docx.oxml.ns import  qn
from docx.shared import Inches #设置缩进
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

import urllib2
import urllib
import re
from bs4 import BeautifulSoup

global allcount
global eachcount

class Count(object):
    def __init__(self):
        self.allcount = 0
        self.eachcount =0

    def AddAllCount(self):
        self.allcount = self.allcount +1

    def AddEachCount(self):
        self.eachcount = self.eachcount +1

    def ClearEachCount(self):
        self.eachcount = 0


cout = Count()

# (#);百分号 (%);“&”;星号 (*);竖线 (|);反斜杠 (\);冒号(:);
#双引号 (");小于号 (<);大于号 (>);问号 (?);斜杠 (/)
#上面的符号不能被用来命名,需要装换出来
def NewWordFile(filename):
    filename = filename.replace(r"\\","")
    filename = filename.replace(r"/", "")
    filename = filename.replace(r":", ":")
    filename = filename.replace(r"*", "")
    filename = filename.replace(r"?", "?")
    filename = filename.replace(r'"', "“",1)
    filename = filename.replace(r'"', "”",1)
    filename = filename.replace(r'"', "")
    filename = filename.replace(r"<", "")
    filename = filename.replace(r">", "")
    filename = filename.replace(r"|", "")
    return filename


# 写入英文标题
def WdocxEnglishTitle(document,content):
    if content == None:
        return

    paragraph = document.add_paragraph()
    paragraph.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
    paragraph_format = paragraph.paragraph_format
    paragraph_format.line_spacing = Pt(18)  # 设置行距
    # 设置字体
    run = paragraph.add_run(content)
    run.font.name = 'New Times Roman'
    run.bold = True
    run.font.size = Pt(14)

# 写入英文文档部分
def WdocxEnglishBody(document,content):
    if content == None:
        return

    paragraph = document.add_paragraph()
    paragraph_format = paragraph.paragraph_format
    paragraph_format.first_line_indent = Inches(0.3)  # 设置缩进
    paragraph_format.line_spacing = Pt(22)  # 设置行距
    paragraph_format.space_before = Pt(0)  # 设置上行行距
    # 设置字体
    run = paragraph.add_run(content)
    run.font.name = 'Times New Roman'
    run.font.size = Pt(12)
#写入中文标题
def WdocxChinaTitle(document,content):
    if content == None:
        return

    try:
        # document.sections.page_height(742.5)
        # document.sections.page_width(595.35)

        paragraph = document.add_paragraph()
        paragraph.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
        paragraph_format = paragraph.paragraph_format
        paragraph_format.line_spacing = Pt(18)  # 设置行距
        # 设置字体
        run = paragraph.add_run(content)
        run.font.name = u'宋体'
        run.bold = True
        run.font.size = Pt(14)
        r = run._element
        r.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
    except Exception as err:
        print err
#写入中文文档部分
def WdocxChinaBody(document,content):
    if content == None:
        return
    content = content.replace("(","(")
    content = content.replace(")",")")
    paragraph = document.add_paragraph()
    paragraph_format = paragraph.paragraph_format
    paragraph_format.first_line_indent = Inches(0.3)  # 设置缩进
    paragraph_format.line_spacing = Pt(22)  # 设置行距
    paragraph_format.space_before = Pt(0)  # 设置上行行距
    # paragraph_format.space_after = Pt(10)
    # 设置字体
    run = paragraph.add_run(content)
    run.font.name = u'宋体'
    run.font.size = Pt(12)
    r = run._element
    r.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')

def reporthook(block_read,block_size,total_size):
    pass
  # if not block_read:
  #   print "connection opened";
  #   return
  # if total_size<0:
  #   #unknown size
  #   print "read %d blocks (%dbytes)" %(block_read,block_read*block_size);
  # else:
  #   amount_read=block_read*block_size;
  #   print 'Read %d blocks,or %d/%d' %(block_read,block_read*block_size,total_size);


def UpLoadMp3File(name,mp3url):
    fullUrl = "http://www.kekenet.com" + mp3url
    try:
        response = urllib2.urlopen(fullUrl)
        readInfo = response.read()
        soup = BeautifulSoup(readInfo, "lxml")

        mp3Html = soup.find_all(href=re.compile("http://test.test.com/Sound"), target="_blank")
        for obj in mp3Html:
            print obj["href"]
            fname = name + '.mp3'
            urllib.urlretrieve(obj["href"], fname,reporthook)

    except Exception as err:
        print(err)
        return

    print  mp3url

# 使用soup解析网页,并提取数据,保证万无一失
def UseSoupParseHtml(href):
    try:
        response = urllib2.urlopen(href)
        readInfo = response.read()
    except Exception as err:
        print(err)
        return
    docEnglish = Document()
    docChina = Document()

    soup = BeautifulSoup(readInfo,"lxml")
    # 通过id 获取标题
    title = soup.select('#nrtitle')[0].get_text()
    if title == None:
        print "此网址错误了。。。。"
        return
    print title

    # 判断是否已经存在此文件了,如果存在,就不再继续添加了

    WdocxChinaTitle(docChina,title)
    WdocxEnglishTitle(docEnglish,title)

    # 通过class查找英文数据和中文数据
    TagEnglishs = soup.select('.qh_en')
    for eTagE in TagEnglishs:
        if(eTagE.contents.__len__()==1):
            WdocxEnglishBody(docEnglish,eTagE.string)
            # print eTagE.string
        else:
            for eContent in eTagE.contents:
                WdocxEnglishBody(docEnglish, eContent.string)
                # print eContent.string


    TagChinese = soup.select('.qh_zg')
    for eTagC in TagChinese:
        if (eTagC.contents.__len__() == 1):
            WdocxChinaBody(docChina,eTagC.string)
            # print eTagC.string
        else:
            for eContent in eTagC.contents:
                WdocxChinaBody(docChina, eContent.string)
                # print eContent.string
    # 对存在的特殊字符进行转义
    title = NewWordFile(title)
    # 创建文件夹
    curPath = os.getcwd()
    if os.path.exists(title):
        return

    os.mkdir(title)
    os.chdir(title)

    fileName =title + ".docx"
    docEnglish.save(fileName)
    fileName = title + "_C.docx"
    docChina.save(fileName)

    mp3Html = soup.find_all(href=re.compile("/mp3"), target="_blank")
    for obj in mp3Html:
        UpLoadMp3File(title,obj["href"])

    os.chdir(curPath)

def ParseHtml(htmlAddres):
    response = urllib2.urlopen(htmlAddres)
    readInfo = response.read()
    # print readInfo
    soup = BeautifulSoup(readInfo,"html.parser")
    allHtml = soup.find_all(href=re.compile("menu"))
    for obj in allHtml:
        if obj.get("title", default=None):
            UseSoupParseHtml(obj["href"])

    allHtml = soup.find_all(href=re.compile("Article"))
    for obj in allHtml:
        if obj.get("title", default=None):
            UseSoupParseHtml(obj["href"])

if __name__ == "__main__":
    os.chdir("mp3")
    for i in xrange(50):
        page = 172-i
        if page== 172:
            html = "http://www.test.com/Article/media/test/"
        else:
            html = "http://www.test.com/Article/media/test/List_%d.shtml" % (page)
        print "page =%d"%(page)
        cout.ClearEachCount()
        ParseHtml(html)
        print "eachcount=%d"%(cout.eachcount)
    # os.chdir(curPath)
    # print "allcount=%d"%(cout.allcount)
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!