实现功能
- 1.将网页内容通过BeautifulSoup格式化,并通过分析网页信息提取
- 2.导出MP3文件
- 3.将网页信息保存成docx文件
源码实现
#coding:utf-8
import sys
import os
from spiderhelp import *
from docx import Document
from docx.shared import Pt
from docx.oxml.ns import qn
from docx.shared import Inches #设置缩进
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
import urllib2
import urllib
import re
from bs4 import BeautifulSoup
global allcount
global eachcount
class Count(object):
def __init__(self):
self.allcount = 0
self.eachcount =0
def AddAllCount(self):
self.allcount = self.allcount +1
def AddEachCount(self):
self.eachcount = self.eachcount +1
def ClearEachCount(self):
self.eachcount = 0
cout = Count()
# (#);百分号 (%);“&”;星号 (*);竖线 (|);反斜杠 (\);冒号(:);
#双引号 (");小于号 (<);大于号 (>);问号 (?);斜杠 (/)
#上面的符号不能被用来命名,需要装换出来
def NewWordFile(filename):
filename = filename.replace(r"\\","")
filename = filename.replace(r"/", "")
filename = filename.replace(r":", ":")
filename = filename.replace(r"*", "")
filename = filename.replace(r"?", "?")
filename = filename.replace(r'"', "“",1)
filename = filename.replace(r'"', "”",1)
filename = filename.replace(r'"', "")
filename = filename.replace(r"<", "")
filename = filename.replace(r">", "")
filename = filename.replace(r"|", "")
return filename
# 写入英文标题
def WdocxEnglishTitle(document,content):
if content == None:
return
paragraph = document.add_paragraph()
paragraph.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
paragraph_format = paragraph.paragraph_format
paragraph_format.line_spacing = Pt(18) # 设置行距
# 设置字体
run = paragraph.add_run(content)
run.font.name = 'New Times Roman'
run.bold = True
run.font.size = Pt(14)
# 写入英文文档部分
def WdocxEnglishBody(document,content):
if content == None:
return
paragraph = document.add_paragraph()
paragraph_format = paragraph.paragraph_format
paragraph_format.first_line_indent = Inches(0.3) # 设置缩进
paragraph_format.line_spacing = Pt(22) # 设置行距
paragraph_format.space_before = Pt(0) # 设置上行行距
# 设置字体
run = paragraph.add_run(content)
run.font.name = 'Times New Roman'
run.font.size = Pt(12)
#写入中文标题
def WdocxChinaTitle(document,content):
if content == None:
return
try:
# document.sections.page_height(742.5)
# document.sections.page_width(595.35)
paragraph = document.add_paragraph()
paragraph.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
paragraph_format = paragraph.paragraph_format
paragraph_format.line_spacing = Pt(18) # 设置行距
# 设置字体
run = paragraph.add_run(content)
run.font.name = u'宋体'
run.bold = True
run.font.size = Pt(14)
r = run._element
r.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
except Exception as err:
print err
#写入中文文档部分
def WdocxChinaBody(document,content):
if content == None:
return
content = content.replace("(","(")
content = content.replace(")",")")
paragraph = document.add_paragraph()
paragraph_format = paragraph.paragraph_format
paragraph_format.first_line_indent = Inches(0.3) # 设置缩进
paragraph_format.line_spacing = Pt(22) # 设置行距
paragraph_format.space_before = Pt(0) # 设置上行行距
# paragraph_format.space_after = Pt(10)
# 设置字体
run = paragraph.add_run(content)
run.font.name = u'宋体'
run.font.size = Pt(12)
r = run._element
r.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
def reporthook(block_read,block_size,total_size):
pass
# if not block_read:
# print "connection opened";
# return
# if total_size<0:
# #unknown size
# print "read %d blocks (%dbytes)" %(block_read,block_read*block_size);
# else:
# amount_read=block_read*block_size;
# print 'Read %d blocks,or %d/%d' %(block_read,block_read*block_size,total_size);
def UpLoadMp3File(name,mp3url):
fullUrl = "http://www.kekenet.com" + mp3url
try:
response = urllib2.urlopen(fullUrl)
readInfo = response.read()
soup = BeautifulSoup(readInfo, "lxml")
mp3Html = soup.find_all(href=re.compile("http://test.test.com/Sound"), target="_blank")
for obj in mp3Html:
print obj["href"]
fname = name + '.mp3'
urllib.urlretrieve(obj["href"], fname,reporthook)
except Exception as err:
print(err)
return
print mp3url
# 使用soup解析网页,并提取数据,保证万无一失
def UseSoupParseHtml(href):
try:
response = urllib2.urlopen(href)
readInfo = response.read()
except Exception as err:
print(err)
return
docEnglish = Document()
docChina = Document()
soup = BeautifulSoup(readInfo,"lxml")
# 通过id 获取标题
title = soup.select('#nrtitle')[0].get_text()
if title == None:
print "此网址错误了。。。。"
return
print title
# 判断是否已经存在此文件了,如果存在,就不再继续添加了
WdocxChinaTitle(docChina,title)
WdocxEnglishTitle(docEnglish,title)
# 通过class查找英文数据和中文数据
TagEnglishs = soup.select('.qh_en')
for eTagE in TagEnglishs:
if(eTagE.contents.__len__()==1):
WdocxEnglishBody(docEnglish,eTagE.string)
# print eTagE.string
else:
for eContent in eTagE.contents:
WdocxEnglishBody(docEnglish, eContent.string)
# print eContent.string
TagChinese = soup.select('.qh_zg')
for eTagC in TagChinese:
if (eTagC.contents.__len__() == 1):
WdocxChinaBody(docChina,eTagC.string)
# print eTagC.string
else:
for eContent in eTagC.contents:
WdocxChinaBody(docChina, eContent.string)
# print eContent.string
# 对存在的特殊字符进行转义
title = NewWordFile(title)
# 创建文件夹
curPath = os.getcwd()
if os.path.exists(title):
return
os.mkdir(title)
os.chdir(title)
fileName =title + ".docx"
docEnglish.save(fileName)
fileName = title + "_C.docx"
docChina.save(fileName)
mp3Html = soup.find_all(href=re.compile("/mp3"), target="_blank")
for obj in mp3Html:
UpLoadMp3File(title,obj["href"])
os.chdir(curPath)
def ParseHtml(htmlAddres):
response = urllib2.urlopen(htmlAddres)
readInfo = response.read()
# print readInfo
soup = BeautifulSoup(readInfo,"html.parser")
allHtml = soup.find_all(href=re.compile("menu"))
for obj in allHtml:
if obj.get("title", default=None):
UseSoupParseHtml(obj["href"])
allHtml = soup.find_all(href=re.compile("Article"))
for obj in allHtml:
if obj.get("title", default=None):
UseSoupParseHtml(obj["href"])
if __name__ == "__main__":
os.chdir("mp3")
for i in xrange(50):
page = 172-i
if page== 172:
html = "http://www.test.com/Article/media/test/"
else:
html = "http://www.test.com/Article/media/test/List_%d.shtml" % (page)
print "page =%d"%(page)
cout.ClearEachCount()
ParseHtml(html)
print "eachcount=%d"%(cout.eachcount)
# os.chdir(curPath)
# print "allcount=%d"%(cout.allcount)
来源:oschina
链接:https://my.oschina.net/gaoxepro/blog/3184908