python的网络爬虫实践
实现功能 1.将网页内容通过BeautifulSoup格式化,并通过分析网页信息提取 2.导出MP3文件 3.将网页信息保存成docx文件 源码实现 #coding:utf-8 import sys import os from spiderhelp import * from docx import Document from docx.shared import Pt from docx.oxml.ns import qn from docx.shared import Inches #设置缩进 from docx.enum.text import WD_PARAGRAPH_ALIGNMENT default_encoding = 'utf-8' if sys.getdefaultencoding() != default_encoding: reload(sys) sys.setdefaultencoding(default_encoding) import urllib2 import urllib import re from bs4 import BeautifulSoup global allcount global eachcount class Count(object): def __init__(self): self.allcount = 0 self