该脚本可以从豆瓣中抓取图片以及音乐,可以输入自己的正则,正则的合法性我没有去进行验证,这只是一个简单的文件抓取,只是用了一些基本的模块,看到网上好多使用Beautiful Soup 模块的例子,一会研究研究。代码很简单,就不多解释了
#! /usr/bin/python2.6.6
# -- coding:utf-8 --
import urllib, urllib2, gevent, re
import time, os
from gevent import monkey
monkey.patch_all()
def worker(url):
try:
m_arr = []
parent_dir = os.path.join("%s" % time.strftime('%Y-%m-%d',time.localtime(time.time()))) #按日期创建目录
if not os.path.exists(parent_dir):
os.mkdir(parent_dir)
os.chdir(parent_dir)
if not os.path.exists('images'):#创建存储图片的目录
os.mkdir('images')
if not os.path.exists('radios'):#创建存储音乐的目录
os.mkdir('radios')
response=urllib2.urlopen(url)
text=response.read()
groups=re.finditer(reg, text)
for g in groups:
name=g.group(1).strip() + ".mp3"
cover_name=g.group(1).strip() + ".jpg"
cover_path=g.group(2).replace('\\', '')
path=g.group(3).replace('\\', '')
m_arr.append((name, path, cover_name, cover_path))
except Exception, e:
print "<<==get regulare text raised exceptin %s==>>" % e
finally:
return m_arr
def grun(path, name):
try:
urllib.urlretrieve(path, name)
except Exception, e:
print "<<==Fetch material %s in %s raised Exception %s==>>" % (name, path, e)
pass
if __name__ == '__main__':
#匹配音乐url
url = raw_input("input the url to fetch materials ==>>")
re_pat = raw_input("input the regular expression to fetch materials ==>>")
if not url:
url = "http://site.douban.com/dannv/"
if not re_pat:
re_pat = '{"name":"(.+?)",.+?"cover":"(.+?)",.+?"rawUrl":"(.+?)",.+?}'
reg=re.compile(re_pat, re.I)
musicArray = worker(url)
jobs = []
os.chdir('images')
for (name, path, cover_name, cover_path) in musicArray:
jobs.append(gevent.spawn(grun, cover_path, cover_name))
gevent.joinall(jobs, timeout=600)
os.chdir('../radios')
for (name, path, cover_name, cover_path) in musicArray:
jobs.append(gevent.spawn(grun, path, name))
gevent.joinall(jobs, timeout=600)
来源:https://www.cnblogs.com/willier/archive/2012/12/06/3082817.html