I am trying to crawl wikipedia to get some data for text mining. I am using python\'s urllib2 and Beautifulsoup. My question is that: is there an easy way of getting rid of
for p in paragraphs(text=True):
print p
Additionally you could use api.php
instead of index.php
:
#!/usr/bin/env python
import sys
import time
import urllib, urllib2
import xml.etree.cElementTree as etree
# prepare request
maxattempts = 5 # how many times to try the request before giving up
maxlag = 5 # seconds http://www.mediawiki.org/wiki/Manual:Maxlag_parameter
params = dict(action="query", format="xml", maxlag=maxlag,
prop="revisions", rvprop="content", rvsection=0,
titles="data_mining")
request = urllib2.Request(
"http://en.wikipedia.org/w/api.php?" + urllib.urlencode(params),
headers={"User-Agent": "WikiDownloader/1.2",
"Referer": "http://stackoverflow.com/q/8044814"})
# make request
for _ in range(maxattempts):
response = urllib2.urlopen(request)
if response.headers.get('MediaWiki-API-Error') == 'maxlag':
t = response.headers.get('Retry-After', 5)
print "retrying in %s seconds" % (t,)
time.sleep(float(t))
else:
break # ready to read
else: # exhausted all attempts
sys.exit(1)
# download & parse xml
tree = etree.parse(response)
# find rev data
rev_data = tree.findtext('.//rev')
if not rev_data:
print 'MediaWiki-API-Error:', response.headers.get('MediaWiki-API-Error')
tree.write(sys.stdout)
print
sys.exit(1)
print(rev_data)
{{Distinguish|analytics|information extraction|data analysis}}
'''Data mining''' (the analysis step of the '''knowledge discovery in databases..