from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
print line
When p
The solutions with HTML-Parser are all breakable, if they run only once:
html_to_text('<script>alert("hacked")</script>
results in:
what you intend to prevent. if you use a HTML-Parser, count the Tags until zero are replaced:
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
self.containstags = False
def handle_starttag(self, tag, attrs):
self.containstags = True
def handle_data(self, d):
self.fed.append(d)
def has_tags(self):
return self.containstags
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
must_filtered = True
while ( must_filtered ):
s = MLStripper()
s.feed(html)
html = s.get_data()
must_filtered = s.has_tags()
return html