from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
print line
When p
I needed a way to strip tags and decode HTML entities to plain text. The following solution is based on Eloff's answer (which I couldn't use because it strips entities).
from HTMLParser import HTMLParser
import htmlentitydefs
class HTMLTextExtractor(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.result = [ ]
def handle_data(self, d):
self.result.append(d)
def handle_charref(self, number):
codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
self.result.append(unichr(codepoint))
def handle_entityref(self, name):
codepoint = htmlentitydefs.name2codepoint[name]
self.result.append(unichr(codepoint))
def get_text(self):
return u''.join(self.result)
def html_to_text(html):
s = HTMLTextExtractor()
s.feed(html)
return s.get_text()
A quick test:
html = u'Demo (¬ \u0394ημώ)'
print repr(html_to_text(html))
Result:
u'Demo (\xac \u0394\u03b7\u03bc\u03ce)'
Error handling:
apos;
, which is valid in XML and XHTML, but not plain HTML) will cause a ValueError
exception.ValueError
exception.Security note: Do not confuse HTML stripping (converting HTML into plain text) with HTML sanitizing (converting plain text into HTML). This answer will remove HTML and decode entities into plain text – that does not make the result safe to use in a HTML context.
Example: <script>alert("Hello");</script>
will be converted to , which is 100% correct behavior, but obviously not sufficient if the resulting plain text is inserted as-is into a HTML page.
The rule is not hard: Any time you insert a plain-text string into HTML output, you should always HTML escape it (using cgi.escape(s, True)
), even if you "know" that it doesn't contain HTML (e.g. because you stripped HTML content).
(However, the OP asked about printing the result to the console, in which case no HTML escaping is needed.)
Python 3.4+ version: (with doctest!)
import html.parser
class HTMLTextExtractor(html.parser.HTMLParser):
def __init__(self):
super(HTMLTextExtractor, self).__init__()
self.result = [ ]
def handle_data(self, d):
self.result.append(d)
def get_text(self):
return ''.join(self.result)
def html_to_text(html):
"""Converts HTML to plain text (stripping tags and converting entities).
>>> html_to_text('Demo (¬ \u0394ημώ)')
'Demo (\xac \u0394\u03b7\u03bc\u03ce)'
"Plain text" doesn't mean result can safely be used as-is in HTML.
>>> html_to_text('<script>alert("Hello");</script>')
''
Always use html.escape to sanitize text before using in an HTML context!
HTMLParser will do its best to make sense of invalid HTML.
>>> html_to_text('x < y < z
- 热议问题