from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
print line
When p
I needed a way to strip tags and decode HTML entities to plain text. The following solution is based on Eloff's answer (which I couldn't use because it strips entities).
from HTMLParser import HTMLParser
import htmlentitydefs
class HTMLTextExtractor(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.result = [ ]
def handle_data(self, d):
self.result.append(d)
def handle_charref(self, number):
codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
self.result.append(unichr(codepoint))
def handle_entityref(self, name):
codepoint = htmlentitydefs.name2codepoint[name]
self.result.append(unichr(codepoint))
def get_text(self):
return u''.join(self.result)
def html_to_text(html):
s = HTMLTextExtractor()
s.feed(html)
return s.get_text()
A quick test:
html = u'<a href="#">Demo <em>(¬ \u0394ημώ)</em></a>'
print repr(html_to_text(html))
Result:
u'Demo (\xac \u0394\u03b7\u03bc\u03ce)'
Error handling:
&#apos;
, which is valid in XML and XHTML, but not plain HTML) will cause a ValueError
exception.ValueError
exception.Security note: Do not confuse HTML stripping (converting HTML into plain text) with HTML sanitizing (converting plain text into HTML). This answer will remove HTML and decode entities into plain text – that does not make the result safe to use in a HTML context.
Example: <script>alert("Hello");</script>
will be converted to <script>alert("Hello");</script>
, which is 100% correct behavior, but obviously not sufficient if the resulting plain text is inserted as-is into a HTML page.
The rule is not hard: Any time you insert a plain-text string into HTML output, you should always HTML escape it (using cgi.escape(s, True)
), even if you "know" that it doesn't contain HTML (e.g. because you stripped HTML content).
(However, the OP asked about printing the result to the console, in which case no HTML escaping is needed.)
Python 3.4+ version: (with doctest!)
import html.parser
class HTMLTextExtractor(html.parser.HTMLParser):
def __init__(self):
super(HTMLTextExtractor, self).__init__()
self.result = [ ]
def handle_data(self, d):
self.result.append(d)
def get_text(self):
return ''.join(self.result)
def html_to_text(html):
"""Converts HTML to plain text (stripping tags and converting entities).
>>> html_to_text('<a href="#">Demo<!--...--> <em>(¬ \u0394ημώ)</em></a>')
'Demo (\xac \u0394\u03b7\u03bc\u03ce)'
"Plain text" doesn't mean result can safely be used as-is in HTML.
>>> html_to_text('<script>alert("Hello");</script>')
'<script>alert("Hello");</script>'
Always use html.escape to sanitize text before using in an HTML context!
HTMLParser will do its best to make sense of invalid HTML.
>>> html_to_text('x < y < z <!--b')
'x < y < z '
Unrecognized named entities are included as-is. ''' is recognized,
despite being XML only.
>>> html_to_text('&nosuchentity; ' ')
"&nosuchentity; ' "
"""
s = HTMLTextExtractor()
s.feed(html)
return s.get_text()
Note that HTMLParser has improved in Python 3 (meaning less code and better error handling).
I'm parsing Github readmes and I find that the following really works well:
import re
import lxml.html
def strip_markdown(x):
links_sub = re.sub(r'\[(.+)\]\([^\)]+\)', r'\1', x)
bold_sub = re.sub(r'\*\*([^*]+)\*\*', r'\1', links_sub)
emph_sub = re.sub(r'\*([^*]+)\*', r'\1', bold_sub)
return emph_sub
def strip_html(x):
return lxml.html.fromstring(x).text_content() if x else ''
And then
readme = """<img src="https://raw.githubusercontent.com/kootenpv/sky/master/resources/skylogo.png" />
sky is a web scraping framework, implemented with the latest python versions in mind (3.4+).
It uses the asynchronous `asyncio` framework, as well as many popular modules
and extensions.
Most importantly, it aims for **next generation** web crawling where machine intelligence
is used to speed up the development/maintainance/reliability of crawling.
It mainly does this by considering the user to be interested in content
from *domains*, not just a collection of *single pages*
([templating approach](#templating-approach))."""
strip_markdown(strip_html(readme))
Removes all markdown and html correctly.
For one project, I needed so strip HTML, but also css and js. Thus, I made a variation of Eloffs answer:
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.strict = False
self.convert_charrefs= True
self.fed = []
self.css = False
def handle_starttag(self, tag, attrs):
if tag == "style" or tag=="script":
self.css = True
def handle_endtag(self, tag):
if tag=="style" or tag=="script":
self.css=False
def handle_data(self, d):
if not self.css:
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
2020 Update
Use the Mozilla Bleach library, it really lets you customize which tags to keep and which attributes to keep and also filter out attributes based on values
Here are 2 cases to illustrate
1) Do not allow any HTML tags or attributes
Take sample raw text
raw_text = """
<p><img width="696" height="392" src="https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-768x432.jpg" class="attachment-medium_large size-medium_large wp-post-image" alt="Ethereum Classic 51% Attack: Okex Crypto Exchange Suffers $5.6 Million Loss, Contemplates Delisting ETC" style="float:left; margin:0 15px 15px 0;" srcset="https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-768x432.jpg 768w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-300x169.jpg 300w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-1024x576.jpg 1024w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-696x392.jpg 696w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-1068x601.jpg 1068w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-747x420.jpg 747w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-190x107.jpg 190w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-380x214.jpg 380w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-760x428.jpg 760w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc.jpg 1280w" sizes="(max-width: 696px) 100vw, 696px" />Cryptocurrency exchange Okex reveals it suffered the $5.6 million loss as a result of the double-spend carried out by the attacker(s) in Ethereum Classic 51% attack. Okex says it fully absorbed the loss as per its user-protection policy while insisting that the attack did not cause any loss to the platform’s users. Also as part […]</p>
<p>The post <a rel="nofollow" href="https://news.bitcoin.com/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc/">Ethereum Classic 51% Attack: Okex Crypto Exchange Suffers $5.6 Million Loss, Contemplates Delisting ETC</a> appeared first on <a rel="nofollow" href="https://news.bitcoin.com">Bitcoin News</a>.</p>
"""
2) Remove all HTML tags and attributes from raw text
# DO NOT ALLOW any tags or any attributes
from bleach.sanitizer import Cleaner
cleaner = Cleaner(tags=[], attributes={}, styles=[], protocols=[], strip=True, strip_comments=True, filters=None)
print(cleaner.clean(raw_text))
Output
Cryptocurrency exchange Okex reveals it suffered the $5.6 million loss as a result of the double-spend carried out by the attacker(s) in Ethereum Classic 51% attack. Okex says it fully absorbed the loss as per its user-protection policy while insisting that the attack did not cause any loss to the platform’s users. Also as part […]
The post Ethereum Classic 51% Attack: Okex Crypto Exchange Suffers $5.6 Million Loss, Contemplates Delisting ETC appeared first on Bitcoin News.
3 Allow Only img tag with srcset attribute
from bleach.sanitizer import Cleaner
# ALLOW ONLY img tags with src attribute
cleaner = Cleaner(tags=['img'], attributes={'img': ['srcset']}, styles=[], protocols=[], strip=True, strip_comments=True, filters=None)
print(cleaner.clean(raw_text))
Output
<img srcset="https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-768x432.jpg 768w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-300x169.jpg 300w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-1024x576.jpg 1024w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-696x392.jpg 696w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-1068x601.jpg 1068w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-747x420.jpg 747w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-190x107.jpg 190w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-380x214.jpg 380w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc-760x428.jpg 760w, https://news.bitcoin.com/wp-content/uploads/2020/08/ethereum-classic-51-attack-okex-crypto-exchange-suffers-5-6-million-loss-contemplates-delisting-etc.jpg 1280w">Cryptocurrency exchange Okex reveals it suffered the $5.6 million loss as a result of the double-spend carried out by the attacker(s) in Ethereum Classic 51% attack. Okex says it fully absorbed the loss as per its user-protection policy while insisting that the attack did not cause any loss to the platform’s users. Also as part […]
The post Ethereum Classic 51% Attack: Okex Crypto Exchange Suffers $5.6 Million Loss, Contemplates Delisting ETC appeared first on Bitcoin News.
The solutions with HTML-Parser are all breakable, if they run only once:
html_to_text('<<b>script>alert("hacked")<</b>/script>
results in:
<script>alert("hacked")</script>
what you intend to prevent. if you use a HTML-Parser, count the Tags until zero are replaced:
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
self.containstags = False
def handle_starttag(self, tag, attrs):
self.containstags = True
def handle_data(self, d):
self.fed.append(d)
def has_tags(self):
return self.containstags
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
must_filtered = True
while ( must_filtered ):
s = MLStripper()
s.feed(html)
html = s.get_data()
must_filtered = s.has_tags()
return html
Using BeautifulSoup, html2text or the code from @Eloff, most of the time, it remains some html elements, javascript code...
So you can use a combination of these libraries and delete markdown formatting (Python 3):
import re
import html2text
from bs4 import BeautifulSoup
def html2Text(html):
def removeMarkdown(text):
for current in ["^[ #*]{2,30}", "^[ ]{0,30}\d\\\.", "^[ ]{0,30}\d\."]:
markdown = re.compile(current, flags=re.MULTILINE)
text = markdown.sub(" ", text)
return text
def removeAngular(text):
angular = re.compile("[{][|].{2,40}[|][}]|[{][*].{2,40}[*][}]|[{][{].{2,40}[}][}]|\[\[.{2,40}\]\]")
text = angular.sub(" ", text)
return text
h = html2text.HTML2Text()
h.images_to_alt = True
h.ignore_links = True
h.ignore_emphasis = False
h.skip_internal_links = True
text = h.handle(html)
soup = BeautifulSoup(text, "html.parser")
text = soup.text
text = removeAngular(text)
text = removeMarkdown(text)
return text
It works well for me but it can be enhanced, of course...