from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
print line
When p
The Beautiful Soup package does this immediately for you.
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
text = soup.get_text()
print(text)
You can use either a different HTML parser (like lxml, or Beautiful Soup) -- one that offers functions to extract just text. Or, you can run a regex on your line string that strips out the tags. See Python docs for more.
I have used Eloff's answer successfully for Python 3.1 [many thanks!].
I upgraded to Python 3.2.3, and ran into errors.
The solution, provided here thanks to the responder Thomas K, is to insert super().__init__()
into the following code:
def __init__(self):
self.reset()
self.fed = []
... in order to make it look like this:
def __init__(self):
super().__init__()
self.reset()
self.fed = []
... and it will work for Python 3.2.3.
Again, thanks to Thomas K for the fix and for Eloff's original code provided above!
If you need to preserve HTML entities (i.e. &
), I added "handle_entityref" method to Eloff's answer.
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def handle_entityref(self, name):
self.fed.append('&%s;' % name)
def get_data(self):
return ''.join(self.fed)
def html_to_text(html):
s = MLStripper()
s.feed(html)
return s.get_data()
A python 3 adaption of søren-løvborg's answer
from html.parser import HTMLParser
from html.entities import html5
class HTMLTextExtractor(HTMLParser):
""" Adaption of http://stackoverflow.com/a/7778368/196732 """
def __init__(self):
super().__init__()
self.result = []
def handle_data(self, d):
self.result.append(d)
def handle_charref(self, number):
codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
self.result.append(unichr(codepoint))
def handle_entityref(self, name):
if name in html5:
self.result.append(unichr(html5[name]))
def get_text(self):
return u''.join(self.result)
def html_to_text(html):
s = HTMLTextExtractor()
s.feed(html)
return s.get_text()
An lxml.html-based solution (lxml is a native library and can be more performant than a pure python solution).
from lxml import html
## from file-like object or URL
tree = html.parse(file_like_object_or_url)
## from string
tree = html.fromstring('safe <script>unsafe</script> safe')
print(tree.text_content().strip())
### OUTPUT: 'safe unsafe safe'
from lxml import html
from lxml.html.clean import clean_html
tree = html.fromstring("""<script>dangerous</script><span class="item-summary">
Detailed answers to any questions you might have
</span>""")
## text only
print(clean_html(tree).text_content().strip())
### OUTPUT: 'Detailed answers to any questions you might have'
Also see http://lxml.de/lxmlhtml.html#cleaning-up-html for what exactly the lxml.cleaner does.
If you need more control over what exactly is sanitized before converting to text then you might want to use the lxml Cleaner explicitly by passing the options you want in the constructor, e.g:
cleaner = Cleaner(page_structure=True,
meta=True,
embedded=True,
links=True,
style=True,
processing_instructions=True,
inline_style=True,
scripts=True,
javascript=True,
comments=True,
frames=True,
forms=True,
annoying_tags=True,
remove_unknown_tags=True,
safe_attrs_only=True,
safe_attrs=frozenset(['src','color', 'href', 'title', 'class', 'name', 'id']),
remove_tags=('span', 'font', 'div')
)
sanitized_html = cleaner.clean_html(unsafe_html)
If you need more control over how plain text is generated then instead of text_content()
you can use lxml.etree.tostring:
plain_bytes = tostring(tree, method='text', encoding='utf-8')
print(plain.decode('utf-8'))