I\'d like to extract the text from an HTML file using Python. I want essentially the same output I would get if I copied the text from a browser and pasted it into notepad.
install html2text using
pip install html2text
then,
>>> import html2text
>>>
>>> h = html2text.HTML2Text()
>>> # Ignore converting links from HTML
>>> h.ignore_links = True
>>> print h.handle("<p>Hello, <a href='http://earth.google.com/'>world</a>!")
Hello, world!
Found myself facing just the same problem today. I wrote a very simple HTML parser to strip incoming content of all markups, returning the remaining text with only a minimum of formatting.
from HTMLParser import HTMLParser
from re import sub
from sys import stderr
from traceback import print_exc
class _DeHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__text = []
def handle_data(self, data):
text = data.strip()
if len(text) > 0:
text = sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ')
def handle_starttag(self, tag, attrs):
if tag == 'p':
self.__text.append('\n\n')
elif tag == 'br':
self.__text.append('\n')
def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.__text.append('\n\n')
def text(self):
return ''.join(self.__text).strip()
def dehtml(text):
try:
parser = _DeHTMLParser()
parser.feed(text)
parser.close()
return parser.text()
except:
print_exc(file=stderr)
return text
def main():
text = r'''
<html>
<body>
<b>Project:</b> DeHTML<br>
<b>Description</b>:<br>
This small script is intended to allow conversion from HTML markup to
plain text.
</body>
</html>
'''
print(dehtml(text))
if __name__ == '__main__':
main()
NOTE: NTLK no longer supports clean_html
function
Original answer below, and an alternative in the comments sections.
Use NLTK
I wasted my 4-5 hours fixing the issues with html2text. Luckily i could encounter NLTK.
It works magically.
import nltk
from urllib import urlopen
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read()
raw = nltk.clean_html(html)
print(raw)
While alot of people mentioned using regex to strip html tags, there are a lot of downsides.
for example:
<p>hello world</p>I love you
Should be parsed to:
Hello world
I love you
Here's a snippet I came up with, you can cusomize it to your specific needs, and it works like a charm
import re
import html
def html2text(htm):
ret = html.unescape(htm)
ret = ret.translate({
8209: ord('-'),
8220: ord('"'),
8221: ord('"'),
160: ord(' '),
})
ret = re.sub(r"\s", " ", ret, flags = re.MULTILINE)
ret = re.sub("<br>|<br />|</p>|</div>|</h\d>", "\n", ret, flags = re.IGNORECASE)
ret = re.sub('<.*?>', ' ', ret, flags=re.DOTALL)
ret = re.sub(r" +", " ", ret)
return ret
You can use html2text method in the stripogram library also.
from stripogram import html2text
text = html2text(your_html_string)
To install stripogram run sudo easy_install stripogram
In Python 3.x you can do it in a very easy way by importing 'imaplib' and 'email' packages. Although this is an older post but maybe my answer can help new comers on this post.
status, data = self.imap.fetch(num, '(RFC822)')
email_msg = email.message_from_bytes(data[0][1])
#email.message_from_string(data[0][1])
#If message is multi part we only want the text version of the body, this walks the message and gets the body.
if email_msg.is_multipart():
for part in email_msg.walk():
if part.get_content_type() == "text/plain":
body = part.get_payload(decode=True) #to control automatic email-style MIME decoding (e.g., Base64, uuencode, quoted-printable)
body = body.decode()
elif part.get_content_type() == "text/html":
continue
Now you can print body variable and it will be in plaintext format :) If it is good enough for you then it would be nice to select it as accepted answer.