from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
print line
When p
Using BeautifulSoup, html2text or the code from @Eloff, most of the time, it remains some html elements, javascript code...
So you can use a combination of these libraries and delete markdown formatting (Python 3):
import re
import html2text
from bs4 import BeautifulSoup
def html2Text(html):
def removeMarkdown(text):
for current in ["^[ #*]{2,30}", "^[ ]{0,30}\d\\\.", "^[ ]{0,30}\d\."]:
markdown = re.compile(current, flags=re.MULTILINE)
text = markdown.sub(" ", text)
return text
def removeAngular(text):
angular = re.compile("[{][|].{2,40}[|][}]|[{][*].{2,40}[*][}]|[{][{].{2,40}[}][}]|\[\[.{2,40}\]\]")
text = angular.sub(" ", text)
return text
h = html2text.HTML2Text()
h.images_to_alt = True
h.ignore_links = True
h.ignore_emphasis = False
h.skip_internal_links = True
text = h.handle(html)
soup = BeautifulSoup(text, "html.parser")
text = soup.text
text = removeAngular(text)
text = removeMarkdown(text)
return text
It works well for me but it can be enhanced, of course...