Strip HTML from strings in Python

前端 未结 26 2348
难免孤独
难免孤独 2020-11-22 02:50
from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
  print line

When p

26条回答
  •  再見小時候
    2020-11-22 03:02

    Using BeautifulSoup, html2text or the code from @Eloff, most of the time, it remains some html elements, javascript code...

    So you can use a combination of these libraries and delete markdown formatting (Python 3):

    import re
    import html2text
    from bs4 import BeautifulSoup
    def html2Text(html):
        def removeMarkdown(text):
            for current in ["^[ #*]{2,30}", "^[ ]{0,30}\d\\\.", "^[ ]{0,30}\d\."]:
                markdown = re.compile(current, flags=re.MULTILINE)
                text = markdown.sub(" ", text)
            return text
        def removeAngular(text):
            angular = re.compile("[{][|].{2,40}[|][}]|[{][*].{2,40}[*][}]|[{][{].{2,40}[}][}]|\[\[.{2,40}\]\]")
            text = angular.sub(" ", text)
            return text
        h = html2text.HTML2Text()
        h.images_to_alt = True
        h.ignore_links = True
        h.ignore_emphasis = False
        h.skip_internal_links = True
        text = h.handle(html)
        soup = BeautifulSoup(text, "html.parser")
        text = soup.text
        text = removeAngular(text)
        text = removeMarkdown(text)
        return text
    

    It works well for me but it can be enhanced, of course...

提交回复
热议问题