Strip HTML from strings in Python

前端未结

关注

 26  2356

难免孤独 2020-11-22 02:50

from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
  print line

When p

26条回答

隐瞒了意图╮ (楼主)

2020-11-22 03:02

The solutions with HTML-Parser are all breakable, if they run only once:

html_to_text('<script>alert("hacked")</script>

results in:

what you intend to prevent. if you use a HTML-Parser, count the Tags until zero are replaced:

from HTMLParser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
        self.containstags = False

    def handle_starttag(self, tag, attrs):
       self.containstags = True

    def handle_data(self, d):
        self.fed.append(d)

    def has_tags(self):
        return self.containstags

    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    must_filtered = True
    while ( must_filtered ):
        s = MLStripper()
        s.feed(html)
        html = s.get_data()
        must_filtered = s.has_tags()
    return html

0 讨论(0)

查看其它26个回答