Strip HTML from strings in Python

前端 未结 26 2311
难免孤独
难免孤独 2020-11-22 02:50
from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
  print line

When p

相关标签:
26条回答
  • 2020-11-22 03:10

    If you want to strip all HTML tags the easiest way I found is using BeautifulSoup:

    from bs4 import BeautifulSoup  # Or from BeautifulSoup import BeautifulSoup
    
    def stripHtmlTags(htmlTxt):
        if htmlTxt is None:
                return None
            else:
                return ''.join(BeautifulSoup(htmlTxt).findAll(text=True)) 
    

    I tried the code of the accepted answer but I was getting "RuntimeError: maximum recursion depth exceeded", which didn't happen with the above block of code.

    0 讨论(0)
  • 2020-11-22 03:10

    You can write your own function:

    def StripTags(text):
         finished = 0
         while not finished:
             finished = 1
             start = text.find("<")
             if start >= 0:
                 stop = text[start:].find(">")
                 if stop >= 0:
                     text = text[:start] + text[start+stop+1:]
                     finished = 0
         return text
    
    0 讨论(0)
  • 2020-11-22 03:14

    There's a simple way to this:

    def remove_html_markup(s):
        tag = False
        quote = False
        out = ""
    
        for c in s:
                if c == '<' and not quote:
                    tag = True
                elif c == '>' and not quote:
                    tag = False
                elif (c == '"' or c == "'") and tag:
                    quote = not quote
                elif not tag:
                    out = out + c
    
        return out
    

    The idea is explained here: http://youtu.be/2tu9LTDujbw

    You can see it working here: http://youtu.be/HPkNPcYed9M?t=35s

    PS - If you're interested in the class(about smart debugging with python) I give you a link: http://www.udacity.com/overview/Course/cs259/CourseRev/1. It's free!

    You're welcome! :)

    0 讨论(0)
  • 2020-11-22 03:14

    Here's my solution for python 3.

    import html
    import re
    
    def html_to_txt(html_text):
        ## unescape html
        txt = html.unescape(html_text)
        tags = re.findall("<[^>]+>",txt)
        print("found tags: ")
        print(tags)
        for tag in tags:
            txt=txt.replace(tag,'')
        return txt
    

    Not sure if it is perfect, but solved my use case and seems simple.

    0 讨论(0)
  • 2020-11-22 03:14

    This is a quick fix and can be even more optimized but it will work fine. This code will replace all non empty tags with "" and strips all html tags form a given input text .You can run it using ./file.py input output

        #!/usr/bin/python
    import sys
    
    def replace(strng,replaceText):
        rpl = 0
        while rpl > -1:
            rpl = strng.find(replaceText)
            if rpl != -1:
                strng = strng[0:rpl] + strng[rpl + len(replaceText):]
        return strng
    
    
    lessThanPos = -1
    count = 0
    listOf = []
    
    try:
        #write File
        writeto = open(sys.argv[2],'w')
    
        #read file and store it in list
        f = open(sys.argv[1],'r')
        for readLine in f.readlines():
            listOf.append(readLine)         
        f.close()
    
        #remove all tags  
        for line in listOf:
            count = 0;  
            lessThanPos = -1  
            lineTemp =  line
    
                for char in lineTemp:
    
                if char == "<":
                    lessThanPos = count
                if char == ">":
                    if lessThanPos > -1:
                        if line[lessThanPos:count + 1] != '<>':
                            lineTemp = replace(lineTemp,line[lessThanPos:count + 1])
                            lessThanPos = -1
                count = count + 1
            lineTemp = lineTemp.replace("&lt","<")
            lineTemp = lineTemp.replace("&gt",">")                  
            writeto.write(lineTemp)  
        writeto.close() 
        print "Write To --- >" , sys.argv[2]
    except:
        print "Help: invalid arguments or exception"
        print "Usage : ",sys.argv[0]," inputfile outputfile"
    
    0 讨论(0)
  • 2020-11-22 03:15

    I always used this function to strip HTML tags, as it requires only the Python stdlib:

    For Python 3:

    from io import StringIO
    from html.parser import HTMLParser
    
    class MLStripper(HTMLParser):
        def __init__(self):
            super().__init__()
            self.reset()
            self.strict = False
            self.convert_charrefs= True
            self.text = StringIO()
        def handle_data(self, d):
            self.text.write(d)
        def get_data(self):
            return self.text.getvalue()
    
    def strip_tags(html):
        s = MLStripper()
        s.feed(html)
        return s.get_data()
    

    For Python 2:

    from HTMLParser import HTMLParser
    from StringIO import StringIO
    
    class MLStripper(HTMLParser):
        def __init__(self):
            self.reset()
            self.text = StringIO()
        def handle_data(self, d):
            self.text.write(d)
        def get_data(self):
            return self.text.getvalue()
    
    def strip_tags(html):
        s = MLStripper()
        s.feed(html)
        return s.get_data()
    
    0 讨论(0)
提交回复
热议问题