Strip HTML from strings in Python

前端 未结 26 2307
难免孤独
难免孤独 2020-11-22 02:50
from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
  print line

When p

26条回答
  •  慢半拍i
    慢半拍i (楼主)
    2020-11-22 03:14

    This is a quick fix and can be even more optimized but it will work fine. This code will replace all non empty tags with "" and strips all html tags form a given input text .You can run it using ./file.py input output

        #!/usr/bin/python
    import sys
    
    def replace(strng,replaceText):
        rpl = 0
        while rpl > -1:
            rpl = strng.find(replaceText)
            if rpl != -1:
                strng = strng[0:rpl] + strng[rpl + len(replaceText):]
        return strng
    
    
    lessThanPos = -1
    count = 0
    listOf = []
    
    try:
        #write File
        writeto = open(sys.argv[2],'w')
    
        #read file and store it in list
        f = open(sys.argv[1],'r')
        for readLine in f.readlines():
            listOf.append(readLine)         
        f.close()
    
        #remove all tags  
        for line in listOf:
            count = 0;  
            lessThanPos = -1  
            lineTemp =  line
    
                for char in lineTemp:
    
                if char == "<":
                    lessThanPos = count
                if char == ">":
                    if lessThanPos > -1:
                        if line[lessThanPos:count + 1] != '<>':
                            lineTemp = replace(lineTemp,line[lessThanPos:count + 1])
                            lessThanPos = -1
                count = count + 1
            lineTemp = lineTemp.replace("<","<")
            lineTemp = lineTemp.replace(">",">")                  
            writeto.write(lineTemp)  
        writeto.close() 
        print "Write To --- >" , sys.argv[2]
    except:
        print "Help: invalid arguments or exception"
        print "Usage : ",sys.argv[0]," inputfile outputfile"
    

提交回复
热议问题