from mechanize import Browser
br = Browser()
br.open(\'http://somewebpage\')
html = br.response().readlines()
for line in html:
print line
When p
If you want to strip all HTML tags the easiest way I found is using BeautifulSoup:
from bs4 import BeautifulSoup # Or from BeautifulSoup import BeautifulSoup
def stripHtmlTags(htmlTxt):
if htmlTxt is None:
return None
else:
return ''.join(BeautifulSoup(htmlTxt).findAll(text=True))
I tried the code of the accepted answer but I was getting "RuntimeError: maximum recursion depth exceeded", which didn't happen with the above block of code.
You can write your own function:
def StripTags(text):
finished = 0
while not finished:
finished = 1
start = text.find("<")
if start >= 0:
stop = text[start:].find(">")
if stop >= 0:
text = text[:start] + text[start+stop+1:]
finished = 0
return text
There's a simple way to this:
def remove_html_markup(s):
tag = False
quote = False
out = ""
for c in s:
if c == '<' and not quote:
tag = True
elif c == '>' and not quote:
tag = False
elif (c == '"' or c == "'") and tag:
quote = not quote
elif not tag:
out = out + c
return out
The idea is explained here: http://youtu.be/2tu9LTDujbw
You can see it working here: http://youtu.be/HPkNPcYed9M?t=35s
PS - If you're interested in the class(about smart debugging with python) I give you a link: http://www.udacity.com/overview/Course/cs259/CourseRev/1. It's free!
You're welcome! :)
Here's my solution for python 3.
import html
import re
def html_to_txt(html_text):
## unescape html
txt = html.unescape(html_text)
tags = re.findall("<[^>]+>",txt)
print("found tags: ")
print(tags)
for tag in tags:
txt=txt.replace(tag,'')
return txt
Not sure if it is perfect, but solved my use case and seems simple.
This is a quick fix and can be even more optimized but it will work fine. This code will replace all non empty tags with "" and strips all html tags form a given input text .You can run it using ./file.py input output
#!/usr/bin/python
import sys
def replace(strng,replaceText):
rpl = 0
while rpl > -1:
rpl = strng.find(replaceText)
if rpl != -1:
strng = strng[0:rpl] + strng[rpl + len(replaceText):]
return strng
lessThanPos = -1
count = 0
listOf = []
try:
#write File
writeto = open(sys.argv[2],'w')
#read file and store it in list
f = open(sys.argv[1],'r')
for readLine in f.readlines():
listOf.append(readLine)
f.close()
#remove all tags
for line in listOf:
count = 0;
lessThanPos = -1
lineTemp = line
for char in lineTemp:
if char == "<":
lessThanPos = count
if char == ">":
if lessThanPos > -1:
if line[lessThanPos:count + 1] != '<>':
lineTemp = replace(lineTemp,line[lessThanPos:count + 1])
lessThanPos = -1
count = count + 1
lineTemp = lineTemp.replace("<","<")
lineTemp = lineTemp.replace(">",">")
writeto.write(lineTemp)
writeto.close()
print "Write To --- >" , sys.argv[2]
except:
print "Help: invalid arguments or exception"
print "Usage : ",sys.argv[0]," inputfile outputfile"
I always used this function to strip HTML tags, as it requires only the Python stdlib:
For Python 3:
from io import StringIO
from html.parser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs= True
self.text = StringIO()
def handle_data(self, d):
self.text.write(d)
def get_data(self):
return self.text.getvalue()
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
For Python 2:
from HTMLParser import HTMLParser
from StringIO import StringIO
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.text = StringIO()
def handle_data(self, d):
self.text.write(d)
def get_data(self):
return self.text.getvalue()
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()