I second waffle paradox in recommending Beautiful Soup for parsing the HTML and then getting the tags, where the feeds are referenced. The code I usually use:
from BeautifulSoup import BeautifulSoup as parser
def detect_feeds_in_HTML(input_stream):
""" examines an open text stream with HTML for referenced feeds.
This is achieved by detecting all ``link`` tags that reference a feed in HTML.
:param input_stream: an arbitrary opened input stream that has a :func:`read` method.
:type input_stream: an input stream (e.g. open file or URL)
:return: a list of tuples ``(url, feed_type)``
:rtype: ``list(tuple(str, str))``
"""
# check if really an input stream
if not hasattr(input_stream, "read"):
raise TypeError("An opened input *stream* should be given, was %s instead!" % type(input_stream))
result = []
# get the textual data (the HTML) from the input stream
html = parser(input_stream.read())
# find all links that have an "alternate" attribute
feed_urls = html.findAll("link", rel="alternate")
# extract URL and type
for feed_link in feed_urls:
url = feed_link.get("href", None)
# if a valid URL is there
if url:
result.append(url)
return result