how to follow meta refreshes in Python

放肆的年华 提交于 2019-11-27 05:18:18
asmaier

Here is a solution using BeautifulSoup and httplib2 (and certificate based authentication):

import BeautifulSoup
import httplib2

def meta_redirect(content):
    soup  = BeautifulSoup.BeautifulSoup(content)

    result=soup.find("meta",attrs={"http-equiv":"Refresh"})
    if result:
        wait,text=result["content"].split(";")
        if text.strip().lower().startswith("url="):
            url=text[4:]
            return url
    return None

def get_content(url, key, cert):

    h=httplib2.Http(".cache")
    h.add_certificate(key,cert,"")

    resp, content = h.request(url,"GET")

    # follow the chain of redirects
    while meta_redirect(content):
        resp, content = h.request(meta_redirect(content),"GET") 

    return content  

A similar solution using the requests and lxml libraries. Also does a simple check that the thing being tested is actually HTML (a requirement in my implementation). Also is able to capture and use cookies by using the request library's sessions (sometimes necessary if redirection + cookies are being used as an anti-scraping mechanism).

import magic
import mimetypes
import requests
from lxml import html 
from urlparse import urljoin

def test_for_meta_redirections(r):
    mime = magic.from_buffer(r.content, mime=True)
    extension = mimetypes.guess_extension(mime)
    if extension == '.html':
        html_tree = html.fromstring(r.text)
        attr = html_tree.xpath("//meta[translate(@http-equiv, 'REFSH', 'refsh') = 'refresh']/@content")[0]
        wait, text = attr.split(";")
        if text.lower().startswith("url="):
            url = text[4:]
            if not url.startswith('http'):
                # Relative URL, adapt
                url = urljoin(r.url, url)
            return True, url
    return False, None


def follow_redirections(r, s):
    """
    Recursive function that follows meta refresh redirections if they exist.
    """
    redirected, url = test_for_meta_redirections(r)
    if redirected:
        r = follow_redirections(s.get(url), s)
    return r

Usage:

s = requests.session()
r = s.get(url)
# test for and follow meta redirects
r = follow_redirections(r, s)
hoju

OK, seems no library supports it so I have been using this code:

import urllib2
import urlparse
import re

def get_hops(url):
    redirect_re = re.compile('<meta[^>]*?url=(.*?)["\']', re.IGNORECASE)
    hops = []
    while url:
        if url in hops:
            url = None
        else:
            hops.insert(0, url)
            response = urllib2.urlopen(url)
            if response.geturl() != url:
                hops.insert(0, response.geturl())
            # check for redirect meta tag
            match = redirect_re.search(response.read())
            if match:
                url = urlparse.urljoin(url, match.groups()[0].strip())
            else:
                url = None
    return hops

If you dont want to use bs4 ,you can use lxml like this:

from lxml.html import soupparser

def meta_redirect(content):
    root = soupparser.fromstring(content)
    result_url = root.xpath('//meta[@http-equiv="refresh"]/@content')
    if result_url:
        result_url = str(result_url[0])
        urls = result_url.split('URL=') if len(result_url.split('url=')) < 2    else result_url.split('url=')
        url = urls[1] if len(urls) >= 2 else None
    else:
        return None
    return url

Use BeautifulSoup or lxml to parse the HTML.

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!