Python - How to extract URLs (plain/html, quote-printable/base64/7bit) from an email file

问题

I have searched in many places but I haven't come across a logic/script that extracts the URLs from the emails properly. So, I am presenting what I came up with. This is working perfectly for me.

This can handle plain-text and html-text content-types, supports quoted-printable, base64, and 7bit encodings.

NOTE: I wrote this as part of another task, you may have to tweak it to suit your need. Post any questions, and I can help answer.

Modules to import for this to work:

import traceback
import BeautifulSoup
import re
from sets import Set
import email
import quopri, base64

Here are the APIs I wrote that will do this job:

   def decode_quote_printable_part(self, quo_pri_part):
        """
        Decodes a quote-printable encoded MIME object
        :param quo_pri_part: MIME msg part
        :return: decoded text, null if exception
        """
        try:
            quo_pri_payload = quo_pri_part.get_payload()
            return quopri.decodestring(quo_pri_payload)
        except Exception as err:
            print "ERROR - Exception when decoding quoted printable: %s" % err
            return ""

    def decode_base64_part(self, base64_part):
        """
        Decodes base64 encoded MIME object
        :param base64_part: MIME msg part
        :return: decoded text, null if exception
        """
        try:
            decoded_part = base64.b64decode(base64_part)
            return decoded_part
        except Exception as err:
            print "ERROR - Exception when decoding base64 part: %s" % err
            return ""

    def get_urls_from_html_part(self, html_code):
        """
        Parses the given HTML text and extracts the href links from it.
        The input should already be decoded
        :param html_code: Decoded html text
        :return: A list of href links (includes mailto: links as well), null list if exception
        """
        try:
            soup = BeautifulSoup.BeautifulSoup(html_code)
            html_urls = []
            for link in soup.findAll("a"):
                url = link.get("href")
                if url and "http" in url:
                    html_urls.append(url)
            return html_urls
        except Exception as err:
            print "ERROR - Exception when parsing the html body: %s" % err
            return []

    def get_urls_from_plain_part(self, email_data):
        """
        Parses the given plain text and extracts the URLs out of it
        :param email_data: plain text to parse
        :return: A list of URLs (deduplicated), a null list if exception
        """
        try:
            pattern = "abcdefghijklmnopqrstuvwxyz0123456789./\~#%&()_-+=;?:[]!$*,@'^`<{|\""
            indices = [m.start() for m in re.finditer('http://', email_data)]
            indices.extend([n.start() for n in re.finditer('https://', email_data)])
            urls = []
            if indices:
                if len(indices) > 1:
                    new_lst = zip(indices, indices[1:])
                    for x, y in new_lst:
                        tmp = email_data[x:y]
                        url = ""
                        for ch in tmp:
                            if ch.lower() in pattern:
                                url += ch
                            else:
                                break
                        urls.append(url)
                tmp = email_data[indices[-1]:]
                url = ""
                for ch in tmp:
                        if ch.lower() in pattern:
                            url += ch
                        else:
                            break
                urls.append(url)
                urls = list(Set(urls))
                return urls
            return []

        except Exception as err:
            print "ERROR - Exception when parsing plain text for urls: %s" % err
            return []

    def get_urls_list(self, msg):
        """
        Collects all the URLs from an email
        :param msg: email message object
        :return: A dictionary of URLs => final_urls = {'http': [], 'https': []}
        """
        urls = []
        for part in msg.walk():
            decoded_part = part.get_payload()
            if part.__getitem__("Content-Transfer-Encoding") == "quoted-printable":
                decoded_part = self.decode_quote_printable_part(part)
            elif part.__getitem__("Content-Transfer-Encoding") == "base64":
                decoded_part = self.decode_base64_part(part.get_payload())
            if part.get_content_subtype() == "plain":
                urls.extend(self.get_urls_from_plain_part(decoded_part))
            elif part.get_content_subtype() == "html":
                urls.extend(self.get_urls_from_html_part(decoded_part))

        final_urls = {'http': [], 'https': []}
        for url in urls:
            if "http://" in url:
                final_urls['http'].append(url)
            else:
                final_urls['https'].append(url)
        return final_urls

Here is how to call this API:

try:
   with open(filename, 'r') as f:
       data = f.read()
   msg = email.message_from_string(data)
   final_urls = self.get_urls_list(msg)
except:
    pass

来源：https://stackoverflow.com/questions/33380726/python-how-to-extract-urls-plain-html-quote-printable-base64-7bit-from-an-e

标签

python

url

base64

quoted-printable