How to read eml file in python?

前端 未结 4 1424
滥情空心
滥情空心 2021-01-31 12:37

I do not known how to load a eml file in python 3.4.
I want to list all and read all of them in python.

\"ente

相关标签:
4条回答
  • 2021-01-31 12:55

    This is how you get content of an e-mail i.e. *.eml file. This works perfectly on Python2.5 - 2.7. Try it on 3. It should work as well.

    
    
    from email import message_from_file
    import os
    
    # Path to directory where attachments will be stored:
    path = "./msgfiles"
    
    # To have attachments extracted into memory, change behaviour of 2 following functions:
    
    def file_exists (f):
        """Checks whether extracted file was extracted before."""
        return os.path.exists(os.path.join(path, f))
    
    def save_file (fn, cont):
        """Saves cont to a file fn"""
        file = open(os.path.join(path, fn), "wb")
        file.write(cont)
        file.close()
    
    def construct_name (id, fn):
        """Constructs a file name out of messages ID and packed file name"""
        id = id.split(".")
        id = id[0]+id[1]
        return id+"."+fn
    
    def disqo (s):
        """Removes double or single quotations."""
        s = s.strip()
        if s.startswith("'") and s.endswith("'"): return s[1:-1]
        if s.startswith('"') and s.endswith('"'): return s[1:-1]
        return s
    
    def disgra (s):
        """Removes < and > from HTML-like tag or e-mail address or e-mail ID."""
        s = s.strip()
        if s.startswith("<") and s.endswith(">"): return s[1:-1]
        return s
    
    def pullout (m, key):
        """Extracts content from an e-mail message.
        This works for multipart and nested multipart messages too.
        m   -- email.Message() or mailbox.Message()
        key -- Initial message ID (some string)
        Returns tuple(Text, Html, Files, Parts)
        Text  -- All text from all parts.
        Html  -- All HTMLs from all parts
        Files -- Dictionary mapping extracted file to message ID it belongs to.
        Parts -- Number of parts in original message.
        """
        Html = ""
        Text = ""
        Files = {}
        Parts = 0
        if not m.is_multipart():
            if m.get_filename(): # It's an attachment
                fn = m.get_filename()
                cfn = construct_name(key, fn)
                Files[fn] = (cfn, None)
                if file_exists(cfn): return Text, Html, Files, 1
                save_file(cfn, m.get_payload(decode=True))
                return Text, Html, Files, 1
            # Not an attachment!
            # See where this belongs. Text, Html or some other data:
            cp = m.get_content_type()
            if cp=="text/plain": Text += m.get_payload(decode=True)
            elif cp=="text/html": Html += m.get_payload(decode=True)
            else:
                # Something else!
                # Extract a message ID and a file name if there is one:
                # This is some packed file and name is contained in content-type header
                # instead of content-disposition header explicitly
                cp = m.get("content-type")
                try: id = disgra(m.get("content-id"))
                except: id = None
                # Find file name:
                o = cp.find("name=")
                if o==-1: return Text, Html, Files, 1
                ox = cp.find(";", o)
                if ox==-1: ox = None
                o += 5; fn = cp[o:ox]
                fn = disqo(fn)
                cfn = construct_name(key, fn)
                Files[fn] = (cfn, id)
                if file_exists(cfn): return Text, Html, Files, 1
                save_file(cfn, m.get_payload(decode=True))
            return Text, Html, Files, 1
        # This IS a multipart message.
        # So, we iterate over it and call pullout() recursively for each part.
        y = 0
        while 1:
            # If we cannot get the payload, it means we hit the end:
            try:
                pl = m.get_payload(y)
            except: break
            # pl is a new Message object which goes back to pullout
            t, h, f, p = pullout(pl, key)
            Text += t; Html += h; Files.update(f); Parts += p
            y += 1
        return Text, Html, Files, Parts
    
    def extract (msgfile, key):
        """Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary.
        msgfile -- A file-like readable object
        key     -- Some ID string for that particular Message. Can be a file name or anything.
        Returns dict()
        Keys: from, to, subject, date, text, html, parts[, files]
        Key files will be present only when message contained binary files.
        For more see __doc__ for pullout() and caption() functions.
        """
        m = message_from_file(msgfile)
        From, To, Subject, Date = caption(m)
        Text, Html, Files, Parts = pullout(m, key)
        Text = Text.strip(); Html = Html.strip()
        msg = {"subject": Subject, "from": From, "to": To, "date": Date,
            "text": Text, "html": Html, "parts": Parts}
        if Files: msg["files"] = Files
        return msg
    
    def caption (origin):
        """Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
        origin -- Message() object
        Returns tuple(From, To, Subject, Date)
        If message doesn't contain one/more of them, the empty strings will be returned.
        """
        Date = ""
        if origin.has_key("date"): Date = origin["date"].strip()
        From = ""
        if origin.has_key("from"): From = origin["from"].strip()
        To = ""
        if origin.has_key("to"): To = origin["to"].strip()
        Subject = ""
        if origin.has_key("subject"): Subject = origin["subject"].strip()
        return From, To, Subject, Date
    
    
    # Usage:
    f = open("message.eml", "rb")
    print extract(f, f.name)
    f.close()
    

    I programmed this for my mailgroup using mailbox, that is why it is so convoluted. It never failed me. Never any junk. If message is multipart, output dictionary will contain a key "files" (a sub dict) with all filenames of extracted other files that were not text or html. That was a way of extracting attachments and other binary data. You may change it in pullout(), or just change the behaviour of file_exists() and save_file().

    construct_name() constructs a filename out of message id and multipart message filename, if there is one.

    In pullout() the Text and Html variables are strings. For online mailgroup it was OK to get any text or HTML packed into multipart that wasn't an attachment at once.

    If you need something more sophisticated change Text and Html to lists and append to them and add them as needed. Nothing problematic.

    Maybe there are some errors here, because it is intended to work with mailbox.Message(), not with email.Message(). I tried it on email.Message() and it worked fine.

    You said, you "wish to list them all". From where? If you refer to the POP3 mailbox or a mailbox of some nice open-source mailer, then you do it using mailbox module. If you want to list them from others, then you have a problem. For example, to get mails from MS Outlook, you have to know how to read OLE2 compound files. Other mailers rarely refer to them as *.eml files, so I think this is exactly what you would like to do. Then search on PyPI for olefile or compoundfiles module and Google around for how to extract an e-mail from MS Outlook inbox file. Or save yourself a mess and just export them from there to some directory. When you have them as eml files, then apply this code.

    0 讨论(0)
  • 2021-01-31 12:59

    I found this code much simpler

    import email
    import os
    
    path = './'
    listing = os.listdir(path)
    
    for fle in listing:
        if str.lower(fle[-3:])=="eml":
            msg = email.message_from_file(open(fle))
            attachments=msg.get_payload()
            for attachment in attachments:
                try:
                    fnam=attachment.get_filename()
                    f=open(fnam, 'wb').write(attachment.get_payload(decode=True,))
                    f.close()
                except Exception as detail:
                    #print detail
                    pass
    
    0 讨论(0)
  • 2021-01-31 13:00

    Posting this here for anyone looking to just extract text from an email and get a list of .eml files - took me forever to find a good answer to this online. NOTE: This will not get attachments to emails, just the text from email.

    import email
    from email import policy
    from email.parser import BytesParser
    import glob
    import os
    
    path = '/path/to/data/' # set this to "./" if in current directory
    
    eml_files = glob.glob(path + '*.eml') # get all .eml files in a list
    for eml_file in eml_files:
        with open(eml_file, 'rb') as fp:  # select a specific email file from the list
            name = fp.name # Get file name
            msg = BytesParser(policy=policy.default).parse(fp)
        text = msg.get_body(preferencelist=('plain')).get_content()
        fp.close()
     
        text = text.split("\n")
        print (name) # Get name of eml file
        print (text) # Get list of all text in email
    

    Credit to some of the code from this post: Reading .eml files with Python 3.6 using emaildata 0.3.4

    0 讨论(0)
  • 2021-01-31 13:09

    Try this:

    #!python3
    # -*- coding: utf-8 -*-
    
    import email
    import os
    
    SOURCE_DIR = 'email'
    DEST_DIR = 'temp'
    
    def extractattachements(fle,suffix=None):
        message = email.message_from_file(open(fle))
        filenames = []
        if message.get_content_maintype() == 'multipart':
            for part in message.walk():
                if part.get_content_maintype() == 'multipart': continue
                #if part.get('Content-Disposition') is None: continue
                if part.get('Content-Type').find('application/octet-stream') == -1: continue
                filename = part.get_filename()
                if suffix:
                    filename = ''.join( [filename.split('.')[0], '_', suffix, '.', filename.split('.')[1]])
                filename = os.path.join(DEST_DIR, filename)
                fb = open(filename,'wb')
                fb.write(part.get_payload(decode=True))
                fb.close()
                filenames.append(filename)
        return filenames
    
    def main():
        onlyfiles = [f for f in os.listdir(SOURCE_DIR) if os.path.isfile(os.path.join(SOURCE_DIR, f))]
        for file in onlyfiles:
            #print path.join(SOURCE_DIR,file)
            extractattachements(os.path.join(SOURCE_DIR,file))
        return True
    
    if __name__ == "__main__":
        main()
    
    0 讨论(0)
提交回复
热议问题