How to extract PDF fields from a filled out form in Python?

后端 未结 6 1082
北恋
北恋 2020-12-02 07:10

I\'m trying to use Python to processes some PDF forms that were filled out and signed using Adobe Acrobat Reader.

I\'ve tried:

  • The pdfminer demo: it di
相关标签:
6条回答
  • 2020-12-02 07:23

    There is a typo on these lines:

    file.write(pp.pformat(form))
    

    Should be:

    outfile.write(pp.pformat(form))
    
    0 讨论(0)
  • 2020-12-02 07:26

    Python 3.6+:

    pip install PyPDF2

    # -*- coding: utf-8 -*-
    
    from collections import OrderedDict
    from PyPDF2 import PdfFileWriter, PdfFileReader
    
    
    def _getFields(obj, tree=None, retval=None, fileobj=None):
        """
        Extracts field data if this PDF contains interactive form fields.
        The *tree* and *retval* parameters are for recursive use.
    
        :param fileobj: A file object (usually a text file) to write
            a report to on all interactive form fields found.
        :return: A dictionary where each key is a field name, and each
            value is a :class:`Field<PyPDF2.generic.Field>` object. By
            default, the mapping name is used for keys.
        :rtype: dict, or ``None`` if form data could not be located.
        """
        fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
                           '/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
        if retval is None:
            retval = OrderedDict()
            catalog = obj.trailer["/Root"]
            # get the AcroForm tree
            if "/AcroForm" in catalog:
                tree = catalog["/AcroForm"]
            else:
                return None
        if tree is None:
            return retval
    
        obj._checkKids(tree, retval, fileobj)
        for attr in fieldAttributes:
            if attr in tree:
                # Tree is a field
                obj._buildField(tree, retval, fileobj, fieldAttributes)
                break
    
        if "/Fields" in tree:
            fields = tree["/Fields"]
            for f in fields:
                field = f.getObject()
                obj._buildField(field, retval, fileobj, fieldAttributes)
    
        return retval
    
    
    def get_form_fields(infile):
        infile = PdfFileReader(open(infile, 'rb'))
        fields = _getFields(infile)
        return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())
    
    
    
    if __name__ == '__main__':
        from pprint import pprint
    
        pdf_file_name = 'FormExample.pdf'
    
        pprint(get_form_fields(pdf_file_name))
    
    0 讨论(0)
  • 2020-12-02 07:28

    You should be able to do it with pdfminer, but it will require some delving into the internals of pdfminer and some knowledge about the pdf format (wrt forms of course, but also about pdf's internal structures like "dictionaries" and "indirect objects").

    This example might help you on your way (I think it will work only on simple cases, with no nested fields etc...)

    import sys
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdftypes import resolve1
    
    filename = sys.argv[1]
    fp = open(filename, 'rb')
    
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    for i in fields:
        field = resolve1(i)
        name, value = field.get('T'), field.get('V')
        print '{0}: {1}'.format(name, value)
    

    EDIT: forgot to mention: if you need to provide a password, pass it to doc.initialize()

    0 讨论(0)
  • 2020-12-02 07:33

    The Python PyPDF2 package (successor to pyPdf) is very convenient:

    import PyPDF2
    f = PyPDF2.PdfFileReader('form.pdf')
    ff = f.getFields()
    

    Then ff is a dict that contains all the relevant form information.

    0 讨论(0)
  • 2020-12-02 07:35

    Quick and dirty 2-minute job; just use PDFminer to convert PDF to xml and then grab all of the fields.

    from xml.etree import ElementTree
    from pprint import pprint
    import os
    
    def main():
        print "Calling PDFDUMP.py"
        os.system("dumppdf.py -a FILE.pdf > out.xml")
    
        # Preprocess the file to eliminate bad XML.
        print "Screening the file"
        o = open("output.xml","w") #open for append
        for line in open("out.xml"):
           line = line.replace("&#", "Invalid_XML") #some bad data in xml for formatting info.
           o.write(line) 
        o.close()
    
        print "Opening XML output"
        tree = ElementTree.parse('output.xml')
        lastnode = ""
        lastnode2 = ""
        list = {}
        entry = {}
    
        for node in tree.iter(): # Run through the tree..        
            # Check if New node
            if node.tag == "key" and node.text == "T":
                lastnode = node.tag + node.text
            elif lastnode == "keyT":
                for child in node.iter():
                    entry["ID"] = child.text
                lastnode = ""
    
            if node.tag == "key" and node.text == "V":
                lastnode2 = node.tag + node.text
            elif lastnode2 == "keyV":
                for child in node.iter():
                    if child.tag == "string":
                        if entry.has_key("ID"):
                            entry["Value"] = child.text
                            list[entry["ID"]] = entry["Value"]
                            entry = {}
                lastnode2 = ""
    
        pprint(list)
    
    if __name__ == '__main__':
      main()
    

    It isn't pretty, just a simple proof of concept. I need to implement it for a system I'm working on so I will be cleaning it up, but I thought I would post it in case anyone finds it useful.

    0 讨论(0)
  • 2020-12-02 07:35

    Update for latest version of pdf miner (change import and parser/doc setup in first function)

    from argparse import ArgumentParser
    import pickle
    import pprint
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdftypes import resolve1
    from pdfminer.pdftypes import PDFObjRef
    
    def load_form(filename):
        """Load pdf form contents into a nested list of name/value tuples"""
        with open(filename, 'rb') as file:
            parser = PDFParser(file)
            doc = PDFDocument(parser)
            parser.set_document(doc)
            #doc.set_parser(parser)
            doc.initialize()
            return [load_fields(resolve1(f)) for f in
                resolve1(doc.catalog['AcroForm'])['Fields']]
    
    def load_fields(field):
        """Recursively load form fields"""
        form = field.get('Kids', None)
        if form:
            return [load_fields(resolve1(f)) for f in form]
        else:
            # Some field types, like signatures, need extra resolving
            return (field.get('T').decode('utf-8'), resolve1(field.get('V')))
    
    def parse_cli():
        """Load command line arguments"""
        parser = ArgumentParser(description='Dump the form contents of a PDF.')
        parser.add_argument('file', metavar='pdf_form',
            help='PDF Form to dump the contents of')
        parser.add_argument('-o', '--out', help='Write output to file',
            default=None, metavar='FILE')
        parser.add_argument('-p', '--pickle', action='store_true', default=False,
            help='Format output for python consumption')
        return parser.parse_args()
    
    def main():
        args = parse_cli()
        form = load_form(args.file)
        if args.out:
            with open(args.out, 'w') as outfile:
                if args.pickle:
                    pickle.dump(form, outfile)
                else:
                    pp = pprint.PrettyPrinter(indent=2)
                    file.write(pp.pformat(form))
        else:
            if args.pickle:
                print pickle.dumps(form)
            else:
                pp = pprint.PrettyPrinter(indent=2)
                pp.pprint(form)
    
    if __name__ == '__main__':
        main()
    
    0 讨论(0)
提交回复
热议问题