split a pdf based on outline

问题

i would like to use pyPdf to split a pdf file based on the outline where each destination in the outline refers to a different page within the pdf.

example outline:

main       --> points to page 1
  sect1    --> points to page 1
  sect2    --> points to page 15
  sect3    --> points to page 22

it is easy within pyPdf to iterate over each page of the document or each destination in the document's outline; however, i cannot figure out how to get the page number where the destination points.

does anybody know how to find the referencing page number for each destination in the outline?

回答1:

I figured it out:

class Darrell(pyPdf.PdfFileReader):

    def getDestinationPageNumbers(self):
        def _setup_outline_page_ids(outline, _result=None):
            if _result is None:
                _result = {}
            for obj in outline:
                if isinstance(obj, pyPdf.pdf.Destination):
                    _result[(id(obj), obj.title)] = obj.page.idnum
                elif isinstance(obj, list):
                    _setup_outline_page_ids(obj, _result)
            return _result

        def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
            if _result is None:
                _result = {}
            if pages is None:
                _num_pages = []
                pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
            t = pages["/Type"]
            if t == "/Pages":
                for page in pages["/Kids"]:
                    _result[page.idnum] = len(_num_pages)
                    _setup_page_id_to_num(page.getObject(), _result, _num_pages)
            elif t == "/Page":
                _num_pages.append(1)
            return _result

        outline_page_ids = _setup_outline_page_ids(self.getOutlines())
        page_id_to_page_numbers = _setup_page_id_to_num()

        result = {}
        for (_, title), page_idnum in outline_page_ids.iteritems():
            result[title] = page_id_to_page_numbers.get(page_idnum, '???')
        return result

pdf = Darrell(open(PATH-TO-PDF, 'rb'))
template = '%-5s  %s'
print template % ('page', 'title')
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
    print template % (p+1,t)

回答2:

Darrell's class can be modified slightly to produce a multi-level table of contents for a pdf (in the manner of pdftoc in the pdftk toolkit.)

My modification adds one more parameter to _setup_page_id_to_num, an integer "level" which defaults to 1. Each invocation increments the level. Instead of storing just the page number in the result, we store the pair of page number and level. Appropriate modifications should be applied when using the returned result.

I am using this to implement the "PDF Hacks" browser-based page-at-a-time document viewer with a sidebar table of contents which reflects LaTeX section, subsection etc bookmarks. I am working on a shared system where pdftk can not be installed but where python is available.

回答3:

This is just what I was looking for. Darrell's additions to PdfFileReader should be part of PyPDF2.

I wrote a little recipe that uses PyPDF2 and sejda-console to split a PDF by bookmarks. In my case there are several Level 1 sections that I want to keep together. This script allows me to do that and give the resulting files meaningful names.

import operator
import os
import subprocess
import sys
import time

import PyPDF2 as pyPdf

# need to have sejda-console installed
# change this to point to your installation
sejda = 'C:\\sejda-console-1.0.0.M2\\bin\\sejda-console.bat'

class Darrell(pyPdf.PdfFileReader):
    ...

if __name__ == '__main__':
    t0= time.time()

    # get the name of the file to split as a command line arg
    pdfname = sys.argv[1]

    # open up the pdf
    pdf = Darrell(open(pdfname, 'rb'))

    # build list of (pagenumbers, newFileNames)
    splitlist = [(1,'FrontMatter')] # Customize name of first section

    template = '%-5s  %s'
    print template % ('Page', 'Title')
    print '-'*72
    for t,p in sorted(pdf.getDestinationPageNumbers().iteritems(),
                      key=operator.itemgetter(1)):

        # Customize this to get it to split where you want
        if t.startswith('Chapter') or \
           t.startswith('Preface') or \
           t.startswith('References'):

            print template % (p+1, t)

            # this customizes how files are renamed
            new = t.replace('Chapter ', 'Chapter')\
                   .replace(':  ', '-')\
                   .replace(': ', '-')\
                   .replace(' ', '_')
            splitlist.append((p+1, new))

    # call sejda tools and split document
    call = sejda
    call += ' splitbypages'
    call += ' -f "%s"'%pdfname
    call += ' -o ./'
    call += ' -n '
    call += ' '.join([str(p) for p,t in splitlist[1:]])
    print '\n', call
    subprocess.call(call)
    print '\nsejda-console has completed.\n\n'

    # rename the split files
    for p,t in splitlist:
        old ='./%i_'%p + pdfname
        new = './' + t + '.pdf'
        print 'renaming "%s"\n      to "%s"...'%(old, new),

        try:
            os.remove(new)
        except OSError:
            pass

        try:
            os.rename(old, new)
            print' succeeded.\n'
        except:
            print' failed.\n'

    print '\ndone. Spliting took %.2f seconds'%(time.time() - t0)

回答4:

Small update to @darrell class to be able to parse UTF-8 outlines, which I post as answer because comment would be hard to read.

Problem is in pyPdf.pdf.Destination.title which may be returned in two flavors:

pyPdf.generic.TextStringObject
pyPdf.generic.ByteStringObject

so that output from _setup_outline_page_ids() function returns also two different types for title object, which fails with UnicodeDecodeError if outline title contains anything then ASCII.

I added this code to solve the problem:

if isinstance(title, pyPdf.generic.TextStringObject):
    title = title.encode('utf-8')

of whole class:

class PdfOutline(pyPdf.PdfFileReader):

    def getDestinationPageNumbers(self):

        def _setup_outline_page_ids(outline, _result=None):
            if _result is None:
                _result = {}
            for obj in outline:
                if isinstance(obj, pyPdf.pdf.Destination):
                    _result[(id(obj), obj.title)] = obj.page.idnum
                elif isinstance(obj, list):
                    _setup_outline_page_ids(obj, _result)
            return _result

        def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
            if _result is None:
                _result = {}
            if pages is None:
                _num_pages = []
                pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
            t = pages["/Type"]
            if t == "/Pages":
                for page in pages["/Kids"]:
                    _result[page.idnum] = len(_num_pages)
                    _setup_page_id_to_num(page.getObject(), _result, _num_pages)
            elif t == "/Page":
                _num_pages.append(1)
            return _result

        outline_page_ids = _setup_outline_page_ids(self.getOutlines())
        page_id_to_page_numbers = _setup_page_id_to_num()

        result = {}
        for (_, title), page_idnum in outline_page_ids.iteritems():
            if isinstance(title, pyPdf.generic.TextStringObject):
                title = title.encode('utf-8')
            result[title] = page_id_to_page_numbers.get(page_idnum, '???')
        return result

来源：https://stackoverflow.com/questions/1918420/split-a-pdf-based-on-outline

标签

python

pdf

pypdf