--- /dev/null
+#!/usr/bin/env python
+#
+# Copyright (C) 2011 W. Trevor King <wking@drexel.edu>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program. If not, see
+# <http://www.gnu.org/licenses/>.
+
+"""Merge PDFs perserving bookmarks.
+"""
+
+import os as _os
+import subprocess as _subprocess
+import sys as _sys
+import tempfile as _tempfile
+
+
+__version__ = '0.1'
+
+
+PDFTK = 'pdftk'
+GS = 'gs'
+
+
+def invoke(args, stdout=None):
+ """
+ >>> invoke(['echo', 'hi', 'there'])
+ 'hi there\\n'
+ >>> invoke(['this command does not exist'])
+ Traceback (most recent call last):
+ ...
+ OSError: [Errno 2] No such file or directory
+ """
+ P = _subprocess.PIPE
+ capture_stdout = stdout is None
+ if capture_stdout:
+ stdout = P
+ p = _subprocess.Popen(
+ args, stdin=P, stdout=stdout, stderr=_sys.stderr, shell=False, cwd='.')
+ stdout_,stderr_ = p.communicate()
+ status = p.wait()
+ assert status == 0, status
+ return stdout_
+
+
+class BookmarkedPDF (object):
+ def __init__(self, filename=None):
+ self.filename = filename
+ if self.filename:
+ self.get_bookmarks()
+
+ def get_bookmarks(self):
+ data = invoke([PDFTK, self.filename, 'dump_data'])
+ self.pages,self.bookmarks = self._parse_dump_data(data)
+
+ @staticmethod
+ def _parse_dump_data(data):
+ """
+ >>> from pprint import pprint
+ >>> data = '\\n'.join([
+ ... 'InfoKey: CreationDate',
+ ... 'InfoValue: D:20080502020302Z',
+ ... 'NumberOfPages: 123',
+ ... 'BookmarkTitle: Chapter 1',
+ ... 'BookmarkLevel: 1',
+ ... 'BookmarkPageNumber: 1',
+ ... 'BookmarkTitle: Section 1.1',
+ ... 'BookmarkLevel: 2',
+ ... 'BookmarkPageNumber: 2',
+ ... 'BookmarkTitle: Section 1.1.1',
+ ... 'BookmarkLevel: 3',
+ ... 'BookmarkPageNumber: 3',
+ ... 'BookmarkTitle: Section 1.1.2',
+ ... 'BookmarkLevel: 3',
+ ... 'BookmarkPageNumber: 4',
+ ... 'BookmarkTitle: Section 1.2',
+ ... 'BookmarkLevel: 2',
+ ... 'BookmarkPageNumber: 5',
+ ... 'PageLabelNewIndex: 1',
+ ... 'PageLabelStart: 316',
+ ... 'PageLabelNumStyle: DecimalArabicNumerals',
+ ... 'PageLabelNewIndex: 2',
+ ... 'PageLabelStart: 317',
+ ... 'PageLabelNumStyle: DecimalArabicNumerals',
+ ... 'PageLabelNewIndex: 3',
+ ... 'PageLabelStart: 318',
+ ... 'PageLabelNumStyle: DecimalArabicNumerals',
+ ... 'PageLabelNewIndex: 4',
+ ... ])
+ >>> pages,bookmarks = BookmarkedPDF._parse_dump_data(data)
+ >>> pages
+ 123
+ >>> pprint(bookmarks)
+ [{'level': 1, 'page': 1, 'title': 'Chapter 1'},
+ {'level': 2, 'page': 2, 'title': 'Section 1.1'},
+ {'level': 3, 'page': 3, 'title': 'Section 1.1.1'},
+ {'level': 3, 'page': 4, 'title': 'Section 1.1.2'},
+ {'level': 2, 'page': 5, 'title': 'Section 1.2'}]
+ """
+ pages = None
+ bookmarks = []
+ bookmark_info = {}
+ bookmark_info_fields = ['title', 'level', 'page']
+ for line in data.splitlines():
+ key,value = line.split(': ', 1)
+ if key == 'NumberOfPages':
+ pages = int(value)
+ elif key.startswith('Bookmark'):
+ k = key[len('Bookmark'):].lower()
+ if k in ['level', 'pagenumber']:
+ if k == 'pagenumber':
+ k = 'page'
+ value = int(value)
+ bookmark_info[k] = value
+ ready_for_bookmark = True
+ for field in bookmark_info_fields:
+ if field not in bookmark_info:
+ ready_for_bookmark = False
+ break
+ if ready_for_bookmark:
+ bookmarks.append(bookmark_info)
+ bookmark_info = {}
+ return (pages, bookmarks)
+
+
+def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None):
+ """
+ >>> inputs = []
+ >>> for pages,bookmarks in [
+ ... (1,
+ ... [{'level': 1, 'page': 1, 'title': 'Table of Contents'}]),
+ ... (100,
+ ... [{'level': 1, 'page': 1, 'title': 'Chapter 1'},
+ ... {'level': 2, 'page': 2, 'title': 'Section 1.1'},
+ ... {'level': 3, 'page': 3, 'title': 'Section 1.1.1'},
+ ... {'level': 3, 'page': 4, 'title': 'Section 1.1.2'},
+ ... {'level': 2, 'page': 5, 'title': 'Section 1.2'}]),
+ ... (100,
+ ... [{'level': 1, 'page': 1, 'title': 'Chapter 2'},
+ ... {'level': 2, 'page': 2, 'title': 'Section 2.1'},
+ ... {'level': 3, 'page': 3, 'title': 'Section 2.1.1'},
+ ... {'level': 3, 'page': 4, 'title': 'Section 2.1.2'},
+ ... {'level': 2, 'page': 5, 'title': 'Section 2.2'}]),
+ ... ]:
+ ... pdf = BookmarkedPDF()
+ ... pdf.pages = pages
+ ... pdf.bookmarks = bookmarks
+ ... inputs.append(pdf)
+ >>> print(generate_pdfmarks(inputs=inputs, title='My Book',
+ ... author='Myself', keywords=['fun', 'witty', 'interesting']))
+ [ /Title (My Book)
+ /Author (Myself)
+ /Keywords (fun, witty, interesting)
+ /DOCINFO pdfmark
+ [ /Title (Table of Contents) /Page 1 [/XYZ null null null] /OUT pdfmark
+ [ /Title (Chapter 1) /Page 2 [/XYZ null null null] /Count -2 /OUT pdfmark
+ [ /Title (Section 1.1) /Page 3 [/XYZ null null null] /Count -2 /OUT pdfmark
+ [ /Title (Section 1.1.1) /Page 4 [/XYZ null null null] /OUT pdfmark
+ [ /Title (Section 1.1.2) /Page 5 [/XYZ null null null] /OUT pdfmark
+ [ /Title (Section 1.2) /Page 6 [/XYZ null null null] /OUT pdfmark
+ [ /Title (Chapter 2) /Page 102 [/XYZ null null null] /Count -2 /OUT pdfmark
+ [ /Title (Section 2.1) /Page 103 [/XYZ null null null] /Count -2 /OUT pdfmark
+ [ /Title (Section 2.1.1) /Page 104 [/XYZ null null null] /OUT pdfmark
+ [ /Title (Section 2.1.2) /Page 105 [/XYZ null null null] /OUT pdfmark
+ [ /Title (Section 2.2) /Page 106 [/XYZ null null null] /OUT pdfmark
+ <BLANKLINE>
+ """
+ pdfmarks = []
+ if title or author or keywords:
+ docinfo = []
+ if title:
+ docinfo.append('/Title ({})'.format(title))
+ if author:
+ docinfo.append('/Author ({})'.format(author))
+ if keywords:
+ docinfo.append('/Keywords ({})'.format(', '.join(keywords)))
+ docinfo.append('/DOCINFO pdfmark')
+ pdfmarks.append('[ {}' .format('\n '.join(docinfo)))
+ bookmarks = []
+ startpage = 0
+ for pdf in inputs:
+ for bookmark in pdf.bookmarks:
+ mark = dict(bookmark) # shallow copy
+ mark['page'] += startpage
+ bookmarks.append(mark)
+ startpage += pdf.pages
+ for i,bookmark in enumerate(bookmarks):
+ attributes = [
+ '/Title ({})'.format(bookmark['title']),
+ '/Page {}'.format(bookmark['page']),
+ #'[/XYZ null null null]', # preserve page zoom and viewport
+ ]
+ count = 0
+ for bmk in bookmarks[i+1:]:
+ if bmk['level'] == bookmark['level']:
+ break
+ if bmk['level'] == bookmark['level'] + 1:
+ count += 1
+ if count:
+ attributes.append('/Count -{}'.format(count))
+ pdfmarks.append('[ {} /OUT pdfmark'.format(' '.join(attributes)))
+ pdfmarks.append('') # terminal newline
+ return '\n'.join(pdfmarks)
+
+
+def _write_pdfmark_noop_file():
+ # By default, Ghostscript will preserve pdfmarks from the sources PDFs
+ fd,filename = _tempfile.mkstemp(prefix='pdfmark-noop-', text=True)
+ # Make `[... /OUT pdfmark` a no-op.
+ _os.write(fd, """
+% store the original pdfmark
+/originalpdfmark { //pdfmark } bind def
+
+% replace pdfmark with a wrapper that ignores OUT
+/pdfmark
+{
+ { % begin loop
+
+ { counttomark pop }
+ stopped
+ { /pdfmark errordict /unmatchedmark get exec stop }
+ if
+
+ dup type /nametype ne
+ { /pdfmark errordict /typecheck get exec stop }
+ if
+
+ dup /OUT eq
+ { (Skipping OUT pdfmark\n) print cleartomark exit }
+ if
+
+ originalpdfmark exit
+
+ } loop
+} def
+""")
+ _os.close(fd)
+ return filename
+
+def _write_pdfmark_restore_file():
+ fd,filename = _tempfile.mkstemp(prefix='pdfmark-restore-', text=True)
+ # Restore the default `[... /Out pdfmark` behaviour
+ _os.write(fd, '/pdfmark { originalpdfmark } bind def\n')
+ _os.close(fd)
+ return filename
+
+def _write_markfile(pdfmarks, pause_for_manual_tweaking=False):
+ fd,filename = _tempfile.mkstemp(prefix='pdfmarks-', text=True)
+ if pdfmarks:
+ _os.write(fd, pdfmarks)
+ _os.close(fd)
+ if pause_for_manual_tweaking:
+ print('edit {} as you see fit, and press enter when ready'.format(
+ filename))
+ _sys.stdin.readline()
+ return filename
+
+def merge_pdfs(inputs, output, pdfmarks=None, pause_for_manual_tweaking=False):
+ args = [GS, '-dBATCH', '-dNOPAUSE', '-sDEVICE=pdfwrite']
+ if output:
+ args.append('-sOutputFile={}'.format(output))
+ else:
+ args.extend(['-sOutputFile=-', '-q'])
+ if pdfmarks:
+ mark_noop = _write_pdfmark_noop_file()
+ args.append(mark_noop)
+ args.extend([pdf.filename for pdf in inputs])
+ if pdfmarks:
+ mark_restore = _write_pdfmark_restore_file()
+ args.append(mark_restore)
+ markfile = _write_markfile(
+ pdfmarks=pdfmarks, pause_for_manual_tweaking=pause_for_manual_tweaking)
+ args.append(markfile)
+ print('preparing to execute: {}'.format(args))
+ invoke(args, stdout=_sys.stdout)
+ if pdfmarks:
+ _os.unlink(mark_noop)
+ _os.unlink(mark_restore)
+ _os.unlink(markfile)
+
+
+if __name__ == '__main__':
+ import argparse
+
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument('input', metavar='PDF', nargs='+',
+ help='an input PDF to merge')
+ parser.add_argument('--ask', dest='pause_for_manual_tweaking',
+ action='store_const', const=True,
+ help='pause for manual pdfmark tweaking')
+ parser.add_argument('--output', dest='output',
+ help='name of the output PDF')
+ parser.add_argument('--title', dest='title',
+ help='title of output PDF')
+ parser.add_argument('--author', dest='author',
+ help='author of output PDF')
+ parser.add_argument('--keywords', dest='keywords',
+ help='keywords for the output PDF')
+ parser.add_argument('--pdftk', dest='pdftk', default=PDFTK,
+ help='path to the pdftk executable')
+ parser.add_argument('--gs', dest='gs', default=GS,
+ help='path to the gs (Ghostscript) executable')
+
+ args = parser.parse_args()
+
+ PDFTK = args.pdftk
+ GS = args.gs
+
+ inputs = []
+ for filename in args.input:
+ inputs.append(BookmarkedPDF(filename))
+ pdfmarks = generate_pdfmarks(
+ inputs, title=args.title, author=args.author, keywords=args.keywords)
+ merge_pdfs(inputs=inputs, pdfmarks=pdfmarks, output=args.output,
+ pause_for_manual_tweaking=args.pause_for_manual_tweaking)