From 84bd03370260dcefe61329934691623288cec01c Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Sat, 1 Oct 2011 08:36:27 -0400 Subject: [PATCH] Add pdf-merge.py script to PDF merging post. --- posts/PDF_bookmarks_with_Ghostscript.mdwn | 39 +++ .../pdf-merge.py | 325 ++++++++++++++++++ 2 files changed, 364 insertions(+) create mode 100755 posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py diff --git a/posts/PDF_bookmarks_with_Ghostscript.mdwn b/posts/PDF_bookmarks_with_Ghostscript.mdwn index af22b97..c7fe6da 100644 --- a/posts/PDF_bookmarks_with_Ghostscript.mdwn +++ b/posts/PDF_bookmarks_with_Ghostscript.mdwn @@ -14,9 +14,48 @@ and `pdfmarks` is a text file with contents like: Nice and easy. +For nested levels, use the `/Count` attribute. For example: + + [/Count 3 /Title (Chapter 1) /Page 1 /OUT pdfmark + [/Count -2 /Title (Section 1.1) /Page 2 /OUT pdfmark + [/Title (Section 1.1.1) /Page 3 /OUT pdfmark + [/Title (Section 1.1.2) /Page 4 /OUT pdfmark + [/Count -1 /Title (Section 1.2) /Page 5 /OUT pdfmark + [/Title (Section 1.2.1) /Page 6 /OUT pdfmark + [/Title (Section 1.3) /Page 7 /OUT pdfmark + +The argument to `/Count` gives the number of immediately subordinate +bookmarks. The sign of the argument sets the default display +(negative for closed, positive for open). + +You can also setup the document info dictionary with something like: + + [ /Title (My Test Document) + /Author (John Doe) + /Subject (pdfmark 3.0) + /Keywords (pdfmark, example, test) + /DOCINFO pdfmark + +If you want more detail, take a look at [Adobe's pdfmark +reference][reference]. + +I've bundled the whole pdfmarks-generation bit into a script, +[[pdf-merge.py]], which generates the pdfmark file and runs +Ghostscript automatically. Think of it as a bookmark-preserving +version of pdftk's `cat`. The script uses pdftk internally to extract +bookmark information from the source PDFs. + +The script also adds a bit of PostScript to ignore any bookmarks in +the source PDFs during the Ghostscript run. The only bookmarks in the +output will be the ones you specify explicitly in the pdfmarks file. +If for some reason the automatically generated pdfmarks are not quite +what you want, the script can pause (via `--ask`) to allow you to +tweak the pdfmarks manually before running Ghostscript. + [pdftk]: http://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/ [Ghostscript]: http://ghostscript.com/ [post]: http://ubuntuforums.org/showthread.php?t=1545064 +[reference]: http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdfmark_reference.pdf [[!tag tags/tools]] [[!tag tags/linux]] diff --git a/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py b/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py new file mode 100755 index 0000000..5586e02 --- /dev/null +++ b/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py @@ -0,0 +1,325 @@ +#!/usr/bin/env python +# +# Copyright (C) 2011 W. Trevor King +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this program. If not, see +# . + +"""Merge PDFs perserving bookmarks. +""" + +import os as _os +import subprocess as _subprocess +import sys as _sys +import tempfile as _tempfile + + +__version__ = '0.1' + + +PDFTK = 'pdftk' +GS = 'gs' + + +def invoke(args, stdout=None): + """ + >>> invoke(['echo', 'hi', 'there']) + 'hi there\\n' + >>> invoke(['this command does not exist']) + Traceback (most recent call last): + ... + OSError: [Errno 2] No such file or directory + """ + P = _subprocess.PIPE + capture_stdout = stdout is None + if capture_stdout: + stdout = P + p = _subprocess.Popen( + args, stdin=P, stdout=stdout, stderr=_sys.stderr, shell=False, cwd='.') + stdout_,stderr_ = p.communicate() + status = p.wait() + assert status == 0, status + return stdout_ + + +class BookmarkedPDF (object): + def __init__(self, filename=None): + self.filename = filename + if self.filename: + self.get_bookmarks() + + def get_bookmarks(self): + data = invoke([PDFTK, self.filename, 'dump_data']) + self.pages,self.bookmarks = self._parse_dump_data(data) + + @staticmethod + def _parse_dump_data(data): + """ + >>> from pprint import pprint + >>> data = '\\n'.join([ + ... 'InfoKey: CreationDate', + ... 'InfoValue: D:20080502020302Z', + ... 'NumberOfPages: 123', + ... 'BookmarkTitle: Chapter 1', + ... 'BookmarkLevel: 1', + ... 'BookmarkPageNumber: 1', + ... 'BookmarkTitle: Section 1.1', + ... 'BookmarkLevel: 2', + ... 'BookmarkPageNumber: 2', + ... 'BookmarkTitle: Section 1.1.1', + ... 'BookmarkLevel: 3', + ... 'BookmarkPageNumber: 3', + ... 'BookmarkTitle: Section 1.1.2', + ... 'BookmarkLevel: 3', + ... 'BookmarkPageNumber: 4', + ... 'BookmarkTitle: Section 1.2', + ... 'BookmarkLevel: 2', + ... 'BookmarkPageNumber: 5', + ... 'PageLabelNewIndex: 1', + ... 'PageLabelStart: 316', + ... 'PageLabelNumStyle: DecimalArabicNumerals', + ... 'PageLabelNewIndex: 2', + ... 'PageLabelStart: 317', + ... 'PageLabelNumStyle: DecimalArabicNumerals', + ... 'PageLabelNewIndex: 3', + ... 'PageLabelStart: 318', + ... 'PageLabelNumStyle: DecimalArabicNumerals', + ... 'PageLabelNewIndex: 4', + ... ]) + >>> pages,bookmarks = BookmarkedPDF._parse_dump_data(data) + >>> pages + 123 + >>> pprint(bookmarks) + [{'level': 1, 'page': 1, 'title': 'Chapter 1'}, + {'level': 2, 'page': 2, 'title': 'Section 1.1'}, + {'level': 3, 'page': 3, 'title': 'Section 1.1.1'}, + {'level': 3, 'page': 4, 'title': 'Section 1.1.2'}, + {'level': 2, 'page': 5, 'title': 'Section 1.2'}] + """ + pages = None + bookmarks = [] + bookmark_info = {} + bookmark_info_fields = ['title', 'level', 'page'] + for line in data.splitlines(): + key,value = line.split(': ', 1) + if key == 'NumberOfPages': + pages = int(value) + elif key.startswith('Bookmark'): + k = key[len('Bookmark'):].lower() + if k in ['level', 'pagenumber']: + if k == 'pagenumber': + k = 'page' + value = int(value) + bookmark_info[k] = value + ready_for_bookmark = True + for field in bookmark_info_fields: + if field not in bookmark_info: + ready_for_bookmark = False + break + if ready_for_bookmark: + bookmarks.append(bookmark_info) + bookmark_info = {} + return (pages, bookmarks) + + +def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None): + """ + >>> inputs = [] + >>> for pages,bookmarks in [ + ... (1, + ... [{'level': 1, 'page': 1, 'title': 'Table of Contents'}]), + ... (100, + ... [{'level': 1, 'page': 1, 'title': 'Chapter 1'}, + ... {'level': 2, 'page': 2, 'title': 'Section 1.1'}, + ... {'level': 3, 'page': 3, 'title': 'Section 1.1.1'}, + ... {'level': 3, 'page': 4, 'title': 'Section 1.1.2'}, + ... {'level': 2, 'page': 5, 'title': 'Section 1.2'}]), + ... (100, + ... [{'level': 1, 'page': 1, 'title': 'Chapter 2'}, + ... {'level': 2, 'page': 2, 'title': 'Section 2.1'}, + ... {'level': 3, 'page': 3, 'title': 'Section 2.1.1'}, + ... {'level': 3, 'page': 4, 'title': 'Section 2.1.2'}, + ... {'level': 2, 'page': 5, 'title': 'Section 2.2'}]), + ... ]: + ... pdf = BookmarkedPDF() + ... pdf.pages = pages + ... pdf.bookmarks = bookmarks + ... inputs.append(pdf) + >>> print(generate_pdfmarks(inputs=inputs, title='My Book', + ... author='Myself', keywords=['fun', 'witty', 'interesting'])) + [ /Title (My Book) + /Author (Myself) + /Keywords (fun, witty, interesting) + /DOCINFO pdfmark + [ /Title (Table of Contents) /Page 1 [/XYZ null null null] /OUT pdfmark + [ /Title (Chapter 1) /Page 2 [/XYZ null null null] /Count -2 /OUT pdfmark + [ /Title (Section 1.1) /Page 3 [/XYZ null null null] /Count -2 /OUT pdfmark + [ /Title (Section 1.1.1) /Page 4 [/XYZ null null null] /OUT pdfmark + [ /Title (Section 1.1.2) /Page 5 [/XYZ null null null] /OUT pdfmark + [ /Title (Section 1.2) /Page 6 [/XYZ null null null] /OUT pdfmark + [ /Title (Chapter 2) /Page 102 [/XYZ null null null] /Count -2 /OUT pdfmark + [ /Title (Section 2.1) /Page 103 [/XYZ null null null] /Count -2 /OUT pdfmark + [ /Title (Section 2.1.1) /Page 104 [/XYZ null null null] /OUT pdfmark + [ /Title (Section 2.1.2) /Page 105 [/XYZ null null null] /OUT pdfmark + [ /Title (Section 2.2) /Page 106 [/XYZ null null null] /OUT pdfmark + + """ + pdfmarks = [] + if title or author or keywords: + docinfo = [] + if title: + docinfo.append('/Title ({})'.format(title)) + if author: + docinfo.append('/Author ({})'.format(author)) + if keywords: + docinfo.append('/Keywords ({})'.format(', '.join(keywords))) + docinfo.append('/DOCINFO pdfmark') + pdfmarks.append('[ {}' .format('\n '.join(docinfo))) + bookmarks = [] + startpage = 0 + for pdf in inputs: + for bookmark in pdf.bookmarks: + mark = dict(bookmark) # shallow copy + mark['page'] += startpage + bookmarks.append(mark) + startpage += pdf.pages + for i,bookmark in enumerate(bookmarks): + attributes = [ + '/Title ({})'.format(bookmark['title']), + '/Page {}'.format(bookmark['page']), + #'[/XYZ null null null]', # preserve page zoom and viewport + ] + count = 0 + for bmk in bookmarks[i+1:]: + if bmk['level'] == bookmark['level']: + break + if bmk['level'] == bookmark['level'] + 1: + count += 1 + if count: + attributes.append('/Count -{}'.format(count)) + pdfmarks.append('[ {} /OUT pdfmark'.format(' '.join(attributes))) + pdfmarks.append('') # terminal newline + return '\n'.join(pdfmarks) + + +def _write_pdfmark_noop_file(): + # By default, Ghostscript will preserve pdfmarks from the sources PDFs + fd,filename = _tempfile.mkstemp(prefix='pdfmark-noop-', text=True) + # Make `[... /OUT pdfmark` a no-op. + _os.write(fd, """ +% store the original pdfmark +/originalpdfmark { //pdfmark } bind def + +% replace pdfmark with a wrapper that ignores OUT +/pdfmark +{ + { % begin loop + + { counttomark pop } + stopped + { /pdfmark errordict /unmatchedmark get exec stop } + if + + dup type /nametype ne + { /pdfmark errordict /typecheck get exec stop } + if + + dup /OUT eq + { (Skipping OUT pdfmark\n) print cleartomark exit } + if + + originalpdfmark exit + + } loop +} def +""") + _os.close(fd) + return filename + +def _write_pdfmark_restore_file(): + fd,filename = _tempfile.mkstemp(prefix='pdfmark-restore-', text=True) + # Restore the default `[... /Out pdfmark` behaviour + _os.write(fd, '/pdfmark { originalpdfmark } bind def\n') + _os.close(fd) + return filename + +def _write_markfile(pdfmarks, pause_for_manual_tweaking=False): + fd,filename = _tempfile.mkstemp(prefix='pdfmarks-', text=True) + if pdfmarks: + _os.write(fd, pdfmarks) + _os.close(fd) + if pause_for_manual_tweaking: + print('edit {} as you see fit, and press enter when ready'.format( + filename)) + _sys.stdin.readline() + return filename + +def merge_pdfs(inputs, output, pdfmarks=None, pause_for_manual_tweaking=False): + args = [GS, '-dBATCH', '-dNOPAUSE', '-sDEVICE=pdfwrite'] + if output: + args.append('-sOutputFile={}'.format(output)) + else: + args.extend(['-sOutputFile=-', '-q']) + if pdfmarks: + mark_noop = _write_pdfmark_noop_file() + args.append(mark_noop) + args.extend([pdf.filename for pdf in inputs]) + if pdfmarks: + mark_restore = _write_pdfmark_restore_file() + args.append(mark_restore) + markfile = _write_markfile( + pdfmarks=pdfmarks, pause_for_manual_tweaking=pause_for_manual_tweaking) + args.append(markfile) + print('preparing to execute: {}'.format(args)) + invoke(args, stdout=_sys.stdout) + if pdfmarks: + _os.unlink(mark_noop) + _os.unlink(mark_restore) + _os.unlink(markfile) + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('input', metavar='PDF', nargs='+', + help='an input PDF to merge') + parser.add_argument('--ask', dest='pause_for_manual_tweaking', + action='store_const', const=True, + help='pause for manual pdfmark tweaking') + parser.add_argument('--output', dest='output', + help='name of the output PDF') + parser.add_argument('--title', dest='title', + help='title of output PDF') + parser.add_argument('--author', dest='author', + help='author of output PDF') + parser.add_argument('--keywords', dest='keywords', + help='keywords for the output PDF') + parser.add_argument('--pdftk', dest='pdftk', default=PDFTK, + help='path to the pdftk executable') + parser.add_argument('--gs', dest='gs', default=GS, + help='path to the gs (Ghostscript) executable') + + args = parser.parse_args() + + PDFTK = args.pdftk + GS = args.gs + + inputs = [] + for filename in args.input: + inputs.append(BookmarkedPDF(filename)) + pdfmarks = generate_pdfmarks( + inputs, title=args.title, author=args.author, keywords=args.keywords) + merge_pdfs(inputs=inputs, pdfmarks=pdfmarks, output=args.output, + pause_for_manual_tweaking=args.pause_for_manual_tweaking) -- 2.26.2