posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright (C) 2011-2013 W. Trevor King <wking@drexel.edu>
   4 #
   5 # This program is free software: you can redistribute it and/or modify
   6 # it under the terms of the GNU Lesser General Public License as
   7 # published by the Free Software Foundation, either version 3 of the
   8 # License, or (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful, but
  11 # WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 # Lesser General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU Lesser General Public
  16 # License along with this program.  If not, see
  17 # <http://www.gnu.org/licenses/>.
  18
  19 """Merge PDFs preserving bookmarks.
  20
  21 Thanks to Larry Cai for suggesting that Unicode be supported and for
  22 discussion about the `--pdfmarks` option.
  23 """
  24
  25 import codecs as _codecs
  26 import locale as _locale
  27 import os as _os
  28 import os.path as _os_path
  29 import re as _re
  30 import subprocess as _subprocess
  31 import sys as _sys
  32 import tempfile as _tempfile
  33
  34
  35 __version__ = '0.2'
  36
  37
  38 PDFTK = 'pdftk'
  39 GS = 'gs'
  40
  41
  42 def invoke(args, stdout=None):
  43     """
  44     >>> invoke(['echo', 'hi', 'there'])
  45     'hi there\\n'
  46     >>> invoke(['this command does not exist'])
  47     Traceback (most recent call last):
  48       ...
  49     OSError: [Errno 2] No such file or directory
  50     """
  51     P = _subprocess.PIPE
  52     capture_stdout = stdout is None
  53     if capture_stdout:
  54         stdout = P
  55     p = _subprocess.Popen(
  56         args, stdin=P, stdout=stdout, stderr=_sys.stderr, shell=False, cwd='.')
  57     stdout_,stderr_ = p.communicate()
  58     status = p.wait()
  59     assert status == 0, status
  60     return stdout_
  61
  62
  63 class BookmarkedPDF (object):
  64     _UNICODE_REGEXP = _re.compile('&#([0-9]+);')
  65
  66     def __init__(self, filename=None):
  67         self.filename = filename
  68         if self.filename:
  69             self.get_bookmarks()
  70
  71     def get_bookmarks(self):
  72         data = invoke([PDFTK, self.filename, 'dump_data'])
  73         self.pages,self.bookmarks = self._parse_dump_data(data)
  74
  75     @staticmethod
  76     def _unicode_replace_match(match):
  77         return unichr(int(match.group(1)))
  78
  79     @classmethod
  80     def _unicode_replace(self, string):
  81         r"""
  82         >>> BookmarkedPDF._unicode_replace('&#945;&#946;&#947;')
  83         u'\u03b1\u03b2\u03b3'
  84         """
  85         return self._UNICODE_REGEXP.sub(self._unicode_replace_match, string)
  86
  87     @classmethod
  88     def _parse_dump_data(self, data):
  89         r"""
  90         >>> from pprint import pprint
  91         >>> data = '\n'.join([
  92         ...     'InfoBegin',
  93         ...     'InfoKey: CreationDate',
  94         ...     'InfoValue: D:20080502020302Z',
  95         ...     'NumberOfPages: 123',
  96         ...     'BookmarkBegin',
  97         ...     'BookmarkTitle: Chapter 1',
  98         ...     'BookmarkLevel: 1',
  99         ...     'BookmarkPageNumber: 1',
 100         ...     'BookmarkTitle: Section 1.1',
 101         ...     'BookmarkLevel: 2',
 102         ...     'BookmarkPageNumber: 2',
 103         ...     'BookmarkTitle: Section 1.1.1',
 104         ...     'BookmarkLevel: 3',
 105         ...     'BookmarkPageNumber: 3',
 106         ...     'BookmarkTitle: Section 1.1.2',
 107         ...     'BookmarkLevel: 3',
 108         ...     'BookmarkPageNumber: 4',
 109         ...     'BookmarkTitle: &#945;&#946;&#947;',
 110         ...     'BookmarkLevel: 4',
 111         ...     'BookmarkPageNumber: 4',
 112         ...     'BookmarkTitle: Section 1.2',
 113         ...     'BookmarkLevel: 2',
 114         ...     'BookmarkPageNumber: 5',
 115         ...     'PageLabelBegin',
 116         ...     'PageLabelNewIndex: 1',
 117         ...     'PageLabelStart: 316',
 118         ...     'PageLabelPrefix:',
 119         ...     'PageLabelNumStyle: DecimalArabicNumerals',
 120         ...     'PageLabelNewIndex: 2',
 121         ...     'PageLabelStart: 317',
 122         ...     'PageLabelPrefix:',
 123         ...     'PageLabelNumStyle: DecimalArabicNumerals',
 124         ...     'PageLabelNewIndex: 3',
 125         ...     'PageLabelStart: 318',
 126         ...     'PageLabelPrefix:',
 127         ...     'PageLabelNumStyle: DecimalArabicNumerals',
 128         ...     'PageLabelNewIndex: 4',
 129         ...     ])
 130         >>> pages,bookmarks = BookmarkedPDF._parse_dump_data(data)
 131         >>> pages
 132         123
 133         >>> pprint(bookmarks)  # doctest: +REPORT_UDIFF
 134         [{'level': 1, 'page': 1, 'title': u'Chapter 1'},
 135          {'level': 2, 'page': 2, 'title': u'Section 1.1'},
 136          {'level': 3, 'page': 3, 'title': u'Section 1.1.1'},
 137          {'level': 3, 'page': 4, 'title': u'Section 1.1.2'},
 138          {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
 139          {'level': 2, 'page': 5, 'title': u'Section 1.2'}]
 140         """
 141         pages = None
 142         bookmarks = []
 143         bookmark_info = {}
 144         bookmark_info_fields = ['title', 'level', 'page']
 145         for line in data.splitlines():
 146             try:
 147                 key,value = line.split(': ', 1)
 148             except ValueError:  # e.g. line == 'InfoBegin'
 149                 continue
 150             if key == 'NumberOfPages':
 151                 pages = int(value)
 152             elif key.startswith('Bookmark'):
 153                 k = key[len('Bookmark'):].lower()
 154                 if k in ['level', 'pagenumber']:
 155                     if k == 'pagenumber':
 156                         k = 'page'
 157                     value = int(value)
 158                 elif k == 'title':
 159                     if self._UNICODE_REGEXP.search(value):
 160                         value = self._unicode_replace(value)
 161                     else:
 162                         value = unicode(value)
 163                 bookmark_info[k] = value
 164                 ready_for_bookmark = True
 165                 for field in bookmark_info_fields:
 166                     if field not in bookmark_info:
 167                         ready_for_bookmark = False
 168                         break
 169                 if ready_for_bookmark:
 170                     bookmarks.append(bookmark_info)
 171                     bookmark_info = {}
 172         return (pages, bookmarks)
 173
 174 def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None):
 175     r"""
 176     >>> inputs = []
 177     >>> for pages,bookmarks in [
 178     ...         (1,
 179     ...          [{'level': 1, 'page': 1, 'title': 'Table of Contents'}]),
 180     ...         (100,
 181     ...          [{'level': 1, 'page': 1, 'title': 'Chapter 1'},
 182     ...           {'level': 2, 'page': 2, 'title': 'Section 1.1'},
 183     ...           {'level': 3, 'page': 3, 'title': 'Section 1.1.1'},
 184     ...           {'level': 3, 'page': 4, 'title': 'Section 1.1.2'},
 185     ...           {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
 186     ...           {'level': 2, 'page': 5, 'title': 'Section 1.2'}]),
 187     ...         (100,
 188     ...          [{'level': 1, 'page': 1, 'title': 'Chapter 2'},
 189     ...           {'level': 2, 'page': 2, 'title': 'Section 2.1'},
 190     ...           {'level': 3, 'page': 3, 'title': 'Section 2.1.1'},
 191     ...           {'level': 3, 'page': 4, 'title': 'Section 2.1.2'},
 192     ...           {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
 193     ...           {'level': 2, 'page': 5, 'title': 'Section 2.2'}]),
 194     ...         ]:
 195     ...     pdf = BookmarkedPDF()
 196     ...     pdf.pages = pages
 197     ...     pdf.bookmarks = bookmarks
 198     ...     inputs.append(pdf)
 199     >>> print(generate_pdfmarks(inputs=inputs, title='My Book',
 200     ...     author='Myself', keywords=['fun', 'witty', 'interesting']))
 201     ... # doctest: +REPORT_UDIFF
 202     [ /Title (My Book)
 203       /Author (Myself)
 204       /Keywords (fun, witty, interesting)
 205       /DOCINFO pdfmark
 206     [ /Title (Table of Contents) /Page 1 /OUT pdfmark
 207     [ /Title (Chapter 1) /Page 2 /Count -2 /OUT pdfmark
 208     [ /Title (Section 1.1) /Page 3 /Count -2 /OUT pdfmark
 209     [ /Title (Section 1.1.1) /Page 4 /OUT pdfmark
 210     [ /Title (Section 1.1.2) /Page 5 /Count -1 /OUT pdfmark
 211     [ /Title <FEFF03B103B203B3> /Page 5 /OUT pdfmark
 212     [ /Title (Section 1.2) /Page 6 /OUT pdfmark
 213     [ /Title (Chapter 2) /Page 102 /Count -2 /OUT pdfmark
 214     [ /Title (Section 2.1) /Page 103 /Count -2 /OUT pdfmark
 215     [ /Title (Section 2.1.1) /Page 104 /OUT pdfmark
 216     [ /Title (Section 2.1.2) /Page 105 /Count -1 /OUT pdfmark
 217     [ /Title <FEFF03B103B203B3> /Page 105 /OUT pdfmark
 218     [ /Title (Section 2.2) /Page 106 /OUT pdfmark
 219     <BLANKLINE>
 220     """
 221     pdfmarks = []
 222     if title or author or keywords:
 223         docinfo = []
 224         if title:
 225             docinfo.append('/Title {}'.format(_pdfmark_unicode(title)))
 226         if author:
 227             docinfo.append('/Author {}'.format(_pdfmark_unicode(author)))
 228         if keywords:
 229             docinfo.append('/Keywords {}'.format(_pdfmark_unicode(
 230                         u', '.join(keywords))))
 231         docinfo.append('/DOCINFO pdfmark')
 232         pdfmarks.append('[ {}' .format('\n  '.join(docinfo)))
 233     bookmarks = []
 234     startpage = 0
 235     for pdf in inputs:
 236         for bookmark in pdf.bookmarks:
 237             mark = dict(bookmark)  # shallow copy
 238             mark['page'] += startpage
 239             bookmarks.append(mark)
 240         startpage += pdf.pages
 241     for i,bookmark in enumerate(bookmarks):
 242         attributes = [
 243             '/Title {}'.format(_pdfmark_unicode(bookmark['title'])),
 244             '/Page {}'.format(bookmark['page']),
 245             #'[/XYZ null null null]',  # preserve page zoom and viewport
 246             ]
 247         count = 0
 248         for bmk in bookmarks[i+1:]:
 249             if bmk['level'] == bookmark['level']:
 250                 break
 251             if bmk['level'] == bookmark['level'] + 1:
 252                 count += 1
 253         if count:
 254             attributes.append('/Count -{}'.format(count))
 255         pdfmarks.append('[ {} /OUT pdfmark'.format(' '.join(attributes)))
 256     pdfmarks.append('')  # terminal newline
 257     return '\n'.join(pdfmarks)
 258
 259
 260 def _write_pdfmark_noop_file():
 261     # By default, Ghostscript will preserve pdfmarks from the sources PDFs
 262     fd,filename = _tempfile.mkstemp(prefix='pdfmark-noop-', text=True)
 263     # Make `[... /OUT pdfmark` a no-op.
 264     _os.write(fd, """
 265 % store the original pdfmark
 266 /originalpdfmark { //pdfmark } bind def
 267
 268 % replace pdfmark with a wrapper that ignores OUT
 269 /pdfmark
 270 {
 271   {  % begin loop
 272
 273       { counttomark pop }
 274     stopped
 275       { /pdfmark errordict /unmatchedmark get exec stop }
 276     if
 277
 278     dup type /nametype ne
 279       { /pdfmark errordict /typecheck get exec stop }
 280     if
 281
 282     dup /OUT eq
 283       { (Skipping OUT pdfmark\n) print cleartomark exit }
 284     if
 285
 286     originalpdfmark exit
 287
 288   } loop
 289 } def
 290 """)
 291     _os.close(fd)
 292     return filename
 293
 294 def _write_pdfmark_restore_file():
 295     fd,filename = _tempfile.mkstemp(prefix='pdfmark-restore-', text=True)
 296     # Restore the default `[... /Out pdfmark` behaviour
 297     _os.write(fd, '/pdfmark { originalpdfmark } bind def\n')
 298     _os.close(fd)
 299     return filename
 300
 301 def _pdfmark_unicode(string):
 302     r"""
 303     >>> _pdfmark_unicode(u'ascii text with ) paren')
 304     '(ascii text with \\) paren)'
 305     >>> _pdfmark_unicode(u'\u03b1\u03b2\u03b3')
 306     '<FEFF03B103B203B3>'
 307     """
 308     try:
 309         ascii = string.encode('ascii')
 310     except UnicodeEncodeError:
 311         b = _codecs.BOM_UTF16_BE + string.encode('utf-16-be')
 312         return '<{}>'.format(''.join('{:02X}'.format(ord(byte)) for byte in b))
 313     else:
 314         # escape special characters
 315         for a,b in [(u'\\', u'\\\\'), (u'(', u'\\('), (u')', u'\\)'),
 316                     (u'\n', u'\\n'), (u'\t', u'\\t')]:
 317             string = string.replace(a, b)
 318         return '({})'.format(string)
 319
 320 def _pdfmark_unicode_decode(string):
 321     r"""
 322     >>> _pdfmark_unicode_decode(_pdfmark_unicode(u'\u03b1\u03b2\u03b3'))
 323     u'\u03b1\u03b2\u03b3'
 324     """
 325     assert string.startswith('<FEFF'), string
 326     assert string.endswith('>'), string
 327     b = ''.join(chr(int(float.fromhex(x1+x2)))
 328                 for x1,x2 in zip(string[5:-2:2], string[6:-1:2]))
 329     return unicode(b, 'utf-16-be')
 330
 331 def _write_markfile(pdfmarks, pause_for_manual_tweaking=False):
 332     fd,filename = _tempfile.mkstemp(prefix='pdfmarks-', text=True)
 333     if pdfmarks:
 334         _os.write(fd, pdfmarks)
 335     _os.close(fd)
 336     if pause_for_manual_tweaking:
 337         print('edit {} as you see fit, and press enter when ready'.format(
 338                 filename))
 339         _sys.stdin.readline()
 340     return filename
 341
 342 def merge_pdfs(inputs, output, pdfmarks=None, pause_for_manual_tweaking=False):
 343     args = [GS, '-dBATCH', '-dNOPAUSE', '-sDEVICE=pdfwrite']
 344     if output:
 345         args.append('-sOutputFile={}'.format(output))
 346     else:
 347         args.extend(['-sOutputFile=-', '-q'])
 348     if pdfmarks:
 349         mark_noop = _write_pdfmark_noop_file()
 350         args.append(mark_noop)
 351     args.extend([pdf.filename for pdf in inputs])
 352     if pdfmarks:
 353         mark_restore = _write_pdfmark_restore_file()
 354         args.append(mark_restore)
 355     markfile = _write_markfile(
 356         pdfmarks=pdfmarks, pause_for_manual_tweaking=pause_for_manual_tweaking)
 357     args.append(markfile)
 358     print('preparing to execute: {}'.format(args))
 359     invoke(args, stdout=_sys.stdout)
 360     if pdfmarks:
 361         _os.unlink(mark_noop)
 362         _os.unlink(mark_restore)
 363     _os.unlink(markfile)
 364
 365
 366 if __name__ == '__main__':
 367     import argparse
 368
 369     parser = argparse.ArgumentParser(description=__doc__)
 370     parser.add_argument('input', metavar='PDF', nargs='+',
 371                         help='an input PDF to merge')
 372     parser.add_argument(
 373         '-v', '--version', action='version',
 374         version='%(prog)s {}'.format(__version__))
 375     parser.add_argument('--ask', dest='pause_for_manual_tweaking',
 376                         action='store_const', const=True,
 377                         help='pause for manual pdfmark tweaking')
 378     parser.add_argument('--output', dest='output', default='output.pdf',
 379                         help='name of the output PDF')
 380     parser.add_argument('--title', dest='title',
 381                         help='title of output PDF')
 382     parser.add_argument('--author', dest='author',
 383                         help='author of output PDF')
 384     parser.add_argument('--keyword', metavar='KEYWORD', dest='keywords',
 385                         action='append',
 386                         help='keywords for the output PDF')
 387     parser.add_argument('--pdftk', dest='pdftk', default=PDFTK,
 388                         help='path to the pdftk executable')
 389     parser.add_argument('--gs', dest='gs', default=GS,
 390                         help='path to the gs (Ghostscript) executable')
 391     parser.add_argument('--pdfmarks', dest='pdfmarks',
 392                         help=('path to pdfmarks file.  If not given, a '
 393                               'temporary file is used.  If given and the file '
 394                               'is missing, execution will stop after the file '
 395                               'is created (before the Ghostscript run).  If '
 396                               'given and the file exists, no attempt will be '
 397                               'make to use pdftk to generate the mark file (I '
 398                               'assume your input file is what you want).'))
 399     parser.add_argument('--argv-encoding', dest='argv_encoding',
 400                         help=('Optionally override the locale encoding for '
 401                               'your command line arguments.'))
 402     parser.add_argument('--unicode', dest='convert_unicode_strings',
 403                         action='store_const', const=True,
 404                         help=(u'instead of merging PDFs, convert '
 405                               u'PDF-formatted unicode strings.  For example '
 406                               u"`--unicode '<FEFF03B103B203B3>' "
 407                               u'\u03b1\u03b2\u03b3`'))
 408
 409     args = parser.parse_args()
 410
 411     PDFTK = args.pdftk
 412     GS = args.gs
 413
 414     if args.argv_encoding:
 415         argv_encoding = args.argv_encoding
 416     else:
 417         argv_encoding = _locale.getpreferredencoding(do_setlocale=True)
 418
 419     if args.convert_unicode_strings:
 420         for string in args.input:
 421             if string.startswith('<FEFF'):
 422                 alt = _pdfmark_unicode_decode(string)
 423             else:
 424                 string = unicode(string, argv_encoding)
 425                 alt = _pdfmark_unicode(string)
 426             print(u'{} -> {}'.format(string, alt))
 427         _sys.exit(0)
 428
 429     inputs = []
 430     for filename in args.input:
 431         inputs.append(BookmarkedPDF(filename))
 432     if args.title:
 433         title = unicode(args.title, argv_encoding)
 434     else:
 435         title = None
 436     if args.author:
 437         author = unicode(args.author, argv_encoding)
 438     else:
 439         author = None
 440     if args.keywords:
 441         keywords = [unicode(k, argv_encoding) for k in args.keywords]
 442     else:
 443         keywords = None
 444     if args.pdfmarks and _os_path.isfile(args.pdfmarks):
 445         pdfmarks = open(args.pdfmarks, 'r').read()
 446     else:
 447         pdfmarks = generate_pdfmarks(
 448             inputs, title=title, author=author, keywords=keywords)
 449         if args.pdfmarks:
 450             open(args.pdfmarks, 'w').write(pdfmarks)
 451             _sys.exit(0)
 452     merge_pdfs(inputs=inputs, pdfmarks=pdfmarks, output=args.output,
 453                pause_for_manual_tweaking=args.pause_for_manual_tweaking)