pdf-merge.py: Add -v/--version option

[blog.git] / posts / PDF_bookmarks_with_Ghostscript / pdf-merge.py
diff --git a/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py b/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py

index b43ea2a12fc011187eec846d96024b752c6bc8ac..4751b2f1f49522cbf6741f3f86704aae54d2a4e7 100755 (executable)
--- a/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py
+++ b/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py
@@ -1,6 +1,6 @@
  #!/usr/bin/env python
  #
-# Copyright (C) 2011 W. Trevor King <wking@tremily.us>
+# Copyright (C) 2011-2013 W. Trevor King <wking@drexel.edu>
  #
  # This program is free software: you can redistribute it and/or modify
  # it under the terms of the GNU Lesser General Public License as
@@ -16,16 +16,23 @@
  # License along with this program.  If not, see
  # <http://www.gnu.org/licenses/>.
  
-"""Merge PDFs perserving bookmarks.
+"""Merge PDFs preserving bookmarks.
+
+Thanks to Larry Cai for suggesting that Unicode be supported and for
+discussion about the `--pdfmarks` option.
  """
  
+import codecs as _codecs
+import locale as _locale
  import os as _os
+import os.path as _os_path
+import re as _re
  import subprocess as _subprocess
  import sys as _sys
  import tempfile as _tempfile
  
  
-__version__ = '0.1'
+__version__ = '0.2'
  
  
  PDFTK = 'pdftk'
@@ -54,6 +61,8 @@ def invoke(args, stdout=None):
  
  
  class BookmarkedPDF (object):
+    _UNICODE_REGEXP = _re.compile('&#([0-9]+);')
+
      def __init__(self, filename=None):
          self.filename = filename
          if self.filename:
@@ -64,13 +73,27 @@ class BookmarkedPDF (object):
          self.pages,self.bookmarks = self._parse_dump_data(data)
  
      @staticmethod
-    def _parse_dump_data(data):
+    def _unicode_replace_match(match):
+        return unichr(int(match.group(1)))
+
+    @classmethod
+    def _unicode_replace(self, string):
+        r"""
+        >>> BookmarkedPDF._unicode_replace('&#945;&#946;&#947;')
+        u'\u03b1\u03b2\u03b3'
          """
+        return self._UNICODE_REGEXP.sub(self._unicode_replace_match, string)
+
+    @classmethod
+    def _parse_dump_data(self, data):
+        r"""
          >>> from pprint import pprint
-        >>> data = '\\n'.join([
+        >>> data = '\n'.join([
+        ...     'InfoBegin',
          ...     'InfoKey: CreationDate',
          ...     'InfoValue: D:20080502020302Z',
          ...     'NumberOfPages: 123',
+        ...     'BookmarkBegin',
          ...     'BookmarkTitle: Chapter 1',
          ...     'BookmarkLevel: 1',
          ...     'BookmarkPageNumber: 1',
@@ -83,36 +106,47 @@ class BookmarkedPDF (object):
          ...     'BookmarkTitle: Section 1.1.2',
          ...     'BookmarkLevel: 3',
          ...     'BookmarkPageNumber: 4',
+        ...     'BookmarkTitle: &#945;&#946;&#947;',
+        ...     'BookmarkLevel: 4',
+        ...     'BookmarkPageNumber: 4',
          ...     'BookmarkTitle: Section 1.2',
          ...     'BookmarkLevel: 2',
          ...     'BookmarkPageNumber: 5',
+        ...     'PageLabelBegin',
          ...     'PageLabelNewIndex: 1',
          ...     'PageLabelStart: 316',
+        ...     'PageLabelPrefix:',
          ...     'PageLabelNumStyle: DecimalArabicNumerals',
          ...     'PageLabelNewIndex: 2',
          ...     'PageLabelStart: 317',
+        ...     'PageLabelPrefix:',
          ...     'PageLabelNumStyle: DecimalArabicNumerals',
          ...     'PageLabelNewIndex: 3',
          ...     'PageLabelStart: 318',
+        ...     'PageLabelPrefix:',
          ...     'PageLabelNumStyle: DecimalArabicNumerals',
          ...     'PageLabelNewIndex: 4',
          ...     ])
          >>> pages,bookmarks = BookmarkedPDF._parse_dump_data(data)
          >>> pages
          123
-        >>> pprint(bookmarks)
-        [{'level': 1, 'page': 1, 'title': 'Chapter 1'},
-         {'level': 2, 'page': 2, 'title': 'Section 1.1'},
-         {'level': 3, 'page': 3, 'title': 'Section 1.1.1'},
-         {'level': 3, 'page': 4, 'title': 'Section 1.1.2'},
-         {'level': 2, 'page': 5, 'title': 'Section 1.2'}]
+        >>> pprint(bookmarks)  # doctest: +REPORT_UDIFF
+        [{'level': 1, 'page': 1, 'title': u'Chapter 1'},
+         {'level': 2, 'page': 2, 'title': u'Section 1.1'},
+         {'level': 3, 'page': 3, 'title': u'Section 1.1.1'},
+         {'level': 3, 'page': 4, 'title': u'Section 1.1.2'},
+         {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
+         {'level': 2, 'page': 5, 'title': u'Section 1.2'}]
          """
          pages = None
          bookmarks = []
          bookmark_info = {}
          bookmark_info_fields = ['title', 'level', 'page']
          for line in data.splitlines():
-            key,value = line.split(': ', 1)
+            try:
+                key,value = line.split(': ', 1)
+            except ValueError:  # e.g. line == 'InfoBegin'
+                continue
              if key == 'NumberOfPages':
                  pages = int(value)
              elif key.startswith('Bookmark'):
@@ -121,6 +155,11 @@ class BookmarkedPDF (object):
                      if k == 'pagenumber':
                          k = 'page'
                      value = int(value)
+                elif k == 'title':
+                    if self._UNICODE_REGEXP.search(value):
+                        value = self._unicode_replace(value)
+                    else:
+                        value = unicode(value)
                  bookmark_info[k] = value
                  ready_for_bookmark = True
                  for field in bookmark_info_fields:
@@ -132,9 +171,8 @@ class BookmarkedPDF (object):
                      bookmark_info = {}
          return (pages, bookmarks)
  
-
  def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None):
-    """
+    r"""
      >>> inputs = []
      >>> for pages,bookmarks in [
      ...         (1,
@@ -144,12 +182,14 @@ def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None):
      ...           {'level': 2, 'page': 2, 'title': 'Section 1.1'},
      ...           {'level': 3, 'page': 3, 'title': 'Section 1.1.1'},
      ...           {'level': 3, 'page': 4, 'title': 'Section 1.1.2'},
+    ...           {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
      ...           {'level': 2, 'page': 5, 'title': 'Section 1.2'}]),
      ...         (100,
      ...          [{'level': 1, 'page': 1, 'title': 'Chapter 2'},
      ...           {'level': 2, 'page': 2, 'title': 'Section 2.1'},
      ...           {'level': 3, 'page': 3, 'title': 'Section 2.1.1'},
      ...           {'level': 3, 'page': 4, 'title': 'Section 2.1.2'},
+    ...           {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
      ...           {'level': 2, 'page': 5, 'title': 'Section 2.2'}]),
      ...         ]:
      ...     pdf = BookmarkedPDF()
@@ -158,32 +198,36 @@ def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None):
      ...     inputs.append(pdf)
      >>> print(generate_pdfmarks(inputs=inputs, title='My Book',
      ...     author='Myself', keywords=['fun', 'witty', 'interesting']))
+    ... # doctest: +REPORT_UDIFF
      [ /Title (My Book)
        /Author (Myself)
        /Keywords (fun, witty, interesting)
        /DOCINFO pdfmark
-    [ /Title (Table of Contents) /Page 1 [/XYZ null null null] /OUT pdfmark
-    [ /Title (Chapter 1) /Page 2 [/XYZ null null null] /Count -2 /OUT pdfmark
-    [ /Title (Section 1.1) /Page 3 [/XYZ null null null] /Count -2 /OUT pdfmark
-    [ /Title (Section 1.1.1) /Page 4 [/XYZ null null null] /OUT pdfmark
-    [ /Title (Section 1.1.2) /Page 5 [/XYZ null null null] /OUT pdfmark
-    [ /Title (Section 1.2) /Page 6 [/XYZ null null null] /OUT pdfmark
-    [ /Title (Chapter 2) /Page 102 [/XYZ null null null] /Count -2 /OUT pdfmark
-    [ /Title (Section 2.1) /Page 103 [/XYZ null null null] /Count -2 /OUT pdfmark
-    [ /Title (Section 2.1.1) /Page 104 [/XYZ null null null] /OUT pdfmark
-    [ /Title (Section 2.1.2) /Page 105 [/XYZ null null null] /OUT pdfmark
-    [ /Title (Section 2.2) /Page 106 [/XYZ null null null] /OUT pdfmark
+    [ /Title (Table of Contents) /Page 1 /OUT pdfmark
+    [ /Title (Chapter 1) /Page 2 /Count -2 /OUT pdfmark
+    [ /Title (Section 1.1) /Page 3 /Count -2 /OUT pdfmark
+    [ /Title (Section 1.1.1) /Page 4 /OUT pdfmark
+    [ /Title (Section 1.1.2) /Page 5 /Count -1 /OUT pdfmark
+    [ /Title <FEFF03B103B203B3> /Page 5 /OUT pdfmark
+    [ /Title (Section 1.2) /Page 6 /OUT pdfmark
+    [ /Title (Chapter 2) /Page 102 /Count -2 /OUT pdfmark
+    [ /Title (Section 2.1) /Page 103 /Count -2 /OUT pdfmark
+    [ /Title (Section 2.1.1) /Page 104 /OUT pdfmark
+    [ /Title (Section 2.1.2) /Page 105 /Count -1 /OUT pdfmark
+    [ /Title <FEFF03B103B203B3> /Page 105 /OUT pdfmark
+    [ /Title (Section 2.2) /Page 106 /OUT pdfmark
      <BLANKLINE>
      """
      pdfmarks = []
      if title or author or keywords:
          docinfo = []
          if title:
-            docinfo.append('/Title ({})'.format(title))
+            docinfo.append('/Title {}'.format(_pdfmark_unicode(title)))
          if author:
-            docinfo.append('/Author ({})'.format(author))
+            docinfo.append('/Author {}'.format(_pdfmark_unicode(author)))
          if keywords:
-            docinfo.append('/Keywords ({})'.format(', '.join(keywords)))
+            docinfo.append('/Keywords {}'.format(_pdfmark_unicode(
+                        u', '.join(keywords))))
          docinfo.append('/DOCINFO pdfmark')
          pdfmarks.append('[ {}' .format('\n  '.join(docinfo)))
      bookmarks = []
@@ -196,7 +240,7 @@ def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None):
          startpage += pdf.pages
      for i,bookmark in enumerate(bookmarks):
          attributes = [
-            '/Title ({})'.format(bookmark['title']),
+            '/Title {}'.format(_pdfmark_unicode(bookmark['title'])),
              '/Page {}'.format(bookmark['page']),
              #'[/XYZ null null null]',  # preserve page zoom and viewport
              ]
@@ -254,6 +298,36 @@ def _write_pdfmark_restore_file():
      _os.close(fd)
      return filename
  
+def _pdfmark_unicode(string):
+    r"""
+    >>> _pdfmark_unicode(u'ascii text with ) paren')
+    '(ascii text with \\) paren)'
+    >>> _pdfmark_unicode(u'\u03b1\u03b2\u03b3')
+    '<FEFF03B103B203B3>'
+    """
+    try:
+        ascii = string.encode('ascii')
+    except UnicodeEncodeError:
+        b = _codecs.BOM_UTF16_BE + string.encode('utf-16-be')
+        return '<{}>'.format(''.join('{:02X}'.format(ord(byte)) for byte in b))
+    else:
+        # escape special characters
+        for a,b in [(u'\\', u'\\\\'), (u'(', u'\\('), (u')', u'\\)'),
+                    (u'\n', u'\\n'), (u'\t', u'\\t')]:
+            string = string.replace(a, b)
+        return '({})'.format(string)
+
+def _pdfmark_unicode_decode(string):
+    r"""
+    >>> _pdfmark_unicode_decode(_pdfmark_unicode(u'\u03b1\u03b2\u03b3'))
+    u'\u03b1\u03b2\u03b3'
+    """
+    assert string.startswith('<FEFF'), string
+    assert string.endswith('>'), string
+    b = ''.join(chr(int(float.fromhex(x1+x2)))
+                for x1,x2 in zip(string[5:-2:2], string[6:-1:2]))
+    return unicode(b, 'utf-16-be')
+
  def _write_markfile(pdfmarks, pause_for_manual_tweaking=False):
      fd,filename = _tempfile.mkstemp(prefix='pdfmarks-', text=True)
      if pdfmarks:
@@ -295,31 +369,84 @@ if __name__ == '__main__':
      parser = argparse.ArgumentParser(description=__doc__)
      parser.add_argument('input', metavar='PDF', nargs='+',
                          help='an input PDF to merge')
+    parser.add_argument(
+        '-v', '--version', action='version',
+        version='%(prog)s {}'.format(__version__))
      parser.add_argument('--ask', dest='pause_for_manual_tweaking',
                          action='store_const', const=True,
                          help='pause for manual pdfmark tweaking')
-    parser.add_argument('--output', dest='output',
+    parser.add_argument('--output', dest='output', default='output.pdf',
                          help='name of the output PDF')
      parser.add_argument('--title', dest='title',
                          help='title of output PDF')
      parser.add_argument('--author', dest='author',
                          help='author of output PDF')
-    parser.add_argument('--keywords', dest='keywords',
+    parser.add_argument('--keyword', dest='keywords', action='append',
                          help='keywords for the output PDF')
      parser.add_argument('--pdftk', dest='pdftk', default=PDFTK,
                          help='path to the pdftk executable')
      parser.add_argument('--gs', dest='gs', default=GS,
                          help='path to the gs (Ghostscript) executable')
+    parser.add_argument('--pdfmarks', dest='pdfmarks',
+                        help=('path to pdfmarks file.  If not given, a '
+                              'temporary file is used.  If given and the file '
+                              'is missing, execution will stop after the file '
+                              'is created (before the Ghostscript run).  If '
+                              'given and the file exists, no attempt will be '
+                              'make to use pdftk to generate the mark file (I '
+                              'assume your input file is what you want).'))
+    parser.add_argument('--argv-encoding', dest='argv_encoding',
+                        help=('Optionally override the locale encoding for '
+                              'your command line arguments.'))
+    parser.add_argument('--unicode', dest='convert_unicode_strings',
+                        action='store_const', const=True,
+                        help=(u'instead of merging PDFs, convert '
+                              u'PDF-formatted unicode strings.  For example '
+                              u"`--unicode '<FEFF03B103B203B3>' "
+                              u'\u03b1\u03b2\u03b3`'))
  
      args = parser.parse_args()
  
      PDFTK = args.pdftk
      GS = args.gs
  
+    if args.argv_encoding:
+        argv_encoding = args.argv_encoding
+    else:
+        argv_encoding = _locale.getpreferredencoding(do_setlocale=True)
+
+    if args.convert_unicode_strings:
+        for string in args.input:
+            if string.startswith('<FEFF'):
+                alt = _pdfmark_unicode_decode(string)
+            else:
+                string = unicode(string, argv_encoding)
+                alt = _pdfmark_unicode(string)
+            print(u'{} -> {}'.format(string, alt))
+        _sys.exit(0)
+
      inputs = []
      for filename in args.input:
          inputs.append(BookmarkedPDF(filename))
-    pdfmarks = generate_pdfmarks(
-        inputs, title=args.title, author=args.author, keywords=args.keywords)
+    if args.title:
+        title = unicode(args.title, argv_encoding)
+    else:
+        title = None
+    if args.author:
+        author = unicode(args.author, argv_encoding)
+    else:
+        author = None
+    if args.keywords:
+        keywords = [unicode(k, argv_encoding) for k in args.keywords]
+    else:
+        keywords = None
+    if args.pdfmarks and _os_path.isfile(args.pdfmarks):
+        pdfmarks = open(args.pdfmarks, 'r').read()
+    else:
+        pdfmarks = generate_pdfmarks(
+            inputs, title=title, author=author, keywords=keywords)
+        if args.pdfmarks:
+            open(args.pdfmarks, 'w').write(pdfmarks)
+            _sys.exit(0)
      merge_pdfs(inputs=inputs, pdfmarks=pdfmarks, output=args.output,
                 pause_for_manual_tweaking=args.pause_for_manual_tweaking)