Handle Unicode strings in pdf-merge.py.

author W. Trevor King <wking@drexel.edu>

Wed, 8 Feb 2012 01:51:09 +0000 (20:51 -0500)

committer W. Trevor King <wking@drexel.edu>

Wed, 8 Feb 2012 14:55:13 +0000 (09:55 -0500)
author W. Trevor King <wking@drexel.edu>
Wed, 8 Feb 2012 01:51:09 +0000 (20:51 -0500)
committer W. Trevor King <wking@drexel.edu>
Wed, 8 Feb 2012 14:55:13 +0000 (09:55 -0500)
diff --git a/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py b/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py

index 5586e021db0ec638d5f8a8a6d160ea7c415435fa..503fc8fd3ad799de017b7795d3e2f52a04f58b9a 100755 (executable)
--- a/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py
+++ b/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py
@@ -1,6 +1,6 @@
  #!/usr/bin/env python
  #
-# Copyright (C) 2011 W. Trevor King <wking@drexel.edu>
+# Copyright (C) 2011-2012 W. Trevor King <wking@drexel.edu>
  #
  # This program is free software: you can redistribute it and/or modify
  # it under the terms of the GNU Lesser General Public License as
@@ -19,13 +19,16 @@
  """Merge PDFs perserving bookmarks.
  """
  
+import codecs as _codecs
+import locale as _locale
  import os as _os
+import re as _re
  import subprocess as _subprocess
  import sys as _sys
  import tempfile as _tempfile
  
  
-__version__ = '0.1'
+__version__ = '0.2'
  
  
  PDFTK = 'pdftk'
@@ -54,6 +57,8 @@ def invoke(args, stdout=None):
  
  
  class BookmarkedPDF (object):
+    _UNICODE_REGEXP = _re.compile('&#([0-9]+);')
+
      def __init__(self, filename=None):
          self.filename = filename
          if self.filename:
@@ -64,10 +69,22 @@ class BookmarkedPDF (object):
          self.pages,self.bookmarks = self._parse_dump_data(data)
  
      @staticmethod
-    def _parse_dump_data(data):
+    def _unicode_replace_match(match):
+        return unichr(int(match.group(1)))
+
+    @classmethod
+    def _unicode_replace(self, string):
+        r"""
+        >>> BookmarkedPDF._unicode_replace('&#945;&#946;&#947;')
+        u'\u03b1\u03b2\u03b3'
          """
+        return self._UNICODE_REGEXP.sub(self._unicode_replace_match, string)
+
+    @classmethod
+    def _parse_dump_data(self, data):
+        r"""
          >>> from pprint import pprint
-        >>> data = '\\n'.join([
+        >>> data = '\n'.join([
          ...     'InfoKey: CreationDate',
          ...     'InfoValue: D:20080502020302Z',
          ...     'NumberOfPages: 123',
@@ -83,6 +100,9 @@ class BookmarkedPDF (object):
          ...     'BookmarkTitle: Section 1.1.2',
          ...     'BookmarkLevel: 3',
          ...     'BookmarkPageNumber: 4',
+        ...     'BookmarkTitle: &#945;&#946;&#947;&#0;',
+        ...     'BookmarkLevel: 4',
+        ...     'BookmarkPageNumber: 4',
          ...     'BookmarkTitle: Section 1.2',
          ...     'BookmarkLevel: 2',
          ...     'BookmarkPageNumber: 5',
@@ -100,12 +120,13 @@ class BookmarkedPDF (object):
          >>> pages,bookmarks = BookmarkedPDF._parse_dump_data(data)
          >>> pages
          123
-        >>> pprint(bookmarks)
-        [{'level': 1, 'page': 1, 'title': 'Chapter 1'},
-         {'level': 2, 'page': 2, 'title': 'Section 1.1'},
-         {'level': 3, 'page': 3, 'title': 'Section 1.1.1'},
-         {'level': 3, 'page': 4, 'title': 'Section 1.1.2'},
-         {'level': 2, 'page': 5, 'title': 'Section 1.2'}]
+        >>> pprint(bookmarks)  # doctest: +REPORT_UDIFF
+        [{'level': 1, 'page': 1, 'title': u'Chapter 1'},
+         {'level': 2, 'page': 2, 'title': u'Section 1.1'},
+         {'level': 3, 'page': 3, 'title': u'Section 1.1.1'},
+         {'level': 3, 'page': 4, 'title': u'Section 1.1.2'},
+         {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
+         {'level': 2, 'page': 5, 'title': u'Section 1.2'}]
          """
          pages = None
          bookmarks = []
@@ -121,6 +142,11 @@ class BookmarkedPDF (object):
                      if k == 'pagenumber':
                          k = 'page'
                      value = int(value)
+                elif k == 'title':
+                    if self._UNICODE_REGEXP.search(value):
+                        value = self._unicode_replace(value[:-len('&#0;')])
+                    else:
+                        value = unicode(value)
                  bookmark_info[k] = value
                  ready_for_bookmark = True
                  for field in bookmark_info_fields:
@@ -132,9 +158,8 @@ class BookmarkedPDF (object):
                      bookmark_info = {}
          return (pages, bookmarks)
  
-
  def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None):
-    """
+    r"""
      >>> inputs = []
      >>> for pages,bookmarks in [
      ...         (1,
@@ -144,12 +169,14 @@ def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None):
      ...           {'level': 2, 'page': 2, 'title': 'Section 1.1'},
      ...           {'level': 3, 'page': 3, 'title': 'Section 1.1.1'},
      ...           {'level': 3, 'page': 4, 'title': 'Section 1.1.2'},
+    ...           {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
      ...           {'level': 2, 'page': 5, 'title': 'Section 1.2'}]),
      ...         (100,
      ...          [{'level': 1, 'page': 1, 'title': 'Chapter 2'},
      ...           {'level': 2, 'page': 2, 'title': 'Section 2.1'},
      ...           {'level': 3, 'page': 3, 'title': 'Section 2.1.1'},
      ...           {'level': 3, 'page': 4, 'title': 'Section 2.1.2'},
+    ...           {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
      ...           {'level': 2, 'page': 5, 'title': 'Section 2.2'}]),
      ...         ]:
      ...     pdf = BookmarkedPDF()
@@ -158,32 +185,36 @@ def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None):
      ...     inputs.append(pdf)
      >>> print(generate_pdfmarks(inputs=inputs, title='My Book',
      ...     author='Myself', keywords=['fun', 'witty', 'interesting']))
+    ... # doctest: +REPORT_UDIFF
      [ /Title (My Book)
        /Author (Myself)
        /Keywords (fun, witty, interesting)
        /DOCINFO pdfmark
-    [ /Title (Table of Contents) /Page 1 [/XYZ null null null] /OUT pdfmark
-    [ /Title (Chapter 1) /Page 2 [/XYZ null null null] /Count -2 /OUT pdfmark
-    [ /Title (Section 1.1) /Page 3 [/XYZ null null null] /Count -2 /OUT pdfmark
-    [ /Title (Section 1.1.1) /Page 4 [/XYZ null null null] /OUT pdfmark
-    [ /Title (Section 1.1.2) /Page 5 [/XYZ null null null] /OUT pdfmark
-    [ /Title (Section 1.2) /Page 6 [/XYZ null null null] /OUT pdfmark
-    [ /Title (Chapter 2) /Page 102 [/XYZ null null null] /Count -2 /OUT pdfmark
-    [ /Title (Section 2.1) /Page 103 [/XYZ null null null] /Count -2 /OUT pdfmark
-    [ /Title (Section 2.1.1) /Page 104 [/XYZ null null null] /OUT pdfmark
-    [ /Title (Section 2.1.2) /Page 105 [/XYZ null null null] /OUT pdfmark
-    [ /Title (Section 2.2) /Page 106 [/XYZ null null null] /OUT pdfmark
+    [ /Title (Table of Contents) /Page 1 /OUT pdfmark
+    [ /Title (Chapter 1) /Page 2 /Count -2 /OUT pdfmark
+    [ /Title (Section 1.1) /Page 3 /Count -2 /OUT pdfmark
+    [ /Title (Section 1.1.1) /Page 4 /OUT pdfmark
+    [ /Title (Section 1.1.2) /Page 5 /Count -1 /OUT pdfmark
+    [ /Title <FEFF03B103B203B3> /Page 5 /OUT pdfmark
+    [ /Title (Section 1.2) /Page 6 /OUT pdfmark
+    [ /Title (Chapter 2) /Page 102 /Count -2 /OUT pdfmark
+    [ /Title (Section 2.1) /Page 103 /Count -2 /OUT pdfmark
+    [ /Title (Section 2.1.1) /Page 104 /OUT pdfmark
+    [ /Title (Section 2.1.2) /Page 105 /Count -1 /OUT pdfmark
+    [ /Title <FEFF03B103B203B3> /Page 105 /OUT pdfmark
+    [ /Title (Section 2.2) /Page 106 /OUT pdfmark
      <BLANKLINE>
      """
      pdfmarks = []
      if title or author or keywords:
          docinfo = []
          if title:
-            docinfo.append('/Title ({})'.format(title))
+            docinfo.append('/Title {}'.format(_pdfmark_unicode(title)))
          if author:
-            docinfo.append('/Author ({})'.format(author))
+            docinfo.append('/Author {}'.format(_pdfmark_unicode(author)))
          if keywords:
-            docinfo.append('/Keywords ({})'.format(', '.join(keywords)))
+            docinfo.append('/Keywords {}'.format(_pdfmark_unicode(
+                        u', '.join(keywords))))
          docinfo.append('/DOCINFO pdfmark')
          pdfmarks.append('[ {}' .format('\n  '.join(docinfo)))
      bookmarks = []
@@ -196,7 +227,7 @@ def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None):
          startpage += pdf.pages
      for i,bookmark in enumerate(bookmarks):
          attributes = [
-            '/Title ({})'.format(bookmark['title']),
+            '/Title {}'.format(_pdfmark_unicode(bookmark['title'])),
              '/Page {}'.format(bookmark['page']),
              #'[/XYZ null null null]',  # preserve page zoom and viewport
              ]
@@ -254,6 +285,36 @@ def _write_pdfmark_restore_file():
      _os.close(fd)
      return filename
  
+def _pdfmark_unicode(string):
+    r"""
+    >>> _pdfmark_unicode(u'ascii text with ) paren')
+    '(ascii text with \\) paren)'
+    >>> _pdfmark_unicode(u'\u03b1\u03b2\u03b3')
+    '<FEFF03B103B203B3>'
+    """
+    try:
+        ascii = string.encode('ascii')
+    except UnicodeEncodeError:
+        b = _codecs.BOM_UTF16_BE + string.encode('utf-16-be')
+        return '<{}>'.format(''.join('{:02X}'.format(ord(byte)) for byte in b))
+    else:
+        # escape special characters
+        for a,b in [(u'\\', u'\\\\'), (u'(', u'\\('), (u')', u'\\)'),
+                    (u'\n', u'\\n'), (u'\t', u'\\t')]:
+            string = string.replace(a, b)
+        return '({})'.format(string)
+
+def _pdfmark_unicode_decode(string):
+    r"""
+    >>> _pdfmark_unicode_decode(_pdfmark_unicode(u'\u03b1\u03b2\u03b3'))
+    u'\u03b1\u03b2\u03b3'
+    """
+    assert string.startswith('<FEFF'), string
+    assert string.endswith('>'), string
+    b = ''.join(chr(int(float.fromhex(x1+x2)))
+                for x1,x2 in zip(string[5:-2:2], string[6:-1:2]))
+    return unicode(b, 'utf-16-be')
+
  def _write_markfile(pdfmarks, pause_for_manual_tweaking=False):
      fd,filename = _tempfile.mkstemp(prefix='pdfmarks-', text=True)
      if pdfmarks:
@@ -298,28 +359,64 @@ if __name__ == '__main__':
      parser.add_argument('--ask', dest='pause_for_manual_tweaking',
                          action='store_const', const=True,
                          help='pause for manual pdfmark tweaking')
-    parser.add_argument('--output', dest='output',
+    parser.add_argument('--output', dest='output', default='output.pdf',
                          help='name of the output PDF')
      parser.add_argument('--title', dest='title',
                          help='title of output PDF')
      parser.add_argument('--author', dest='author',
                          help='author of output PDF')
-    parser.add_argument('--keywords', dest='keywords',
+    parser.add_argument('--keyword', dest='keywords', action='append',
                          help='keywords for the output PDF')
      parser.add_argument('--pdftk', dest='pdftk', default=PDFTK,
                          help='path to the pdftk executable')
      parser.add_argument('--gs', dest='gs', default=GS,
                          help='path to the gs (Ghostscript) executable')
+    parser.add_argument('--argv-encoding', dest='argv_encoding',
+                        help=('Optionally override the locale encoding for '
+                              'your command line arguments.'))
+    parser.add_argument('--unicode', dest='convert_unicode_strings',
+                        action='store_const', const=True,
+                        help=(u'instead of merging PDFs, convert '
+                              u'PDF-formatted unicode strings.  For example '
+                              u"`--unicode '<FEFF03B103B203B3>' "
+                              u'\u03b1\u03b2\u03b3`'))
  
      args = parser.parse_args()
  
      PDFTK = args.pdftk
      GS = args.gs
  
+    if args.argv_encoding:
+        argv_encoding = args.argv_encoding
+    else:
+        argv_encoding = _locale.getpreferredencoding(do_setlocale=True)
+
+    if args.convert_unicode_strings:
+        for string in args.input:
+            if string.startswith('<FEFF'):
+                alt = _pdfmark_unicode_decode(string)
+            else:
+                string = unicode(string, argv_encoding)
+                alt = _pdfmark_unicode(string)
+            print(u'{} -> {}'.format(string, alt))
+        _sys.exit(0)
+
      inputs = []
      for filename in args.input:
          inputs.append(BookmarkedPDF(filename))
+    if args.title:
+        title = unicode(args.title, argv_encoding)
+    else:
+        title = None
+    if args.author:
+        author = unicode(args.author, argv_encoding)
+    else:
+        author = None
+    if args.keywords:
+        keywords = [unicode(k, argv_encoding) for k in args.keywords]
+    else:
+        keywords = None
      pdfmarks = generate_pdfmarks(
-        inputs, title=args.title, author=args.author, keywords=args.keywords)
+        inputs, title=title, author=author, keywords=keywords)
      merge_pdfs(inputs=inputs, pdfmarks=pdfmarks, output=args.output,
                 pause_for_manual_tweaking=args.pause_for_manual_tweaking)
author	W. Trevor King <wking@drexel.edu>
	Wed, 8 Feb 2012 01:51:09 +0000 (20:51 -0500)
committer	W. Trevor King <wking@drexel.edu>
	Wed, 8 Feb 2012 14:55:13 +0000 (09:55 -0500)