#!/usr/bin/env python
-# Copyright (C) 2011 W. Trevor King <wking@drexel.edu>
+# Copyright (C) 2011-2012 W. Trevor King <wking@drexel.edu>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
"""Merge PDFs perserving bookmarks.
+import codecs as _codecs
+import locale as _locale
import os as _os
+import re as _re
import subprocess as _subprocess
import sys as _sys
import tempfile as _tempfile
-__version__ = '0.1'
+__version__ = '0.2'
PDFTK = 'pdftk'
class BookmarkedPDF (object):
+ _UNICODE_REGEXP = _re.compile('&#([0-9]+);')
def __init__(self, filename=None):
self.filename = filename
if self.filename:
self.pages,self.bookmarks = self._parse_dump_data(data)
- def _parse_dump_data(data):
+ def _unicode_replace_match(match):
+ return unichr(int(match.group(1)))
+ @classmethod
+ def _unicode_replace(self, string):
+ r"""
+ >>> BookmarkedPDF._unicode_replace('αβγ')
+ u'\u03b1\u03b2\u03b3'
+ return self._UNICODE_REGEXP.sub(self._unicode_replace_match, string)
+ @classmethod
+ def _parse_dump_data(self, data):
+ r"""
>>> from pprint import pprint
- >>> data = '\\n'.join([
+ >>> data = '\n'.join([
... 'InfoKey: CreationDate',
... 'InfoValue: D:20080502020302Z',
... 'NumberOfPages: 123',
... 'BookmarkTitle: Section 1.1.2',
... 'BookmarkLevel: 3',
... 'BookmarkPageNumber: 4',
+ ... 'BookmarkTitle: αβγ�',
+ ... 'BookmarkLevel: 4',
+ ... 'BookmarkPageNumber: 4',
... 'BookmarkTitle: Section 1.2',
... 'BookmarkLevel: 2',
... 'BookmarkPageNumber: 5',
>>> pages,bookmarks = BookmarkedPDF._parse_dump_data(data)
>>> pages
- >>> pprint(bookmarks)
- [{'level': 1, 'page': 1, 'title': 'Chapter 1'},
- {'level': 2, 'page': 2, 'title': 'Section 1.1'},
- {'level': 3, 'page': 3, 'title': 'Section 1.1.1'},
- {'level': 3, 'page': 4, 'title': 'Section 1.1.2'},
- {'level': 2, 'page': 5, 'title': 'Section 1.2'}]
+ >>> pprint(bookmarks) # doctest: +REPORT_UDIFF
+ [{'level': 1, 'page': 1, 'title': u'Chapter 1'},
+ {'level': 2, 'page': 2, 'title': u'Section 1.1'},
+ {'level': 3, 'page': 3, 'title': u'Section 1.1.1'},
+ {'level': 3, 'page': 4, 'title': u'Section 1.1.2'},
+ {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
+ {'level': 2, 'page': 5, 'title': u'Section 1.2'}]
pages = None
bookmarks = []
if k == 'pagenumber':
k = 'page'
value = int(value)
+ elif k == 'title':
+ if self._UNICODE_REGEXP.search(value):
+ value = self._unicode_replace(value[:-len('�')])
+ else:
+ value = unicode(value)
bookmark_info[k] = value
ready_for_bookmark = True
for field in bookmark_info_fields:
bookmark_info = {}
return (pages, bookmarks)
def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None):
- """
+ r"""
>>> inputs = []
>>> for pages,bookmarks in [
... (1,
... {'level': 2, 'page': 2, 'title': 'Section 1.1'},
... {'level': 3, 'page': 3, 'title': 'Section 1.1.1'},
... {'level': 3, 'page': 4, 'title': 'Section 1.1.2'},
+ ... {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
... {'level': 2, 'page': 5, 'title': 'Section 1.2'}]),
... (100,
... [{'level': 1, 'page': 1, 'title': 'Chapter 2'},
... {'level': 2, 'page': 2, 'title': 'Section 2.1'},
... {'level': 3, 'page': 3, 'title': 'Section 2.1.1'},
... {'level': 3, 'page': 4, 'title': 'Section 2.1.2'},
+ ... {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
... {'level': 2, 'page': 5, 'title': 'Section 2.2'}]),
... ]:
... pdf = BookmarkedPDF()
... inputs.append(pdf)
>>> print(generate_pdfmarks(inputs=inputs, title='My Book',
... author='Myself', keywords=['fun', 'witty', 'interesting']))
+ ... # doctest: +REPORT_UDIFF
[ /Title (My Book)
/Author (Myself)
/Keywords (fun, witty, interesting)
/DOCINFO pdfmark
- [ /Title (Table of Contents) /Page 1 [/XYZ null null null] /OUT pdfmark
- [ /Title (Chapter 1) /Page 2 [/XYZ null null null] /Count -2 /OUT pdfmark
- [ /Title (Section 1.1) /Page 3 [/XYZ null null null] /Count -2 /OUT pdfmark
- [ /Title (Section 1.1.1) /Page 4 [/XYZ null null null] /OUT pdfmark
- [ /Title (Section 1.1.2) /Page 5 [/XYZ null null null] /OUT pdfmark
- [ /Title (Section 1.2) /Page 6 [/XYZ null null null] /OUT pdfmark
- [ /Title (Chapter 2) /Page 102 [/XYZ null null null] /Count -2 /OUT pdfmark
- [ /Title (Section 2.1) /Page 103 [/XYZ null null null] /Count -2 /OUT pdfmark
- [ /Title (Section 2.1.1) /Page 104 [/XYZ null null null] /OUT pdfmark
- [ /Title (Section 2.1.2) /Page 105 [/XYZ null null null] /OUT pdfmark
- [ /Title (Section 2.2) /Page 106 [/XYZ null null null] /OUT pdfmark
+ [ /Title (Table of Contents) /Page 1 /OUT pdfmark
+ [ /Title (Chapter 1) /Page 2 /Count -2 /OUT pdfmark
+ [ /Title (Section 1.1) /Page 3 /Count -2 /OUT pdfmark
+ [ /Title (Section 1.1.1) /Page 4 /OUT pdfmark
+ [ /Title (Section 1.1.2) /Page 5 /Count -1 /OUT pdfmark
+ [ /Title <FEFF03B103B203B3> /Page 5 /OUT pdfmark
+ [ /Title (Section 1.2) /Page 6 /OUT pdfmark
+ [ /Title (Chapter 2) /Page 102 /Count -2 /OUT pdfmark
+ [ /Title (Section 2.1) /Page 103 /Count -2 /OUT pdfmark
+ [ /Title (Section 2.1.1) /Page 104 /OUT pdfmark
+ [ /Title (Section 2.1.2) /Page 105 /Count -1 /OUT pdfmark
+ [ /Title <FEFF03B103B203B3> /Page 105 /OUT pdfmark
+ [ /Title (Section 2.2) /Page 106 /OUT pdfmark
pdfmarks = []
if title or author or keywords:
docinfo = []
if title:
- docinfo.append('/Title ({})'.format(title))
+ docinfo.append('/Title {}'.format(_pdfmark_unicode(title)))
if author:
- docinfo.append('/Author ({})'.format(author))
+ docinfo.append('/Author {}'.format(_pdfmark_unicode(author)))
if keywords:
- docinfo.append('/Keywords ({})'.format(', '.join(keywords)))
+ docinfo.append('/Keywords {}'.format(_pdfmark_unicode(
+ u', '.join(keywords))))
docinfo.append('/DOCINFO pdfmark')
pdfmarks.append('[ {}' .format('\n '.join(docinfo)))
bookmarks = []
startpage += pdf.pages
for i,bookmark in enumerate(bookmarks):
attributes = [
- '/Title ({})'.format(bookmark['title']),
+ '/Title {}'.format(_pdfmark_unicode(bookmark['title'])),
'/Page {}'.format(bookmark['page']),
#'[/XYZ null null null]', # preserve page zoom and viewport
return filename
+def _pdfmark_unicode(string):
+ r"""
+ >>> _pdfmark_unicode(u'ascii text with ) paren')
+ '(ascii text with \\) paren)'
+ >>> _pdfmark_unicode(u'\u03b1\u03b2\u03b3')
+ '<FEFF03B103B203B3>'
+ """
+ try:
+ ascii = string.encode('ascii')
+ except UnicodeEncodeError:
+ b = _codecs.BOM_UTF16_BE + string.encode('utf-16-be')
+ return '<{}>'.format(''.join('{:02X}'.format(ord(byte)) for byte in b))
+ else:
+ # escape special characters
+ for a,b in [(u'\\', u'\\\\'), (u'(', u'\\('), (u')', u'\\)'),
+ (u'\n', u'\\n'), (u'\t', u'\\t')]:
+ string = string.replace(a, b)
+ return '({})'.format(string)
+def _pdfmark_unicode_decode(string):
+ r"""
+ >>> _pdfmark_unicode_decode(_pdfmark_unicode(u'\u03b1\u03b2\u03b3'))
+ u'\u03b1\u03b2\u03b3'
+ """
+ assert string.startswith('<FEFF'), string
+ assert string.endswith('>'), string
+ b = ''.join(chr(int(float.fromhex(x1+x2)))
+ for x1,x2 in zip(string[5:-2:2], string[6:-1:2]))
+ return unicode(b, 'utf-16-be')
def _write_markfile(pdfmarks, pause_for_manual_tweaking=False):
fd,filename = _tempfile.mkstemp(prefix='pdfmarks-', text=True)
if pdfmarks:
parser.add_argument('--ask', dest='pause_for_manual_tweaking',
action='store_const', const=True,
help='pause for manual pdfmark tweaking')
- parser.add_argument('--output', dest='output',
+ parser.add_argument('--output', dest='output', default='output.pdf',
help='name of the output PDF')
parser.add_argument('--title', dest='title',
help='title of output PDF')
parser.add_argument('--author', dest='author',
help='author of output PDF')
- parser.add_argument('--keywords', dest='keywords',
+ parser.add_argument('--keyword', dest='keywords', action='append',
help='keywords for the output PDF')
parser.add_argument('--pdftk', dest='pdftk', default=PDFTK,
help='path to the pdftk executable')
parser.add_argument('--gs', dest='gs', default=GS,
help='path to the gs (Ghostscript) executable')
+ parser.add_argument('--argv-encoding', dest='argv_encoding',
+ help=('Optionally override the locale encoding for '
+ 'your command line arguments.'))
+ parser.add_argument('--unicode', dest='convert_unicode_strings',
+ action='store_const', const=True,
+ help=(u'instead of merging PDFs, convert '
+ u'PDF-formatted unicode strings. For example '
+ u"`--unicode '<FEFF03B103B203B3>' "
+ u'\u03b1\u03b2\u03b3`'))
args = parser.parse_args()
PDFTK = args.pdftk
GS = args.gs
+ if args.argv_encoding:
+ argv_encoding = args.argv_encoding
+ else:
+ argv_encoding = _locale.getpreferredencoding(do_setlocale=True)
+ if args.convert_unicode_strings:
+ for string in args.input:
+ if string.startswith('<FEFF'):
+ alt = _pdfmark_unicode_decode(string)
+ else:
+ string = unicode(string, argv_encoding)
+ alt = _pdfmark_unicode(string)
+ print(u'{} -> {}'.format(string, alt))
+ _sys.exit(0)
inputs = []
for filename in args.input:
+ if args.title:
+ title = unicode(args.title, argv_encoding)
+ else:
+ title = None
+ if args.author:
+ author = unicode(args.author, argv_encoding)
+ else:
+ author = None
+ if args.keywords:
+ keywords = [unicode(k, argv_encoding) for k in args.keywords]
+ else:
+ keywords = None
pdfmarks = generate_pdfmarks(
- inputs, title=args.title, author=args.author, keywords=args.keywords)
+ inputs, title=title, author=author, keywords=keywords)
merge_pdfs(inputs=inputs, pdfmarks=pdfmarks, output=args.output,