3 # Copyright (C) 2011-2013 W. Trevor King <wking@drexel.edu>
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Lesser General Public License as
7 # published by the Free Software Foundation, either version 3 of the
8 # License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful, but
11 # WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 # Lesser General Public License for more details.
15 # You should have received a copy of the GNU Lesser General Public
16 # License along with this program. If not, see
17 # <http://www.gnu.org/licenses/>.
19 """Merge PDFs preserving bookmarks.
21 Thanks to Larry Cai for suggesting that Unicode be supported and for
22 discussion about the `--pdfmarks` option.
25 import codecs as _codecs
26 import locale as _locale
28 import os.path as _os_path
30 import subprocess as _subprocess
32 import tempfile as _tempfile
42 def invoke(args, stdout=None):
44 >>> invoke(['echo', 'hi', 'there'])
46 >>> invoke(['this command does not exist'])
47 Traceback (most recent call last):
49 OSError: [Errno 2] No such file or directory
52 capture_stdout = stdout is None
55 p = _subprocess.Popen(
56 args, stdin=P, stdout=stdout, stderr=_sys.stderr, shell=False, cwd='.')
57 stdout_,stderr_ = p.communicate()
59 assert status == 0, status
63 class BookmarkedPDF (object):
64 _UNICODE_REGEXP = _re.compile('&#([0-9]+);')
66 def __init__(self, filename=None):
67 self.filename = filename
71 def get_bookmarks(self):
72 data = invoke([PDFTK, self.filename, 'dump_data'])
73 self.pages,self.bookmarks = self._parse_dump_data(data)
76 def _unicode_replace_match(match):
77 return unichr(int(match.group(1)))
80 def _unicode_replace(self, string):
82 >>> BookmarkedPDF._unicode_replace('αβγ')
85 return self._UNICODE_REGEXP.sub(self._unicode_replace_match, string)
88 def _parse_dump_data(self, data):
90 >>> from pprint import pprint
91 >>> data = '\n'.join([
93 ... 'InfoKey: CreationDate',
94 ... 'InfoValue: D:20080502020302Z',
95 ... 'NumberOfPages: 123',
97 ... 'BookmarkTitle: Chapter 1',
98 ... 'BookmarkLevel: 1',
99 ... 'BookmarkPageNumber: 1',
100 ... 'BookmarkTitle: Section 1.1',
101 ... 'BookmarkLevel: 2',
102 ... 'BookmarkPageNumber: 2',
103 ... 'BookmarkTitle: Section 1.1.1',
104 ... 'BookmarkLevel: 3',
105 ... 'BookmarkPageNumber: 3',
106 ... 'BookmarkTitle: Section 1.1.2',
107 ... 'BookmarkLevel: 3',
108 ... 'BookmarkPageNumber: 4',
109 ... 'BookmarkTitle: αβγ',
110 ... 'BookmarkLevel: 4',
111 ... 'BookmarkPageNumber: 4',
112 ... 'BookmarkTitle: Section 1.2',
113 ... 'BookmarkLevel: 2',
114 ... 'BookmarkPageNumber: 5',
115 ... 'PageLabelBegin',
116 ... 'PageLabelNewIndex: 1',
117 ... 'PageLabelStart: 316',
118 ... 'PageLabelPrefix:',
119 ... 'PageLabelNumStyle: DecimalArabicNumerals',
120 ... 'PageLabelNewIndex: 2',
121 ... 'PageLabelStart: 317',
122 ... 'PageLabelPrefix:',
123 ... 'PageLabelNumStyle: DecimalArabicNumerals',
124 ... 'PageLabelNewIndex: 3',
125 ... 'PageLabelStart: 318',
126 ... 'PageLabelPrefix:',
127 ... 'PageLabelNumStyle: DecimalArabicNumerals',
128 ... 'PageLabelNewIndex: 4',
130 >>> pages,bookmarks = BookmarkedPDF._parse_dump_data(data)
133 >>> pprint(bookmarks) # doctest: +REPORT_UDIFF
134 [{'level': 1, 'page': 1, 'title': u'Chapter 1'},
135 {'level': 2, 'page': 2, 'title': u'Section 1.1'},
136 {'level': 3, 'page': 3, 'title': u'Section 1.1.1'},
137 {'level': 3, 'page': 4, 'title': u'Section 1.1.2'},
138 {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
139 {'level': 2, 'page': 5, 'title': u'Section 1.2'}]
144 bookmark_info_fields = ['title', 'level', 'page']
145 for line in data.splitlines():
147 key,value = line.split(': ', 1)
148 except ValueError: # e.g. line == 'InfoBegin'
150 if key == 'NumberOfPages':
152 elif key.startswith('Bookmark'):
153 k = key[len('Bookmark'):].lower()
154 if k in ['level', 'pagenumber']:
155 if k == 'pagenumber':
159 if self._UNICODE_REGEXP.search(value):
160 value = self._unicode_replace(value)
162 value = unicode(value)
163 bookmark_info[k] = value
164 ready_for_bookmark = True
165 for field in bookmark_info_fields:
166 if field not in bookmark_info:
167 ready_for_bookmark = False
169 if ready_for_bookmark:
170 bookmarks.append(bookmark_info)
172 return (pages, bookmarks)
174 def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None):
177 >>> for pages,bookmarks in [
179 ... [{'level': 1, 'page': 1, 'title': 'Table of Contents'}]),
181 ... [{'level': 1, 'page': 1, 'title': 'Chapter 1'},
182 ... {'level': 2, 'page': 2, 'title': 'Section 1.1'},
183 ... {'level': 3, 'page': 3, 'title': 'Section 1.1.1'},
184 ... {'level': 3, 'page': 4, 'title': 'Section 1.1.2'},
185 ... {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
186 ... {'level': 2, 'page': 5, 'title': 'Section 1.2'}]),
188 ... [{'level': 1, 'page': 1, 'title': 'Chapter 2'},
189 ... {'level': 2, 'page': 2, 'title': 'Section 2.1'},
190 ... {'level': 3, 'page': 3, 'title': 'Section 2.1.1'},
191 ... {'level': 3, 'page': 4, 'title': 'Section 2.1.2'},
192 ... {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
193 ... {'level': 2, 'page': 5, 'title': 'Section 2.2'}]),
195 ... pdf = BookmarkedPDF()
196 ... pdf.pages = pages
197 ... pdf.bookmarks = bookmarks
198 ... inputs.append(pdf)
199 >>> print(generate_pdfmarks(inputs=inputs, title='My Book',
200 ... author='Myself', keywords=['fun', 'witty', 'interesting']))
201 ... # doctest: +REPORT_UDIFF
204 /Keywords (fun, witty, interesting)
206 [ /Title (Table of Contents) /Page 1 /OUT pdfmark
207 [ /Title (Chapter 1) /Page 2 /Count -2 /OUT pdfmark
208 [ /Title (Section 1.1) /Page 3 /Count -2 /OUT pdfmark
209 [ /Title (Section 1.1.1) /Page 4 /OUT pdfmark
210 [ /Title (Section 1.1.2) /Page 5 /Count -1 /OUT pdfmark
211 [ /Title <FEFF03B103B203B3> /Page 5 /OUT pdfmark
212 [ /Title (Section 1.2) /Page 6 /OUT pdfmark
213 [ /Title (Chapter 2) /Page 102 /Count -2 /OUT pdfmark
214 [ /Title (Section 2.1) /Page 103 /Count -2 /OUT pdfmark
215 [ /Title (Section 2.1.1) /Page 104 /OUT pdfmark
216 [ /Title (Section 2.1.2) /Page 105 /Count -1 /OUT pdfmark
217 [ /Title <FEFF03B103B203B3> /Page 105 /OUT pdfmark
218 [ /Title (Section 2.2) /Page 106 /OUT pdfmark
222 if title or author or keywords:
225 docinfo.append('/Title {}'.format(_pdfmark_unicode(title)))
227 docinfo.append('/Author {}'.format(_pdfmark_unicode(author)))
229 docinfo.append('/Keywords {}'.format(_pdfmark_unicode(
230 u', '.join(keywords))))
231 docinfo.append('/DOCINFO pdfmark')
232 pdfmarks.append('[ {}' .format('\n '.join(docinfo)))
236 for bookmark in pdf.bookmarks:
237 mark = dict(bookmark) # shallow copy
238 mark['page'] += startpage
239 bookmarks.append(mark)
240 startpage += pdf.pages
241 for i,bookmark in enumerate(bookmarks):
243 '/Title {}'.format(_pdfmark_unicode(bookmark['title'])),
244 '/Page {}'.format(bookmark['page']),
245 #'[/XYZ null null null]', # preserve page zoom and viewport
248 for bmk in bookmarks[i+1:]:
249 if bmk['level'] == bookmark['level']:
251 if bmk['level'] == bookmark['level'] + 1:
254 attributes.append('/Count -{}'.format(count))
255 pdfmarks.append('[ {} /OUT pdfmark'.format(' '.join(attributes)))
256 pdfmarks.append('') # terminal newline
257 return '\n'.join(pdfmarks)
260 def _write_pdfmark_noop_file():
261 # By default, Ghostscript will preserve pdfmarks from the sources PDFs
262 fd,filename = _tempfile.mkstemp(prefix='pdfmark-noop-', text=True)
263 # Make `[... /OUT pdfmark` a no-op.
265 % store the original pdfmark
266 /originalpdfmark { //pdfmark } bind def
268 % replace pdfmark with a wrapper that ignores OUT
275 { /pdfmark errordict /unmatchedmark get exec stop }
278 dup type /nametype ne
279 { /pdfmark errordict /typecheck get exec stop }
283 { (Skipping OUT pdfmark\n) print cleartomark exit }
294 def _write_pdfmark_restore_file():
295 fd,filename = _tempfile.mkstemp(prefix='pdfmark-restore-', text=True)
296 # Restore the default `[... /Out pdfmark` behaviour
297 _os.write(fd, '/pdfmark { originalpdfmark } bind def\n')
301 def _pdfmark_unicode(string):
303 >>> _pdfmark_unicode(u'ascii text with ) paren')
304 '(ascii text with \\) paren)'
305 >>> _pdfmark_unicode(u'\u03b1\u03b2\u03b3')
309 ascii = string.encode('ascii')
310 except UnicodeEncodeError:
311 b = _codecs.BOM_UTF16_BE + string.encode('utf-16-be')
312 return '<{}>'.format(''.join('{:02X}'.format(ord(byte)) for byte in b))
314 # escape special characters
315 for a,b in [(u'\\', u'\\\\'), (u'(', u'\\('), (u')', u'\\)'),
316 (u'\n', u'\\n'), (u'\t', u'\\t')]:
317 string = string.replace(a, b)
318 return '({})'.format(string)
320 def _pdfmark_unicode_decode(string):
322 >>> _pdfmark_unicode_decode(_pdfmark_unicode(u'\u03b1\u03b2\u03b3'))
323 u'\u03b1\u03b2\u03b3'
325 assert string.startswith('<FEFF'), string
326 assert string.endswith('>'), string
327 b = ''.join(chr(int(float.fromhex(x1+x2)))
328 for x1,x2 in zip(string[5:-2:2], string[6:-1:2]))
329 return unicode(b, 'utf-16-be')
331 def _write_markfile(pdfmarks, pause_for_manual_tweaking=False):
332 fd,filename = _tempfile.mkstemp(prefix='pdfmarks-', text=True)
334 _os.write(fd, pdfmarks)
336 if pause_for_manual_tweaking:
337 print('edit {} as you see fit, and press enter when ready'.format(
339 _sys.stdin.readline()
342 def merge_pdfs(inputs, output, pdfmarks=None, pause_for_manual_tweaking=False):
343 args = [GS, '-dBATCH', '-dNOPAUSE', '-sDEVICE=pdfwrite']
345 args.append('-sOutputFile={}'.format(output))
347 args.extend(['-sOutputFile=-', '-q'])
349 mark_noop = _write_pdfmark_noop_file()
350 args.append(mark_noop)
351 args.extend([pdf.filename for pdf in inputs])
353 mark_restore = _write_pdfmark_restore_file()
354 args.append(mark_restore)
355 markfile = _write_markfile(
356 pdfmarks=pdfmarks, pause_for_manual_tweaking=pause_for_manual_tweaking)
357 args.append(markfile)
358 print('preparing to execute: {}'.format(args))
359 invoke(args, stdout=_sys.stdout)
361 _os.unlink(mark_noop)
362 _os.unlink(mark_restore)
366 if __name__ == '__main__':
369 parser = argparse.ArgumentParser(description=__doc__)
370 parser.add_argument('input', metavar='PDF', nargs='+',
371 help='an input PDF to merge')
373 '-v', '--version', action='version',
374 version='%(prog)s {}'.format(__version__))
375 parser.add_argument('--ask', dest='pause_for_manual_tweaking',
376 action='store_const', const=True,
377 help='pause for manual pdfmark tweaking')
378 parser.add_argument('--output', dest='output', default='output.pdf',
379 help='name of the output PDF')
380 parser.add_argument('--title', dest='title',
381 help='title of output PDF')
382 parser.add_argument('--author', dest='author',
383 help='author of output PDF')
384 parser.add_argument('--keyword', metavar='KEYWORD', dest='keywords',
386 help='keywords for the output PDF')
387 parser.add_argument('--pdftk', dest='pdftk', default=PDFTK,
388 help='path to the pdftk executable')
389 parser.add_argument('--gs', dest='gs', default=GS,
390 help='path to the gs (Ghostscript) executable')
391 parser.add_argument('--pdfmarks', dest='pdfmarks',
392 help=('path to pdfmarks file. If not given, a '
393 'temporary file is used. If given and the file '
394 'is missing, execution will stop after the file '
395 'is created (before the Ghostscript run). If '
396 'given and the file exists, no attempt will be '
397 'make to use pdftk to generate the mark file (I '
398 'assume your input file is what you want).'))
399 parser.add_argument('--argv-encoding', dest='argv_encoding',
400 help=('Optionally override the locale encoding for '
401 'your command line arguments.'))
402 parser.add_argument('--unicode', dest='convert_unicode_strings',
403 action='store_const', const=True,
404 help=(u'instead of merging PDFs, convert '
405 u'PDF-formatted unicode strings. For example '
406 u"`--unicode '<FEFF03B103B203B3>' "
407 u'\u03b1\u03b2\u03b3`'))
409 args = parser.parse_args()
414 if args.argv_encoding:
415 argv_encoding = args.argv_encoding
417 argv_encoding = _locale.getpreferredencoding(do_setlocale=True)
419 if args.convert_unicode_strings:
420 for string in args.input:
421 if string.startswith('<FEFF'):
422 alt = _pdfmark_unicode_decode(string)
424 string = unicode(string, argv_encoding)
425 alt = _pdfmark_unicode(string)
426 print(u'{} -> {}'.format(string, alt))
430 for filename in args.input:
431 inputs.append(BookmarkedPDF(filename))
433 title = unicode(args.title, argv_encoding)
437 author = unicode(args.author, argv_encoding)
441 keywords = [unicode(k, argv_encoding) for k in args.keywords]
444 if args.pdfmarks and _os_path.isfile(args.pdfmarks):
445 pdfmarks = open(args.pdfmarks, 'r').read()
447 pdfmarks = generate_pdfmarks(
448 inputs, title=title, author=author, keywords=keywords)
450 open(args.pdfmarks, 'w').write(pdfmarks)
452 merge_pdfs(inputs=inputs, pdfmarks=pdfmarks, output=args.output,
453 pause_for_manual_tweaking=args.pause_for_manual_tweaking)