From: W. Trevor King <wking@tremily.us> Date: Tue, 29 Jan 2013 16:23:46 +0000 (-0500) Subject: pdf-merge.py: Update dump parser for pdftk v1.45 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=71c7be5a402c93f8e2923be52aecee583fe499b9;p=mw2txt.git pdf-merge.py: Update dump parser for pdftk v1.45 On Tue, Jan 29, 2013 at 08:15:57AM -0800, Tsung-Han Yang wrote: > I've attached the dumpdata message. I've just installed pdftk on > windows yesterday. I've attached the diff file with -u, too. > ... > It looks like my pdf has some metadata like "InfoBegin" without ":" > in the line. The new lines in the dump are due to the addition of m_begin_mark in pdftk between v1.44.1 and v1.45: Sid Stewart wrote: > Added record delimiters to dump_data output to help make parsing > more reliable. --- diff --git a/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py b/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py index 6266fe4..be6e08a 100755 --- a/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py +++ b/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py @@ -89,9 +89,11 @@ class BookmarkedPDF (object): r""" >>> from pprint import pprint >>> data = '\n'.join([ + ... 'InfoBegin', ... 'InfoKey: CreationDate', ... 'InfoValue: D:20080502020302Z', ... 'NumberOfPages: 123', + ... 'BookmarkBegin', ... 'BookmarkTitle: Chapter 1', ... 'BookmarkLevel: 1', ... 'BookmarkPageNumber: 1', @@ -110,14 +112,18 @@ class BookmarkedPDF (object): ... 'BookmarkTitle: Section 1.2', ... 'BookmarkLevel: 2', ... 'BookmarkPageNumber: 5', + ... 'PageLabelBegin', ... 'PageLabelNewIndex: 1', ... 'PageLabelStart: 316', + ... 'PageLabelPrefix:', ... 'PageLabelNumStyle: DecimalArabicNumerals', ... 'PageLabelNewIndex: 2', ... 'PageLabelStart: 317', + ... 'PageLabelPrefix:', ... 'PageLabelNumStyle: DecimalArabicNumerals', ... 'PageLabelNewIndex: 3', ... 'PageLabelStart: 318', + ... 'PageLabelPrefix:', ... 'PageLabelNumStyle: DecimalArabicNumerals', ... 'PageLabelNewIndex: 4', ... ]) @@ -137,7 +143,10 @@ class BookmarkedPDF (object): bookmark_info = {} bookmark_info_fields = ['title', 'level', 'page'] for line in data.splitlines(): - key,value = line.split(': ', 1) + try: + key,value = line.split(': ', 1) + except ValueError: # e.g. line == 'InfoBegin' + continue if key == 'NumberOfPages': pages = int(value) elif key.startswith('Bookmark'):