From 8ff215a148d6a3b327a8fc821d8a7236f6855d80 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Tue, 7 Feb 2012 20:51:09 -0500 Subject: [PATCH] Handle Unicode strings in pdf-merge.py. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit For information on Unicode strings in PDFs, see `§7.3.4 String Objects` and `§7.9.2.2 Text String Type` in the PDF reference [1] and `Table 2.3 (p21)`, `Table 2.5 (p25)`, etc. in the pdfmark reference [2]. Note that there are Ghostscript bugs [3] that can lead to errors like: Entity: line 5: parser error : xmlParseCharRef: invalid xmlChar value 1 >> BookmarkedPDF._unicode_replace('αβγ') + u'\u03b1\u03b2\u03b3' """ + return self._UNICODE_REGEXP.sub(self._unicode_replace_match, string) + + @classmethod + def _parse_dump_data(self, data): + r""" >>> from pprint import pprint - >>> data = '\\n'.join([ + >>> data = '\n'.join([ ... 'InfoKey: CreationDate', ... 'InfoValue: D:20080502020302Z', ... 'NumberOfPages: 123', @@ -83,6 +100,9 @@ class BookmarkedPDF (object): ... 'BookmarkTitle: Section 1.1.2', ... 'BookmarkLevel: 3', ... 'BookmarkPageNumber: 4', + ... 'BookmarkTitle: αβγ�', + ... 'BookmarkLevel: 4', + ... 'BookmarkPageNumber: 4', ... 'BookmarkTitle: Section 1.2', ... 'BookmarkLevel: 2', ... 'BookmarkPageNumber: 5', @@ -100,12 +120,13 @@ class BookmarkedPDF (object): >>> pages,bookmarks = BookmarkedPDF._parse_dump_data(data) >>> pages 123 - >>> pprint(bookmarks) - [{'level': 1, 'page': 1, 'title': 'Chapter 1'}, - {'level': 2, 'page': 2, 'title': 'Section 1.1'}, - {'level': 3, 'page': 3, 'title': 'Section 1.1.1'}, - {'level': 3, 'page': 4, 'title': 'Section 1.1.2'}, - {'level': 2, 'page': 5, 'title': 'Section 1.2'}] + >>> pprint(bookmarks) # doctest: +REPORT_UDIFF + [{'level': 1, 'page': 1, 'title': u'Chapter 1'}, + {'level': 2, 'page': 2, 'title': u'Section 1.1'}, + {'level': 3, 'page': 3, 'title': u'Section 1.1.1'}, + {'level': 3, 'page': 4, 'title': u'Section 1.1.2'}, + {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'}, + {'level': 2, 'page': 5, 'title': u'Section 1.2'}] """ pages = None bookmarks = [] @@ -121,6 +142,11 @@ class BookmarkedPDF (object): if k == 'pagenumber': k = 'page' value = int(value) + elif k == 'title': + if self._UNICODE_REGEXP.search(value): + value = self._unicode_replace(value[:-len('�')]) + else: + value = unicode(value) bookmark_info[k] = value ready_for_bookmark = True for field in bookmark_info_fields: @@ -132,9 +158,8 @@ class BookmarkedPDF (object): bookmark_info = {} return (pages, bookmarks) - def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None): - """ + r""" >>> inputs = [] >>> for pages,bookmarks in [ ... (1, @@ -144,12 +169,14 @@ def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None): ... {'level': 2, 'page': 2, 'title': 'Section 1.1'}, ... {'level': 3, 'page': 3, 'title': 'Section 1.1.1'}, ... {'level': 3, 'page': 4, 'title': 'Section 1.1.2'}, + ... {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'}, ... {'level': 2, 'page': 5, 'title': 'Section 1.2'}]), ... (100, ... [{'level': 1, 'page': 1, 'title': 'Chapter 2'}, ... {'level': 2, 'page': 2, 'title': 'Section 2.1'}, ... {'level': 3, 'page': 3, 'title': 'Section 2.1.1'}, ... {'level': 3, 'page': 4, 'title': 'Section 2.1.2'}, + ... {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'}, ... {'level': 2, 'page': 5, 'title': 'Section 2.2'}]), ... ]: ... pdf = BookmarkedPDF() @@ -158,32 +185,36 @@ def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None): ... inputs.append(pdf) >>> print(generate_pdfmarks(inputs=inputs, title='My Book', ... author='Myself', keywords=['fun', 'witty', 'interesting'])) + ... # doctest: +REPORT_UDIFF [ /Title (My Book) /Author (Myself) /Keywords (fun, witty, interesting) /DOCINFO pdfmark - [ /Title (Table of Contents) /Page 1 [/XYZ null null null] /OUT pdfmark - [ /Title (Chapter 1) /Page 2 [/XYZ null null null] /Count -2 /OUT pdfmark - [ /Title (Section 1.1) /Page 3 [/XYZ null null null] /Count -2 /OUT pdfmark - [ /Title (Section 1.1.1) /Page 4 [/XYZ null null null] /OUT pdfmark - [ /Title (Section 1.1.2) /Page 5 [/XYZ null null null] /OUT pdfmark - [ /Title (Section 1.2) /Page 6 [/XYZ null null null] /OUT pdfmark - [ /Title (Chapter 2) /Page 102 [/XYZ null null null] /Count -2 /OUT pdfmark - [ /Title (Section 2.1) /Page 103 [/XYZ null null null] /Count -2 /OUT pdfmark - [ /Title (Section 2.1.1) /Page 104 [/XYZ null null null] /OUT pdfmark - [ /Title (Section 2.1.2) /Page 105 [/XYZ null null null] /OUT pdfmark - [ /Title (Section 2.2) /Page 106 [/XYZ null null null] /OUT pdfmark + [ /Title (Table of Contents) /Page 1 /OUT pdfmark + [ /Title (Chapter 1) /Page 2 /Count -2 /OUT pdfmark + [ /Title (Section 1.1) /Page 3 /Count -2 /OUT pdfmark + [ /Title (Section 1.1.1) /Page 4 /OUT pdfmark + [ /Title (Section 1.1.2) /Page 5 /Count -1 /OUT pdfmark + [ /Title /Page 5 /OUT pdfmark + [ /Title (Section 1.2) /Page 6 /OUT pdfmark + [ /Title (Chapter 2) /Page 102 /Count -2 /OUT pdfmark + [ /Title (Section 2.1) /Page 103 /Count -2 /OUT pdfmark + [ /Title (Section 2.1.1) /Page 104 /OUT pdfmark + [ /Title (Section 2.1.2) /Page 105 /Count -1 /OUT pdfmark + [ /Title /Page 105 /OUT pdfmark + [ /Title (Section 2.2) /Page 106 /OUT pdfmark """ pdfmarks = [] if title or author or keywords: docinfo = [] if title: - docinfo.append('/Title ({})'.format(title)) + docinfo.append('/Title {}'.format(_pdfmark_unicode(title))) if author: - docinfo.append('/Author ({})'.format(author)) + docinfo.append('/Author {}'.format(_pdfmark_unicode(author))) if keywords: - docinfo.append('/Keywords ({})'.format(', '.join(keywords))) + docinfo.append('/Keywords {}'.format(_pdfmark_unicode( + u', '.join(keywords)))) docinfo.append('/DOCINFO pdfmark') pdfmarks.append('[ {}' .format('\n '.join(docinfo))) bookmarks = [] @@ -196,7 +227,7 @@ def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None): startpage += pdf.pages for i,bookmark in enumerate(bookmarks): attributes = [ - '/Title ({})'.format(bookmark['title']), + '/Title {}'.format(_pdfmark_unicode(bookmark['title'])), '/Page {}'.format(bookmark['page']), #'[/XYZ null null null]', # preserve page zoom and viewport ] @@ -254,6 +285,36 @@ def _write_pdfmark_restore_file(): _os.close(fd) return filename +def _pdfmark_unicode(string): + r""" + >>> _pdfmark_unicode(u'ascii text with ) paren') + '(ascii text with \\) paren)' + >>> _pdfmark_unicode(u'\u03b1\u03b2\u03b3') + '' + """ + try: + ascii = string.encode('ascii') + except UnicodeEncodeError: + b = _codecs.BOM_UTF16_BE + string.encode('utf-16-be') + return '<{}>'.format(''.join('{:02X}'.format(ord(byte)) for byte in b)) + else: + # escape special characters + for a,b in [(u'\\', u'\\\\'), (u'(', u'\\('), (u')', u'\\)'), + (u'\n', u'\\n'), (u'\t', u'\\t')]: + string = string.replace(a, b) + return '({})'.format(string) + +def _pdfmark_unicode_decode(string): + r""" + >>> _pdfmark_unicode_decode(_pdfmark_unicode(u'\u03b1\u03b2\u03b3')) + u'\u03b1\u03b2\u03b3' + """ + assert string.startswith(''), string + b = ''.join(chr(int(float.fromhex(x1+x2))) + for x1,x2 in zip(string[5:-2:2], string[6:-1:2])) + return unicode(b, 'utf-16-be') + def _write_markfile(pdfmarks, pause_for_manual_tweaking=False): fd,filename = _tempfile.mkstemp(prefix='pdfmarks-', text=True) if pdfmarks: @@ -298,28 +359,64 @@ if __name__ == '__main__': parser.add_argument('--ask', dest='pause_for_manual_tweaking', action='store_const', const=True, help='pause for manual pdfmark tweaking') - parser.add_argument('--output', dest='output', + parser.add_argument('--output', dest='output', default='output.pdf', help='name of the output PDF') parser.add_argument('--title', dest='title', help='title of output PDF') parser.add_argument('--author', dest='author', help='author of output PDF') - parser.add_argument('--keywords', dest='keywords', + parser.add_argument('--keyword', dest='keywords', action='append', help='keywords for the output PDF') parser.add_argument('--pdftk', dest='pdftk', default=PDFTK, help='path to the pdftk executable') parser.add_argument('--gs', dest='gs', default=GS, help='path to the gs (Ghostscript) executable') + parser.add_argument('--argv-encoding', dest='argv_encoding', + help=('Optionally override the locale encoding for ' + 'your command line arguments.')) + parser.add_argument('--unicode', dest='convert_unicode_strings', + action='store_const', const=True, + help=(u'instead of merging PDFs, convert ' + u'PDF-formatted unicode strings. For example ' + u"`--unicode '' " + u'\u03b1\u03b2\u03b3`')) args = parser.parse_args() PDFTK = args.pdftk GS = args.gs + if args.argv_encoding: + argv_encoding = args.argv_encoding + else: + argv_encoding = _locale.getpreferredencoding(do_setlocale=True) + + if args.convert_unicode_strings: + for string in args.input: + if string.startswith(' {}'.format(string, alt)) + _sys.exit(0) + inputs = [] for filename in args.input: inputs.append(BookmarkedPDF(filename)) + if args.title: + title = unicode(args.title, argv_encoding) + else: + title = None + if args.author: + author = unicode(args.author, argv_encoding) + else: + author = None + if args.keywords: + keywords = [unicode(k, argv_encoding) for k in args.keywords] + else: + keywords = None pdfmarks = generate_pdfmarks( - inputs, title=args.title, author=args.author, keywords=args.keywords) + inputs, title=title, author=author, keywords=keywords) merge_pdfs(inputs=inputs, pdfmarks=pdfmarks, output=args.output, pause_for_manual_tweaking=args.pause_for_manual_tweaking) -- 2.26.2