From: W. Trevor King Date: Tue, 29 Jan 2013 18:39:40 +0000 (-0500) Subject: pdf-merge.py: Upgrade to Python 3.x X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=77a9ba6;p=mw2txt.git pdf-merge.py: Upgrade to Python 3.x This probably breaks Python 2.x support. If you can't upgrade your Python implementation, you'll have to stick to older versions of this script. --- diff --git a/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py b/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py index 1d05acc..147cd75 100755 --- a/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py +++ b/posts/PDF_bookmarks_with_Ghostscript/pdf-merge.py @@ -39,14 +39,14 @@ PDFTK = 'pdftk' GS = 'gs' -def invoke(args, stdout=None): +def invoke(args, stdout=None, encoding=None): """ >>> invoke(['echo', 'hi', 'there']) - 'hi there\\n' + b'hi there\\n' >>> invoke(['this command does not exist']) Traceback (most recent call last): ... - OSError: [Errno 2] No such file or directory + OSError: [Errno 2] No such file or directory: 'this command does not exist' """ P = _subprocess.PIPE capture_stdout = stdout is None @@ -57,30 +57,34 @@ def invoke(args, stdout=None): stdout_,stderr_ = p.communicate() status = p.wait() assert status == 0, status + if encoding: + stdout_ = str(stdout_, encoding) return stdout_ class BookmarkedPDF (object): _UNICODE_REGEXP = _re.compile('&#([0-9]+);') - def __init__(self, filename=None): + def __init__(self, filename=None, encoding='ascii'): self.filename = filename + self.encoding = encoding if self.filename: self.get_bookmarks() def get_bookmarks(self): - data = invoke([PDFTK, self.filename, 'dump_data']) + data = invoke( + [PDFTK, self.filename, 'dump_data'], encoding=self.encoding) self.pages,self.bookmarks = self._parse_dump_data(data) @staticmethod def _unicode_replace_match(match): - return unichr(int(match.group(1))) + return chr(int(match.group(1))) @classmethod def _unicode_replace(self, string): r""" >>> BookmarkedPDF._unicode_replace('αβγ') - u'\u03b1\u03b2\u03b3' + '\u03b1\u03b2\u03b3' """ return self._UNICODE_REGEXP.sub(self._unicode_replace_match, string) @@ -131,12 +135,12 @@ class BookmarkedPDF (object): >>> pages 123 >>> pprint(bookmarks) # doctest: +REPORT_UDIFF - [{'level': 1, 'page': 1, 'title': u'Chapter 1'}, - {'level': 2, 'page': 2, 'title': u'Section 1.1'}, - {'level': 3, 'page': 3, 'title': u'Section 1.1.1'}, - {'level': 3, 'page': 4, 'title': u'Section 1.1.2'}, - {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'}, - {'level': 2, 'page': 5, 'title': u'Section 1.2'}] + [{'level': 1, 'page': 1, 'title': 'Chapter 1'}, + {'level': 2, 'page': 2, 'title': 'Section 1.1'}, + {'level': 3, 'page': 3, 'title': 'Section 1.1.1'}, + {'level': 3, 'page': 4, 'title': 'Section 1.1.2'}, + {'level': 4, 'page': 4, 'title': '\u03b1\u03b2\u03b3'}, + {'level': 2, 'page': 5, 'title': 'Section 1.2'}] """ pages = None bookmarks = [] @@ -158,8 +162,6 @@ class BookmarkedPDF (object): elif k == 'title': if self._UNICODE_REGEXP.search(value): value = self._unicode_replace(value) - else: - value = unicode(value) bookmark_info[k] = value ready_for_bookmark = True for field in bookmark_info_fields: @@ -182,14 +184,14 @@ def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None): ... {'level': 2, 'page': 2, 'title': 'Section 1.1'}, ... {'level': 3, 'page': 3, 'title': 'Section 1.1.1'}, ... {'level': 3, 'page': 4, 'title': 'Section 1.1.2'}, - ... {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'}, + ... {'level': 4, 'page': 4, 'title': '\u03b1\u03b2\u03b3'}, ... {'level': 2, 'page': 5, 'title': 'Section 1.2'}]), ... (100, ... [{'level': 1, 'page': 1, 'title': 'Chapter 2'}, ... {'level': 2, 'page': 2, 'title': 'Section 2.1'}, ... {'level': 3, 'page': 3, 'title': 'Section 2.1.1'}, ... {'level': 3, 'page': 4, 'title': 'Section 2.1.2'}, - ... {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'}, + ... {'level': 4, 'page': 4, 'title': '\u03b1\u03b2\u03b3'}, ... {'level': 2, 'page': 5, 'title': 'Section 2.2'}]), ... ]: ... pdf = BookmarkedPDF() @@ -227,7 +229,7 @@ def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None): docinfo.append('/Author {}'.format(_pdfmark_unicode(author))) if keywords: docinfo.append('/Keywords {}'.format(_pdfmark_unicode( - u', '.join(keywords)))) + ', '.join(keywords)))) docinfo.append('/DOCINFO pdfmark') pdfmarks.append('[ {}' .format('\n '.join(docinfo))) bookmarks = [] @@ -257,7 +259,7 @@ def generate_pdfmarks(inputs=(), title=None, author=None, keywords=None): return '\n'.join(pdfmarks) -def _write_pdfmark_noop_file(): +def _write_pdfmark_noop_file(encoding='ascii'): # By default, Ghostscript will preserve pdfmarks from the sources PDFs fd,filename = _tempfile.mkstemp(prefix='pdfmark-noop-', text=True) # Make `[... /OUT pdfmark` a no-op. @@ -287,51 +289,52 @@ def _write_pdfmark_noop_file(): } loop } def -""") +""".encode(encoding)) _os.close(fd) return filename -def _write_pdfmark_restore_file(): +def _write_pdfmark_restore_file(encoding='ascii'): fd,filename = _tempfile.mkstemp(prefix='pdfmark-restore-', text=True) # Restore the default `[... /Out pdfmark` behaviour - _os.write(fd, '/pdfmark { originalpdfmark } bind def\n') + _os.write(fd, '/pdfmark { originalpdfmark } bind def\n'.encode(encoding)) _os.close(fd) return filename def _pdfmark_unicode(string): r""" - >>> _pdfmark_unicode(u'ascii text with ) paren') + >>> _pdfmark_unicode('ascii text with ) paren') '(ascii text with \\) paren)' - >>> _pdfmark_unicode(u'\u03b1\u03b2\u03b3') + >>> _pdfmark_unicode('\u03b1\u03b2\u03b3') '' """ try: ascii = string.encode('ascii') except UnicodeEncodeError: b = _codecs.BOM_UTF16_BE + string.encode('utf-16-be') - return '<{}>'.format(''.join('{:02X}'.format(ord(byte)) for byte in b)) + return '<{}>'.format(''.join('{:02X}'.format(byte) for byte in b)) else: # escape special characters - for a,b in [(u'\\', u'\\\\'), (u'(', u'\\('), (u')', u'\\)'), - (u'\n', u'\\n'), (u'\t', u'\\t')]: + for a,b in [('\\', '\\\\'), ('(', '\\('), (')', '\\)'), + ('\n', '\\n'), ('\t', '\\t')]: string = string.replace(a, b) return '({})'.format(string) def _pdfmark_unicode_decode(string): r""" - >>> _pdfmark_unicode_decode(_pdfmark_unicode(u'\u03b1\u03b2\u03b3')) - u'\u03b1\u03b2\u03b3' + >>> _pdfmark_unicode_decode(_pdfmark_unicode('\u03b1\u03b2\u03b3')) + '\u03b1\u03b2\u03b3' """ assert string.startswith(''), string - b = ''.join(chr(int(float.fromhex(x1+x2))) - for x1,x2 in zip(string[5:-2:2], string[6:-1:2])) - return unicode(b, 'utf-16-be') + b = bytes(int(float.fromhex(x1+x2)) + for x1,x2 in zip(string[5:-2:2], string[6:-1:2])) + return str(b, 'utf-16-be') -def _write_markfile(pdfmarks, pause_for_manual_tweaking=False): +def _write_markfile(pdfmarks, pause_for_manual_tweaking=False, + encoding='ascii'): fd,filename = _tempfile.mkstemp(prefix='pdfmarks-', text=True) if pdfmarks: - _os.write(fd, pdfmarks) + _os.write(fd, pdfmarks.encode(encoding)) _os.close(fd) if pause_for_manual_tweaking: print('edit {} as you see fit, and press enter when ready'.format( @@ -339,21 +342,24 @@ def _write_markfile(pdfmarks, pause_for_manual_tweaking=False): _sys.stdin.readline() return filename -def merge_pdfs(inputs, output, pdfmarks=None, pause_for_manual_tweaking=False): +def merge_pdfs(inputs, output, pdfmarks=None, + pause_for_manual_tweaking=False, encoding='ascii'): args = [GS, '-dBATCH', '-dNOPAUSE', '-sDEVICE=pdfwrite'] if output: args.append('-sOutputFile={}'.format(output)) else: args.extend(['-sOutputFile=-', '-q']) if pdfmarks: - mark_noop = _write_pdfmark_noop_file() + mark_noop = _write_pdfmark_noop_file(encoding=encoding) args.append(mark_noop) args.extend([pdf.filename for pdf in inputs]) if pdfmarks: - mark_restore = _write_pdfmark_restore_file() + mark_restore = _write_pdfmark_restore_file( + encoding=encoding) args.append(mark_restore) markfile = _write_markfile( - pdfmarks=pdfmarks, pause_for_manual_tweaking=pause_for_manual_tweaking) + pdfmarks=pdfmarks, pause_for_manual_tweaking=pause_for_manual_tweaking, + encoding=encoding) args.append(markfile) print('preparing to execute: {}'.format(args)) invoke(args, stdout=_sys.stdout) @@ -366,6 +372,8 @@ def merge_pdfs(inputs, output, pdfmarks=None, pause_for_manual_tweaking=False): if __name__ == '__main__': import argparse + encoding = _locale.getpreferredencoding(do_setlocale=True) + parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('input', metavar='PDF', nargs='+', help='an input PDF to merge') @@ -396,58 +404,39 @@ if __name__ == '__main__': 'given and the file exists, no attempt will be ' 'make to use pdftk to generate the mark file (I ' 'assume your input file is what you want).')) - parser.add_argument('--argv-encoding', dest='argv_encoding', - help=('Optionally override the locale encoding for ' - 'your command line arguments.')) parser.add_argument('--unicode', dest='convert_unicode_strings', action='store_const', const=True, - help=(u'instead of merging PDFs, convert ' - u'PDF-formatted unicode strings. For example ' - u"`--unicode '' " - u'\u03b1\u03b2\u03b3`')) + help=('instead of merging PDFs, convert ' + 'PDF-formatted unicode strings. For example ' + "`--unicode '' " + '\u03b1\u03b2\u03b3`')) args = parser.parse_args() PDFTK = args.pdftk GS = args.gs - if args.argv_encoding: - argv_encoding = args.argv_encoding - else: - argv_encoding = _locale.getpreferredencoding(do_setlocale=True) - if args.convert_unicode_strings: for string in args.input: if string.startswith(' {}'.format(string, alt)) + print('{} -> {}'.format(string, alt)) _sys.exit(0) inputs = [] for filename in args.input: inputs.append(BookmarkedPDF(filename)) - if args.title: - title = unicode(args.title, argv_encoding) - else: - title = None - if args.author: - author = unicode(args.author, argv_encoding) - else: - author = None - if args.keywords: - keywords = [unicode(k, argv_encoding) for k in args.keywords] - else: - keywords = None if args.pdfmarks and _os_path.isfile(args.pdfmarks): - pdfmarks = open(args.pdfmarks, 'r').read() + pdfmarks = _codecs.open(args.pdfmarks, 'r', encoding).read() else: pdfmarks = generate_pdfmarks( - inputs, title=title, author=author, keywords=keywords) + inputs, title=args.title, author=args.author, + keywords=args.keywords) if args.pdfmarks: - open(args.pdfmarks, 'w').write(pdfmarks) + _codecs.open(args.pdfmarks, 'w', encoding).write(pdfmarks) _sys.exit(0) merge_pdfs(inputs=inputs, pdfmarks=pdfmarks, output=args.output, - pause_for_manual_tweaking=args.pause_for_manual_tweaking) + pause_for_manual_tweaking=args.pause_for_manual_tweaking, + encoding=encoding)