GS = 'gs'
-def invoke(args, stdout=None):
+def invoke(args, stdout=None, encoding=None):
"""
>>> invoke(['echo', 'hi', 'there'])
- 'hi there\\n'
+ b'hi there\\n'
>>> invoke(['this command does not exist'])
Traceback (most recent call last):
...
- OSError: [Errno 2] No such file or directory
+ OSError: [Errno 2] No such file or directory: 'this command does not exist'
"""
P = _subprocess.PIPE
capture_stdout = stdout is None
stdout_,stderr_ = p.communicate()
status = p.wait()
assert status == 0, status
+ if encoding:
+ stdout_ = str(stdout_, encoding)
return stdout_
class BookmarkedPDF (object):
_UNICODE_REGEXP = _re.compile('&#([0-9]+);')
- def __init__(self, filename=None):
+ def __init__(self, filename=None, encoding='ascii'):
self.filename = filename
+ self.encoding = encoding
if self.filename:
self.get_bookmarks()
def get_bookmarks(self):
- data = invoke([PDFTK, self.filename, 'dump_data'])
+ data = invoke(
+ [PDFTK, self.filename, 'dump_data'], encoding=self.encoding)
self.pages,self.bookmarks = self._parse_dump_data(data)
@staticmethod
def _unicode_replace_match(match):
- return unichr(int(match.group(1)))
+ return chr(int(match.group(1)))
@classmethod
def _unicode_replace(self, string):
r"""
>>> BookmarkedPDF._unicode_replace('αβγ')
- u'\u03b1\u03b2\u03b3'
+ '\u03b1\u03b2\u03b3'
"""
return self._UNICODE_REGEXP.sub(self._unicode_replace_match, string)
>>> pages
123
>>> pprint(bookmarks) # doctest: +REPORT_UDIFF
- [{'level': 1, 'page': 1, 'title': u'Chapter 1'},
- {'level': 2, 'page': 2, 'title': u'Section 1.1'},
- {'level': 3, 'page': 3, 'title': u'Section 1.1.1'},
- {'level': 3, 'page': 4, 'title': u'Section 1.1.2'},
- {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
- {'level': 2, 'page': 5, 'title': u'Section 1.2'}]
+ [{'level': 1, 'page': 1, 'title': 'Chapter 1'},
+ {'level': 2, 'page': 2, 'title': 'Section 1.1'},
+ {'level': 3, 'page': 3, 'title': 'Section 1.1.1'},
+ {'level': 3, 'page': 4, 'title': 'Section 1.1.2'},
+ {'level': 4, 'page': 4, 'title': '\u03b1\u03b2\u03b3'},
+ {'level': 2, 'page': 5, 'title': 'Section 1.2'}]
"""
pages = None
bookmarks = []
elif k == 'title':
if self._UNICODE_REGEXP.search(value):
value = self._unicode_replace(value)
- else:
- value = unicode(value)
bookmark_info[k] = value
ready_for_bookmark = True
for field in bookmark_info_fields:
... {'level': 2, 'page': 2, 'title': 'Section 1.1'},
... {'level': 3, 'page': 3, 'title': 'Section 1.1.1'},
... {'level': 3, 'page': 4, 'title': 'Section 1.1.2'},
- ... {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
+ ... {'level': 4, 'page': 4, 'title': '\u03b1\u03b2\u03b3'},
... {'level': 2, 'page': 5, 'title': 'Section 1.2'}]),
... (100,
... [{'level': 1, 'page': 1, 'title': 'Chapter 2'},
... {'level': 2, 'page': 2, 'title': 'Section 2.1'},
... {'level': 3, 'page': 3, 'title': 'Section 2.1.1'},
... {'level': 3, 'page': 4, 'title': 'Section 2.1.2'},
- ... {'level': 4, 'page': 4, 'title': u'\u03b1\u03b2\u03b3'},
+ ... {'level': 4, 'page': 4, 'title': '\u03b1\u03b2\u03b3'},
... {'level': 2, 'page': 5, 'title': 'Section 2.2'}]),
... ]:
... pdf = BookmarkedPDF()
docinfo.append('/Author {}'.format(_pdfmark_unicode(author)))
if keywords:
docinfo.append('/Keywords {}'.format(_pdfmark_unicode(
- u', '.join(keywords))))
+ ', '.join(keywords))))
docinfo.append('/DOCINFO pdfmark')
pdfmarks.append('[ {}' .format('\n '.join(docinfo)))
bookmarks = []
return '\n'.join(pdfmarks)
-def _write_pdfmark_noop_file():
+def _write_pdfmark_noop_file(encoding='ascii'):
# By default, Ghostscript will preserve pdfmarks from the sources PDFs
fd,filename = _tempfile.mkstemp(prefix='pdfmark-noop-', text=True)
# Make `[... /OUT pdfmark` a no-op.
} loop
} def
-""")
+""".encode(encoding))
_os.close(fd)
return filename
-def _write_pdfmark_restore_file():
+def _write_pdfmark_restore_file(encoding='ascii'):
fd,filename = _tempfile.mkstemp(prefix='pdfmark-restore-', text=True)
# Restore the default `[... /Out pdfmark` behaviour
- _os.write(fd, '/pdfmark { originalpdfmark } bind def\n')
+ _os.write(fd, '/pdfmark { originalpdfmark } bind def\n'.encode(encoding))
_os.close(fd)
return filename
def _pdfmark_unicode(string):
r"""
- >>> _pdfmark_unicode(u'ascii text with ) paren')
+ >>> _pdfmark_unicode('ascii text with ) paren')
'(ascii text with \\) paren)'
- >>> _pdfmark_unicode(u'\u03b1\u03b2\u03b3')
+ >>> _pdfmark_unicode('\u03b1\u03b2\u03b3')
'<FEFF03B103B203B3>'
"""
try:
ascii = string.encode('ascii')
except UnicodeEncodeError:
b = _codecs.BOM_UTF16_BE + string.encode('utf-16-be')
- return '<{}>'.format(''.join('{:02X}'.format(ord(byte)) for byte in b))
+ return '<{}>'.format(''.join('{:02X}'.format(byte) for byte in b))
else:
# escape special characters
- for a,b in [(u'\\', u'\\\\'), (u'(', u'\\('), (u')', u'\\)'),
- (u'\n', u'\\n'), (u'\t', u'\\t')]:
+ for a,b in [('\\', '\\\\'), ('(', '\\('), (')', '\\)'),
+ ('\n', '\\n'), ('\t', '\\t')]:
string = string.replace(a, b)
return '({})'.format(string)
def _pdfmark_unicode_decode(string):
r"""
- >>> _pdfmark_unicode_decode(_pdfmark_unicode(u'\u03b1\u03b2\u03b3'))
- u'\u03b1\u03b2\u03b3'
+ >>> _pdfmark_unicode_decode(_pdfmark_unicode('\u03b1\u03b2\u03b3'))
+ '\u03b1\u03b2\u03b3'
"""
assert string.startswith('<FEFF'), string
assert string.endswith('>'), string
- b = ''.join(chr(int(float.fromhex(x1+x2)))
- for x1,x2 in zip(string[5:-2:2], string[6:-1:2]))
- return unicode(b, 'utf-16-be')
+ b = bytes(int(float.fromhex(x1+x2))
+ for x1,x2 in zip(string[5:-2:2], string[6:-1:2]))
+ return str(b, 'utf-16-be')
-def _write_markfile(pdfmarks, pause_for_manual_tweaking=False):
+def _write_markfile(pdfmarks, pause_for_manual_tweaking=False,
+ encoding='ascii'):
fd,filename = _tempfile.mkstemp(prefix='pdfmarks-', text=True)
if pdfmarks:
- _os.write(fd, pdfmarks)
+ _os.write(fd, pdfmarks.encode(encoding))
_os.close(fd)
if pause_for_manual_tweaking:
print('edit {} as you see fit, and press enter when ready'.format(
_sys.stdin.readline()
return filename
-def merge_pdfs(inputs, output, pdfmarks=None, pause_for_manual_tweaking=False):
+def merge_pdfs(inputs, output, pdfmarks=None,
+ pause_for_manual_tweaking=False, encoding='ascii'):
args = [GS, '-dBATCH', '-dNOPAUSE', '-sDEVICE=pdfwrite']
if output:
args.append('-sOutputFile={}'.format(output))
else:
args.extend(['-sOutputFile=-', '-q'])
if pdfmarks:
- mark_noop = _write_pdfmark_noop_file()
+ mark_noop = _write_pdfmark_noop_file(encoding=encoding)
args.append(mark_noop)
args.extend([pdf.filename for pdf in inputs])
if pdfmarks:
- mark_restore = _write_pdfmark_restore_file()
+ mark_restore = _write_pdfmark_restore_file(
+ encoding=encoding)
args.append(mark_restore)
markfile = _write_markfile(
- pdfmarks=pdfmarks, pause_for_manual_tweaking=pause_for_manual_tweaking)
+ pdfmarks=pdfmarks, pause_for_manual_tweaking=pause_for_manual_tweaking,
+ encoding=encoding)
args.append(markfile)
print('preparing to execute: {}'.format(args))
invoke(args, stdout=_sys.stdout)
if __name__ == '__main__':
import argparse
+ encoding = _locale.getpreferredencoding(do_setlocale=True)
+
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('input', metavar='PDF', nargs='+',
help='an input PDF to merge')
'given and the file exists, no attempt will be '
'make to use pdftk to generate the mark file (I '
'assume your input file is what you want).'))
- parser.add_argument('--argv-encoding', dest='argv_encoding',
- help=('Optionally override the locale encoding for '
- 'your command line arguments.'))
parser.add_argument('--unicode', dest='convert_unicode_strings',
action='store_const', const=True,
- help=(u'instead of merging PDFs, convert '
- u'PDF-formatted unicode strings. For example '
- u"`--unicode '<FEFF03B103B203B3>' "
- u'\u03b1\u03b2\u03b3`'))
+ help=('instead of merging PDFs, convert '
+ 'PDF-formatted unicode strings. For example '
+ "`--unicode '<FEFF03B103B203B3>' "
+ '\u03b1\u03b2\u03b3`'))
args = parser.parse_args()
PDFTK = args.pdftk
GS = args.gs
- if args.argv_encoding:
- argv_encoding = args.argv_encoding
- else:
- argv_encoding = _locale.getpreferredencoding(do_setlocale=True)
-
if args.convert_unicode_strings:
for string in args.input:
if string.startswith('<FEFF'):
alt = _pdfmark_unicode_decode(string)
else:
- string = unicode(string, argv_encoding)
alt = _pdfmark_unicode(string)
- print(u'{} -> {}'.format(string, alt))
+ print('{} -> {}'.format(string, alt))
_sys.exit(0)
inputs = []
for filename in args.input:
inputs.append(BookmarkedPDF(filename))
- if args.title:
- title = unicode(args.title, argv_encoding)
- else:
- title = None
- if args.author:
- author = unicode(args.author, argv_encoding)
- else:
- author = None
- if args.keywords:
- keywords = [unicode(k, argv_encoding) for k in args.keywords]
- else:
- keywords = None
if args.pdfmarks and _os_path.isfile(args.pdfmarks):
- pdfmarks = open(args.pdfmarks, 'r').read()
+ pdfmarks = _codecs.open(args.pdfmarks, 'r', encoding).read()
else:
pdfmarks = generate_pdfmarks(
- inputs, title=title, author=author, keywords=keywords)
+ inputs, title=args.title, author=args.author,
+ keywords=args.keywords)
if args.pdfmarks:
- open(args.pdfmarks, 'w').write(pdfmarks)
+ _codecs.open(args.pdfmarks, 'w', encoding).write(pdfmarks)
_sys.exit(0)
merge_pdfs(inputs=inputs, pdfmarks=pdfmarks, output=args.output,
- pause_for_manual_tweaking=args.pause_for_manual_tweaking)
+ pause_for_manual_tweaking=args.pause_for_manual_tweaking,
+ encoding=encoding)