From: Stefan Behnel Date: Mon, 11 Aug 2008 06:38:27 +0000 (+0200) Subject: escape C digraphs, trigraphs and other special characters in strings X-Git-Tag: 0.9.8.1~66 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=39389bdc6b5c18a807e0fe0989b161b6063e1f01;p=cython.git escape C digraphs, trigraphs and other special characters in strings --- diff --git a/Cython/Utils.py b/Cython/Utils.py index 4ab111b2..0a85c7fc 100644 --- a/Cython/Utils.py +++ b/Cython/Utils.py @@ -99,8 +99,26 @@ class EncodedString(unicode): # return unicode.__eq__(self, other) and \ # getattr(other, 'encoding', '') == self.encoding +def _to_oct_sequence(s): + return ''.join(['\\%03o' % ord(c) for c in s]) + +_c_special = ('\0', '??', '<:', ':>', '<%', '%>', '%:', '%:') +_c_special_replacements = zip(_c_special, map(_to_oct_sequence, _c_special)) + +def _build_special_test(): + subexps = [] + for special in _c_special + ('\n','\r','\t'): + regexp = ''.join(['[%s]' % c for c in special ]) + subexps.append(regexp) + return re.compile('(' + '|'.join(subexps) + ')').search + +_has_specials = _build_special_test() + def escape_byte_string(s): - s = s.replace('\0', r'\000').replace('\x0A', r'\012').replace('\x0C', r'\014') + if _has_specials(s): + s = s.replace('\n', r'\n').replace('\r', r'\r').replace('\t', r'\t') + for special, replacement in _c_special_replacements: + s = s.replace(special, replacement) try: s.decode("ASCII") return s diff --git a/tests/run/strescapes.pyx b/tests/run/strescapes.pyx index 51eebfb7..b5d99eed 100644 --- a/tests/run/strescapes.pyx +++ b/tests/run/strescapes.pyx @@ -6,13 +6,23 @@ __doc__ = u""" ... b'\\x0A57', ... b'abc\\x12def', ... u'\\u1234', -... u'\\U00041234', +... u'\\U00001234', ... b'\\u1234', -... b'\\U00041234', +... b'\\U00001234', +... b'\\n\\r\\t', +... b':>', +... b'??>', +... b'\\0\\0\\0', ... ] ->>> for i, (py_string, c_string) in enumerate(zip(py_strings, c_strings)): +>>> for i, (py_string, (c_string, length)) in enumerate(zip(py_strings, c_strings)): ... assert py_string == c_string, "%d: %r != %r" % (i, py_string, c_string) +... assert len(py_string) == length, ( +... "%d: wrong length of %r, got %d, expected %d" % ( +... i, py_string, len(py_string), length)) +... assert len(c_string) == length, ( +... "%d: wrong length of %r, got %d, expected %d" % ( +... i, c_string, len(c_string), length)) """ @@ -23,12 +33,16 @@ else: __doc__ = __doc__.replace(u" u'", u" '") c_strings = [ -b'\x1234', -b'\x0A12\x0C34', -b'\x0A57', -b'abc\x12def', -u'\u1234', -u'\U00041234', -b'\u1234', -b'\U00041234', +(b'\x1234', 3), +(b'\x0A12\x0C34', 6), +(b'\x0A57', 3), +(b'abc\x12def', 7), +(u'\u1234', 1), +(u'\U00001234', 1), +(b'\u1234', 6), +(b'\U00001234', 10), +(b'\n\r\t', 3), +(b':>', 2), +(b'??>', 3), +(b'\0\0\0', 3), ]