support for long unicode escapes ('\U...')
authorStefan Behnel <scoder@users.berlios.de>
Sun, 10 Aug 2008 18:37:12 +0000 (20:37 +0200)
committerStefan Behnel <scoder@users.berlios.de>
Sun, 10 Aug 2008 18:37:12 +0000 (20:37 +0200)
fixed unicode escape handling in byte strings
unescape \xXY in string literals as C allows it to conflict with trailing hex numbers - output string escaping will do the right thing

Cython/Compiler/Lexicon.py
Cython/Compiler/Parsing.py
tests/run/strescapes.pyx [new file with mode: 0644]

index dade469e72fb278feca9103a6ba8de861c8401d8..e3f1be9ad1d8165b139fbad0069b4511bc72d98c 100644 (file)
@@ -64,7 +64,8 @@ def make_lexicon():
     two_hex = hexdigit + hexdigit
     four_hex = two_hex + two_hex
     escapeseq = Str("\\") + (two_oct | three_oct | two_hex |
-                             Str('u') + four_hex | Str('x') + two_hex | AnyChar)
+                             Str('u') + four_hex | Str('x') + two_hex |
+                             Str('U') + four_hex + four_hex | AnyChar)
     
     deco = Str("@")
     bra = Any("([{")
index 1f73389315fabed33c1f45711543d3e87e8bc5ac..298fc092f33e950154682d09920ef31d75b27994 100644 (file)
@@ -2,7 +2,9 @@
 #   Pyrex Parser
 #
 
-import os, re
+import os
+import re
+import sys
 from types import ListType, TupleType
 from Scanning import PyrexScanner, FileSourceDescriptor
 import Nodes
@@ -604,18 +606,17 @@ def p_string_literal(s):
                     chars.append(systr)
                 elif c == '\n':
                     pass
-                elif c in 'ux':
-                    if kind == 'u':
-                        try:
-                            chars.append(
-                                systr.encode("ASCII").decode('unicode_escape'))
-                        except UnicodeDecodeError:
+                elif c in 'Uux':
+                    if kind == 'u' or c == 'x':
+                        chrval = int(systr[2:], 16)
+                        if chrval > sys.maxunicode:
                             s.error("Invalid unicode escape '%s'" % systr,
                                     pos = pos)
-                    elif c == 'x':
-                        chars.append('\\x0' + systr[2:])
+                        strval = unichr(chrval)
                     else:
-                        chars.append(systr)
+                        # unicode escapes in plain byte strings are not unescaped
+                        strval = systr
+                    chars.append(strval.replace('\\', '\\\\'))
                 else:
                     chars.append(r'\\' + systr[1:])
         elif sy == 'NEWLINE':
diff --git a/tests/run/strescapes.pyx b/tests/run/strescapes.pyx
new file mode 100644 (file)
index 0000000..a88df30
--- /dev/null
@@ -0,0 +1,34 @@
+__doc__ = u"""
+
+>>> py_strings = [
+... '\\x1234',
+... '\\x0A12\\x0C34',
+... '\\x0A57',
+... 'abc\\x12def',
+... u'\\u1234',
+... u'\\U00041234',
+... b'\\u1234',
+... b'\\U00041234',
+... ]
+
+>>> for i, (py_string, c_string) in enumerate(zip(py_strings, c_strings)):
+...     assert py_string == c_string, "%d: %r != %r" % (i, py_string, c_string)
+
+"""
+
+import sys
+if sys.version_info[0] < 3:
+    __doc__ = __doc__.replace(" b'", " '")
+else:
+    __doc__ = __doc__.replace(" u'", " '")
+
+c_strings = [
+'\x1234',
+'\x0A12\x0C34',
+'\x0A57',
+'abc\x12def',
+u'\u1234',
+u'\U00041234',
+b'\u1234',
+b'\U00041234',
+]