support for long unicode escapes ('\U...')

author Stefan Behnel <scoder@users.berlios.de>

Sun, 10 Aug 2008 18:37:12 +0000 (20:37 +0200)

committer Stefan Behnel <scoder@users.berlios.de>

Sun, 10 Aug 2008 18:37:12 +0000 (20:37 +0200)
author Stefan Behnel <scoder@users.berlios.de>
Sun, 10 Aug 2008 18:37:12 +0000 (20:37 +0200)
committer Stefan Behnel <scoder@users.berlios.de>
Sun, 10 Aug 2008 18:37:12 +0000 (20:37 +0200)
diff --git a/Cython/Compiler/Lexicon.py b/Cython/Compiler/Lexicon.py

index dade469e72fb278feca9103a6ba8de861c8401d8..e3f1be9ad1d8165b139fbad0069b4511bc72d98c 100644 (file)
--- a/Cython/Compiler/Lexicon.py
+++ b/Cython/Compiler/Lexicon.py
@@ -64,7 +64,8 @@ def make_lexicon():
      two_hex = hexdigit + hexdigit
      four_hex = two_hex + two_hex
      escapeseq = Str("\\") + (two_oct | three_oct | two_hex |
-                             Str('u') + four_hex | Str('x') + two_hex | AnyChar)
+                             Str('u') + four_hex | Str('x') + two_hex |
+                             Str('U') + four_hex + four_hex | AnyChar)
      
      deco = Str("@")
      bra = Any("([{")
diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py

index 1f73389315fabed33c1f45711543d3e87e8bc5ac..298fc092f33e950154682d09920ef31d75b27994 100644 (file)
--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -2,7 +2,9 @@
  #   Pyrex Parser
  #
  
-import os, re
+import os
+import re
+import sys
  from types import ListType, TupleType
  from Scanning import PyrexScanner, FileSourceDescriptor
  import Nodes
@@ -604,18 +606,17 @@ def p_string_literal(s):
                      chars.append(systr)
                  elif c == '\n':
                      pass
-                elif c in 'ux':
-                    if kind == 'u':
-                        try:
-                            chars.append(
-                                systr.encode("ASCII").decode('unicode_escape'))
-                        except UnicodeDecodeError:
+                elif c in 'Uux':
+                    if kind == 'u' or c == 'x':
+                        chrval = int(systr[2:], 16)
+                        if chrval > sys.maxunicode:
                              s.error("Invalid unicode escape '%s'" % systr,
                                      pos = pos)
-                    elif c == 'x':
-                        chars.append('\\x0' + systr[2:])
+                        strval = unichr(chrval)
                      else:
-                        chars.append(systr)
+                        # unicode escapes in plain byte strings are not unescaped
+                        strval = systr
+                    chars.append(strval.replace('\\', '\\\\'))
                  else:
                      chars.append(r'\\' + systr[1:])
          elif sy == 'NEWLINE':
diff --git a/tests/run/strescapes.pyx b/tests/run/strescapes.pyx

new file mode 100644 (file)

index 0000000..a88df30
--- /dev/null
+++ b/tests/run/strescapes.pyx
@@ -0,0 +1,34 @@
+__doc__ = u"""
+
+>>> py_strings = [
+... '\\x1234',
+... '\\x0A12\\x0C34',
+... '\\x0A57',
+... 'abc\\x12def',
+... u'\\u1234',
+... u'\\U00041234',
+... b'\\u1234',
+... b'\\U00041234',
+... ]
+
+>>> for i, (py_string, c_string) in enumerate(zip(py_strings, c_strings)):
+...     assert py_string == c_string, "%d: %r != %r" % (i, py_string, c_string)
+
+"""
+
+import sys
+if sys.version_info[0] < 3:
+    __doc__ = __doc__.replace(" b'", " '")
+else:
+    __doc__ = __doc__.replace(" u'", " '")
+
+c_strings = [
+'\x1234',
+'\x0A12\x0C34',
+'\x0A57',
+'abc\x12def',
+u'\u1234',
+u'\U00041234',
+b'\u1234',
+b'\U00041234',
+]
author	Stefan Behnel <scoder@users.berlios.de>
	Sun, 10 Aug 2008 18:37:12 +0000 (20:37 +0200)
committer	Stefan Behnel <scoder@users.berlios.de>
	Sun, 10 Aug 2008 18:37:12 +0000 (20:37 +0200)
Cython/Compiler/Lexicon.py		patch \| blob \| history
Cython/Compiler/Parsing.py		patch \| blob \| history
tests/run/strescapes.pyx	[new file with mode: 0644]	patch \| blob