initial support for unicode literals in UTF-8
authorStefan Behnel <scoder@users.berlios.de>
Wed, 1 Aug 2007 23:43:48 +0000 (01:43 +0200)
committerStefan Behnel <scoder@users.berlios.de>
Wed, 1 Aug 2007 23:43:48 +0000 (01:43 +0200)
Cython/Compiler/Lexicon.py
Cython/Compiler/ModuleNode.py
Cython/Compiler/Nodes.py
Cython/Compiler/Parsing.py
Cython/Compiler/PyrexTypes.py

index 676f2036e724a451760af3ad3a9be7ee70edf48e..89d7b43fddc5d2e84524731bd72246a2106b8040 100644 (file)
@@ -5,7 +5,7 @@
 #   to be rebuilt next time pyrexc is run.
 #
 
-string_prefixes = "cCrR"
+string_prefixes = "cCrRuU"
 
 def make_lexicon():
     from Cython.Plex import \
index 4fd310fcc112ea13ae5fae01db6e88876a1c1089..05945bb361afdbca24eb8fb20ff017337a38095d 100644 (file)
@@ -1071,13 +1071,16 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode):
                 "static __Pyx_StringTabEntry %s[] = {" %
                     Naming.stringtab_cname)
             for entry in entries:
+                print repr(entry.init), type(entry.init)
                 code.putln(
-                    "{&%s, %s, sizeof(%s)}," % (
+                    "{&%s, %s, sizeof(%s), %d}," % (
                         entry.pystring_cname,
                         entry.cname,
-                        entry.cname))
+                        entry.cname,
+                        isinstance(entry.init, unicode)
+                        ))
             code.putln(
-                "{0, 0, 0}")
+                "{0, 0, 0, 0}")
             code.putln(
                 "};")
     
index 9273012063b022805f81deeab6a679d26bb17aa3..19e62970d2ccdc2508cdbcf18841073223293681 100644 (file)
@@ -2600,7 +2600,7 @@ utility_function_predeclarations = \
 
 typedef struct {const char *s; const void **p;} __Pyx_CApiTabEntry; /*proto*/
 typedef struct {PyObject **p; char *s;} __Pyx_InternTabEntry; /*proto*/
-typedef struct {PyObject **p; char *s; long n;} __Pyx_StringTabEntry; /*proto*/
+typedef struct {PyObject **p; char *s; long n; int is_unicode;} __Pyx_StringTabEntry; /*proto*/
 
 #define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False))
 static INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
@@ -3104,7 +3104,11 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/
 ""","""
 static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
     while (t->p) {
-        *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else {
+            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
         if (!*t->p)
             return -1;
         ++t;
index 9ff8c65b1f987fa4b78a1fa9494c0ae053fae6b3..cbe3126c52e0ab45697c9d6ea569bcda7b94026c 100644 (file)
@@ -493,7 +493,7 @@ def p_opt_string_literal(s):
 
 def p_string_literal(s):
     # A single string or char literal.
-    # Returns (kind, value) where kind in ('', 'c', 'r')
+    # Returns (kind, value) where kind in ('', 'c', 'r', 'u')
     if s.sy == 'STRING':
         value = unquote(s.systring)
         s.next()
@@ -502,7 +502,7 @@ def p_string_literal(s):
     pos = s.position()
     #is_raw = s.systring[:1].lower() == "r"
     kind = s.systring[:1].lower()
-    if kind not in "cr":
+    if kind not in "cru":
         kind = ''
     chars = []
     while 1:
@@ -513,6 +513,8 @@ def p_string_literal(s):
             systr = s.systring
             if len(systr) == 1 and systr in "'\"\n":
                 chars.append('\\')
+            if kind == 'u' and not isinstance(systr, unicode):
+                systr = systr.decode("UTF-8")
             chars.append(systr)
         elif sy == 'ESCAPE':
             systr = s.systring
@@ -533,6 +535,8 @@ def p_string_literal(s):
                     chars.append('\\x0' + systr[2:])
                 elif c == '\n':
                     pass
+                elif c == 'u':
+                    chars.append(systr)
                 else:
                     chars.append(r'\\' + systr[1:])
         elif sy == 'NEWLINE':
@@ -546,7 +550,10 @@ def p_string_literal(s):
                 "Unexpected token %r:%r in string literal" %
                     (sy, s.systring))
     s.next()
-    value = join(chars, '')
+    if kind == 'u':
+        value = u''.join(chars)
+    else:
+        value = ''.join(chars)
     #print "p_string_literal: value =", repr(value) ###
     return kind, value
 
index b25fbc30fb884c3ec0930e60f7cbd9e8c5fc2f65..a7a6ca0ca9bc0c63ccbe29fb2f765f6b52e2bdf3 100644 (file)
@@ -705,6 +705,8 @@ class CStringType:
     from_py_function = "PyString_AsString"
 
     def literal_code(self, value):
+        if isinstance(value, unicode):
+            value = value.encode("UTF-8")
         return '"%s"' % value