use a dedicated UnicodeType and UnicodeNode to represent unicode literals
authorStefan Behnel <scoder@users.berlios.de>
Tue, 12 Aug 2008 20:10:54 +0000 (22:10 +0200)
committerStefan Behnel <scoder@users.berlios.de>
Tue, 12 Aug 2008 20:10:54 +0000 (22:10 +0200)
fixes the unicode literal indexing problem (only for unicode strings, not for byte strings!)

Cython/Compiler/ExprNodes.py
Cython/Compiler/Parsing.py
Cython/Compiler/PyrexTypes.py
Cython/Compiler/Symtab.py

index b833824c932fef24507bd60c7a0b04265e0bc0b1..ce345460623fa2da6b84565ed15a5e7a3322d8f0 100644 (file)
@@ -738,6 +738,29 @@ class StringNode(ConstNode):
             return self.entry.cname
 
 
+class UnicodeNode(PyConstNode):
+    #  entry   Symtab.Entry
+
+    type = PyrexTypes.c_unicode_type
+
+    def analyse_types(self, env):
+        self.entry = env.add_string_const(self.value)
+        env.add_py_string(self.entry)
+
+    def calculate_result_code(self):
+        return self.entry.pystring_cname
+    
+    def _coerce_to(self, dst_type, env):
+        if not dst_type.is_pyobject:
+            node = StringNode(self.pos, entry = entry, type = py_object_type)
+            return ConstNode.coerce_to(node, dst_type, env)
+        else:
+            return self
+        # We still need to perform normal coerce_to processing on the
+        # result, because we might be coercing to an extension type,
+        # in which case a type test node will be needed.
+
+
 class IdentifierStringNode(ConstNode):
     # A Python string that behaves like an identifier, e.g. for
     # keyword arguments in a call, or for imported names
index fafcc8a33fd2bf942a7c61a570de025a03f48138..2e93a73716e79035635927442fa7dfebc1dcdfd5 100644 (file)
@@ -492,6 +492,8 @@ def p_atom(s):
         kind, value = p_cat_string_literal(s)
         if kind == 'c':
             return ExprNodes.CharNode(pos, value = value)
+        elif kind == 'u':
+            return ExprNodes.UnicodeNode(pos, value = value)
         else:
             return ExprNodes.StringNode(pos, value = value)
     elif sy == 'IDENT':
index e8a7fd10d8f85a3b06ec8537b6d2dbfbe4932358..8417b8c1cbe5eff306e680fbc34a9b7efd205141 100644 (file)
@@ -998,20 +998,6 @@ class CStringType:
         return '"%s"' % Utils.escape_byte_string(value)
 
 
-class CUTF8StringType:
-    #  Mixin class for C unicode types.
-
-    is_string = 1
-    is_unicode = 1
-    
-    to_py_function = "PyUnicode_DecodeUTF8"
-    exception_value = "NULL"
-
-    def literal_code(self, value):
-        assert isinstance(value, str)
-        return '"%s"' % Utils.escape_byte_string(value)
-
-
 class CCharArrayType(CStringType, CArrayType):
     #  C 'char []' type.
     
@@ -1020,16 +1006,6 @@ class CCharArrayType(CStringType, CArrayType):
     
     def __init__(self, size):
         CArrayType.__init__(self, c_char_type, size)
-
-
-class CUTF8CharArrayType(CUTF8StringType, CArrayType):
-    #  C 'char []' type.
-    
-    parsetuple_format = "s"
-    pymemberdef_typecode = "T_STRING_INPLACE"
-    
-    def __init__(self, size):
-        CArrayType.__init__(self, c_char_type, size)
     
 
 class CCharPtrType(CStringType, CPtrType):
@@ -1042,6 +1018,29 @@ class CCharPtrType(CStringType, CPtrType):
         CPtrType.__init__(self, c_char_type)
 
 
+class UnicodeType(BuiltinObjectType):
+    #  The Python unicode type.
+
+    is_string = 1
+    is_unicode = 1
+    
+    parsetuple_format = "O"
+
+    def __init__(self):
+        BuiltinObjectType.__init__(self, "unicode", "PyUnicodeObject")
+
+    def literal_code(self, value):
+        assert isinstance(value, str)
+        return '"%s"' % Utils.escape_byte_string(value)
+
+    def declaration_code(self, entity_code, 
+            for_display = 0, dll_linkage = None, pyrex = 0):
+        if pyrex or for_display:
+            return self.base_declaration_code(self.name, entity_code)
+        else:
+            return "%s %s[]" % (public_decl("char", dll_linkage), entity_code)
+
+
 class ErrorType(PyrexType):
     # Used to prevent propagation of error messages.
     
@@ -1106,7 +1105,7 @@ c_longdouble_type =  CFloatType(8, typestring="g")
 
 c_null_ptr_type =     CNullPtrType(c_void_type)
 c_char_array_type =   CCharArrayType(None)
-c_utf8_char_array_type =   CUTF8CharArrayType(None)
+c_unicode_type =      UnicodeType()
 c_char_ptr_type =     CCharPtrType()
 c_char_ptr_ptr_type = CPtrType(c_char_ptr_type)
 c_py_ssize_t_ptr_type =  CPtrType(c_py_ssize_t_type)
index 3942faa263984d6eb765ccc1da05dba04b5852cf..d47cd02f0f9187787281364f5494505a7a334a52 100644 (file)
@@ -504,7 +504,7 @@ class Scope:
         else:
             cname = self.new_const_cname()
         if value.is_unicode:
-            c_type = PyrexTypes.c_utf8_char_array_type
+            c_type = PyrexTypes.c_unicode_type
             value = value.utf8encode()
         else:
             c_type = PyrexTypes.c_char_array_type