From dc320e05583fb88ed719560983de56f230ac07ba Mon Sep 17 00:00:00 2001
From: Stefan Behnel <scoder@users.berlios.de>
Date: Mon, 19 Apr 2010 09:50:19 +0200
Subject: [PATCH] native support for Py_UNICODE, coercion between 1-character
 unicode literals and Py_UNICODE, fix C iteration over unicode strings by
 using Py_UNICODE*

---
 Cython/Compiler/ExprNodes.py            | 20 ++++++--
 Cython/Compiler/Optimize.py             |  2 +-
 Cython/Compiler/Parsing.py              |  1 +
 Cython/Compiler/PyrexTypes.py           | 67 ++++++++++++++++---------
 Cython/Shadow.py                        |  4 +-
 tests/errors/e_strcoerce.pyx            | 14 +++---
 tests/errors/py_unicode_type_errors.pyx | 24 +++++++++
 tests/errors/string_assignments.pyx     |  2 +-
 tests/run/for_in_string.pyx             | 14 +++---
 tests/run/py_unicode_type.pyx           | 44 ++++++++++++++++
 10 files changed, 147 insertions(+), 45 deletions(-)
 create mode 100644 tests/errors/py_unicode_type_errors.pyx
 create mode 100644 tests/run/py_unicode_type.pyx

diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py
index dda284c7..c8d733ca 100755
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -860,7 +860,10 @@ class BytesNode(ConstNode):
     def coerce_to(self, dst_type, env):
         if dst_type.is_int:
             if not self.can_coerce_to_char_literal():
-                error(self.pos, "Only single-character strings can be coerced into ints.")
+                error(self.pos, "Only single-character string literals can be coerced into ints.")
+                return self
+            if dst_type is PyrexTypes.c_py_unicode_type:
+                error(self.pos, "Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.")
                 return self
             return CharNode(self.pos, value=self.value)
 
@@ -915,13 +918,22 @@ class UnicodeNode(PyConstNode):
     def coerce_to(self, dst_type, env):
         if dst_type is self.type:
             pass
+        elif dst_type is PyrexTypes.c_py_unicode_type:
+            if not self.can_coerce_to_char_literal():
+                error(self.pos, "Only single-character Unicode string literals can be coerced into Py_UNICODE.")
+                return self
+            int_value = ord(self.value)
+            return IntNode(self.pos, value=int_value, constant_result=int_value)
         elif not dst_type.is_pyobject:
-            error(self.pos, "Unicode objects do not support coercion to C types.")
+            error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.")
         elif dst_type is not py_object_type:
             if not self.check_for_coercion_error(dst_type):
                 self.fail_assignment(dst_type)
         return self
 
+    def can_coerce_to_char_literal(self):
+        return len(self.value) == 1
+
     def generate_evaluation_code(self, code):
         self.result_code = code.get_py_string_const(self.value)
 
@@ -5426,10 +5438,10 @@ class CmpNode(object):
         type1_can_be_int = False
         type2_can_be_int = False
 
-        if isinstance(operand1, (StringNode, BytesNode)) \
+        if isinstance(operand1, (StringNode, BytesNode, UnicodeNode)) \
                and operand1.can_coerce_to_char_literal():
             type1_can_be_int = True
-        if isinstance(operand2, (StringNode, BytesNode)) \
+        if isinstance(operand2, (StringNode, BytesNode, UnicodeNode)) \
                  and operand2.can_coerce_to_char_literal():
             type2_can_be_int = True
 
diff --git a/Cython/Compiler/Optimize.py b/Cython/Compiler/Optimize.py
index e68f7bfc..4739f363 100644
--- a/Cython/Compiler/Optimize.py
+++ b/Cython/Compiler/Optimize.py
@@ -137,7 +137,7 @@ class IterationTransform(Visitor.VisitorTransform):
         return node
 
     PyUnicode_AS_UNICODE_func_type = PyrexTypes.CFuncType(
-        PyrexTypes.CPtrType(PyrexTypes.c_uint_type), [ # FIXME: return type is actually Py_UNICODE*
+        PyrexTypes.CPtrType(PyrexTypes.c_py_unicode_type), [
             PyrexTypes.CFuncTypeArg("s", Builtin.unicode_type, None)
             ])
 
diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py
index 5966cc42..3397d771 100644
--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -1851,6 +1851,7 @@ basic_c_type_names = ("void", "char", "int", "float", "double", "bint")
 
 special_basic_c_types = {
     # name : (signed, longness)
+    "Py_UNICODE" : (0, 0),
     "Py_ssize_t" : (2, 0),
     "size_t"     : (0, 0),
 }
diff --git a/Cython/Compiler/PyrexTypes.py b/Cython/Compiler/PyrexTypes.py
index 3126b333..101249a5 100755
--- a/Cython/Compiler/PyrexTypes.py
+++ b/Cython/Compiler/PyrexTypes.py
@@ -863,6 +863,20 @@ class CAnonEnumType(CIntType):
         return 'int'
 
 
+class CPyUnicodeIntType(CIntType):
+    # Py_UNICODE
+
+    # Conversion from a unicode string to Py_UNICODE at runtime is not
+    # currently supported and may never be - we only convert from and
+    # to integers here.  The maximum value for a Py_UNICODE is
+    # 1114111, so PyInt_FromLong() will do just fine here.
+
+    to_py_function = "PyInt_FromLong"
+
+    def sign_and_name(self):
+        return "Py_UNICODE"
+
+
 class CPySSizeTType(CIntType):
 
     to_py_function = "PyInt_FromSsize_t"
@@ -2075,14 +2089,15 @@ class ErrorType(PyrexType):
 rank_to_type_name = (
     "char",         # 0
     "short",        # 1
-    "int",          # 2
-    "long",         # 3
-    "Py_ssize_t",   # 4
-    "size_t",       # 5
-    "PY_LONG_LONG", # 6
-    "float",        # 7
-    "double",       # 8
-    "long double",  # 9
+    "Py_UNICODE",   # 2
+    "int",          # 3
+    "long",         # 4
+    "Py_ssize_t",   # 5
+    "size_t",       # 6
+    "PY_LONG_LONG", # 7
+    "float",        # 8
+    "double",       # 9
+    "long double",  # 10
 )
 
 py_object_type = PyObjectType()
@@ -2093,29 +2108,30 @@ c_void_ptr_ptr_type = CPtrType(c_void_ptr_type)
 
 c_uchar_type =       CIntType(0, 0)
 c_ushort_type =      CIntType(1, 0)
-c_uint_type =        CIntType(2, 0)
-c_ulong_type =       CIntType(3, 0)
-c_ulonglong_type =   CIntType(6, 0)
+c_py_unicode_type =  CPyUnicodeIntType(2, 0)
+c_uint_type =        CIntType(3, 0)
+c_ulong_type =       CIntType(4, 0)
+c_ulonglong_type =   CIntType(7, 0)
 
 c_char_type =        CIntType(0, 1)
 c_short_type =       CIntType(1, 1)
-c_int_type =         CIntType(2, 1)
-c_long_type =        CIntType(3, 1)
-c_longlong_type =    CIntType(6, 1)
+c_int_type =         CIntType(3, 1)
+c_long_type =        CIntType(4, 1)
+c_longlong_type =    CIntType(7, 1)
 
 c_schar_type =       CIntType(0, 2)
 c_sshort_type =      CIntType(1, 2)
-c_sint_type =        CIntType(2, 2)
-c_slong_type =       CIntType(3, 2)
-c_slonglong_type =   CIntType(6, 2)
+c_sint_type =        CIntType(3, 2)
+c_slong_type =       CIntType(4, 2)
+c_slonglong_type =   CIntType(7, 2)
 
-c_bint_type =        CBIntType(2, 1)
-c_py_ssize_t_type =  CPySSizeTType(4, 2)
-c_size_t_type =      CSizeTType(5, 0)
+c_bint_type =        CBIntType(3, 1)
+c_py_ssize_t_type =  CPySSizeTType(5, 2)
+c_size_t_type =      CSizeTType(6, 0)
 
-c_float_type =       CFloatType(7, math_h_modifier='f')
-c_double_type =      CFloatType(8)
-c_longdouble_type =  CFloatType(9, math_h_modifier='l')
+c_float_type =       CFloatType(8, math_h_modifier='f')
+c_double_type =      CFloatType(9)
+c_longdouble_type =  CFloatType(10, math_h_modifier='l')
 
 c_float_complex_type =      CComplexType(c_float_type)
 c_double_complex_type =     CComplexType(c_double_type)
@@ -2131,7 +2147,7 @@ c_int_ptr_type =      CPtrType(c_int_type)
 c_py_ssize_t_ptr_type =  CPtrType(c_py_ssize_t_type)
 c_size_t_ptr_type =  CPtrType(c_size_t_type)
 
-c_returncode_type =   CIntType(2, 1, is_returncode = 1)
+c_returncode_type =   CIntType(3, 1, is_returncode = 1)
 c_anon_enum_type =    CAnonEnumType(-1, 1)
 
 # the Py_buffer type is defined in Builtin.py
@@ -2165,6 +2181,7 @@ modifiers_and_name_to_type = {
     (1,  0, "bint"): c_bint_type,
     (0,  0, "size_t") :    c_size_t_type,
     (2,  0, "Py_ssize_t"): c_py_ssize_t_type,
+    (0,  0, "Py_UNICODE"): c_py_unicode_type,
 
     (1,  0, "float"):  c_float_type,
     (1,  0, "double"): c_double_type,
@@ -2383,6 +2400,8 @@ def parse_basic_type(name):
         signed = 2
     elif name == 'size_t':
         signed = 0
+    elif name == 'Py_UNICODE':
+        signed = 0
     else:
         if name.startswith('u'):
             name = name[1:]
diff --git a/Cython/Shadow.py b/Cython/Shadow.py
index a48d8d3f..d7dd186d 100644
--- a/Cython/Shadow.py
+++ b/Cython/Shadow.py
@@ -174,7 +174,7 @@ except ImportError:
 
 # Predefined types
 
-int_types = ['char', 'short', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t']
+int_types = ['char', 'short', 'Py_UNICODE', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t']
 float_types = ['longdouble', 'double', 'float']
 complex_types = ['longdoublecomplex', 'doublecomplex', 'floatcomplex', 'complex']
 other_types = ['bint', 'void']
@@ -183,7 +183,7 @@ gs = globals()
 
 for name in int_types:
     gs[name] = typedef(py_int)
-    if not name.endswith('size_t'):
+    if name != 'Py_UNICODE' and not name.endswith('size_t'):
         gs['u'+name] = typedef(py_int)
         gs['s'+name] = typedef(py_int)
     
diff --git a/tests/errors/e_strcoerce.pyx b/tests/errors/e_strcoerce.pyx
index cda8dd57..8de7344f 100644
--- a/tests/errors/e_strcoerce.pyx
+++ b/tests/errors/e_strcoerce.pyx
@@ -4,12 +4,14 @@ cdef int cx = "test"  # fails
 
 cdef int x1 =  "\xFF"    # works
 cdef int x2 =  "\u0FFF"  # fails
-cdef int x3 = u"\xFF"    # fails
 
+cdef Py_UNICODE u1 = u"\xFF"   # works
+cdef int u3 = u"\xFF"          # fails
 
-_ERRORS = u"""
-2:14: Only single-character strings can be coerced into ints.
-3:14: Only single-character strings can be coerced into ints.
-6:15: Only single-character strings can be coerced into ints.
-7:14: Unicode objects do not support coercion to C types.
+
+_ERRORS = """
+2:14: Only single-character string literals can be coerced into ints.
+3:14: Only single-character string literals can be coerced into ints.
+6:15: Only single-character string literals can be coerced into ints.
+9:14: Unicode literals do not support coercion to C types other than Py_UNICODE.
 """
diff --git a/tests/errors/py_unicode_type_errors.pyx b/tests/errors/py_unicode_type_errors.pyx
new file mode 100644
index 00000000..1b4e10e3
--- /dev/null
+++ b/tests/errors/py_unicode_type_errors.pyx
@@ -0,0 +1,24 @@
+# -*- coding: iso-8859-1 -*-
+
+cdef Py_UNICODE char_ASCII = u'A'
+cdef Py_UNICODE char_KLINGON = u'\uF8D2'
+
+def char_too_long_ASCII():
+    cdef Py_UNICODE c = u'AB'
+
+def char_too_long_Unicode():
+    cdef Py_UNICODE c = u'A\uF8D2'
+
+def char_too_long_bytes():
+    cdef Py_UNICODE c = b'AB'
+
+def char_too_long_latin1():
+    cdef Py_UNICODE char_bytes_latin1 = b'ö'
+
+
+_ERRORS = """
+7:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
+10:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
+13:24: Only single-character string literals can be coerced into ints.
+16:40: Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.
+"""
diff --git a/tests/errors/string_assignments.pyx b/tests/errors/string_assignments.pyx
index 5a691035..1b5a1341 100644
--- a/tests/errors/string_assignments.pyx
+++ b/tests/errors/string_assignments.pyx
@@ -50,7 +50,7 @@ cdef list  l_f2 = b1
 cdef list  l_f3 = u1
 
 _ERRORS = u"""
-25:20: Unicode objects do not support coercion to C types.
+25:20: Unicode literals do not support coercion to C types other than Py_UNICODE.
 26:22: Unicode objects do not support coercion to C types.
 27:22: 'str' objects do not support coercion to C types (use 'bytes'?).
 
diff --git a/tests/run/for_in_string.pyx b/tests/run/for_in_string.pyx
index 9e920dfe..4851ab17 100644
--- a/tests/run/for_in_string.pyx
+++ b/tests/run/for_in_string.pyx
@@ -14,7 +14,7 @@ def for_in_bytes(bytes s):
     'C'
     """
     for c in s:
-        if c == 'C':
+        if c == b'C':
             return 'C'
     else:
         return 'X'
@@ -28,21 +28,21 @@ def for_char_in_bytes(bytes s):
     """
     cdef char c
     for c in s:
-        if c == 'C':
+        if c == b'C':
             return 'C'
     else:
         return 'X'
 
-def for_int_in_unicode(unicode s):
+def for_pyunicode_in_unicode(unicode s):
     """
-    >>> for_int_in_unicode(unicode_abc)
+    >>> for_pyunicode_in_unicode(unicode_abc)
     'X'
-    >>> for_int_in_unicode(unicode_ABC)
+    >>> for_pyunicode_in_unicode(unicode_ABC)
     'C'
     """
-    cdef int c
+    cdef Py_UNICODE c
     for c in s:
-        if c == 'C':
+        if c == u'C':
             return 'C'
     else:
         return 'X'
diff --git a/tests/run/py_unicode_type.pyx b/tests/run/py_unicode_type.pyx
new file mode 100644
index 00000000..b1bb40e8
--- /dev/null
+++ b/tests/run/py_unicode_type.pyx
@@ -0,0 +1,44 @@
+# -*- coding: iso-8859-1 -*-
+
+cdef Py_UNICODE char_ASCII = u'A'
+cdef Py_UNICODE char_KLINGON = u'\uF8D2'
+
+
+def compare_ASCII():
+    """
+    >>> compare_ASCII()
+    True
+    False
+    False
+    """
+    print(char_ASCII == u'A')
+    print(char_ASCII == u'B')
+    print(char_ASCII == u'\uF8D2')
+
+
+def compare_KLINGON():
+    """
+    >>> compare_ASCII()
+    True
+    False
+    False
+    """
+    print(char_KLINGON == u'\uF8D2')
+    print(char_KLINGON == u'A')
+    print(char_KLINGON == u'B')
+
+
+def index_literal(int i):
+    """
+    >>> index_literal(0) == '1'
+    True
+    >>> index_literal(-5) == '1'
+    True
+    >>> index_literal(2) == '3'
+    True
+    >>> index_literal(4) == '5'
+    True
+    """
+    # runtime casts are not currently supported
+    #return <Py_UNICODE>(u"12345"[i])
+    return u"12345"[i]
-- 
2.26.2