From: Stefan Behnel <scoder@users.berlios.de>
Date: Sat, 29 Jan 2011 17:35:53 +0000 (+0100)
Subject: implemented Py_UCS4 type
X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=e68b3b57b9c8999df250e6bcaed538aa33ea0235;p=cython.git

implemented Py_UCS4 type

--HG--
rename : tests/errors/py_unicode_type_errors.pyx => tests/errors/py_ucs4_type_errors.pyx
rename : tests/run/py_unicode_type.pyx => tests/run/py_ucs4_type.pyx
---

diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py
index 6824d946..555588cf 100755
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -981,8 +981,8 @@ class BytesNode(ConstNode):
             if not self.can_coerce_to_char_literal():
                 error(self.pos, "Only single-character string literals can be coerced into ints.")
                 return self
-            if dst_type is PyrexTypes.c_py_unicode_type:
-                error(self.pos, "Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.")
+            if dst_type.is_unicode_char:
+                error(self.pos, "Bytes literals cannot coerce to Py_UNICODE/Py_UCS4, use a unicode literal instead.")
                 return self
             return CharNode(self.pos, value=self.value)
 
@@ -1033,17 +1033,17 @@ class UnicodeNode(PyConstNode):
     def coerce_to(self, dst_type, env):
         if dst_type is self.type:
             pass
-        elif dst_type is PyrexTypes.c_py_unicode_type:
+        elif dst_type.is_unicode_char:
             if not self.can_coerce_to_char_literal():
-                error(self.pos, "Only single-character Unicode string literals can be coerced into Py_UNICODE.")
+                error(self.pos, "Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.")
                 return self
             int_value = ord(self.value)
-            return IntNode(self.pos, value=int_value, constant_result=int_value)
+            return IntNode(self.pos, type=dst_type, value=str(int_value), constant_result=int_value)
         elif not dst_type.is_pyobject:
             if dst_type.is_string and self.bytes_value is not None:
                 # special case: '-3' enforced unicode literal used in a C char* context
                 return BytesNode(self.pos, value=self.bytes_value).coerce_to(dst_type, env)
-            error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.")
+            error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4.")
         elif dst_type is not py_object_type:
             if not self.check_for_coercion_error(dst_type):
                 self.fail_assignment(dst_type)
@@ -1051,6 +1051,9 @@ class UnicodeNode(PyConstNode):
 
     def can_coerce_to_char_literal(self):
         return len(self.value) == 1
+            ## or (len(self.value) == 2
+            ##     and (0xD800 <= self.value[0] <= 0xDBFF)
+            ##     and (0xDC00 <= self.value[1] <= 0xDFFF))
 
     def contains_surrogates(self):
         # Check if the unicode string contains surrogate code points
@@ -2165,8 +2168,8 @@ class IndexNode(ExprNode):
             elif not skip_child_analysis:
                 self.index.analyse_types(env)
             self.original_index_type = self.index.type
-            if base_type is PyrexTypes.c_py_unicode_type:
-                # we infer Py_UNICODE for unicode strings in some
+            if base_type.is_unicode_char:
+                # we infer Py_UNICODE/Py_UCS4 for unicode strings in some
                 # cases, but indexing must still work for them
                 if self.index.constant_result in (0, -1):
                     # FIXME: we know that this node is redundant -
@@ -2188,7 +2191,7 @@ class IndexNode(ExprNode):
                     self.index = self.index.coerce_to_pyobject(env)
                     self.is_temp = 1
                 if self.index.type.is_int and base_type is unicode_type:
-                    # Py_UNICODE will automatically coerce to a unicode string
+                    # Py_UNICODE/Py_UCS4 will automatically coerce to a unicode string
                     # if required, so this is fast and safe
                     self.type = PyrexTypes.c_py_unicode_type
                 elif is_slice and base_type in (bytes_type, str_type, unicode_type, list_type, tuple_type):
@@ -2253,7 +2256,7 @@ class IndexNode(ExprNode):
             return "PyList_GET_ITEM(%s, %s)" % (self.base.result(), self.index.result())
         elif self.base.type is tuple_type:
             return "PyTuple_GET_ITEM(%s, %s)" % (self.base.result(), self.index.result())
-        elif self.base.type is unicode_type and self.type is PyrexTypes.c_py_unicode_type:
+        elif self.base.type is unicode_type and self.type.is_unicode_char:
             return "PyUnicode_AS_UNICODE(%s)[%s]" % (self.base.result(), self.index.result())
         elif (self.type.is_ptr or self.type.is_array) and self.type == self.base.type:
             error(self.pos, "Invalid use of pointer slice")
@@ -2332,7 +2335,7 @@ class IndexNode(ExprNode):
                         self.result(),
                         code.error_goto(self.pos)))
                 code.put_gotref(self.py_result())
-            elif self.type is PyrexTypes.c_py_unicode_type and self.base.type is unicode_type:
+            elif self.type.is_unicode_char and self.base.type is unicode_type:
                 assert self.index.type.is_int
                 index_code = self.index.result()
                 function = "__Pyx_GetItemInt_Unicode"
@@ -5845,8 +5848,8 @@ class NumBinopNode(BinopNode):
                 self.operand2.result())
 
     def is_py_operation_types(self, type1, type2):
-        return (type1 is PyrexTypes.c_py_unicode_type or
-                type2 is PyrexTypes.c_py_unicode_type or
+        return (type1.is_unicode_char or
+                type2.is_unicode_char or
                 BinopNode.is_py_operation_types(self, type1, type2))
 
     def py_operation_function(self):
@@ -6503,7 +6506,7 @@ class CmpNode(object):
         return self.operator in ('in', 'not_in') and \
                ((self.operand1.type.is_int
                  and (self.operand2.type.is_string or self.operand2.type is bytes_type)) or
-                (self.operand1.type is PyrexTypes.c_py_unicode_type
+                (self.operand1.type.is_unicode_char
                  and self.operand2.type is unicode_type))
 
     def is_ptr_contains(self):
@@ -7166,7 +7169,7 @@ class CoerceToPyTypeNode(CoercionNode):
             # be specific about some known types
             if arg.type.is_string:
                 self.type = bytes_type
-            elif arg.type is PyrexTypes.c_py_unicode_type:
+            elif arg.type.is_unicode_char:
                 self.type = unicode_type
             elif arg.type.is_complex:
                 self.type = Builtin.complex_type
diff --git a/Cython/Compiler/Optimize.py b/Cython/Compiler/Optimize.py
index 3139e400..a4a3ed71 100644
--- a/Cython/Compiler/Optimize.py
+++ b/Cython/Compiler/Optimize.py
@@ -1936,7 +1936,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
                 node.pos, cfunc_name, self.PyObject_Size_func_type,
                 args = [arg],
                 is_temp = node.is_temp)
-        elif arg.type is PyrexTypes.c_py_unicode_type:
+        elif arg.type.is_unicode_char:
             return ExprNodes.IntNode(node.pos, value='1', constant_result=1,
                                      type=node.type)
         else:
@@ -2028,7 +2028,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
             return node
         arg = pos_args[0]
         if isinstance(arg, ExprNodes.CoerceToPyTypeNode):
-            if arg.arg.type is PyrexTypes.c_py_unicode_type:
+            if arg.arg.type.is_unicode_char:
                 return arg.arg.coerce_to(node.type, self.current_env())
         return node
 
@@ -2191,7 +2191,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
             return node
         ustring = args[0]
         if not isinstance(ustring, ExprNodes.CoerceToPyTypeNode) or \
-               ustring.arg.type is not PyrexTypes.c_py_unicode_type:
+               not ustring.arg.type.is_unicode_char:
             return node
         uchar = ustring.arg
         method_name = node.function.attribute
@@ -2230,7 +2230,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
             return node
         ustring = args[0]
         if not isinstance(ustring, ExprNodes.CoerceToPyTypeNode) or \
-               ustring.arg.type is not PyrexTypes.c_py_unicode_type:
+               not ustring.arg.type.is_unicode_char:
             return node
         uchar = ustring.arg
         method_name = node.function.attribute
diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py
index 7f59c256..2cd437e7 100644
--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -2041,6 +2041,7 @@ basic_c_type_names = ("void", "char", "int", "float", "double", "bint")
 special_basic_c_types = {
     # name : (signed, longness)
     "Py_UNICODE" : (0, 0),
+    "Py_UCS4"    : (0, 0),
     "Py_ssize_t" : (2, 0),
     "ssize_t"    : (2, 0),
     "size_t"     : (0, 0),
diff --git a/Cython/Compiler/PyrexTypes.py b/Cython/Compiler/PyrexTypes.py
index 39dd9203..ca997ee8 100755
--- a/Cython/Compiler/PyrexTypes.py
+++ b/Cython/Compiler/PyrexTypes.py
@@ -49,6 +49,7 @@ class PyrexType(BaseType):
     #  is_typedef            boolean     Is a typedef type
     #  is_string             boolean     Is a C char * type
     #  is_unicode            boolean     Is a UTF-8 encoded C char * type
+    #  is_unicode_char       boolean     Is either Py_UCS4 or Py_UNICODE
     #  is_returncode         boolean     Is used only to signal exceptions
     #  is_error              boolean     Is the dummy error type
     #  is_buffer             boolean     Is buffer access type
@@ -101,6 +102,7 @@ class PyrexType(BaseType):
     is_typedef = 0
     is_string = 0
     is_unicode = 0
+    is_unicode_char = 0
     is_returncode = 0
     is_error = 0
     is_buffer = 0
@@ -924,9 +926,78 @@ class CBIntType(CIntType):
         return "<CNumericType bint>"
 
 
+class CPyUCS4IntType(CIntType):
+    # Py_UCS4
+
+    is_unicode_char = True
+
+    # Py_UCS4 coerces from and to single character unicode strings (or
+    # at most two characters on 16bit Unicode builds), but we also
+    # allow Python integers as input.  The value range for Py_UCS4
+    # is 0..1114111, which is checked when converting from an integer
+    # value.
+
+    to_py_function = "PyUnicode_FromOrdinal"
+    from_py_function = "__Pyx_PyObject_AsPy_UCS4"
+
+    def create_from_py_utility_code(self, env):
+        env.use_utility_code(pyobject_as_py_ucs4_utility_code)
+        return True
+
+    def sign_and_name(self):
+        return "Py_UCS4"
+
+
+pyobject_as_py_ucs4_utility_code = UtilityCode(
+proto='''
+static CYTHON_INLINE Py_UCS4 __Pyx_PyObject_AsPy_UCS4(PyObject*);
+''',
+impl='''
+static CYTHON_INLINE Py_UCS4 __Pyx_PyObject_AsPy_UCS4(PyObject* x) {
+   long ival;
+   if (PyUnicode_Check(x)) {
+       if (likely(PyUnicode_GET_SIZE(x) == 1)) {
+           return PyUnicode_AS_UNICODE(x)[0];
+       } else if (PyUnicode_GET_SIZE(x) == 2) {
+           Py_UCS4 high_val = PyUnicode_AS_UNICODE(x)[0];
+           if (high_val >= 0xD800 && high_val <= 0xDBFF) {
+               Py_UCS4 low_val = PyUnicode_AS_UNICODE(x)[1];
+               if (low_val >= 0xDC00 && low_val <= 0xDFFF) {
+                   return 0x10000 | ((high_val & ((1<<10)-1)) << 10) | (low_val & ((1<<10)-1));
+               }
+           }
+       }
+       PyErr_Format(PyExc_ValueError,
+           "only single character unicode strings or surrogate pairs can be converted to Py_UCS4, got length "
+           #if PY_VERSION_HEX < 0x02050000
+           "%d",
+           #else
+           "%zd",
+           #endif
+           PyUnicode_GET_SIZE(x));
+       return (Py_UCS4)-1;
+   }
+   ival = __Pyx_PyInt_AsLong(x);
+   if (unlikely(ival < 0)) {
+       if (!PyErr_Occurred())
+           PyErr_SetString(PyExc_OverflowError,
+                           "cannot convert negative value to Py_UCS4");
+       return (Py_UCS4)-1;
+   } else if (unlikely(ival > 1114111)) {
+       PyErr_SetString(PyExc_OverflowError,
+                       "value too large to convert to Py_UCS4");
+       return (Py_UCS4)-1;
+   }
+   return (Py_UCS4)ival;
+}
+''')
+
+
 class CPyUnicodeIntType(CIntType):
     # Py_UNICODE
 
+    is_unicode_char = True
+
     # Py_UNICODE coerces from and to single character unicode strings,
     # but we also allow Python integers as input.  The value range for
     # Py_UNICODE is 0..1114111, which is checked when converting from
@@ -2306,6 +2377,7 @@ c_anon_enum_type =   CAnonEnumType(-1)
 c_returncode_type =  CReturnCodeType(RANK_INT)
 c_bint_type =        CBIntType(RANK_INT)
 c_py_unicode_type =  CPyUnicodeIntType(RANK_INT-0.5, UNSIGNED)
+c_py_ucs4_type =     CPyUCS4IntType(RANK_LONG-0.5, UNSIGNED)
 c_py_ssize_t_type =  CPySSizeTType(RANK_LONG+0.5, SIGNED)
 c_ssize_t_type =     CSSizeTType(RANK_LONG+0.5, SIGNED)
 c_size_t_type =      CSizeTType(RANK_LONG+0.5, UNSIGNED)
@@ -2367,6 +2439,7 @@ modifiers_and_name_to_type = {
 
     (1,  0, "bint"):       c_bint_type,
     (0,  0, "Py_UNICODE"): c_py_unicode_type,
+    (0,  0, "Py_UCS4"):    c_py_ucs4_type,
     (2,  0, "Py_ssize_t"): c_py_ssize_t_type,
     (2,  0, "ssize_t") :   c_ssize_t_type,
     (0,  0, "size_t") :    c_size_t_type,
@@ -2614,6 +2687,8 @@ def parse_basic_type(name):
     longness = 0
     if name == 'Py_UNICODE':
         signed = 0
+    elif name == 'Py_UCS4':
+        signed = 0
     elif name == 'Py_ssize_t':
         signed = 2
     elif name == 'ssize_t':
diff --git a/tests/errors/py_ucs4_type_errors.pyx b/tests/errors/py_ucs4_type_errors.pyx
new file mode 100644
index 00000000..639b3324
--- /dev/null
+++ b/tests/errors/py_ucs4_type_errors.pyx
@@ -0,0 +1,24 @@
+# -*- coding: iso-8859-1 -*-
+
+cdef Py_UCS4 char_ASCII = u'A'
+cdef Py_UCS4 char_KLINGON = u'\uF8D2'
+
+def char_too_long_ASCII():
+    cdef Py_UCS4 c = u'AB'
+
+def char_too_long_Unicode():
+    cdef Py_UCS4 c = u'A\uF8D2'
+
+def char_too_long_bytes():
+    cdef Py_UCS4 c = b'AB'
+
+def char_too_long_latin1():
+    cdef Py_UCS4 char_bytes_latin1 = b'\xf6'
+
+
+_ERRORS = """
+ 7:21: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.
+10:21: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.
+13:21: Only single-character string literals can be coerced into ints.
+16:37: Bytes literals cannot coerce to Py_UNICODE/Py_UCS4, use a unicode literal instead.
+"""
diff --git a/tests/errors/py_unicode_type_errors.pyx b/tests/errors/py_unicode_type_errors.pyx
index 3afe0258..17fdc9b0 100644
--- a/tests/errors/py_unicode_type_errors.pyx
+++ b/tests/errors/py_unicode_type_errors.pyx
@@ -17,8 +17,8 @@ def char_too_long_latin1():
 
 
 _ERRORS = """
-7:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
-10:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
+ 7:24: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.
+10:24: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.
 13:24: Only single-character string literals can be coerced into ints.
-16:40: Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.
+16:40: Bytes literals cannot coerce to Py_UNICODE/Py_UCS4, use a unicode literal instead.
 """
diff --git a/tests/run/py_ucs4_type.pyx b/tests/run/py_ucs4_type.pyx
new file mode 100644
index 00000000..a9548402
--- /dev/null
+++ b/tests/run/py_ucs4_type.pyx
@@ -0,0 +1,197 @@
+# -*- coding: iso-8859-1 -*-
+
+cimport cython
+
+cdef Py_UCS4 char_ASCII = u'A'
+cdef Py_UCS4 char_KLINGON = u'\uF8D2'
+
+def compare_ASCII():
+    """
+    >>> compare_ASCII()
+    True
+    False
+    False
+    """
+    print(char_ASCII == u'A')
+    print(char_ASCII == u'B')
+    print(char_ASCII == u'\uF8D2')
+
+
+def compare_klingon():
+    """
+    >>> compare_klingon()
+    True
+    False
+    False
+    """
+    print(char_KLINGON == u'\uF8D2')
+    print(char_KLINGON == u'A')
+    print(char_KLINGON == u'B')
+
+
+from cpython.unicode cimport PyUnicode_FromOrdinal
+import sys
+
+u0 = u'\x00'
+u1 = u'\x01'
+umax = PyUnicode_FromOrdinal(sys.maxunicode)
+
+def unicode_ordinal(Py_UCS4 i):
+    """
+    >>> ord(unicode_ordinal(0)) == 0
+    True
+    >>> ord(unicode_ordinal(1)) == 1
+    True
+    >>> ord(unicode_ordinal(sys.maxunicode)) == sys.maxunicode
+    True
+
+    >>> ord(unicode_ordinal(u0)) == 0
+    True
+    >>> ord(unicode_ordinal(u1)) == 1
+    True
+    >>> ord(unicode_ordinal(umax)) == sys.maxunicode
+    True
+
+    Value too small:
+    >>> unicode_ordinal(-1) #doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ...
+    OverflowError: ...
+
+    Value too large:
+    >>> unicode_ordinal(sys.maxunicode+1) #doctest: +ELLIPSIS
+    Traceback (most recent call last):
+    ...
+    OverflowError: ...
+
+    Less than one character:
+    >>> unicode_ordinal(u0[:0])
+    Traceback (most recent call last):
+    ...
+    ValueError: only single character unicode strings or surrogate pairs can be converted to Py_UCS4, got length 0
+
+    More than one character:
+    >>> unicode_ordinal(u0+u1)
+    Traceback (most recent call last):
+    ...
+    ValueError: only single character unicode strings or surrogate pairs can be converted to Py_UCS4, got length 2
+    """
+    return i
+
+@cython.test_assert_path_exists('//PythonCapiCallNode')
+@cython.test_fail_if_path_exists('//SimpleCallNode')
+def unicode_type_methods(Py_UCS4 uchar):
+    """
+    >>> unicode_type_methods(ord('A'))
+    [True, True, False, False, False, False, False, True, True]
+    >>> unicode_type_methods(ord('a'))
+    [True, True, False, False, True, False, False, False, False]
+    >>> unicode_type_methods(ord('8'))
+    [True, False, True, True, False, True, False, False, False]
+    >>> unicode_type_methods(ord('\\t'))
+    [False, False, False, False, False, False, True, False, False]
+    """
+    return [
+        # character types
+        uchar.isalnum(),
+        uchar.isalpha(),
+        uchar.isdecimal(),
+        uchar.isdigit(),
+        uchar.islower(),
+        uchar.isnumeric(),
+        uchar.isspace(),
+        uchar.istitle(),
+        uchar.isupper(),
+        ]
+
+@cython.test_assert_path_exists('//PythonCapiCallNode')
+@cython.test_fail_if_path_exists('//SimpleCallNode')
+def unicode_methods(Py_UCS4 uchar):
+    """
+    >>> unicode_methods(ord('A')) == ['a', 'A', 'A']
+    True
+    >>> unicode_methods(ord('a')) == ['a', 'A', 'A']
+    True
+    """
+    return [
+        # character conversion
+        uchar.lower(),
+        uchar.upper(),
+        uchar.title(),
+        ]
+
+@cython.test_assert_path_exists('//IntNode')
+@cython.test_fail_if_path_exists('//SimpleCallNode',
+                                 '//PythonCapiCallNode')
+def len_uchar(Py_UCS4 uchar):
+    """
+    >>> len_uchar(ord('A'))
+    1
+    """
+    return len(uchar)
+
+def index_uchar(Py_UCS4 uchar, Py_ssize_t i):
+    """
+    >>> index_uchar(ord('A'), 0) == ('A', 'A', 'A')
+    True
+    >>> index_uchar(ord('A'), -1) == ('A', 'A', 'A')
+    True
+    >>> index_uchar(ord('A'), 1)
+    Traceback (most recent call last):
+    IndexError: string index out of range
+    """
+    return uchar[0], uchar[-1], uchar[i]
+
+mixed_ustring = u'AbcDefGhIjKlmnoP'
+lower_ustring = mixed_ustring.lower()
+upper_ustring = mixed_ustring.lower()
+
+@cython.test_assert_path_exists('//PythonCapiCallNode',
+                                '//ForFromStatNode')
+@cython.test_fail_if_path_exists('//SimpleCallNode',
+                                 '//ForInStatNode')
+def count_lower_case_characters(unicode ustring):
+    """
+    >>> count_lower_case_characters(mixed_ustring)
+    10
+    >>> count_lower_case_characters(lower_ustring)
+    16
+    """
+    cdef Py_ssize_t count = 0
+    for uchar in ustring:
+         if uchar.islower():
+             count += 1
+    return count
+
+@cython.test_assert_path_exists('//SwitchStatNode',
+                                '//ForFromStatNode')
+@cython.test_fail_if_path_exists('//ForInStatNode')
+def iter_and_in():
+    """
+    >>> iter_and_in()
+    a
+    b
+    e
+    f
+    h
+    """
+    for c in u'abcdefgh':
+        if c in u'abCDefGh':
+            print c
+
+@cython.test_assert_path_exists('//SwitchStatNode',
+                                '//ForFromStatNode')
+@cython.test_fail_if_path_exists('//ForInStatNode')
+def index_and_in():
+    """
+    >>> index_and_in()
+    1
+    3
+    4
+    7
+    8
+    """
+    cdef int i
+    for i in range(1,9):
+        if u'abcdefgh'[-i] in u'abCDefGh':
+            print i