native support for Py_UNICODE, coercion between 1-character unicode literals and...

author Stefan Behnel <scoder@users.berlios.de>

Mon, 19 Apr 2010 07:50:19 +0000 (09:50 +0200)

committer Stefan Behnel <scoder@users.berlios.de>

Mon, 19 Apr 2010 07:50:19 +0000 (09:50 +0200)
author Stefan Behnel <scoder@users.berlios.de>
Mon, 19 Apr 2010 07:50:19 +0000 (09:50 +0200)
committer Stefan Behnel <scoder@users.berlios.de>
Mon, 19 Apr 2010 07:50:19 +0000 (09:50 +0200)
diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py

index dda284c70a77bee7d745af5400e0d23332b269bf..c8d733ca14c03d7c106352571eb291dabf1092be 100755 (executable)
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -860,7 +860,10 @@ class BytesNode(ConstNode):
      def coerce_to(self, dst_type, env):
          if dst_type.is_int:
              if not self.can_coerce_to_char_literal():
-                error(self.pos, "Only single-character strings can be coerced into ints.")
+                error(self.pos, "Only single-character string literals can be coerced into ints.")
+                return self
+            if dst_type is PyrexTypes.c_py_unicode_type:
+                error(self.pos, "Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.")
                  return self
              return CharNode(self.pos, value=self.value)
  
@@ -915,13 +918,22 @@ class UnicodeNode(PyConstNode):
      def coerce_to(self, dst_type, env):
          if dst_type is self.type:
              pass
+        elif dst_type is PyrexTypes.c_py_unicode_type:
+            if not self.can_coerce_to_char_literal():
+                error(self.pos, "Only single-character Unicode string literals can be coerced into Py_UNICODE.")
+                return self
+            int_value = ord(self.value)
+            return IntNode(self.pos, value=int_value, constant_result=int_value)
          elif not dst_type.is_pyobject:
-            error(self.pos, "Unicode objects do not support coercion to C types.")
+            error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.")
          elif dst_type is not py_object_type:
              if not self.check_for_coercion_error(dst_type):
                  self.fail_assignment(dst_type)
          return self
  
+    def can_coerce_to_char_literal(self):
+        return len(self.value) == 1
+
      def generate_evaluation_code(self, code):
          self.result_code = code.get_py_string_const(self.value)
  
@@ -5426,10 +5438,10 @@ class CmpNode(object):
          type1_can_be_int = False
          type2_can_be_int = False
  
-        if isinstance(operand1, (StringNode, BytesNode)) \
+        if isinstance(operand1, (StringNode, BytesNode, UnicodeNode)) \
                 and operand1.can_coerce_to_char_literal():
              type1_can_be_int = True
-        if isinstance(operand2, (StringNode, BytesNode)) \
+        if isinstance(operand2, (StringNode, BytesNode, UnicodeNode)) \
                   and operand2.can_coerce_to_char_literal():
              type2_can_be_int = True
  
diff --git a/Cython/Compiler/Optimize.py b/Cython/Compiler/Optimize.py

index e68f7bfc0f65a84a3cf311a4fc8348ea3d1859d4..4739f36336fb4204bc957e0a792d2b4b6704ced1 100644 (file)
--- a/Cython/Compiler/Optimize.py
+++ b/Cython/Compiler/Optimize.py
@@ -137,7 +137,7 @@ class IterationTransform(Visitor.VisitorTransform):
          return node
  
      PyUnicode_AS_UNICODE_func_type = PyrexTypes.CFuncType(
-        PyrexTypes.CPtrType(PyrexTypes.c_uint_type), [ # FIXME: return type is actually Py_UNICODE*
+        PyrexTypes.CPtrType(PyrexTypes.c_py_unicode_type), [
              PyrexTypes.CFuncTypeArg("s", Builtin.unicode_type, None)
              ])
  
diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py

index 5966cc426180c1f37529c8d6406e298ce41ec4e7..3397d7718c842ed7a720b813ae2f15f65c3cfcab 100644 (file)
--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -1851,6 +1851,7 @@ basic_c_type_names = ("void", "char", "int", "float", "double", "bint")
  
  special_basic_c_types = {
      # name : (signed, longness)
+    "Py_UNICODE" : (0, 0),
      "Py_ssize_t" : (2, 0),
      "size_t"     : (0, 0),
  }
diff --git a/Cython/Compiler/PyrexTypes.py b/Cython/Compiler/PyrexTypes.py

index 3126b3332ebe568330e5cfd00a76378afed51f24..101249a5b20532bc649d51fd1a8e64f5d4b8d6cd 100755 (executable)
--- a/Cython/Compiler/PyrexTypes.py
+++ b/Cython/Compiler/PyrexTypes.py
@@ -863,6 +863,20 @@ class CAnonEnumType(CIntType):
          return 'int'
  
  
+class CPyUnicodeIntType(CIntType):
+    # Py_UNICODE
+
+    # Conversion from a unicode string to Py_UNICODE at runtime is not
+    # currently supported and may never be - we only convert from and
+    # to integers here.  The maximum value for a Py_UNICODE is
+    # 1114111, so PyInt_FromLong() will do just fine here.
+
+    to_py_function = "PyInt_FromLong"
+
+    def sign_and_name(self):
+        return "Py_UNICODE"
+
+
  class CPySSizeTType(CIntType):
  
      to_py_function = "PyInt_FromSsize_t"
@@ -2075,14 +2089,15 @@ class ErrorType(PyrexType):
  rank_to_type_name = (
      "char",         # 0
      "short",        # 1
-    "int",          # 2
-    "long",         # 3
-    "Py_ssize_t",   # 4
-    "size_t",       # 5
-    "PY_LONG_LONG", # 6
-    "float",        # 7
-    "double",       # 8
-    "long double",  # 9
+    "Py_UNICODE",   # 2
+    "int",          # 3
+    "long",         # 4
+    "Py_ssize_t",   # 5
+    "size_t",       # 6
+    "PY_LONG_LONG", # 7
+    "float",        # 8
+    "double",       # 9
+    "long double",  # 10
  )
  
  py_object_type = PyObjectType()
@@ -2093,29 +2108,30 @@ c_void_ptr_ptr_type = CPtrType(c_void_ptr_type)
  
  c_uchar_type =       CIntType(0, 0)
  c_ushort_type =      CIntType(1, 0)
-c_uint_type =        CIntType(2, 0)
-c_ulong_type =       CIntType(3, 0)
-c_ulonglong_type =   CIntType(6, 0)
+c_py_unicode_type =  CPyUnicodeIntType(2, 0)
+c_uint_type =        CIntType(3, 0)
+c_ulong_type =       CIntType(4, 0)
+c_ulonglong_type =   CIntType(7, 0)
  
  c_char_type =        CIntType(0, 1)
  c_short_type =       CIntType(1, 1)
-c_int_type =         CIntType(2, 1)
-c_long_type =        CIntType(3, 1)
-c_longlong_type =    CIntType(6, 1)
+c_int_type =         CIntType(3, 1)
+c_long_type =        CIntType(4, 1)
+c_longlong_type =    CIntType(7, 1)
  
  c_schar_type =       CIntType(0, 2)
  c_sshort_type =      CIntType(1, 2)
-c_sint_type =        CIntType(2, 2)
-c_slong_type =       CIntType(3, 2)
-c_slonglong_type =   CIntType(6, 2)
+c_sint_type =        CIntType(3, 2)
+c_slong_type =       CIntType(4, 2)
+c_slonglong_type =   CIntType(7, 2)
  
-c_bint_type =        CBIntType(2, 1)
-c_py_ssize_t_type =  CPySSizeTType(4, 2)
-c_size_t_type =      CSizeTType(5, 0)
+c_bint_type =        CBIntType(3, 1)
+c_py_ssize_t_type =  CPySSizeTType(5, 2)
+c_size_t_type =      CSizeTType(6, 0)
  
-c_float_type =       CFloatType(7, math_h_modifier='f')
-c_double_type =      CFloatType(8)
-c_longdouble_type =  CFloatType(9, math_h_modifier='l')
+c_float_type =       CFloatType(8, math_h_modifier='f')
+c_double_type =      CFloatType(9)
+c_longdouble_type =  CFloatType(10, math_h_modifier='l')
  
  c_float_complex_type =      CComplexType(c_float_type)
  c_double_complex_type =     CComplexType(c_double_type)
@@ -2131,7 +2147,7 @@ c_int_ptr_type =      CPtrType(c_int_type)
  c_py_ssize_t_ptr_type =  CPtrType(c_py_ssize_t_type)
  c_size_t_ptr_type =  CPtrType(c_size_t_type)
  
-c_returncode_type =   CIntType(2, 1, is_returncode = 1)
+c_returncode_type =   CIntType(3, 1, is_returncode = 1)
  c_anon_enum_type =    CAnonEnumType(-1, 1)
  
  # the Py_buffer type is defined in Builtin.py
@@ -2165,6 +2181,7 @@ modifiers_and_name_to_type = {
      (1,  0, "bint"): c_bint_type,
      (0,  0, "size_t") :    c_size_t_type,
      (2,  0, "Py_ssize_t"): c_py_ssize_t_type,
+    (0,  0, "Py_UNICODE"): c_py_unicode_type,
  
      (1,  0, "float"):  c_float_type,
      (1,  0, "double"): c_double_type,
@@ -2383,6 +2400,8 @@ def parse_basic_type(name):
          signed = 2
      elif name == 'size_t':
          signed = 0
+    elif name == 'Py_UNICODE':
+        signed = 0
      else:
          if name.startswith('u'):
              name = name[1:]
diff --git a/Cython/Shadow.py b/Cython/Shadow.py

index a48d8d3fd22e6d4329b8e08cddaa3d348e2f1510..d7dd186d5b5664b5e03a0c6c49cd1042b62fbd44 100644 (file)
--- a/Cython/Shadow.py
+++ b/Cython/Shadow.py
@@ -174,7 +174,7 @@ except ImportError:
  
  # Predefined types
  
-int_types = ['char', 'short', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t']
+int_types = ['char', 'short', 'Py_UNICODE', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t']
  float_types = ['longdouble', 'double', 'float']
  complex_types = ['longdoublecomplex', 'doublecomplex', 'floatcomplex', 'complex']
  other_types = ['bint', 'void']
@@ -183,7 +183,7 @@ gs = globals()
  
  for name in int_types:
      gs[name] = typedef(py_int)
-    if not name.endswith('size_t'):
+    if name != 'Py_UNICODE' and not name.endswith('size_t'):
          gs['u'+name] = typedef(py_int)
          gs['s'+name] = typedef(py_int)
      
diff --git a/tests/errors/e_strcoerce.pyx b/tests/errors/e_strcoerce.pyx

index cda8dd57ad8353b918e34f6e5fd9dc7094871326..8de7344fe41912a5bb2d4589763b518c41a18bf7 100644 (file)
--- a/tests/errors/e_strcoerce.pyx
+++ b/tests/errors/e_strcoerce.pyx
@@ -4,12 +4,14 @@ cdef int cx = "test"  # fails
  
  cdef int x1 =  "\xFF"    # works
  cdef int x2 =  "\u0FFF"  # fails
-cdef int x3 = u"\xFF"    # fails
  
+cdef Py_UNICODE u1 = u"\xFF"   # works
+cdef int u3 = u"\xFF"          # fails
  
-_ERRORS = u"""
-2:14: Only single-character strings can be coerced into ints.
-3:14: Only single-character strings can be coerced into ints.
-6:15: Only single-character strings can be coerced into ints.
-7:14: Unicode objects do not support coercion to C types.
+
+_ERRORS = """
+2:14: Only single-character string literals can be coerced into ints.
+3:14: Only single-character string literals can be coerced into ints.
+6:15: Only single-character string literals can be coerced into ints.
+9:14: Unicode literals do not support coercion to C types other than Py_UNICODE.
  """
diff --git a/tests/errors/py_unicode_type_errors.pyx b/tests/errors/py_unicode_type_errors.pyx

new file mode 100644 (file)

index 0000000..1b4e10e
--- /dev/null
+++ b/tests/errors/py_unicode_type_errors.pyx
@@ -0,0 +1,24 @@
+# -*- coding: iso-8859-1 -*-
+
+cdef Py_UNICODE char_ASCII = u'A'
+cdef Py_UNICODE char_KLINGON = u'\uF8D2'
+
+def char_too_long_ASCII():
+    cdef Py_UNICODE c = u'AB'
+
+def char_too_long_Unicode():
+    cdef Py_UNICODE c = u'A\uF8D2'
+
+def char_too_long_bytes():
+    cdef Py_UNICODE c = b'AB'
+
+def char_too_long_latin1():
+    cdef Py_UNICODE char_bytes_latin1 = b'ö'
+
+
+_ERRORS = """
+7:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
+10:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
+13:24: Only single-character string literals can be coerced into ints.
+16:40: Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.
+"""
diff --git a/tests/errors/string_assignments.pyx b/tests/errors/string_assignments.pyx

index 5a6910356717ea12466b6a3f2eca704c94771734..1b5a13411395889fe51e93681a79d00cdd9a8d31 100644 (file)
--- a/tests/errors/string_assignments.pyx
+++ b/tests/errors/string_assignments.pyx
@@ -50,7 +50,7 @@ cdef list  l_f2 = b1
  cdef list  l_f3 = u1
  
  _ERRORS = u"""
-25:20: Unicode objects do not support coercion to C types.
+25:20: Unicode literals do not support coercion to C types other than Py_UNICODE.
  26:22: Unicode objects do not support coercion to C types.
  27:22: 'str' objects do not support coercion to C types (use 'bytes'?).
  
diff --git a/tests/run/for_in_string.pyx b/tests/run/for_in_string.pyx

index 9e920dfe0d4fc7bb987851946064d87e05efd7e8..4851ab1725e7d61a2d9d84712e0395b6df1c73b5 100644 (file)
--- a/tests/run/for_in_string.pyx
+++ b/tests/run/for_in_string.pyx
@@ -14,7 +14,7 @@ def for_in_bytes(bytes s):
      'C'
      """
      for c in s:
-        if c == 'C':
+        if c == b'C':
              return 'C'
      else:
          return 'X'
@@ -28,21 +28,21 @@ def for_char_in_bytes(bytes s):
      """
      cdef char c
      for c in s:
-        if c == 'C':
+        if c == b'C':
              return 'C'
      else:
          return 'X'
  
-def for_int_in_unicode(unicode s):
+def for_pyunicode_in_unicode(unicode s):
      """
-    >>> for_int_in_unicode(unicode_abc)
+    >>> for_pyunicode_in_unicode(unicode_abc)
      'X'
-    >>> for_int_in_unicode(unicode_ABC)
+    >>> for_pyunicode_in_unicode(unicode_ABC)
      'C'
      """
-    cdef int c
+    cdef Py_UNICODE c
      for c in s:
-        if c == 'C':
+        if c == u'C':
              return 'C'
      else:
          return 'X'
diff --git a/tests/run/py_unicode_type.pyx b/tests/run/py_unicode_type.pyx

new file mode 100644 (file)

index 0000000..b1bb40e
--- /dev/null
+++ b/tests/run/py_unicode_type.pyx
@@ -0,0 +1,44 @@
+# -*- coding: iso-8859-1 -*-
+
+cdef Py_UNICODE char_ASCII = u'A'
+cdef Py_UNICODE char_KLINGON = u'\uF8D2'
+
+
+def compare_ASCII():
+    """
+    >>> compare_ASCII()
+    True
+    False
+    False
+    """
+    print(char_ASCII == u'A')
+    print(char_ASCII == u'B')
+    print(char_ASCII == u'\uF8D2')
+
+
+def compare_KLINGON():
+    """
+    >>> compare_ASCII()
+    True
+    False
+    False
+    """
+    print(char_KLINGON == u'\uF8D2')
+    print(char_KLINGON == u'A')
+    print(char_KLINGON == u'B')
+
+
+def index_literal(int i):
+    """
+    >>> index_literal(0) == '1'
+    True
+    >>> index_literal(-5) == '1'
+    True
+    >>> index_literal(2) == '3'
+    True
+    >>> index_literal(4) == '5'
+    True
+    """
+    # runtime casts are not currently supported
+    #return <Py_UNICODE>(u"12345"[i])
+    return u"12345"[i]
author	Stefan Behnel <scoder@users.berlios.de>
	Mon, 19 Apr 2010 07:50:19 +0000 (09:50 +0200)
committer	Stefan Behnel <scoder@users.berlios.de>
	Mon, 19 Apr 2010 07:50:19 +0000 (09:50 +0200)
Cython/Compiler/ExprNodes.py		patch \| blob \| history
Cython/Compiler/Optimize.py		patch \| blob \| history
Cython/Compiler/Parsing.py		patch \| blob \| history
Cython/Compiler/PyrexTypes.py		patch \| blob \| history
Cython/Shadow.py		patch \| blob \| history
tests/errors/e_strcoerce.pyx		patch \| blob \| history
tests/errors/py_unicode_type_errors.pyx	[new file with mode: 0644]	patch \| blob
tests/errors/string_assignments.pyx		patch \| blob \| history
tests/run/for_in_string.pyx		patch \| blob \| history
tests/run/py_unicode_type.pyx	[new file with mode: 0644]	patch \| blob