fix "Py_UNICODE in ..." against wide unicode literals on narrow Unicode platforms

author Stefan Behnel <scoder@users.berlios.de>

Sat, 3 Jul 2010 15:34:52 +0000 (17:34 +0200)

committer Stefan Behnel <scoder@users.berlios.de>

Sat, 3 Jul 2010 15:34:52 +0000 (17:34 +0200)
author Stefan Behnel <scoder@users.berlios.de>
Sat, 3 Jul 2010 15:34:52 +0000 (17:34 +0200)
committer Stefan Behnel <scoder@users.berlios.de>
Sat, 3 Jul 2010 15:34:52 +0000 (17:34 +0200)
diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py

index cdf267018b45f5b2c87bc33238facce8609d1554..965a8ec4719dcf015360bdda2e0353dbf1ef1e38 100755 (executable)
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -969,6 +969,23 @@ class UnicodeNode(PyConstNode):
      def can_coerce_to_char_literal(self):
          return len(self.value) == 1
  
+    def contains_surrogates(self):
+        # Check if the unicode string contains surrogate code points
+        # on a CPython platform with wide (UCS-4) or narrow (UTF-16)
+        # Unicode, i.e. characters that would be spelled as two
+        # separate code units on a narrow platform.
+        for c in map(ord, self.value):
+            if c > 65535: # can only happen on wide platforms
+                return True
+            # We only look for the first code unit (D800-DBFF) of a
+            # surrogate pair - if we find one, the other one
+            # (DC00-DFFF) is likely there, too.  If we don't find it,
+            # any second code unit cannot make for a surrogate pair by
+            # itself.
+            if c >= 0xD800 and c <= 0xDBFF:
+                return True
+        return False
+
      def generate_evaluation_code(self, code):
          self.result_code = code.get_py_string_const(self.value)
  
diff --git a/Cython/Compiler/Optimize.py b/Cython/Compiler/Optimize.py

index 574321b8153cb8510e6bc7f63c30bc3b01f8a926..8eb1e92d06b2de540842ad1122885c7aaeb65615 100644 (file)
--- a/Cython/Compiler/Optimize.py
+++ b/Cython/Compiler/Optimize.py
@@ -600,6 +600,12 @@ class SwitchTransform(Visitor.VisitorTransform):
                      not_in = cond.operator == 'not_in'
                      if not_in and not allow_not_in:
                          return self.NO_MATCH
+                    if isinstance(cond.operand2, ExprNodes.UnicodeNode) and \
+                           cond.operand2.contains_surrogates():
+                        # dealing with surrogates leads to different
+                        # behaviour on wide and narrow Unicode
+                        # platforms => refuse to optimise this case
+                        return self.NO_MATCH
                      # this looks somewhat silly, but it does the right
                      # checks for NameNode and AttributeNode
                      if is_common_value(cond.operand1, cond.operand1):
diff --git a/tests/run/inop.pyx b/tests/run/inop.pyx

index 917eff714c1fc23827926050f41b6ab77e428950..ff18ef7ad746ff8723b333484c3485b1ceea9a70 100644 (file)
--- a/tests/run/inop.pyx
+++ b/tests/run/inop.pyx
@@ -195,6 +195,34 @@ def m_unicode_literal(Py_UNICODE a):
      cdef int result = a in u'abc\0defg\u1234\uF8D2'
      return result
  
+cdef unicode wide_unicode_character = u'\U0010FEDC'
+py_wide_unicode_character = wide_unicode_character
+cdef unicode wide_unicode_character_surrogate1 = u'\uDBFF'
+cdef unicode wide_unicode_character_surrogate2 = u'\uDEDC'
+py_wide_unicode_character_surrogate1 = wide_unicode_character_surrogate1
+py_wide_unicode_character_surrogate2 = wide_unicode_character_surrogate2
+
+@cython.test_fail_if_path_exists("//SwitchStatNode")
+@cython.test_assert_path_exists("//PrimaryCmpNode")
+def m_wide_unicode_literal(Py_UNICODE a):
+    """
+    >>> m_unicode_literal(ord('f'))
+    1
+    >>> m_unicode_literal(ord('X'))
+    0
+    >>> import sys
+    >>> if sys.maxunicode == 65535:
+    ...     m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate1))
+    ...     m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate2))
+    ... else:
+    ...     m_wide_unicode_literal(ord(py_wide_unicode_character))
+    ...     1
+    1
+    1
+    """
+    cdef int result = a in u'abc\0defg\u1234\uF8D2\U0010FEDC'
+    return result
+
  @cython.test_assert_path_exists("//SwitchStatNode")
  @cython.test_fail_if_path_exists("//BoolBinopNode", "//PrimaryCmpNode")
  def conditional_int(int a):
author	Stefan Behnel <scoder@users.berlios.de>
	Sat, 3 Jul 2010 15:34:52 +0000 (17:34 +0200)
committer	Stefan Behnel <scoder@users.berlios.de>
	Sat, 3 Jul 2010 15:34:52 +0000 (17:34 +0200)
Cython/Compiler/ExprNodes.py		patch \| blob \| history
Cython/Compiler/Optimize.py		patch \| blob \| history
tests/run/inop.pyx		patch \| blob \| history