From: Stefan Behnel <scoder@users.berlios.de>
Date: Sat, 3 Jul 2010 15:34:52 +0000 (+0200)
Subject: fix "Py_UNICODE in ..." against wide unicode literals on narrow Unicode platforms
X-Git-Tag: 0.13.beta0~39
X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=f180a00f4b28a29c788fd4305cb7e611339bb1cc;p=cython.git

fix "Py_UNICODE in ..." against wide unicode literals on narrow Unicode platforms
---

diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py
index cdf26701..965a8ec4 100755
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -969,6 +969,23 @@ class UnicodeNode(PyConstNode):
     def can_coerce_to_char_literal(self):
         return len(self.value) == 1
 
+    def contains_surrogates(self):
+        # Check if the unicode string contains surrogate code points
+        # on a CPython platform with wide (UCS-4) or narrow (UTF-16)
+        # Unicode, i.e. characters that would be spelled as two
+        # separate code units on a narrow platform.
+        for c in map(ord, self.value):
+            if c > 65535: # can only happen on wide platforms
+                return True
+            # We only look for the first code unit (D800-DBFF) of a
+            # surrogate pair - if we find one, the other one
+            # (DC00-DFFF) is likely there, too.  If we don't find it,
+            # any second code unit cannot make for a surrogate pair by
+            # itself.
+            if c >= 0xD800 and c <= 0xDBFF:
+                return True
+        return False
+
     def generate_evaluation_code(self, code):
         self.result_code = code.get_py_string_const(self.value)
 
diff --git a/Cython/Compiler/Optimize.py b/Cython/Compiler/Optimize.py
index 574321b8..8eb1e92d 100644
--- a/Cython/Compiler/Optimize.py
+++ b/Cython/Compiler/Optimize.py
@@ -600,6 +600,12 @@ class SwitchTransform(Visitor.VisitorTransform):
                     not_in = cond.operator == 'not_in'
                     if not_in and not allow_not_in:
                         return self.NO_MATCH
+                    if isinstance(cond.operand2, ExprNodes.UnicodeNode) and \
+                           cond.operand2.contains_surrogates():
+                        # dealing with surrogates leads to different
+                        # behaviour on wide and narrow Unicode
+                        # platforms => refuse to optimise this case
+                        return self.NO_MATCH
                     # this looks somewhat silly, but it does the right
                     # checks for NameNode and AttributeNode
                     if is_common_value(cond.operand1, cond.operand1):
diff --git a/tests/run/inop.pyx b/tests/run/inop.pyx
index 917eff71..ff18ef7a 100644
--- a/tests/run/inop.pyx
+++ b/tests/run/inop.pyx
@@ -195,6 +195,34 @@ def m_unicode_literal(Py_UNICODE a):
     cdef int result = a in u'abc\0defg\u1234\uF8D2'
     return result
 
+cdef unicode wide_unicode_character = u'\U0010FEDC'
+py_wide_unicode_character = wide_unicode_character
+cdef unicode wide_unicode_character_surrogate1 = u'\uDBFF'
+cdef unicode wide_unicode_character_surrogate2 = u'\uDEDC'
+py_wide_unicode_character_surrogate1 = wide_unicode_character_surrogate1
+py_wide_unicode_character_surrogate2 = wide_unicode_character_surrogate2
+
+@cython.test_fail_if_path_exists("//SwitchStatNode")
+@cython.test_assert_path_exists("//PrimaryCmpNode")
+def m_wide_unicode_literal(Py_UNICODE a):
+    """
+    >>> m_unicode_literal(ord('f'))
+    1
+    >>> m_unicode_literal(ord('X'))
+    0
+    >>> import sys
+    >>> if sys.maxunicode == 65535:
+    ...     m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate1))
+    ...     m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate2))
+    ... else:
+    ...     m_wide_unicode_literal(ord(py_wide_unicode_character))
+    ...     1
+    1
+    1
+    """
+    cdef int result = a in u'abc\0defg\u1234\uF8D2\U0010FEDC'
+    return result
+
 @cython.test_assert_path_exists("//SwitchStatNode")
 @cython.test_fail_if_path_exists("//BoolBinopNode", "//PrimaryCmpNode")
 def conditional_int(int a):