fix "Py_UNICODE in ..." against wide unicode literals on narrow Unicode platforms
authorStefan Behnel <scoder@users.berlios.de>
Sat, 3 Jul 2010 15:34:52 +0000 (17:34 +0200)
committerStefan Behnel <scoder@users.berlios.de>
Sat, 3 Jul 2010 15:34:52 +0000 (17:34 +0200)
Cython/Compiler/ExprNodes.py
Cython/Compiler/Optimize.py
tests/run/inop.pyx

index cdf267018b45f5b2c87bc33238facce8609d1554..965a8ec4719dcf015360bdda2e0353dbf1ef1e38 100755 (executable)
@@ -969,6 +969,23 @@ class UnicodeNode(PyConstNode):
     def can_coerce_to_char_literal(self):
         return len(self.value) == 1
 
+    def contains_surrogates(self):
+        # Check if the unicode string contains surrogate code points
+        # on a CPython platform with wide (UCS-4) or narrow (UTF-16)
+        # Unicode, i.e. characters that would be spelled as two
+        # separate code units on a narrow platform.
+        for c in map(ord, self.value):
+            if c > 65535: # can only happen on wide platforms
+                return True
+            # We only look for the first code unit (D800-DBFF) of a
+            # surrogate pair - if we find one, the other one
+            # (DC00-DFFF) is likely there, too.  If we don't find it,
+            # any second code unit cannot make for a surrogate pair by
+            # itself.
+            if c >= 0xD800 and c <= 0xDBFF:
+                return True
+        return False
+
     def generate_evaluation_code(self, code):
         self.result_code = code.get_py_string_const(self.value)
 
index 574321b8153cb8510e6bc7f63c30bc3b01f8a926..8eb1e92d06b2de540842ad1122885c7aaeb65615 100644 (file)
@@ -600,6 +600,12 @@ class SwitchTransform(Visitor.VisitorTransform):
                     not_in = cond.operator == 'not_in'
                     if not_in and not allow_not_in:
                         return self.NO_MATCH
+                    if isinstance(cond.operand2, ExprNodes.UnicodeNode) and \
+                           cond.operand2.contains_surrogates():
+                        # dealing with surrogates leads to different
+                        # behaviour on wide and narrow Unicode
+                        # platforms => refuse to optimise this case
+                        return self.NO_MATCH
                     # this looks somewhat silly, but it does the right
                     # checks for NameNode and AttributeNode
                     if is_common_value(cond.operand1, cond.operand1):
index 917eff714c1fc23827926050f41b6ab77e428950..ff18ef7ad746ff8723b333484c3485b1ceea9a70 100644 (file)
@@ -195,6 +195,34 @@ def m_unicode_literal(Py_UNICODE a):
     cdef int result = a in u'abc\0defg\u1234\uF8D2'
     return result
 
+cdef unicode wide_unicode_character = u'\U0010FEDC'
+py_wide_unicode_character = wide_unicode_character
+cdef unicode wide_unicode_character_surrogate1 = u'\uDBFF'
+cdef unicode wide_unicode_character_surrogate2 = u'\uDEDC'
+py_wide_unicode_character_surrogate1 = wide_unicode_character_surrogate1
+py_wide_unicode_character_surrogate2 = wide_unicode_character_surrogate2
+
+@cython.test_fail_if_path_exists("//SwitchStatNode")
+@cython.test_assert_path_exists("//PrimaryCmpNode")
+def m_wide_unicode_literal(Py_UNICODE a):
+    """
+    >>> m_unicode_literal(ord('f'))
+    1
+    >>> m_unicode_literal(ord('X'))
+    0
+    >>> import sys
+    >>> if sys.maxunicode == 65535:
+    ...     m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate1))
+    ...     m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate2))
+    ... else:
+    ...     m_wide_unicode_literal(ord(py_wide_unicode_character))
+    ...     1
+    1
+    1
+    """
+    cdef int result = a in u'abc\0defg\u1234\uF8D2\U0010FEDC'
+    return result
+
 @cython.test_assert_path_exists("//SwitchStatNode")
 @cython.test_fail_if_path_exists("//BoolBinopNode", "//PrimaryCmpNode")
 def conditional_int(int a):