From: Stefan Behnel Date: Sat, 3 Jul 2010 15:34:52 +0000 (+0200) Subject: fix "Py_UNICODE in ..." against wide unicode literals on narrow Unicode platforms X-Git-Tag: 0.13.beta0~39 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=f180a00f4b28a29c788fd4305cb7e611339bb1cc;p=cython.git fix "Py_UNICODE in ..." against wide unicode literals on narrow Unicode platforms --- diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py index cdf26701..965a8ec4 100755 --- a/Cython/Compiler/ExprNodes.py +++ b/Cython/Compiler/ExprNodes.py @@ -969,6 +969,23 @@ class UnicodeNode(PyConstNode): def can_coerce_to_char_literal(self): return len(self.value) == 1 + def contains_surrogates(self): + # Check if the unicode string contains surrogate code points + # on a CPython platform with wide (UCS-4) or narrow (UTF-16) + # Unicode, i.e. characters that would be spelled as two + # separate code units on a narrow platform. + for c in map(ord, self.value): + if c > 65535: # can only happen on wide platforms + return True + # We only look for the first code unit (D800-DBFF) of a + # surrogate pair - if we find one, the other one + # (DC00-DFFF) is likely there, too. If we don't find it, + # any second code unit cannot make for a surrogate pair by + # itself. + if c >= 0xD800 and c <= 0xDBFF: + return True + return False + def generate_evaluation_code(self, code): self.result_code = code.get_py_string_const(self.value) diff --git a/Cython/Compiler/Optimize.py b/Cython/Compiler/Optimize.py index 574321b8..8eb1e92d 100644 --- a/Cython/Compiler/Optimize.py +++ b/Cython/Compiler/Optimize.py @@ -600,6 +600,12 @@ class SwitchTransform(Visitor.VisitorTransform): not_in = cond.operator == 'not_in' if not_in and not allow_not_in: return self.NO_MATCH + if isinstance(cond.operand2, ExprNodes.UnicodeNode) and \ + cond.operand2.contains_surrogates(): + # dealing with surrogates leads to different + # behaviour on wide and narrow Unicode + # platforms => refuse to optimise this case + return self.NO_MATCH # this looks somewhat silly, but it does the right # checks for NameNode and AttributeNode if is_common_value(cond.operand1, cond.operand1): diff --git a/tests/run/inop.pyx b/tests/run/inop.pyx index 917eff71..ff18ef7a 100644 --- a/tests/run/inop.pyx +++ b/tests/run/inop.pyx @@ -195,6 +195,34 @@ def m_unicode_literal(Py_UNICODE a): cdef int result = a in u'abc\0defg\u1234\uF8D2' return result +cdef unicode wide_unicode_character = u'\U0010FEDC' +py_wide_unicode_character = wide_unicode_character +cdef unicode wide_unicode_character_surrogate1 = u'\uDBFF' +cdef unicode wide_unicode_character_surrogate2 = u'\uDEDC' +py_wide_unicode_character_surrogate1 = wide_unicode_character_surrogate1 +py_wide_unicode_character_surrogate2 = wide_unicode_character_surrogate2 + +@cython.test_fail_if_path_exists("//SwitchStatNode") +@cython.test_assert_path_exists("//PrimaryCmpNode") +def m_wide_unicode_literal(Py_UNICODE a): + """ + >>> m_unicode_literal(ord('f')) + 1 + >>> m_unicode_literal(ord('X')) + 0 + >>> import sys + >>> if sys.maxunicode == 65535: + ... m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate1)) + ... m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate2)) + ... else: + ... m_wide_unicode_literal(ord(py_wide_unicode_character)) + ... 1 + 1 + 1 + """ + cdef int result = a in u'abc\0defg\u1234\uF8D2\U0010FEDC' + return result + @cython.test_assert_path_exists("//SwitchStatNode") @cython.test_fail_if_path_exists("//BoolBinopNode", "//PrimaryCmpNode") def conditional_int(int a):