def can_coerce_to_char_literal(self):
return len(self.value) == 1
+ def contains_surrogates(self):
+ # Check if the unicode string contains surrogate code points
+ # on a CPython platform with wide (UCS-4) or narrow (UTF-16)
+ # Unicode, i.e. characters that would be spelled as two
+ # separate code units on a narrow platform.
+ for c in map(ord, self.value):
+ if c > 65535: # can only happen on wide platforms
+ return True
+ # We only look for the first code unit (D800-DBFF) of a
+ # surrogate pair - if we find one, the other one
+ # (DC00-DFFF) is likely there, too. If we don't find it,
+ # any second code unit cannot make for a surrogate pair by
+ # itself.
+ if c >= 0xD800 and c <= 0xDBFF:
+ return True
+ return False
+
def generate_evaluation_code(self, code):
self.result_code = code.get_py_string_const(self.value)
not_in = cond.operator == 'not_in'
if not_in and not allow_not_in:
return self.NO_MATCH
+ if isinstance(cond.operand2, ExprNodes.UnicodeNode) and \
+ cond.operand2.contains_surrogates():
+ # dealing with surrogates leads to different
+ # behaviour on wide and narrow Unicode
+ # platforms => refuse to optimise this case
+ return self.NO_MATCH
# this looks somewhat silly, but it does the right
# checks for NameNode and AttributeNode
if is_common_value(cond.operand1, cond.operand1):
cdef int result = a in u'abc\0defg\u1234\uF8D2'
return result
+cdef unicode wide_unicode_character = u'\U0010FEDC'
+py_wide_unicode_character = wide_unicode_character
+cdef unicode wide_unicode_character_surrogate1 = u'\uDBFF'
+cdef unicode wide_unicode_character_surrogate2 = u'\uDEDC'
+py_wide_unicode_character_surrogate1 = wide_unicode_character_surrogate1
+py_wide_unicode_character_surrogate2 = wide_unicode_character_surrogate2
+
+@cython.test_fail_if_path_exists("//SwitchStatNode")
+@cython.test_assert_path_exists("//PrimaryCmpNode")
+def m_wide_unicode_literal(Py_UNICODE a):
+ """
+ >>> m_unicode_literal(ord('f'))
+ 1
+ >>> m_unicode_literal(ord('X'))
+ 0
+ >>> import sys
+ >>> if sys.maxunicode == 65535:
+ ... m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate1))
+ ... m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate2))
+ ... else:
+ ... m_wide_unicode_literal(ord(py_wide_unicode_character))
+ ... 1
+ 1
+ 1
+ """
+ cdef int result = a in u'abc\0defg\u1234\uF8D2\U0010FEDC'
+ return result
+
@cython.test_assert_path_exists("//SwitchStatNode")
@cython.test_fail_if_path_exists("//BoolBinopNode", "//PrimaryCmpNode")
def conditional_int(int a):