From e7336e2f02b5632c679d561fc95af65ff161b07f Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sat, 5 Feb 2011 17:24:44 +0100 Subject: [PATCH] support 'Py_UCS4 in unicode_string' also in 16 Unicode builds --- Cython/Compiler/ExprNodes.py | 50 ++++++++++++++++++++++++++++++++---- tests/run/py_ucs4_type.pyx | 22 ++++++++++++++++ 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py index 21c23b7a..97298d9c 100755 --- a/Cython/Compiler/ExprNodes.py +++ b/Cython/Compiler/ExprNodes.py @@ -6662,11 +6662,43 @@ static CYTHON_INLINE int __Pyx_UnicodeContains(PyObject* unicode, Py_UNICODE cha """, impl=""" static CYTHON_INLINE int __Pyx_UnicodeContains(PyObject* unicode, Py_UNICODE character) { + Py_UNICODE* pos; const Py_ssize_t length = PyUnicode_GET_SIZE(unicode); Py_UNICODE* char_start = PyUnicode_AS_UNICODE(unicode); + + for (pos=char_start; pos < char_start+length; pos++) { + if (unlikely(character == pos[0])) return 1; + } + return 0; +} +""") + +py_ucs4_in_unicode_utility_code = UtilityCode( +proto=""" +static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 character); /*proto*/ +""", +# additionally handles surrogate pairs in 16bit Unicode builds +impl=""" +static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 character) { Py_UNICODE* pos; + Py_UNICODE uchar; + const Py_ssize_t length = PyUnicode_GET_SIZE(unicode); + Py_UNICODE* char_start = PyUnicode_AS_UNICODE(unicode); + + #if Py_UNICODE_SIZE == 2 + if (unlikely(character > 65535)) { + Py_UNICODE high_val, low_val; + high_val = (Py_UNICODE) (0xD800 | (((character - 0x10000) >> 10) & ((1<<10)-1))); + low_val = (Py_UNICODE) (0xDC00 | ( (character - 0x10000) & ((1<<10)-1))); + for (pos=char_start; pos < char_start+length-1; pos++) { + if (unlikely(high_val == pos[0]) & unlikely(low_val == pos[1])) return 1; + } + return 0; + } + #endif + uchar = (Py_UNICODE) character; for (pos=char_start; pos < char_start+length; pos++) { - if (character == pos[0]) return 1; + if (unlikely(uchar == pos[0])) return 1; } return 0; } @@ -6764,7 +6796,12 @@ class PrimaryCmpNode(ExprNode, CmpNode): error(self.pos, "Cascading comparison not yet supported for 'int_val in string'.") return if self.operand2.type is unicode_type: - env.use_utility_code(pyunicode_in_unicode_utility_code) + self.uchar_test_type = PyrexTypes.widest_numeric_type( + self.operand1.type, PyrexTypes.c_py_unicode_type) + if self.uchar_test_type is PyrexTypes.c_py_unicode_type: + env.use_utility_code(pyunicode_in_unicode_utility_code) + else: + env.use_utility_code(py_ucs4_in_unicode_utility_code) else: if self.operand1.type is PyrexTypes.c_uchar_type: self.operand1 = self.operand1.coerce_to(PyrexTypes.c_char_type, env) @@ -6854,10 +6891,13 @@ class PrimaryCmpNode(ExprNode, CmpNode): self.operand1.result(), self.operand2.result()) elif self.is_c_string_contains(): - if self.operand2.type is bytes_type: - method = "__Pyx_BytesContains" + if self.operand2.type is unicode_type: + if self.uchar_test_type is PyrexTypes.c_py_unicode_type: + method = "__Pyx_UnicodeContains" + else: + method = "__Pyx_UnicodeContainsUCS4" else: - method = "__Pyx_UnicodeContains" + method = "__Pyx_BytesContains" if self.operator == "not_in": negation = "!" else: diff --git a/tests/run/py_ucs4_type.pyx b/tests/run/py_ucs4_type.pyx index b5790dce..04253765 100644 --- a/tests/run/py_ucs4_type.pyx +++ b/tests/run/py_ucs4_type.pyx @@ -195,3 +195,25 @@ def index_and_in(): for i in range(1,9): if u'abcdefgh'[-i] in u'abCDefGh': print i + +# special test for narrow builds + +high_uchar = u'\U00012345' +high_ustring0 = u'\U00012345\U00012346abc' +high_ustring1 = u'\U00012346\U00012345abc' +high_ustring_end = u'\U00012346abc\U00012344\U00012345' +high_ustring_no = u'\U00012346\U00012346abc' + +def uchar_in(Py_UCS4 uchar, unicode ustring): + """ + >>> uchar_in(high_uchar, high_ustring0) + True + >>> uchar_in(high_uchar, high_ustring1) + True + >>> uchar_in(high_uchar, high_ustring_end) + True + >>> uchar_in(high_uchar, high_ustring_no) + False + """ + assert uchar == 0x12345, ('%X' % uchar) + return uchar in ustring -- 2.26.2