support 'Py_UCS4 in unicode_string' also in 16 Unicode builds
authorStefan Behnel <scoder@users.berlios.de>
Sat, 5 Feb 2011 16:24:44 +0000 (17:24 +0100)
committerStefan Behnel <scoder@users.berlios.de>
Sat, 5 Feb 2011 16:24:44 +0000 (17:24 +0100)
Cython/Compiler/ExprNodes.py
tests/run/py_ucs4_type.pyx

index 21c23b7ab8f90bbfc84ff7c2abddf54f6731beae..97298d9cf296e0f518a800b6a7e790c80b64c981 100755 (executable)
@@ -6662,11 +6662,43 @@ static CYTHON_INLINE int __Pyx_UnicodeContains(PyObject* unicode, Py_UNICODE cha
 """,
 impl="""
 static CYTHON_INLINE int __Pyx_UnicodeContains(PyObject* unicode, Py_UNICODE character) {
+    Py_UNICODE* pos;
     const Py_ssize_t length = PyUnicode_GET_SIZE(unicode);
     Py_UNICODE* char_start = PyUnicode_AS_UNICODE(unicode);
+
+    for (pos=char_start; pos < char_start+length; pos++) {
+        if (unlikely(character == pos[0])) return 1;
+    }
+    return 0;
+}
+""")
+
+py_ucs4_in_unicode_utility_code = UtilityCode(
+proto="""
+static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 character); /*proto*/
+""",
+# additionally handles surrogate pairs in 16bit Unicode builds
+impl="""
+static CYTHON_INLINE int __Pyx_UnicodeContainsUCS4(PyObject* unicode, Py_UCS4 character) {
     Py_UNICODE* pos;
+    Py_UNICODE uchar;
+    const Py_ssize_t length = PyUnicode_GET_SIZE(unicode);
+    Py_UNICODE* char_start = PyUnicode_AS_UNICODE(unicode);
+
+    #if Py_UNICODE_SIZE == 2
+    if (unlikely(character > 65535)) {
+        Py_UNICODE high_val, low_val;
+        high_val = (Py_UNICODE) (0xD800 | (((character - 0x10000) >> 10) & ((1<<10)-1)));
+        low_val  = (Py_UNICODE) (0xDC00 | ( (character - 0x10000)        & ((1<<10)-1)));
+        for (pos=char_start; pos < char_start+length-1; pos++) {
+            if (unlikely(high_val == pos[0]) & unlikely(low_val == pos[1])) return 1;
+        }
+        return 0;
+    }
+    #endif
+    uchar = (Py_UNICODE) character;
     for (pos=char_start; pos < char_start+length; pos++) {
-        if (character == pos[0]) return 1;
+        if (unlikely(uchar == pos[0])) return 1;
     }
     return 0;
 }
@@ -6764,7 +6796,12 @@ class PrimaryCmpNode(ExprNode, CmpNode):
                     error(self.pos, "Cascading comparison not yet supported for 'int_val in string'.")
                     return
                 if self.operand2.type is unicode_type:
-                    env.use_utility_code(pyunicode_in_unicode_utility_code)
+                    self.uchar_test_type = PyrexTypes.widest_numeric_type(
+                        self.operand1.type, PyrexTypes.c_py_unicode_type)
+                    if self.uchar_test_type is PyrexTypes.c_py_unicode_type:
+                        env.use_utility_code(pyunicode_in_unicode_utility_code)
+                    else:
+                        env.use_utility_code(py_ucs4_in_unicode_utility_code)
                 else:
                     if self.operand1.type is PyrexTypes.c_uchar_type:
                         self.operand1 = self.operand1.coerce_to(PyrexTypes.c_char_type, env)
@@ -6854,10 +6891,13 @@ class PrimaryCmpNode(ExprNode, CmpNode):
                 self.operand1.result(),
                 self.operand2.result())
         elif self.is_c_string_contains():
-            if self.operand2.type is bytes_type:
-                method = "__Pyx_BytesContains"
+            if self.operand2.type is unicode_type:
+                if self.uchar_test_type is PyrexTypes.c_py_unicode_type:
+                    method = "__Pyx_UnicodeContains"
+                else:
+                    method = "__Pyx_UnicodeContainsUCS4"
             else:
-                method = "__Pyx_UnicodeContains"
+                method = "__Pyx_BytesContains"
             if self.operator == "not_in":
                 negation = "!"
             else:
index b5790dced8c667a09d7ab821751a185fc2f9224b..042537658f383f767dbee4eea987d4fda12e4574 100644 (file)
@@ -195,3 +195,25 @@ def index_and_in():
     for i in range(1,9):
         if u'abcdefgh'[-i] in u'abCDefGh':
             print i
+
+# special test for narrow builds
+
+high_uchar = u'\U00012345'
+high_ustring0 = u'\U00012345\U00012346abc'
+high_ustring1 = u'\U00012346\U00012345abc'
+high_ustring_end = u'\U00012346abc\U00012344\U00012345'
+high_ustring_no = u'\U00012346\U00012346abc'
+
+def uchar_in(Py_UCS4 uchar, unicode ustring):
+    """
+    >>> uchar_in(high_uchar, high_ustring0)
+    True
+    >>> uchar_in(high_uchar, high_ustring1)
+    True
+    >>> uchar_in(high_uchar, high_ustring_end)
+    True
+    >>> uchar_in(high_uchar, high_ustring_no)
+    False
+    """
+    assert uchar == 0x12345, ('%X' % uchar)
+    return uchar in ustring