optimise unicode.startswith() and unicode.endswith(), drop redundant bint->bool-...
authorStefan Behnel <scoder@users.berlios.de>
Sun, 21 Mar 2010 19:06:06 +0000 (20:06 +0100)
committerStefan Behnel <scoder@users.berlios.de>
Sun, 21 Mar 2010 19:06:06 +0000 (20:06 +0100)
Cython/Compiler/Builtin.py
Cython/Compiler/Optimize.py
tests/run/unicodemethods.pyx

index a090ecfd2e100eb0d24b3eba58481aa0444e2ed7..23f5ec71f3672ef383d590ec41081a2faf35013c 100644 (file)
@@ -414,7 +414,7 @@ def init_builtins():
     init_builtin_types()
     init_builtin_structs()
     global list_type, tuple_type, dict_type, set_type, type_type
-    global bytes_type, str_type, unicode_type, float_type
+    global bytes_type, str_type, unicode_type, float_type, bool_type
     type_type  = builtin_scope.lookup('type').type
     list_type  = builtin_scope.lookup('list').type
     tuple_type = builtin_scope.lookup('tuple').type
@@ -424,5 +424,6 @@ def init_builtins():
     str_type   = builtin_scope.lookup('str').type
     unicode_type = builtin_scope.lookup('unicode').type
     float_type = builtin_scope.lookup('float').type
+    bool_type  = builtin_scope.lookup('bool').type
 
 init_builtins()
index 1627692fac1196f23592c81ab6616f08fe5184e6..a0a82ba8f7169e8366420d8ab4e7742b26117000 100644 (file)
@@ -937,6 +937,16 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
             return node
         return node.arg
 
+    def visit_CoerceToBooleanNode(self, node):
+        """Drop redundant conversion nodes after tree changes.
+        """
+        self.visitchildren(node)
+        arg = node.arg
+        if isinstance(arg, ExprNodes.CoerceToPyTypeNode):
+            if arg.type in (PyrexTypes.py_object_type, Builtin.bool_type):
+                return arg.arg.coerce_to_boolean(self.env_stack[-1])
+        return node
+
     def visit_CoerceFromPyTypeNode(self, node):
         """Drop redundant conversion nodes after tree changes.
 
@@ -1437,8 +1447,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
         if len(args) < 2:
             args.append(ExprNodes.BoolNode(node.pos, value=False))
         else:
-            args[1] = args[1].coerce_to(PyrexTypes.c_bint_type,
-                                        self.env_stack[-1])
+            args[1] = args[1].coerce_to_boolean(self.env_stack[-1])
 
         return self._substitute_method_call(
             node, "PyUnicode_Splitlines", self.PyUnicode_Splitlines_func_type,
@@ -1490,6 +1499,54 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
             node, "PyUnicode_Split", self.PyUnicode_Split_func_type,
             'split', is_unbound_method, args)
 
+    PyUnicode_Tailmatch_func_type = PyrexTypes.CFuncType(
+        PyrexTypes.c_bint_type, [
+            PyrexTypes.CFuncTypeArg("str", Builtin.unicode_type, None),
+            PyrexTypes.CFuncTypeArg("substring", PyrexTypes.py_object_type, None),
+            PyrexTypes.CFuncTypeArg("start", PyrexTypes.c_py_ssize_t_type, None),
+            PyrexTypes.CFuncTypeArg("end", PyrexTypes.c_py_ssize_t_type, None),
+            PyrexTypes.CFuncTypeArg("direction", PyrexTypes.c_int_type, None),
+            ],
+        exception_value = '-1')
+
+    def _handle_simple_method_unicode_endswith(self, node, args, is_unbound_method):
+        return self._inject_unicode_tailmatch(
+            node, args, is_unbound_method, 'endswith', +1)
+
+    def _handle_simple_method_unicode_startswith(self, node, args, is_unbound_method):
+        return self._inject_unicode_tailmatch(
+            node, args, is_unbound_method, 'startswith', -1)
+
+    def _inject_unicode_tailmatch(self, node, args, is_unbound_method,
+                                  method_name, direction):
+        """Replace unicode.startswith(...) and unicode.endswith(...)
+        by a direct call to the corresponding C-API function.
+        """
+        if len(args) not in (2,3,4):
+            self._error_wrong_arg_count('unicode.%s' % method_name, node, args, "2-4")
+            return node
+        if len(args) < 3:
+            args.append(ExprNodes.IntNode(
+                node.pos, value="0", type=PyrexTypes.c_py_ssize_t_type))
+        else:
+            args[2] = args[2].coerce_to(PyrexTypes.c_py_ssize_t_type,
+                                        self.env_stack[-1])
+        if len(args) < 4:
+            args.append(ExprNodes.IntNode(
+                node.pos, value="PY_SSIZE_T_MAX", type=PyrexTypes.c_py_ssize_t_type))
+        else:
+            args[3] = args[3].coerce_to(PyrexTypes.c_py_ssize_t_type,
+                                        self.env_stack[-1])
+        args.append(ExprNodes.IntNode(
+            node.pos, value=str(direction), type=PyrexTypes.c_int_type))
+
+        method_call = self._substitute_method_call(
+            node, "__Pyx_PyUnicode_Tailmatch", self.PyUnicode_Tailmatch_func_type,
+            method_name, is_unbound_method, args,
+            utility_code = unicode_tailmatch_utility_code)
+        return ExprNodes.CoerceToPyTypeNode(
+            method_call, self.env_stack[-1], Builtin.bool_type)
+
     PyUnicode_AsEncodedString_func_type = PyrexTypes.CFuncType(
         Builtin.bytes_type, [
             PyrexTypes.CFuncTypeArg("obj", Builtin.unicode_type, None),
@@ -1740,6 +1797,34 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
             )
 
 
+unicode_tailmatch_utility_code = UtilityCode(
+    # Python's unicode.startswith() and unicode.endswith() support a
+    # tuple of prefixes/suffixes, whereas it's much more common to
+    # test for a single unicode string.
+proto = '''
+static int __Pyx_PyUnicode_Tailmatch(PyObject* s, PyObject* substr, \
+Py_ssize_t start, Py_ssize_t end, int direction);
+''',
+impl = '''
+static int __Pyx_PyUnicode_Tailmatch(PyObject* s, PyObject* substr,
+                                     Py_ssize_t start, Py_ssize_t end, int direction) {
+    if (unlikely(PyTuple_Check(substr))) {
+        int result;
+        Py_ssize_t i;
+        for (i = 0; i < PyTuple_GET_SIZE(substr); i++) {
+            result = PyUnicode_Tailmatch(s, PyTuple_GET_ITEM(substr, i),
+                                         start, end, direction);
+            if (result) {
+                return result;
+            }
+        }
+        return 0;
+    }
+    return PyUnicode_Tailmatch(s, substr, start, end, direction);
+}
+''',
+)
+
 dict_getitem_default_utility_code = UtilityCode(
 proto = '''
 static PyObject* __Pyx_PyDict_GetItemDefault(PyObject* d, PyObject* key, PyObject* default_value) {
index 11ec5618ea10c47ea2354f9e96052792b64f9306..ec8cae0dd2a3f6291ad8d7416123c4c2b5d428f0 100644 (file)
@@ -136,8 +136,7 @@ def splitlines_keep(unicode s, keep):
     return s.splitlines(keep)
 
 @cython.test_fail_if_path_exists(
-# boolean conversion isn't currently smart enough for this ...
-#    "//CoerceToPyTypeNode", "//CoerceFromPyTypeNode",
+    "//CoerceToPyTypeNode", "//CoerceFromPyTypeNode",
     "//CastNode", "//TypecastNode")
 @cython.test_assert_path_exists(
     "//PythonCapiCallNode")
@@ -208,3 +207,124 @@ def join_sep(l):
     ab|jd|sdflk|as|sa|sadas|asdas|fsdf
     """
     return u'|'.join(l)
+
+
+# unicode.startswith(s, prefix, [start, [end]])
+
+@cython.test_fail_if_path_exists(
+    "//CoerceToPyTypeNode",
+    "//CoerceFromPyTypeNode",
+    "//CastNode", "//TypecastNode")
+@cython.test_assert_path_exists(
+    "//PythonCapiCallNode")
+def startswith(unicode s, sub):
+    """
+    >>> text.startswith('ab ')
+    True
+    >>> startswith(text, 'ab ')
+    'MATCH'
+    >>> text.startswith('ab X')
+    False
+    >>> startswith(text, 'ab X')
+    'NO MATCH'
+
+    >>> text.startswith(('ab', 'ab '))
+    True
+    >>> startswith(text, ('ab', 'ab '))
+    'MATCH'
+    >>> text.startswith((' ab', 'ab X'))
+    False
+    >>> startswith(text, (' ab', 'ab X'))
+    'NO MATCH'
+    """
+    if s.startswith(sub):
+        return 'MATCH'
+    else:
+        return 'NO MATCH'
+
+@cython.test_fail_if_path_exists(
+    "//CoerceToPyTypeNode",
+    "//CastNode", "//TypecastNode")
+@cython.test_assert_path_exists(
+    "//CoerceFromPyTypeNode",
+    "//PythonCapiCallNode")
+def startswith_start_end(unicode s, sub, start, end):
+    """
+    >>> text.startswith('b ', 1, 5)
+    True
+    >>> startswith_start_end(text, 'b ', 1, 5)
+    'MATCH'
+    >>> text.startswith('b X', 1, 5)
+    False
+    >>> startswith_start_end(text, 'b X', 1, 5)
+    'NO MATCH'
+    """
+    if s.startswith(sub, start, end):
+        return 'MATCH'
+    else:
+        return 'NO MATCH'
+
+
+# unicode.endswith(s, prefix, [start, [end]])
+
+@cython.test_fail_if_path_exists(
+    "//CoerceToPyTypeNode",
+    "//CoerceFromPyTypeNode",
+    "//CastNode", "//TypecastNode")
+@cython.test_assert_path_exists(
+    "//PythonCapiCallNode")
+def endswith(unicode s, sub):
+    """
+    >>> text.endswith('fsdf ')
+    True
+    >>> endswith(text, 'fsdf ')
+    'MATCH'
+    >>> text.endswith('fsdf X')
+    False
+    >>> endswith(text, 'fsdf X')
+    'NO MATCH'
+
+    >>> text.endswith(('fsdf', 'fsdf '))
+    True
+    >>> endswith(text, ('fsdf', 'fsdf '))
+    'MATCH'
+    >>> text.endswith(('fsdf', 'fsdf X'))
+    False
+    >>> endswith(text, ('fsdf', 'fsdf X'))
+    'NO MATCH'
+    """
+    if s.endswith(sub):
+        return 'MATCH'
+    else:
+        return 'NO MATCH'
+
+@cython.test_fail_if_path_exists(
+    "//CoerceToPyTypeNode",
+    "//CastNode", "//TypecastNode")
+@cython.test_assert_path_exists(
+    "//CoerceFromPyTypeNode",
+    "//PythonCapiCallNode")
+def endswith_start_end(unicode s, sub, start, end):
+    """
+    >>> text.endswith('fsdf', 10, len(text)-1)
+    True
+    >>> endswith_start_end(text, 'fsdf', 10, len(text)-1)
+    'MATCH'
+    >>> text.endswith('fsdf ', 10, len(text)-1)
+    False
+    >>> endswith_start_end(text, 'fsdf ', 10, len(text)-1)
+    'NO MATCH'
+
+    >>> text.endswith(('fsd', 'fsdf'), 10, len(text)-1)
+    True
+    >>> endswith_start_end(text, ('fsd', 'fsdf'), 10, len(text)-1)
+    'MATCH'
+    >>> text.endswith(('fsdf ', 'fsdf X'), 10, len(text)-1)
+    False
+    >>> endswith_start_end(text, ('fsdf ', 'fsdf X'), 10, len(text)-1)
+    'NO MATCH'
+    """
+    if s.endswith(sub, start, end):
+        return 'MATCH'
+    else:
+        return 'NO MATCH'