From 8f88561fca076f84ad8c7b1084fe4b1c6f2fea15 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sun, 21 Mar 2010 20:06:06 +0100 Subject: [PATCH] optimise unicode.startswith() and unicode.endswith(), drop redundant bint->bool->bint coercions --- Cython/Compiler/Builtin.py | 3 +- Cython/Compiler/Optimize.py | 89 ++++++++++++++++++++++++- tests/run/unicodemethods.pyx | 124 ++++++++++++++++++++++++++++++++++- 3 files changed, 211 insertions(+), 5 deletions(-) diff --git a/Cython/Compiler/Builtin.py b/Cython/Compiler/Builtin.py index a090ecfd..23f5ec71 100644 --- a/Cython/Compiler/Builtin.py +++ b/Cython/Compiler/Builtin.py @@ -414,7 +414,7 @@ def init_builtins(): init_builtin_types() init_builtin_structs() global list_type, tuple_type, dict_type, set_type, type_type - global bytes_type, str_type, unicode_type, float_type + global bytes_type, str_type, unicode_type, float_type, bool_type type_type = builtin_scope.lookup('type').type list_type = builtin_scope.lookup('list').type tuple_type = builtin_scope.lookup('tuple').type @@ -424,5 +424,6 @@ def init_builtins(): str_type = builtin_scope.lookup('str').type unicode_type = builtin_scope.lookup('unicode').type float_type = builtin_scope.lookup('float').type + bool_type = builtin_scope.lookup('bool').type init_builtins() diff --git a/Cython/Compiler/Optimize.py b/Cython/Compiler/Optimize.py index 1627692f..a0a82ba8 100644 --- a/Cython/Compiler/Optimize.py +++ b/Cython/Compiler/Optimize.py @@ -937,6 +937,16 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform): return node return node.arg + def visit_CoerceToBooleanNode(self, node): + """Drop redundant conversion nodes after tree changes. + """ + self.visitchildren(node) + arg = node.arg + if isinstance(arg, ExprNodes.CoerceToPyTypeNode): + if arg.type in (PyrexTypes.py_object_type, Builtin.bool_type): + return arg.arg.coerce_to_boolean(self.env_stack[-1]) + return node + def visit_CoerceFromPyTypeNode(self, node): """Drop redundant conversion nodes after tree changes. @@ -1437,8 +1447,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform): if len(args) < 2: args.append(ExprNodes.BoolNode(node.pos, value=False)) else: - args[1] = args[1].coerce_to(PyrexTypes.c_bint_type, - self.env_stack[-1]) + args[1] = args[1].coerce_to_boolean(self.env_stack[-1]) return self._substitute_method_call( node, "PyUnicode_Splitlines", self.PyUnicode_Splitlines_func_type, @@ -1490,6 +1499,54 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform): node, "PyUnicode_Split", self.PyUnicode_Split_func_type, 'split', is_unbound_method, args) + PyUnicode_Tailmatch_func_type = PyrexTypes.CFuncType( + PyrexTypes.c_bint_type, [ + PyrexTypes.CFuncTypeArg("str", Builtin.unicode_type, None), + PyrexTypes.CFuncTypeArg("substring", PyrexTypes.py_object_type, None), + PyrexTypes.CFuncTypeArg("start", PyrexTypes.c_py_ssize_t_type, None), + PyrexTypes.CFuncTypeArg("end", PyrexTypes.c_py_ssize_t_type, None), + PyrexTypes.CFuncTypeArg("direction", PyrexTypes.c_int_type, None), + ], + exception_value = '-1') + + def _handle_simple_method_unicode_endswith(self, node, args, is_unbound_method): + return self._inject_unicode_tailmatch( + node, args, is_unbound_method, 'endswith', +1) + + def _handle_simple_method_unicode_startswith(self, node, args, is_unbound_method): + return self._inject_unicode_tailmatch( + node, args, is_unbound_method, 'startswith', -1) + + def _inject_unicode_tailmatch(self, node, args, is_unbound_method, + method_name, direction): + """Replace unicode.startswith(...) and unicode.endswith(...) + by a direct call to the corresponding C-API function. + """ + if len(args) not in (2,3,4): + self._error_wrong_arg_count('unicode.%s' % method_name, node, args, "2-4") + return node + if len(args) < 3: + args.append(ExprNodes.IntNode( + node.pos, value="0", type=PyrexTypes.c_py_ssize_t_type)) + else: + args[2] = args[2].coerce_to(PyrexTypes.c_py_ssize_t_type, + self.env_stack[-1]) + if len(args) < 4: + args.append(ExprNodes.IntNode( + node.pos, value="PY_SSIZE_T_MAX", type=PyrexTypes.c_py_ssize_t_type)) + else: + args[3] = args[3].coerce_to(PyrexTypes.c_py_ssize_t_type, + self.env_stack[-1]) + args.append(ExprNodes.IntNode( + node.pos, value=str(direction), type=PyrexTypes.c_int_type)) + + method_call = self._substitute_method_call( + node, "__Pyx_PyUnicode_Tailmatch", self.PyUnicode_Tailmatch_func_type, + method_name, is_unbound_method, args, + utility_code = unicode_tailmatch_utility_code) + return ExprNodes.CoerceToPyTypeNode( + method_call, self.env_stack[-1], Builtin.bool_type) + PyUnicode_AsEncodedString_func_type = PyrexTypes.CFuncType( Builtin.bytes_type, [ PyrexTypes.CFuncTypeArg("obj", Builtin.unicode_type, None), @@ -1740,6 +1797,34 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform): ) +unicode_tailmatch_utility_code = UtilityCode( + # Python's unicode.startswith() and unicode.endswith() support a + # tuple of prefixes/suffixes, whereas it's much more common to + # test for a single unicode string. +proto = ''' +static int __Pyx_PyUnicode_Tailmatch(PyObject* s, PyObject* substr, \ +Py_ssize_t start, Py_ssize_t end, int direction); +''', +impl = ''' +static int __Pyx_PyUnicode_Tailmatch(PyObject* s, PyObject* substr, + Py_ssize_t start, Py_ssize_t end, int direction) { + if (unlikely(PyTuple_Check(substr))) { + int result; + Py_ssize_t i; + for (i = 0; i < PyTuple_GET_SIZE(substr); i++) { + result = PyUnicode_Tailmatch(s, PyTuple_GET_ITEM(substr, i), + start, end, direction); + if (result) { + return result; + } + } + return 0; + } + return PyUnicode_Tailmatch(s, substr, start, end, direction); +} +''', +) + dict_getitem_default_utility_code = UtilityCode( proto = ''' static PyObject* __Pyx_PyDict_GetItemDefault(PyObject* d, PyObject* key, PyObject* default_value) { diff --git a/tests/run/unicodemethods.pyx b/tests/run/unicodemethods.pyx index 11ec5618..ec8cae0d 100644 --- a/tests/run/unicodemethods.pyx +++ b/tests/run/unicodemethods.pyx @@ -136,8 +136,7 @@ def splitlines_keep(unicode s, keep): return s.splitlines(keep) @cython.test_fail_if_path_exists( -# boolean conversion isn't currently smart enough for this ... -# "//CoerceToPyTypeNode", "//CoerceFromPyTypeNode", + "//CoerceToPyTypeNode", "//CoerceFromPyTypeNode", "//CastNode", "//TypecastNode") @cython.test_assert_path_exists( "//PythonCapiCallNode") @@ -208,3 +207,124 @@ def join_sep(l): ab|jd|sdflk|as|sa|sadas|asdas|fsdf """ return u'|'.join(l) + + +# unicode.startswith(s, prefix, [start, [end]]) + +@cython.test_fail_if_path_exists( + "//CoerceToPyTypeNode", + "//CoerceFromPyTypeNode", + "//CastNode", "//TypecastNode") +@cython.test_assert_path_exists( + "//PythonCapiCallNode") +def startswith(unicode s, sub): + """ + >>> text.startswith('ab ') + True + >>> startswith(text, 'ab ') + 'MATCH' + >>> text.startswith('ab X') + False + >>> startswith(text, 'ab X') + 'NO MATCH' + + >>> text.startswith(('ab', 'ab ')) + True + >>> startswith(text, ('ab', 'ab ')) + 'MATCH' + >>> text.startswith((' ab', 'ab X')) + False + >>> startswith(text, (' ab', 'ab X')) + 'NO MATCH' + """ + if s.startswith(sub): + return 'MATCH' + else: + return 'NO MATCH' + +@cython.test_fail_if_path_exists( + "//CoerceToPyTypeNode", + "//CastNode", "//TypecastNode") +@cython.test_assert_path_exists( + "//CoerceFromPyTypeNode", + "//PythonCapiCallNode") +def startswith_start_end(unicode s, sub, start, end): + """ + >>> text.startswith('b ', 1, 5) + True + >>> startswith_start_end(text, 'b ', 1, 5) + 'MATCH' + >>> text.startswith('b X', 1, 5) + False + >>> startswith_start_end(text, 'b X', 1, 5) + 'NO MATCH' + """ + if s.startswith(sub, start, end): + return 'MATCH' + else: + return 'NO MATCH' + + +# unicode.endswith(s, prefix, [start, [end]]) + +@cython.test_fail_if_path_exists( + "//CoerceToPyTypeNode", + "//CoerceFromPyTypeNode", + "//CastNode", "//TypecastNode") +@cython.test_assert_path_exists( + "//PythonCapiCallNode") +def endswith(unicode s, sub): + """ + >>> text.endswith('fsdf ') + True + >>> endswith(text, 'fsdf ') + 'MATCH' + >>> text.endswith('fsdf X') + False + >>> endswith(text, 'fsdf X') + 'NO MATCH' + + >>> text.endswith(('fsdf', 'fsdf ')) + True + >>> endswith(text, ('fsdf', 'fsdf ')) + 'MATCH' + >>> text.endswith(('fsdf', 'fsdf X')) + False + >>> endswith(text, ('fsdf', 'fsdf X')) + 'NO MATCH' + """ + if s.endswith(sub): + return 'MATCH' + else: + return 'NO MATCH' + +@cython.test_fail_if_path_exists( + "//CoerceToPyTypeNode", + "//CastNode", "//TypecastNode") +@cython.test_assert_path_exists( + "//CoerceFromPyTypeNode", + "//PythonCapiCallNode") +def endswith_start_end(unicode s, sub, start, end): + """ + >>> text.endswith('fsdf', 10, len(text)-1) + True + >>> endswith_start_end(text, 'fsdf', 10, len(text)-1) + 'MATCH' + >>> text.endswith('fsdf ', 10, len(text)-1) + False + >>> endswith_start_end(text, 'fsdf ', 10, len(text)-1) + 'NO MATCH' + + >>> text.endswith(('fsd', 'fsdf'), 10, len(text)-1) + True + >>> endswith_start_end(text, ('fsd', 'fsdf'), 10, len(text)-1) + 'MATCH' + >>> text.endswith(('fsdf ', 'fsdf X'), 10, len(text)-1) + False + >>> endswith_start_end(text, ('fsdf ', 'fsdf X'), 10, len(text)-1) + 'NO MATCH' + """ + if s.endswith(sub, start, end): + return 'MATCH' + else: + return 'NO MATCH' -- 2.26.2