From 9cb1c32fb718554d37ae13a862afe4dbd517a78c Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sun, 16 May 2010 10:09:33 +0200 Subject: [PATCH] optimise predicates and methods for single unicode characters --- Cython/Compiler/ExprNodes.py | 4 +- Cython/Compiler/Optimize.py | 77 +++++++++++++++++++++++++++++++++++ tests/run/py_unicode_type.pyx | 42 +++++++++++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py index b7228de0..3eac2bb3 100755 --- a/Cython/Compiler/ExprNodes.py +++ b/Cython/Compiler/ExprNodes.py @@ -6217,7 +6217,9 @@ class CoerceToPyTypeNode(CoercionNode): if type is not py_object_type: self.type = py_object_type elif arg.type.is_string: - self.type = Builtin.bytes_type + self.type = bytes_type + elif arg.type is PyrexTypes.c_py_unicode_type: + self.type = unicode_type gil_message = "Converting to Python object" diff --git a/Cython/Compiler/Optimize.py b/Cython/Compiler/Optimize.py index 0216f54e..bfbc32c3 100644 --- a/Cython/Compiler/Optimize.py +++ b/Cython/Compiler/Optimize.py @@ -1736,6 +1736,71 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform): ### unicode type methods + PyUnicode_uchar_predicate_func_type = PyrexTypes.CFuncType( + PyrexTypes.c_bint_type, [ + PyrexTypes.CFuncTypeArg("uchar", PyrexTypes.c_py_unicode_type, None), + ]) + + def _inject_unicode_predicate(self, node, args, is_unbound_method): + if is_unbound_method or len(args) != 1: + return node + ustring = args[0] + if not isinstance(ustring, ExprNodes.CoerceToPyTypeNode) or \ + ustring.arg.type is not PyrexTypes.c_py_unicode_type: + return node + uchar = ustring.arg + method_name = node.function.attribute + if method_name == 'istitle': + # istitle() doesn't directly map to Py_UNICODE_ISTITLE() + utility_code = py_unicode_istitle_utility_code + function_name = '__Pyx_Py_UNICODE_ISTITLE' + else: + utility_code = None + function_name = 'Py_UNICODE_%s' % method_name.upper() + func_call = self._substitute_method_call( + node, function_name, self.PyUnicode_uchar_predicate_func_type, + method_name, is_unbound_method, [uchar], + utility_code = utility_code) + if node.type.is_pyobject: + func_call = func_call.coerce_to_pyobject(self.current_env) + return func_call + + _handle_simple_method_unicode_isalnum = _inject_unicode_predicate + _handle_simple_method_unicode_isalpha = _inject_unicode_predicate + _handle_simple_method_unicode_isdecimal = _inject_unicode_predicate + _handle_simple_method_unicode_isdigit = _inject_unicode_predicate + _handle_simple_method_unicode_islower = _inject_unicode_predicate + _handle_simple_method_unicode_isnumeric = _inject_unicode_predicate + _handle_simple_method_unicode_isspace = _inject_unicode_predicate + _handle_simple_method_unicode_istitle = _inject_unicode_predicate + _handle_simple_method_unicode_isupper = _inject_unicode_predicate + + PyUnicode_uchar_conversion_func_type = PyrexTypes.CFuncType( + PyrexTypes.c_py_unicode_type, [ + PyrexTypes.CFuncTypeArg("uchar", PyrexTypes.c_py_unicode_type, None), + ]) + + def _inject_unicode_character_conversion(self, node, args, is_unbound_method): + if is_unbound_method or len(args) != 1: + return node + ustring = args[0] + if not isinstance(ustring, ExprNodes.CoerceToPyTypeNode) or \ + ustring.arg.type is not PyrexTypes.c_py_unicode_type: + return node + uchar = ustring.arg + method_name = node.function.attribute + function_name = 'Py_UNICODE_TO%s' % method_name.upper() + func_call = self._substitute_method_call( + node, function_name, self.PyUnicode_uchar_conversion_func_type, + method_name, is_unbound_method, [uchar]) + if node.type.is_pyobject: + func_call = func_call.coerce_to_pyobject(self.current_env) + return func_call + + _handle_simple_method_unicode_lower = _inject_unicode_character_conversion + _handle_simple_method_unicode_upper = _inject_unicode_character_conversion + _handle_simple_method_unicode_title = _inject_unicode_character_conversion + PyUnicode_Splitlines_func_type = PyrexTypes.CFuncType( Builtin.list_type, [ PyrexTypes.CFuncTypeArg("str", Builtin.unicode_type, None), @@ -2196,6 +2261,18 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform): args[arg_index] = args[arg_index].coerce_to_boolean(self.current_env()) +py_unicode_istitle_utility_code = UtilityCode( +# Py_UNICODE_ISTITLE() doesn't match unicode.istitle() as the latter +# additionally allows character that comply with Py_UNICODE_ISUPPER() +proto = ''' +static CYTHON_INLINE int __Pyx_Py_UNICODE_ISTITLE(Py_UNICODE uchar); /* proto */ +''', +impl = ''' +static CYTHON_INLINE int __Pyx_Py_UNICODE_ISTITLE(Py_UNICODE uchar) { + return Py_UNICODE_ISTITLE(uchar) || Py_UNICODE_ISUPPER(uchar); +} +''') + unicode_tailmatch_utility_code = UtilityCode( # Python's unicode.startswith() and unicode.endswith() support a # tuple of prefixes/suffixes, whereas it's much more common to diff --git a/tests/run/py_unicode_type.pyx b/tests/run/py_unicode_type.pyx index 6655dd86..5a93cc73 100644 --- a/tests/run/py_unicode_type.pyx +++ b/tests/run/py_unicode_type.pyx @@ -77,3 +77,45 @@ def unicode_ordinal(Py_UNICODE i): ValueError: only single character unicode strings can be converted to Py_UNICODE, got length 2 """ return i + +@cython.test_assert_path_exists('//PythonCapiCallNode') +@cython.test_fail_if_path_exists('//SimpleCallNode') +def unicode_type_methods(Py_UNICODE uchar): + """ + >>> unicode_type_methods(ord('A')) + [True, True, False, False, False, False, False, True, True] + >>> unicode_type_methods(ord('a')) + [True, True, False, False, True, False, False, False, False] + >>> unicode_type_methods(ord('8')) + [True, False, True, True, False, True, False, False, False] + >>> unicode_type_methods(ord('\\t')) + [False, False, False, False, False, False, True, False, False] + """ + return [ + # character types + uchar.isalnum(), + uchar.isalpha(), + uchar.isdecimal(), + uchar.isdigit(), + uchar.islower(), + uchar.isnumeric(), + uchar.isspace(), + uchar.istitle(), + uchar.isupper(), + ] + +@cython.test_assert_path_exists('//PythonCapiCallNode') +@cython.test_fail_if_path_exists('//SimpleCallNode') +def unicode_methods(Py_UNICODE uchar): + """ + >>> unicode_methods(ord('A')) == ['a', 'A', 'A'] + True + >>> unicode_methods(ord('a')) == ['a', 'A', 'A'] + True + """ + return [ + # character conversion + uchar.lower(), + uchar.upper(), + uchar.title(), + ] -- 2.26.2