optimise predicates and methods for single unicode characters
authorStefan Behnel <scoder@users.berlios.de>
Sun, 16 May 2010 08:09:33 +0000 (10:09 +0200)
committerStefan Behnel <scoder@users.berlios.de>
Sun, 16 May 2010 08:09:33 +0000 (10:09 +0200)
Cython/Compiler/ExprNodes.py
Cython/Compiler/Optimize.py
tests/run/py_unicode_type.pyx

index b7228de058e907e48b5152e89f0feec7d3c59d94..3eac2bb39c705c0ec7cb5757762d4be3b0ed8057 100755 (executable)
@@ -6217,7 +6217,9 @@ class CoerceToPyTypeNode(CoercionNode):
         if type is not py_object_type:
             self.type = py_object_type
         elif arg.type.is_string:
-            self.type = Builtin.bytes_type
+            self.type = bytes_type
+        elif arg.type is PyrexTypes.c_py_unicode_type:
+            self.type = unicode_type
 
     gil_message = "Converting to Python object"
 
index 0216f54eb5da472f197e236708c23962f8848d8e..bfbc32c3f41a94522a5eab4b97cb2dab9e2c3d7e 100644 (file)
@@ -1736,6 +1736,71 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
 
     ### unicode type methods
 
+    PyUnicode_uchar_predicate_func_type = PyrexTypes.CFuncType(
+        PyrexTypes.c_bint_type, [
+            PyrexTypes.CFuncTypeArg("uchar", PyrexTypes.c_py_unicode_type, None),
+            ])
+
+    def _inject_unicode_predicate(self, node, args, is_unbound_method):
+        if is_unbound_method or len(args) != 1:
+            return node
+        ustring = args[0]
+        if not isinstance(ustring, ExprNodes.CoerceToPyTypeNode) or \
+               ustring.arg.type is not PyrexTypes.c_py_unicode_type:
+            return node
+        uchar = ustring.arg
+        method_name = node.function.attribute
+        if method_name == 'istitle':
+            # istitle() doesn't directly map to Py_UNICODE_ISTITLE()
+            utility_code = py_unicode_istitle_utility_code
+            function_name = '__Pyx_Py_UNICODE_ISTITLE'
+        else:
+            utility_code = None
+            function_name = 'Py_UNICODE_%s' % method_name.upper()
+        func_call = self._substitute_method_call(
+            node, function_name, self.PyUnicode_uchar_predicate_func_type,
+            method_name, is_unbound_method, [uchar],
+            utility_code = utility_code)
+        if node.type.is_pyobject:
+            func_call = func_call.coerce_to_pyobject(self.current_env)
+        return func_call
+
+    _handle_simple_method_unicode_isalnum   = _inject_unicode_predicate
+    _handle_simple_method_unicode_isalpha   = _inject_unicode_predicate
+    _handle_simple_method_unicode_isdecimal = _inject_unicode_predicate
+    _handle_simple_method_unicode_isdigit   = _inject_unicode_predicate
+    _handle_simple_method_unicode_islower   = _inject_unicode_predicate
+    _handle_simple_method_unicode_isnumeric = _inject_unicode_predicate
+    _handle_simple_method_unicode_isspace   = _inject_unicode_predicate
+    _handle_simple_method_unicode_istitle   = _inject_unicode_predicate
+    _handle_simple_method_unicode_isupper   = _inject_unicode_predicate
+
+    PyUnicode_uchar_conversion_func_type = PyrexTypes.CFuncType(
+        PyrexTypes.c_py_unicode_type, [
+            PyrexTypes.CFuncTypeArg("uchar", PyrexTypes.c_py_unicode_type, None),
+            ])
+
+    def _inject_unicode_character_conversion(self, node, args, is_unbound_method):
+        if is_unbound_method or len(args) != 1:
+            return node
+        ustring = args[0]
+        if not isinstance(ustring, ExprNodes.CoerceToPyTypeNode) or \
+               ustring.arg.type is not PyrexTypes.c_py_unicode_type:
+            return node
+        uchar = ustring.arg
+        method_name = node.function.attribute
+        function_name = 'Py_UNICODE_TO%s' % method_name.upper()
+        func_call = self._substitute_method_call(
+            node, function_name, self.PyUnicode_uchar_conversion_func_type,
+            method_name, is_unbound_method, [uchar])
+        if node.type.is_pyobject:
+            func_call = func_call.coerce_to_pyobject(self.current_env)
+        return func_call
+
+    _handle_simple_method_unicode_lower = _inject_unicode_character_conversion
+    _handle_simple_method_unicode_upper = _inject_unicode_character_conversion
+    _handle_simple_method_unicode_title = _inject_unicode_character_conversion
+
     PyUnicode_Splitlines_func_type = PyrexTypes.CFuncType(
         Builtin.list_type, [
             PyrexTypes.CFuncTypeArg("str", Builtin.unicode_type, None),
@@ -2196,6 +2261,18 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
             args[arg_index] = args[arg_index].coerce_to_boolean(self.current_env())
 
 
+py_unicode_istitle_utility_code = UtilityCode(
+# Py_UNICODE_ISTITLE() doesn't match unicode.istitle() as the latter
+# additionally allows character that comply with Py_UNICODE_ISUPPER()
+proto = '''
+static CYTHON_INLINE int __Pyx_Py_UNICODE_ISTITLE(Py_UNICODE uchar); /* proto */
+''',
+impl = '''
+static CYTHON_INLINE int __Pyx_Py_UNICODE_ISTITLE(Py_UNICODE uchar) {
+    return Py_UNICODE_ISTITLE(uchar) || Py_UNICODE_ISUPPER(uchar);
+}
+''')
+
 unicode_tailmatch_utility_code = UtilityCode(
     # Python's unicode.startswith() and unicode.endswith() support a
     # tuple of prefixes/suffixes, whereas it's much more common to
index 6655dd865918afc9f1b8a28af7c3f3bc72ce152d..5a93cc73c1c8dd18c024994b31ccd0f16f82a5f0 100644 (file)
@@ -77,3 +77,45 @@ def unicode_ordinal(Py_UNICODE i):
     ValueError: only single character unicode strings can be converted to Py_UNICODE, got length 2
     """
     return i
+
+@cython.test_assert_path_exists('//PythonCapiCallNode')
+@cython.test_fail_if_path_exists('//SimpleCallNode')
+def unicode_type_methods(Py_UNICODE uchar):
+    """
+    >>> unicode_type_methods(ord('A'))
+    [True, True, False, False, False, False, False, True, True]
+    >>> unicode_type_methods(ord('a'))
+    [True, True, False, False, True, False, False, False, False]
+    >>> unicode_type_methods(ord('8'))
+    [True, False, True, True, False, True, False, False, False]
+    >>> unicode_type_methods(ord('\\t'))
+    [False, False, False, False, False, False, True, False, False]
+    """
+    return [
+        # character types
+        uchar.isalnum(),
+        uchar.isalpha(),
+        uchar.isdecimal(),
+        uchar.isdigit(),
+        uchar.islower(),
+        uchar.isnumeric(),
+        uchar.isspace(),
+        uchar.istitle(),
+        uchar.isupper(),
+        ]
+
+@cython.test_assert_path_exists('//PythonCapiCallNode')
+@cython.test_fail_if_path_exists('//SimpleCallNode')
+def unicode_methods(Py_UNICODE uchar):
+    """
+    >>> unicode_methods(ord('A')) == ['a', 'A', 'A']
+    True
+    >>> unicode_methods(ord('a')) == ['a', 'A', 'A']
+    True
+    """
+    return [
+        # character conversion
+        uchar.lower(),
+        uchar.upper(),
+        uchar.title(),
+        ]