From dc320e05583fb88ed719560983de56f230ac07ba Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Mon, 19 Apr 2010 09:50:19 +0200 Subject: [PATCH] native support for Py_UNICODE, coercion between 1-character unicode literals and Py_UNICODE, fix C iteration over unicode strings by using Py_UNICODE* --- Cython/Compiler/ExprNodes.py | 20 ++++++-- Cython/Compiler/Optimize.py | 2 +- Cython/Compiler/Parsing.py | 1 + Cython/Compiler/PyrexTypes.py | 67 ++++++++++++++++--------- Cython/Shadow.py | 4 +- tests/errors/e_strcoerce.pyx | 14 +++--- tests/errors/py_unicode_type_errors.pyx | 24 +++++++++ tests/errors/string_assignments.pyx | 2 +- tests/run/for_in_string.pyx | 14 +++--- tests/run/py_unicode_type.pyx | 44 ++++++++++++++++ 10 files changed, 147 insertions(+), 45 deletions(-) create mode 100644 tests/errors/py_unicode_type_errors.pyx create mode 100644 tests/run/py_unicode_type.pyx diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py index dda284c7..c8d733ca 100755 --- a/Cython/Compiler/ExprNodes.py +++ b/Cython/Compiler/ExprNodes.py @@ -860,7 +860,10 @@ class BytesNode(ConstNode): def coerce_to(self, dst_type, env): if dst_type.is_int: if not self.can_coerce_to_char_literal(): - error(self.pos, "Only single-character strings can be coerced into ints.") + error(self.pos, "Only single-character string literals can be coerced into ints.") + return self + if dst_type is PyrexTypes.c_py_unicode_type: + error(self.pos, "Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.") return self return CharNode(self.pos, value=self.value) @@ -915,13 +918,22 @@ class UnicodeNode(PyConstNode): def coerce_to(self, dst_type, env): if dst_type is self.type: pass + elif dst_type is PyrexTypes.c_py_unicode_type: + if not self.can_coerce_to_char_literal(): + error(self.pos, "Only single-character Unicode string literals can be coerced into Py_UNICODE.") + return self + int_value = ord(self.value) + return IntNode(self.pos, value=int_value, constant_result=int_value) elif not dst_type.is_pyobject: - error(self.pos, "Unicode objects do not support coercion to C types.") + error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.") elif dst_type is not py_object_type: if not self.check_for_coercion_error(dst_type): self.fail_assignment(dst_type) return self + def can_coerce_to_char_literal(self): + return len(self.value) == 1 + def generate_evaluation_code(self, code): self.result_code = code.get_py_string_const(self.value) @@ -5426,10 +5438,10 @@ class CmpNode(object): type1_can_be_int = False type2_can_be_int = False - if isinstance(operand1, (StringNode, BytesNode)) \ + if isinstance(operand1, (StringNode, BytesNode, UnicodeNode)) \ and operand1.can_coerce_to_char_literal(): type1_can_be_int = True - if isinstance(operand2, (StringNode, BytesNode)) \ + if isinstance(operand2, (StringNode, BytesNode, UnicodeNode)) \ and operand2.can_coerce_to_char_literal(): type2_can_be_int = True diff --git a/Cython/Compiler/Optimize.py b/Cython/Compiler/Optimize.py index e68f7bfc..4739f363 100644 --- a/Cython/Compiler/Optimize.py +++ b/Cython/Compiler/Optimize.py @@ -137,7 +137,7 @@ class IterationTransform(Visitor.VisitorTransform): return node PyUnicode_AS_UNICODE_func_type = PyrexTypes.CFuncType( - PyrexTypes.CPtrType(PyrexTypes.c_uint_type), [ # FIXME: return type is actually Py_UNICODE* + PyrexTypes.CPtrType(PyrexTypes.c_py_unicode_type), [ PyrexTypes.CFuncTypeArg("s", Builtin.unicode_type, None) ]) diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py index 5966cc42..3397d771 100644 --- a/Cython/Compiler/Parsing.py +++ b/Cython/Compiler/Parsing.py @@ -1851,6 +1851,7 @@ basic_c_type_names = ("void", "char", "int", "float", "double", "bint") special_basic_c_types = { # name : (signed, longness) + "Py_UNICODE" : (0, 0), "Py_ssize_t" : (2, 0), "size_t" : (0, 0), } diff --git a/Cython/Compiler/PyrexTypes.py b/Cython/Compiler/PyrexTypes.py index 3126b333..101249a5 100755 --- a/Cython/Compiler/PyrexTypes.py +++ b/Cython/Compiler/PyrexTypes.py @@ -863,6 +863,20 @@ class CAnonEnumType(CIntType): return 'int' +class CPyUnicodeIntType(CIntType): + # Py_UNICODE + + # Conversion from a unicode string to Py_UNICODE at runtime is not + # currently supported and may never be - we only convert from and + # to integers here. The maximum value for a Py_UNICODE is + # 1114111, so PyInt_FromLong() will do just fine here. + + to_py_function = "PyInt_FromLong" + + def sign_and_name(self): + return "Py_UNICODE" + + class CPySSizeTType(CIntType): to_py_function = "PyInt_FromSsize_t" @@ -2075,14 +2089,15 @@ class ErrorType(PyrexType): rank_to_type_name = ( "char", # 0 "short", # 1 - "int", # 2 - "long", # 3 - "Py_ssize_t", # 4 - "size_t", # 5 - "PY_LONG_LONG", # 6 - "float", # 7 - "double", # 8 - "long double", # 9 + "Py_UNICODE", # 2 + "int", # 3 + "long", # 4 + "Py_ssize_t", # 5 + "size_t", # 6 + "PY_LONG_LONG", # 7 + "float", # 8 + "double", # 9 + "long double", # 10 ) py_object_type = PyObjectType() @@ -2093,29 +2108,30 @@ c_void_ptr_ptr_type = CPtrType(c_void_ptr_type) c_uchar_type = CIntType(0, 0) c_ushort_type = CIntType(1, 0) -c_uint_type = CIntType(2, 0) -c_ulong_type = CIntType(3, 0) -c_ulonglong_type = CIntType(6, 0) +c_py_unicode_type = CPyUnicodeIntType(2, 0) +c_uint_type = CIntType(3, 0) +c_ulong_type = CIntType(4, 0) +c_ulonglong_type = CIntType(7, 0) c_char_type = CIntType(0, 1) c_short_type = CIntType(1, 1) -c_int_type = CIntType(2, 1) -c_long_type = CIntType(3, 1) -c_longlong_type = CIntType(6, 1) +c_int_type = CIntType(3, 1) +c_long_type = CIntType(4, 1) +c_longlong_type = CIntType(7, 1) c_schar_type = CIntType(0, 2) c_sshort_type = CIntType(1, 2) -c_sint_type = CIntType(2, 2) -c_slong_type = CIntType(3, 2) -c_slonglong_type = CIntType(6, 2) +c_sint_type = CIntType(3, 2) +c_slong_type = CIntType(4, 2) +c_slonglong_type = CIntType(7, 2) -c_bint_type = CBIntType(2, 1) -c_py_ssize_t_type = CPySSizeTType(4, 2) -c_size_t_type = CSizeTType(5, 0) +c_bint_type = CBIntType(3, 1) +c_py_ssize_t_type = CPySSizeTType(5, 2) +c_size_t_type = CSizeTType(6, 0) -c_float_type = CFloatType(7, math_h_modifier='f') -c_double_type = CFloatType(8) -c_longdouble_type = CFloatType(9, math_h_modifier='l') +c_float_type = CFloatType(8, math_h_modifier='f') +c_double_type = CFloatType(9) +c_longdouble_type = CFloatType(10, math_h_modifier='l') c_float_complex_type = CComplexType(c_float_type) c_double_complex_type = CComplexType(c_double_type) @@ -2131,7 +2147,7 @@ c_int_ptr_type = CPtrType(c_int_type) c_py_ssize_t_ptr_type = CPtrType(c_py_ssize_t_type) c_size_t_ptr_type = CPtrType(c_size_t_type) -c_returncode_type = CIntType(2, 1, is_returncode = 1) +c_returncode_type = CIntType(3, 1, is_returncode = 1) c_anon_enum_type = CAnonEnumType(-1, 1) # the Py_buffer type is defined in Builtin.py @@ -2165,6 +2181,7 @@ modifiers_and_name_to_type = { (1, 0, "bint"): c_bint_type, (0, 0, "size_t") : c_size_t_type, (2, 0, "Py_ssize_t"): c_py_ssize_t_type, + (0, 0, "Py_UNICODE"): c_py_unicode_type, (1, 0, "float"): c_float_type, (1, 0, "double"): c_double_type, @@ -2383,6 +2400,8 @@ def parse_basic_type(name): signed = 2 elif name == 'size_t': signed = 0 + elif name == 'Py_UNICODE': + signed = 0 else: if name.startswith('u'): name = name[1:] diff --git a/Cython/Shadow.py b/Cython/Shadow.py index a48d8d3f..d7dd186d 100644 --- a/Cython/Shadow.py +++ b/Cython/Shadow.py @@ -174,7 +174,7 @@ except ImportError: # Predefined types -int_types = ['char', 'short', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t'] +int_types = ['char', 'short', 'Py_UNICODE', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t'] float_types = ['longdouble', 'double', 'float'] complex_types = ['longdoublecomplex', 'doublecomplex', 'floatcomplex', 'complex'] other_types = ['bint', 'void'] @@ -183,7 +183,7 @@ gs = globals() for name in int_types: gs[name] = typedef(py_int) - if not name.endswith('size_t'): + if name != 'Py_UNICODE' and not name.endswith('size_t'): gs['u'+name] = typedef(py_int) gs['s'+name] = typedef(py_int) diff --git a/tests/errors/e_strcoerce.pyx b/tests/errors/e_strcoerce.pyx index cda8dd57..8de7344f 100644 --- a/tests/errors/e_strcoerce.pyx +++ b/tests/errors/e_strcoerce.pyx @@ -4,12 +4,14 @@ cdef int cx = "test" # fails cdef int x1 = "\xFF" # works cdef int x2 = "\u0FFF" # fails -cdef int x3 = u"\xFF" # fails +cdef Py_UNICODE u1 = u"\xFF" # works +cdef int u3 = u"\xFF" # fails -_ERRORS = u""" -2:14: Only single-character strings can be coerced into ints. -3:14: Only single-character strings can be coerced into ints. -6:15: Only single-character strings can be coerced into ints. -7:14: Unicode objects do not support coercion to C types. + +_ERRORS = """ +2:14: Only single-character string literals can be coerced into ints. +3:14: Only single-character string literals can be coerced into ints. +6:15: Only single-character string literals can be coerced into ints. +9:14: Unicode literals do not support coercion to C types other than Py_UNICODE. """ diff --git a/tests/errors/py_unicode_type_errors.pyx b/tests/errors/py_unicode_type_errors.pyx new file mode 100644 index 00000000..1b4e10e3 --- /dev/null +++ b/tests/errors/py_unicode_type_errors.pyx @@ -0,0 +1,24 @@ +# -*- coding: iso-8859-1 -*- + +cdef Py_UNICODE char_ASCII = u'A' +cdef Py_UNICODE char_KLINGON = u'\uF8D2' + +def char_too_long_ASCII(): + cdef Py_UNICODE c = u'AB' + +def char_too_long_Unicode(): + cdef Py_UNICODE c = u'A\uF8D2' + +def char_too_long_bytes(): + cdef Py_UNICODE c = b'AB' + +def char_too_long_latin1(): + cdef Py_UNICODE char_bytes_latin1 = b'รถ' + + +_ERRORS = """ +7:24: Only single-character Unicode string literals can be coerced into Py_UNICODE. +10:24: Only single-character Unicode string literals can be coerced into Py_UNICODE. +13:24: Only single-character string literals can be coerced into ints. +16:40: Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead. +""" diff --git a/tests/errors/string_assignments.pyx b/tests/errors/string_assignments.pyx index 5a691035..1b5a1341 100644 --- a/tests/errors/string_assignments.pyx +++ b/tests/errors/string_assignments.pyx @@ -50,7 +50,7 @@ cdef list l_f2 = b1 cdef list l_f3 = u1 _ERRORS = u""" -25:20: Unicode objects do not support coercion to C types. +25:20: Unicode literals do not support coercion to C types other than Py_UNICODE. 26:22: Unicode objects do not support coercion to C types. 27:22: 'str' objects do not support coercion to C types (use 'bytes'?). diff --git a/tests/run/for_in_string.pyx b/tests/run/for_in_string.pyx index 9e920dfe..4851ab17 100644 --- a/tests/run/for_in_string.pyx +++ b/tests/run/for_in_string.pyx @@ -14,7 +14,7 @@ def for_in_bytes(bytes s): 'C' """ for c in s: - if c == 'C': + if c == b'C': return 'C' else: return 'X' @@ -28,21 +28,21 @@ def for_char_in_bytes(bytes s): """ cdef char c for c in s: - if c == 'C': + if c == b'C': return 'C' else: return 'X' -def for_int_in_unicode(unicode s): +def for_pyunicode_in_unicode(unicode s): """ - >>> for_int_in_unicode(unicode_abc) + >>> for_pyunicode_in_unicode(unicode_abc) 'X' - >>> for_int_in_unicode(unicode_ABC) + >>> for_pyunicode_in_unicode(unicode_ABC) 'C' """ - cdef int c + cdef Py_UNICODE c for c in s: - if c == 'C': + if c == u'C': return 'C' else: return 'X' diff --git a/tests/run/py_unicode_type.pyx b/tests/run/py_unicode_type.pyx new file mode 100644 index 00000000..b1bb40e8 --- /dev/null +++ b/tests/run/py_unicode_type.pyx @@ -0,0 +1,44 @@ +# -*- coding: iso-8859-1 -*- + +cdef Py_UNICODE char_ASCII = u'A' +cdef Py_UNICODE char_KLINGON = u'\uF8D2' + + +def compare_ASCII(): + """ + >>> compare_ASCII() + True + False + False + """ + print(char_ASCII == u'A') + print(char_ASCII == u'B') + print(char_ASCII == u'\uF8D2') + + +def compare_KLINGON(): + """ + >>> compare_ASCII() + True + False + False + """ + print(char_KLINGON == u'\uF8D2') + print(char_KLINGON == u'A') + print(char_KLINGON == u'B') + + +def index_literal(int i): + """ + >>> index_literal(0) == '1' + True + >>> index_literal(-5) == '1' + True + >>> index_literal(2) == '3' + True + >>> index_literal(4) == '5' + True + """ + # runtime casts are not currently supported + #return (u"12345"[i]) + return u"12345"[i] -- 2.26.2