From: Stefan Behnel Date: Sat, 29 Jan 2011 17:35:53 +0000 (+0100) Subject: implemented Py_UCS4 type X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=e68b3b57b9c8999df250e6bcaed538aa33ea0235;p=cython.git implemented Py_UCS4 type --HG-- rename : tests/errors/py_unicode_type_errors.pyx => tests/errors/py_ucs4_type_errors.pyx rename : tests/run/py_unicode_type.pyx => tests/run/py_ucs4_type.pyx --- diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py index 6824d946..555588cf 100755 --- a/Cython/Compiler/ExprNodes.py +++ b/Cython/Compiler/ExprNodes.py @@ -981,8 +981,8 @@ class BytesNode(ConstNode): if not self.can_coerce_to_char_literal(): error(self.pos, "Only single-character string literals can be coerced into ints.") return self - if dst_type is PyrexTypes.c_py_unicode_type: - error(self.pos, "Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.") + if dst_type.is_unicode_char: + error(self.pos, "Bytes literals cannot coerce to Py_UNICODE/Py_UCS4, use a unicode literal instead.") return self return CharNode(self.pos, value=self.value) @@ -1033,17 +1033,17 @@ class UnicodeNode(PyConstNode): def coerce_to(self, dst_type, env): if dst_type is self.type: pass - elif dst_type is PyrexTypes.c_py_unicode_type: + elif dst_type.is_unicode_char: if not self.can_coerce_to_char_literal(): - error(self.pos, "Only single-character Unicode string literals can be coerced into Py_UNICODE.") + error(self.pos, "Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.") return self int_value = ord(self.value) - return IntNode(self.pos, value=int_value, constant_result=int_value) + return IntNode(self.pos, type=dst_type, value=str(int_value), constant_result=int_value) elif not dst_type.is_pyobject: if dst_type.is_string and self.bytes_value is not None: # special case: '-3' enforced unicode literal used in a C char* context return BytesNode(self.pos, value=self.bytes_value).coerce_to(dst_type, env) - error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.") + error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4.") elif dst_type is not py_object_type: if not self.check_for_coercion_error(dst_type): self.fail_assignment(dst_type) @@ -1051,6 +1051,9 @@ class UnicodeNode(PyConstNode): def can_coerce_to_char_literal(self): return len(self.value) == 1 + ## or (len(self.value) == 2 + ## and (0xD800 <= self.value[0] <= 0xDBFF) + ## and (0xDC00 <= self.value[1] <= 0xDFFF)) def contains_surrogates(self): # Check if the unicode string contains surrogate code points @@ -2165,8 +2168,8 @@ class IndexNode(ExprNode): elif not skip_child_analysis: self.index.analyse_types(env) self.original_index_type = self.index.type - if base_type is PyrexTypes.c_py_unicode_type: - # we infer Py_UNICODE for unicode strings in some + if base_type.is_unicode_char: + # we infer Py_UNICODE/Py_UCS4 for unicode strings in some # cases, but indexing must still work for them if self.index.constant_result in (0, -1): # FIXME: we know that this node is redundant - @@ -2188,7 +2191,7 @@ class IndexNode(ExprNode): self.index = self.index.coerce_to_pyobject(env) self.is_temp = 1 if self.index.type.is_int and base_type is unicode_type: - # Py_UNICODE will automatically coerce to a unicode string + # Py_UNICODE/Py_UCS4 will automatically coerce to a unicode string # if required, so this is fast and safe self.type = PyrexTypes.c_py_unicode_type elif is_slice and base_type in (bytes_type, str_type, unicode_type, list_type, tuple_type): @@ -2253,7 +2256,7 @@ class IndexNode(ExprNode): return "PyList_GET_ITEM(%s, %s)" % (self.base.result(), self.index.result()) elif self.base.type is tuple_type: return "PyTuple_GET_ITEM(%s, %s)" % (self.base.result(), self.index.result()) - elif self.base.type is unicode_type and self.type is PyrexTypes.c_py_unicode_type: + elif self.base.type is unicode_type and self.type.is_unicode_char: return "PyUnicode_AS_UNICODE(%s)[%s]" % (self.base.result(), self.index.result()) elif (self.type.is_ptr or self.type.is_array) and self.type == self.base.type: error(self.pos, "Invalid use of pointer slice") @@ -2332,7 +2335,7 @@ class IndexNode(ExprNode): self.result(), code.error_goto(self.pos))) code.put_gotref(self.py_result()) - elif self.type is PyrexTypes.c_py_unicode_type and self.base.type is unicode_type: + elif self.type.is_unicode_char and self.base.type is unicode_type: assert self.index.type.is_int index_code = self.index.result() function = "__Pyx_GetItemInt_Unicode" @@ -5845,8 +5848,8 @@ class NumBinopNode(BinopNode): self.operand2.result()) def is_py_operation_types(self, type1, type2): - return (type1 is PyrexTypes.c_py_unicode_type or - type2 is PyrexTypes.c_py_unicode_type or + return (type1.is_unicode_char or + type2.is_unicode_char or BinopNode.is_py_operation_types(self, type1, type2)) def py_operation_function(self): @@ -6503,7 +6506,7 @@ class CmpNode(object): return self.operator in ('in', 'not_in') and \ ((self.operand1.type.is_int and (self.operand2.type.is_string or self.operand2.type is bytes_type)) or - (self.operand1.type is PyrexTypes.c_py_unicode_type + (self.operand1.type.is_unicode_char and self.operand2.type is unicode_type)) def is_ptr_contains(self): @@ -7166,7 +7169,7 @@ class CoerceToPyTypeNode(CoercionNode): # be specific about some known types if arg.type.is_string: self.type = bytes_type - elif arg.type is PyrexTypes.c_py_unicode_type: + elif arg.type.is_unicode_char: self.type = unicode_type elif arg.type.is_complex: self.type = Builtin.complex_type diff --git a/Cython/Compiler/Optimize.py b/Cython/Compiler/Optimize.py index 3139e400..a4a3ed71 100644 --- a/Cython/Compiler/Optimize.py +++ b/Cython/Compiler/Optimize.py @@ -1936,7 +1936,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform): node.pos, cfunc_name, self.PyObject_Size_func_type, args = [arg], is_temp = node.is_temp) - elif arg.type is PyrexTypes.c_py_unicode_type: + elif arg.type.is_unicode_char: return ExprNodes.IntNode(node.pos, value='1', constant_result=1, type=node.type) else: @@ -2028,7 +2028,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform): return node arg = pos_args[0] if isinstance(arg, ExprNodes.CoerceToPyTypeNode): - if arg.arg.type is PyrexTypes.c_py_unicode_type: + if arg.arg.type.is_unicode_char: return arg.arg.coerce_to(node.type, self.current_env()) return node @@ -2191,7 +2191,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform): return node ustring = args[0] if not isinstance(ustring, ExprNodes.CoerceToPyTypeNode) or \ - ustring.arg.type is not PyrexTypes.c_py_unicode_type: + not ustring.arg.type.is_unicode_char: return node uchar = ustring.arg method_name = node.function.attribute @@ -2230,7 +2230,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform): return node ustring = args[0] if not isinstance(ustring, ExprNodes.CoerceToPyTypeNode) or \ - ustring.arg.type is not PyrexTypes.c_py_unicode_type: + not ustring.arg.type.is_unicode_char: return node uchar = ustring.arg method_name = node.function.attribute diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py index 7f59c256..2cd437e7 100644 --- a/Cython/Compiler/Parsing.py +++ b/Cython/Compiler/Parsing.py @@ -2041,6 +2041,7 @@ basic_c_type_names = ("void", "char", "int", "float", "double", "bint") special_basic_c_types = { # name : (signed, longness) "Py_UNICODE" : (0, 0), + "Py_UCS4" : (0, 0), "Py_ssize_t" : (2, 0), "ssize_t" : (2, 0), "size_t" : (0, 0), diff --git a/Cython/Compiler/PyrexTypes.py b/Cython/Compiler/PyrexTypes.py index 39dd9203..ca997ee8 100755 --- a/Cython/Compiler/PyrexTypes.py +++ b/Cython/Compiler/PyrexTypes.py @@ -49,6 +49,7 @@ class PyrexType(BaseType): # is_typedef boolean Is a typedef type # is_string boolean Is a C char * type # is_unicode boolean Is a UTF-8 encoded C char * type + # is_unicode_char boolean Is either Py_UCS4 or Py_UNICODE # is_returncode boolean Is used only to signal exceptions # is_error boolean Is the dummy error type # is_buffer boolean Is buffer access type @@ -101,6 +102,7 @@ class PyrexType(BaseType): is_typedef = 0 is_string = 0 is_unicode = 0 + is_unicode_char = 0 is_returncode = 0 is_error = 0 is_buffer = 0 @@ -924,9 +926,78 @@ class CBIntType(CIntType): return "" +class CPyUCS4IntType(CIntType): + # Py_UCS4 + + is_unicode_char = True + + # Py_UCS4 coerces from and to single character unicode strings (or + # at most two characters on 16bit Unicode builds), but we also + # allow Python integers as input. The value range for Py_UCS4 + # is 0..1114111, which is checked when converting from an integer + # value. + + to_py_function = "PyUnicode_FromOrdinal" + from_py_function = "__Pyx_PyObject_AsPy_UCS4" + + def create_from_py_utility_code(self, env): + env.use_utility_code(pyobject_as_py_ucs4_utility_code) + return True + + def sign_and_name(self): + return "Py_UCS4" + + +pyobject_as_py_ucs4_utility_code = UtilityCode( +proto=''' +static CYTHON_INLINE Py_UCS4 __Pyx_PyObject_AsPy_UCS4(PyObject*); +''', +impl=''' +static CYTHON_INLINE Py_UCS4 __Pyx_PyObject_AsPy_UCS4(PyObject* x) { + long ival; + if (PyUnicode_Check(x)) { + if (likely(PyUnicode_GET_SIZE(x) == 1)) { + return PyUnicode_AS_UNICODE(x)[0]; + } else if (PyUnicode_GET_SIZE(x) == 2) { + Py_UCS4 high_val = PyUnicode_AS_UNICODE(x)[0]; + if (high_val >= 0xD800 && high_val <= 0xDBFF) { + Py_UCS4 low_val = PyUnicode_AS_UNICODE(x)[1]; + if (low_val >= 0xDC00 && low_val <= 0xDFFF) { + return 0x10000 | ((high_val & ((1<<10)-1)) << 10) | (low_val & ((1<<10)-1)); + } + } + } + PyErr_Format(PyExc_ValueError, + "only single character unicode strings or surrogate pairs can be converted to Py_UCS4, got length " + #if PY_VERSION_HEX < 0x02050000 + "%d", + #else + "%zd", + #endif + PyUnicode_GET_SIZE(x)); + return (Py_UCS4)-1; + } + ival = __Pyx_PyInt_AsLong(x); + if (unlikely(ival < 0)) { + if (!PyErr_Occurred()) + PyErr_SetString(PyExc_OverflowError, + "cannot convert negative value to Py_UCS4"); + return (Py_UCS4)-1; + } else if (unlikely(ival > 1114111)) { + PyErr_SetString(PyExc_OverflowError, + "value too large to convert to Py_UCS4"); + return (Py_UCS4)-1; + } + return (Py_UCS4)ival; +} +''') + + class CPyUnicodeIntType(CIntType): # Py_UNICODE + is_unicode_char = True + # Py_UNICODE coerces from and to single character unicode strings, # but we also allow Python integers as input. The value range for # Py_UNICODE is 0..1114111, which is checked when converting from @@ -2306,6 +2377,7 @@ c_anon_enum_type = CAnonEnumType(-1) c_returncode_type = CReturnCodeType(RANK_INT) c_bint_type = CBIntType(RANK_INT) c_py_unicode_type = CPyUnicodeIntType(RANK_INT-0.5, UNSIGNED) +c_py_ucs4_type = CPyUCS4IntType(RANK_LONG-0.5, UNSIGNED) c_py_ssize_t_type = CPySSizeTType(RANK_LONG+0.5, SIGNED) c_ssize_t_type = CSSizeTType(RANK_LONG+0.5, SIGNED) c_size_t_type = CSizeTType(RANK_LONG+0.5, UNSIGNED) @@ -2367,6 +2439,7 @@ modifiers_and_name_to_type = { (1, 0, "bint"): c_bint_type, (0, 0, "Py_UNICODE"): c_py_unicode_type, + (0, 0, "Py_UCS4"): c_py_ucs4_type, (2, 0, "Py_ssize_t"): c_py_ssize_t_type, (2, 0, "ssize_t") : c_ssize_t_type, (0, 0, "size_t") : c_size_t_type, @@ -2614,6 +2687,8 @@ def parse_basic_type(name): longness = 0 if name == 'Py_UNICODE': signed = 0 + elif name == 'Py_UCS4': + signed = 0 elif name == 'Py_ssize_t': signed = 2 elif name == 'ssize_t': diff --git a/tests/errors/py_ucs4_type_errors.pyx b/tests/errors/py_ucs4_type_errors.pyx new file mode 100644 index 00000000..639b3324 --- /dev/null +++ b/tests/errors/py_ucs4_type_errors.pyx @@ -0,0 +1,24 @@ +# -*- coding: iso-8859-1 -*- + +cdef Py_UCS4 char_ASCII = u'A' +cdef Py_UCS4 char_KLINGON = u'\uF8D2' + +def char_too_long_ASCII(): + cdef Py_UCS4 c = u'AB' + +def char_too_long_Unicode(): + cdef Py_UCS4 c = u'A\uF8D2' + +def char_too_long_bytes(): + cdef Py_UCS4 c = b'AB' + +def char_too_long_latin1(): + cdef Py_UCS4 char_bytes_latin1 = b'\xf6' + + +_ERRORS = """ + 7:21: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE. +10:21: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE. +13:21: Only single-character string literals can be coerced into ints. +16:37: Bytes literals cannot coerce to Py_UNICODE/Py_UCS4, use a unicode literal instead. +""" diff --git a/tests/errors/py_unicode_type_errors.pyx b/tests/errors/py_unicode_type_errors.pyx index 3afe0258..17fdc9b0 100644 --- a/tests/errors/py_unicode_type_errors.pyx +++ b/tests/errors/py_unicode_type_errors.pyx @@ -17,8 +17,8 @@ def char_too_long_latin1(): _ERRORS = """ -7:24: Only single-character Unicode string literals can be coerced into Py_UNICODE. -10:24: Only single-character Unicode string literals can be coerced into Py_UNICODE. + 7:24: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE. +10:24: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE. 13:24: Only single-character string literals can be coerced into ints. -16:40: Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead. +16:40: Bytes literals cannot coerce to Py_UNICODE/Py_UCS4, use a unicode literal instead. """ diff --git a/tests/run/py_ucs4_type.pyx b/tests/run/py_ucs4_type.pyx new file mode 100644 index 00000000..a9548402 --- /dev/null +++ b/tests/run/py_ucs4_type.pyx @@ -0,0 +1,197 @@ +# -*- coding: iso-8859-1 -*- + +cimport cython + +cdef Py_UCS4 char_ASCII = u'A' +cdef Py_UCS4 char_KLINGON = u'\uF8D2' + +def compare_ASCII(): + """ + >>> compare_ASCII() + True + False + False + """ + print(char_ASCII == u'A') + print(char_ASCII == u'B') + print(char_ASCII == u'\uF8D2') + + +def compare_klingon(): + """ + >>> compare_klingon() + True + False + False + """ + print(char_KLINGON == u'\uF8D2') + print(char_KLINGON == u'A') + print(char_KLINGON == u'B') + + +from cpython.unicode cimport PyUnicode_FromOrdinal +import sys + +u0 = u'\x00' +u1 = u'\x01' +umax = PyUnicode_FromOrdinal(sys.maxunicode) + +def unicode_ordinal(Py_UCS4 i): + """ + >>> ord(unicode_ordinal(0)) == 0 + True + >>> ord(unicode_ordinal(1)) == 1 + True + >>> ord(unicode_ordinal(sys.maxunicode)) == sys.maxunicode + True + + >>> ord(unicode_ordinal(u0)) == 0 + True + >>> ord(unicode_ordinal(u1)) == 1 + True + >>> ord(unicode_ordinal(umax)) == sys.maxunicode + True + + Value too small: + >>> unicode_ordinal(-1) #doctest: +ELLIPSIS + Traceback (most recent call last): + ... + OverflowError: ... + + Value too large: + >>> unicode_ordinal(sys.maxunicode+1) #doctest: +ELLIPSIS + Traceback (most recent call last): + ... + OverflowError: ... + + Less than one character: + >>> unicode_ordinal(u0[:0]) + Traceback (most recent call last): + ... + ValueError: only single character unicode strings or surrogate pairs can be converted to Py_UCS4, got length 0 + + More than one character: + >>> unicode_ordinal(u0+u1) + Traceback (most recent call last): + ... + ValueError: only single character unicode strings or surrogate pairs can be converted to Py_UCS4, got length 2 + """ + return i + +@cython.test_assert_path_exists('//PythonCapiCallNode') +@cython.test_fail_if_path_exists('//SimpleCallNode') +def unicode_type_methods(Py_UCS4 uchar): + """ + >>> unicode_type_methods(ord('A')) + [True, True, False, False, False, False, False, True, True] + >>> unicode_type_methods(ord('a')) + [True, True, False, False, True, False, False, False, False] + >>> unicode_type_methods(ord('8')) + [True, False, True, True, False, True, False, False, False] + >>> unicode_type_methods(ord('\\t')) + [False, False, False, False, False, False, True, False, False] + """ + return [ + # character types + uchar.isalnum(), + uchar.isalpha(), + uchar.isdecimal(), + uchar.isdigit(), + uchar.islower(), + uchar.isnumeric(), + uchar.isspace(), + uchar.istitle(), + uchar.isupper(), + ] + +@cython.test_assert_path_exists('//PythonCapiCallNode') +@cython.test_fail_if_path_exists('//SimpleCallNode') +def unicode_methods(Py_UCS4 uchar): + """ + >>> unicode_methods(ord('A')) == ['a', 'A', 'A'] + True + >>> unicode_methods(ord('a')) == ['a', 'A', 'A'] + True + """ + return [ + # character conversion + uchar.lower(), + uchar.upper(), + uchar.title(), + ] + +@cython.test_assert_path_exists('//IntNode') +@cython.test_fail_if_path_exists('//SimpleCallNode', + '//PythonCapiCallNode') +def len_uchar(Py_UCS4 uchar): + """ + >>> len_uchar(ord('A')) + 1 + """ + return len(uchar) + +def index_uchar(Py_UCS4 uchar, Py_ssize_t i): + """ + >>> index_uchar(ord('A'), 0) == ('A', 'A', 'A') + True + >>> index_uchar(ord('A'), -1) == ('A', 'A', 'A') + True + >>> index_uchar(ord('A'), 1) + Traceback (most recent call last): + IndexError: string index out of range + """ + return uchar[0], uchar[-1], uchar[i] + +mixed_ustring = u'AbcDefGhIjKlmnoP' +lower_ustring = mixed_ustring.lower() +upper_ustring = mixed_ustring.lower() + +@cython.test_assert_path_exists('//PythonCapiCallNode', + '//ForFromStatNode') +@cython.test_fail_if_path_exists('//SimpleCallNode', + '//ForInStatNode') +def count_lower_case_characters(unicode ustring): + """ + >>> count_lower_case_characters(mixed_ustring) + 10 + >>> count_lower_case_characters(lower_ustring) + 16 + """ + cdef Py_ssize_t count = 0 + for uchar in ustring: + if uchar.islower(): + count += 1 + return count + +@cython.test_assert_path_exists('//SwitchStatNode', + '//ForFromStatNode') +@cython.test_fail_if_path_exists('//ForInStatNode') +def iter_and_in(): + """ + >>> iter_and_in() + a + b + e + f + h + """ + for c in u'abcdefgh': + if c in u'abCDefGh': + print c + +@cython.test_assert_path_exists('//SwitchStatNode', + '//ForFromStatNode') +@cython.test_fail_if_path_exists('//ForInStatNode') +def index_and_in(): + """ + >>> index_and_in() + 1 + 3 + 4 + 7 + 8 + """ + cdef int i + for i in range(1,9): + if u'abcdefgh'[-i] in u'abCDefGh': + print i