if not self.can_coerce_to_char_literal():
error(self.pos, "Only single-character string literals can be coerced into ints.")
return self
- if dst_type is PyrexTypes.c_py_unicode_type:
- error(self.pos, "Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.")
+ if dst_type.is_unicode_char:
+ error(self.pos, "Bytes literals cannot coerce to Py_UNICODE/Py_UCS4, use a unicode literal instead.")
return self
return CharNode(self.pos, value=self.value)
def coerce_to(self, dst_type, env):
if dst_type is self.type:
pass
- elif dst_type is PyrexTypes.c_py_unicode_type:
+ elif dst_type.is_unicode_char:
if not self.can_coerce_to_char_literal():
- error(self.pos, "Only single-character Unicode string literals can be coerced into Py_UNICODE.")
+ error(self.pos, "Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.")
return self
int_value = ord(self.value)
- return IntNode(self.pos, value=int_value, constant_result=int_value)
+ return IntNode(self.pos, type=dst_type, value=str(int_value), constant_result=int_value)
elif not dst_type.is_pyobject:
if dst_type.is_string and self.bytes_value is not None:
# special case: '-3' enforced unicode literal used in a C char* context
return BytesNode(self.pos, value=self.bytes_value).coerce_to(dst_type, env)
- error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.")
+ error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4.")
elif dst_type is not py_object_type:
if not self.check_for_coercion_error(dst_type):
self.fail_assignment(dst_type)
def can_coerce_to_char_literal(self):
return len(self.value) == 1
+ ## or (len(self.value) == 2
+ ## and (0xD800 <= self.value[0] <= 0xDBFF)
+ ## and (0xDC00 <= self.value[1] <= 0xDFFF))
def contains_surrogates(self):
# Check if the unicode string contains surrogate code points
elif not skip_child_analysis:
self.index.analyse_types(env)
self.original_index_type = self.index.type
- if base_type is PyrexTypes.c_py_unicode_type:
- # we infer Py_UNICODE for unicode strings in some
+ if base_type.is_unicode_char:
+ # we infer Py_UNICODE/Py_UCS4 for unicode strings in some
# cases, but indexing must still work for them
if self.index.constant_result in (0, -1):
# FIXME: we know that this node is redundant -
self.index = self.index.coerce_to_pyobject(env)
self.is_temp = 1
if self.index.type.is_int and base_type is unicode_type:
- # Py_UNICODE will automatically coerce to a unicode string
+ # Py_UNICODE/Py_UCS4 will automatically coerce to a unicode string
# if required, so this is fast and safe
self.type = PyrexTypes.c_py_unicode_type
elif is_slice and base_type in (bytes_type, str_type, unicode_type, list_type, tuple_type):
return "PyList_GET_ITEM(%s, %s)" % (self.base.result(), self.index.result())
elif self.base.type is tuple_type:
return "PyTuple_GET_ITEM(%s, %s)" % (self.base.result(), self.index.result())
- elif self.base.type is unicode_type and self.type is PyrexTypes.c_py_unicode_type:
+ elif self.base.type is unicode_type and self.type.is_unicode_char:
return "PyUnicode_AS_UNICODE(%s)[%s]" % (self.base.result(), self.index.result())
elif (self.type.is_ptr or self.type.is_array) and self.type == self.base.type:
error(self.pos, "Invalid use of pointer slice")
self.result(),
code.error_goto(self.pos)))
code.put_gotref(self.py_result())
- elif self.type is PyrexTypes.c_py_unicode_type and self.base.type is unicode_type:
+ elif self.type.is_unicode_char and self.base.type is unicode_type:
assert self.index.type.is_int
index_code = self.index.result()
function = "__Pyx_GetItemInt_Unicode"
self.operand2.result())
def is_py_operation_types(self, type1, type2):
- return (type1 is PyrexTypes.c_py_unicode_type or
- type2 is PyrexTypes.c_py_unicode_type or
+ return (type1.is_unicode_char or
+ type2.is_unicode_char or
BinopNode.is_py_operation_types(self, type1, type2))
def py_operation_function(self):
return self.operator in ('in', 'not_in') and \
((self.operand1.type.is_int
and (self.operand2.type.is_string or self.operand2.type is bytes_type)) or
- (self.operand1.type is PyrexTypes.c_py_unicode_type
+ (self.operand1.type.is_unicode_char
and self.operand2.type is unicode_type))
def is_ptr_contains(self):
# be specific about some known types
if arg.type.is_string:
self.type = bytes_type
- elif arg.type is PyrexTypes.c_py_unicode_type:
+ elif arg.type.is_unicode_char:
self.type = unicode_type
elif arg.type.is_complex:
self.type = Builtin.complex_type
node.pos, cfunc_name, self.PyObject_Size_func_type,
args = [arg],
is_temp = node.is_temp)
- elif arg.type is PyrexTypes.c_py_unicode_type:
+ elif arg.type.is_unicode_char:
return ExprNodes.IntNode(node.pos, value='1', constant_result=1,
type=node.type)
else:
return node
arg = pos_args[0]
if isinstance(arg, ExprNodes.CoerceToPyTypeNode):
- if arg.arg.type is PyrexTypes.c_py_unicode_type:
+ if arg.arg.type.is_unicode_char:
return arg.arg.coerce_to(node.type, self.current_env())
return node
return node
ustring = args[0]
if not isinstance(ustring, ExprNodes.CoerceToPyTypeNode) or \
- ustring.arg.type is not PyrexTypes.c_py_unicode_type:
+ not ustring.arg.type.is_unicode_char:
return node
uchar = ustring.arg
method_name = node.function.attribute
return node
ustring = args[0]
if not isinstance(ustring, ExprNodes.CoerceToPyTypeNode) or \
- ustring.arg.type is not PyrexTypes.c_py_unicode_type:
+ not ustring.arg.type.is_unicode_char:
return node
uchar = ustring.arg
method_name = node.function.attribute
special_basic_c_types = {
# name : (signed, longness)
"Py_UNICODE" : (0, 0),
+ "Py_UCS4" : (0, 0),
"Py_ssize_t" : (2, 0),
"ssize_t" : (2, 0),
"size_t" : (0, 0),
# is_typedef boolean Is a typedef type
# is_string boolean Is a C char * type
# is_unicode boolean Is a UTF-8 encoded C char * type
+ # is_unicode_char boolean Is either Py_UCS4 or Py_UNICODE
# is_returncode boolean Is used only to signal exceptions
# is_error boolean Is the dummy error type
# is_buffer boolean Is buffer access type
is_typedef = 0
is_string = 0
is_unicode = 0
+ is_unicode_char = 0
is_returncode = 0
is_error = 0
is_buffer = 0
return "<CNumericType bint>"
+class CPyUCS4IntType(CIntType):
+ # Py_UCS4
+
+ is_unicode_char = True
+
+ # Py_UCS4 coerces from and to single character unicode strings (or
+ # at most two characters on 16bit Unicode builds), but we also
+ # allow Python integers as input. The value range for Py_UCS4
+ # is 0..1114111, which is checked when converting from an integer
+ # value.
+
+ to_py_function = "PyUnicode_FromOrdinal"
+ from_py_function = "__Pyx_PyObject_AsPy_UCS4"
+
+ def create_from_py_utility_code(self, env):
+ env.use_utility_code(pyobject_as_py_ucs4_utility_code)
+ return True
+
+ def sign_and_name(self):
+ return "Py_UCS4"
+
+
+pyobject_as_py_ucs4_utility_code = UtilityCode(
+proto='''
+static CYTHON_INLINE Py_UCS4 __Pyx_PyObject_AsPy_UCS4(PyObject*);
+''',
+impl='''
+static CYTHON_INLINE Py_UCS4 __Pyx_PyObject_AsPy_UCS4(PyObject* x) {
+ long ival;
+ if (PyUnicode_Check(x)) {
+ if (likely(PyUnicode_GET_SIZE(x) == 1)) {
+ return PyUnicode_AS_UNICODE(x)[0];
+ } else if (PyUnicode_GET_SIZE(x) == 2) {
+ Py_UCS4 high_val = PyUnicode_AS_UNICODE(x)[0];
+ if (high_val >= 0xD800 && high_val <= 0xDBFF) {
+ Py_UCS4 low_val = PyUnicode_AS_UNICODE(x)[1];
+ if (low_val >= 0xDC00 && low_val <= 0xDFFF) {
+ return 0x10000 | ((high_val & ((1<<10)-1)) << 10) | (low_val & ((1<<10)-1));
+ }
+ }
+ }
+ PyErr_Format(PyExc_ValueError,
+ "only single character unicode strings or surrogate pairs can be converted to Py_UCS4, got length "
+ #if PY_VERSION_HEX < 0x02050000
+ "%d",
+ #else
+ "%zd",
+ #endif
+ PyUnicode_GET_SIZE(x));
+ return (Py_UCS4)-1;
+ }
+ ival = __Pyx_PyInt_AsLong(x);
+ if (unlikely(ival < 0)) {
+ if (!PyErr_Occurred())
+ PyErr_SetString(PyExc_OverflowError,
+ "cannot convert negative value to Py_UCS4");
+ return (Py_UCS4)-1;
+ } else if (unlikely(ival > 1114111)) {
+ PyErr_SetString(PyExc_OverflowError,
+ "value too large to convert to Py_UCS4");
+ return (Py_UCS4)-1;
+ }
+ return (Py_UCS4)ival;
+}
+''')
+
+
class CPyUnicodeIntType(CIntType):
# Py_UNICODE
+ is_unicode_char = True
+
# Py_UNICODE coerces from and to single character unicode strings,
# but we also allow Python integers as input. The value range for
# Py_UNICODE is 0..1114111, which is checked when converting from
c_returncode_type = CReturnCodeType(RANK_INT)
c_bint_type = CBIntType(RANK_INT)
c_py_unicode_type = CPyUnicodeIntType(RANK_INT-0.5, UNSIGNED)
+c_py_ucs4_type = CPyUCS4IntType(RANK_LONG-0.5, UNSIGNED)
c_py_ssize_t_type = CPySSizeTType(RANK_LONG+0.5, SIGNED)
c_ssize_t_type = CSSizeTType(RANK_LONG+0.5, SIGNED)
c_size_t_type = CSizeTType(RANK_LONG+0.5, UNSIGNED)
(1, 0, "bint"): c_bint_type,
(0, 0, "Py_UNICODE"): c_py_unicode_type,
+ (0, 0, "Py_UCS4"): c_py_ucs4_type,
(2, 0, "Py_ssize_t"): c_py_ssize_t_type,
(2, 0, "ssize_t") : c_ssize_t_type,
(0, 0, "size_t") : c_size_t_type,
longness = 0
if name == 'Py_UNICODE':
signed = 0
+ elif name == 'Py_UCS4':
+ signed = 0
elif name == 'Py_ssize_t':
signed = 2
elif name == 'ssize_t':
--- /dev/null
+# -*- coding: iso-8859-1 -*-
+
+cdef Py_UCS4 char_ASCII = u'A'
+cdef Py_UCS4 char_KLINGON = u'\uF8D2'
+
+def char_too_long_ASCII():
+ cdef Py_UCS4 c = u'AB'
+
+def char_too_long_Unicode():
+ cdef Py_UCS4 c = u'A\uF8D2'
+
+def char_too_long_bytes():
+ cdef Py_UCS4 c = b'AB'
+
+def char_too_long_latin1():
+ cdef Py_UCS4 char_bytes_latin1 = b'\xf6'
+
+
+_ERRORS = """
+ 7:21: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.
+10:21: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.
+13:21: Only single-character string literals can be coerced into ints.
+16:37: Bytes literals cannot coerce to Py_UNICODE/Py_UCS4, use a unicode literal instead.
+"""
_ERRORS = """
-7:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
-10:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
+ 7:24: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.
+10:24: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.
13:24: Only single-character string literals can be coerced into ints.
-16:40: Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.
+16:40: Bytes literals cannot coerce to Py_UNICODE/Py_UCS4, use a unicode literal instead.
"""
--- /dev/null
+# -*- coding: iso-8859-1 -*-
+
+cimport cython
+
+cdef Py_UCS4 char_ASCII = u'A'
+cdef Py_UCS4 char_KLINGON = u'\uF8D2'
+
+def compare_ASCII():
+ """
+ >>> compare_ASCII()
+ True
+ False
+ False
+ """
+ print(char_ASCII == u'A')
+ print(char_ASCII == u'B')
+ print(char_ASCII == u'\uF8D2')
+
+
+def compare_klingon():
+ """
+ >>> compare_klingon()
+ True
+ False
+ False
+ """
+ print(char_KLINGON == u'\uF8D2')
+ print(char_KLINGON == u'A')
+ print(char_KLINGON == u'B')
+
+
+from cpython.unicode cimport PyUnicode_FromOrdinal
+import sys
+
+u0 = u'\x00'
+u1 = u'\x01'
+umax = PyUnicode_FromOrdinal(sys.maxunicode)
+
+def unicode_ordinal(Py_UCS4 i):
+ """
+ >>> ord(unicode_ordinal(0)) == 0
+ True
+ >>> ord(unicode_ordinal(1)) == 1
+ True
+ >>> ord(unicode_ordinal(sys.maxunicode)) == sys.maxunicode
+ True
+
+ >>> ord(unicode_ordinal(u0)) == 0
+ True
+ >>> ord(unicode_ordinal(u1)) == 1
+ True
+ >>> ord(unicode_ordinal(umax)) == sys.maxunicode
+ True
+
+ Value too small:
+ >>> unicode_ordinal(-1) #doctest: +ELLIPSIS
+ Traceback (most recent call last):
+ ...
+ OverflowError: ...
+
+ Value too large:
+ >>> unicode_ordinal(sys.maxunicode+1) #doctest: +ELLIPSIS
+ Traceback (most recent call last):
+ ...
+ OverflowError: ...
+
+ Less than one character:
+ >>> unicode_ordinal(u0[:0])
+ Traceback (most recent call last):
+ ...
+ ValueError: only single character unicode strings or surrogate pairs can be converted to Py_UCS4, got length 0
+
+ More than one character:
+ >>> unicode_ordinal(u0+u1)
+ Traceback (most recent call last):
+ ...
+ ValueError: only single character unicode strings or surrogate pairs can be converted to Py_UCS4, got length 2
+ """
+ return i
+
+@cython.test_assert_path_exists('//PythonCapiCallNode')
+@cython.test_fail_if_path_exists('//SimpleCallNode')
+def unicode_type_methods(Py_UCS4 uchar):
+ """
+ >>> unicode_type_methods(ord('A'))
+ [True, True, False, False, False, False, False, True, True]
+ >>> unicode_type_methods(ord('a'))
+ [True, True, False, False, True, False, False, False, False]
+ >>> unicode_type_methods(ord('8'))
+ [True, False, True, True, False, True, False, False, False]
+ >>> unicode_type_methods(ord('\\t'))
+ [False, False, False, False, False, False, True, False, False]
+ """
+ return [
+ # character types
+ uchar.isalnum(),
+ uchar.isalpha(),
+ uchar.isdecimal(),
+ uchar.isdigit(),
+ uchar.islower(),
+ uchar.isnumeric(),
+ uchar.isspace(),
+ uchar.istitle(),
+ uchar.isupper(),
+ ]
+
+@cython.test_assert_path_exists('//PythonCapiCallNode')
+@cython.test_fail_if_path_exists('//SimpleCallNode')
+def unicode_methods(Py_UCS4 uchar):
+ """
+ >>> unicode_methods(ord('A')) == ['a', 'A', 'A']
+ True
+ >>> unicode_methods(ord('a')) == ['a', 'A', 'A']
+ True
+ """
+ return [
+ # character conversion
+ uchar.lower(),
+ uchar.upper(),
+ uchar.title(),
+ ]
+
+@cython.test_assert_path_exists('//IntNode')
+@cython.test_fail_if_path_exists('//SimpleCallNode',
+ '//PythonCapiCallNode')
+def len_uchar(Py_UCS4 uchar):
+ """
+ >>> len_uchar(ord('A'))
+ 1
+ """
+ return len(uchar)
+
+def index_uchar(Py_UCS4 uchar, Py_ssize_t i):
+ """
+ >>> index_uchar(ord('A'), 0) == ('A', 'A', 'A')
+ True
+ >>> index_uchar(ord('A'), -1) == ('A', 'A', 'A')
+ True
+ >>> index_uchar(ord('A'), 1)
+ Traceback (most recent call last):
+ IndexError: string index out of range
+ """
+ return uchar[0], uchar[-1], uchar[i]
+
+mixed_ustring = u'AbcDefGhIjKlmnoP'
+lower_ustring = mixed_ustring.lower()
+upper_ustring = mixed_ustring.lower()
+
+@cython.test_assert_path_exists('//PythonCapiCallNode',
+ '//ForFromStatNode')
+@cython.test_fail_if_path_exists('//SimpleCallNode',
+ '//ForInStatNode')
+def count_lower_case_characters(unicode ustring):
+ """
+ >>> count_lower_case_characters(mixed_ustring)
+ 10
+ >>> count_lower_case_characters(lower_ustring)
+ 16
+ """
+ cdef Py_ssize_t count = 0
+ for uchar in ustring:
+ if uchar.islower():
+ count += 1
+ return count
+
+@cython.test_assert_path_exists('//SwitchStatNode',
+ '//ForFromStatNode')
+@cython.test_fail_if_path_exists('//ForInStatNode')
+def iter_and_in():
+ """
+ >>> iter_and_in()
+ a
+ b
+ e
+ f
+ h
+ """
+ for c in u'abcdefgh':
+ if c in u'abCDefGh':
+ print c
+
+@cython.test_assert_path_exists('//SwitchStatNode',
+ '//ForFromStatNode')
+@cython.test_fail_if_path_exists('//ForInStatNode')
+def index_and_in():
+ """
+ >>> index_and_in()
+ 1
+ 3
+ 4
+ 7
+ 8
+ """
+ cdef int i
+ for i in range(1,9):
+ if u'abcdefgh'[-i] in u'abCDefGh':
+ print i