From 171d69cb3fdcc1105b7deb5c184dbb04902e4d6d Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sun, 11 Oct 2009 09:28:43 +0200 Subject: [PATCH] implement proper encoding support for new str literals --- Cython/Compiler/Code.py | 52 ++++++++++++++++++----------- Cython/Compiler/ExprNodes.py | 41 +++++++++++++++-------- Cython/Compiler/Nodes.py | 16 +++++---- Cython/Compiler/Parsing.py | 12 +++---- tests/run/str_encoding_latin1.pyx | 55 +++++++++++++++++++++++++++++++ 5 files changed, 131 insertions(+), 45 deletions(-) create mode 100644 tests/run/str_encoding_latin1.pyx diff --git a/Cython/Compiler/Code.py b/Cython/Compiler/Code.py index 91438395..4afdd28f 100644 --- a/Cython/Compiler/Code.py +++ b/Cython/Compiler/Code.py @@ -304,13 +304,15 @@ class StringConst(object): self.escaped_value = StringEncoding.escape_byte_string(byte_string) self.py_strings = None - def get_py_string_const(self, encoding, identifier=None): + def get_py_string_const(self, encoding, identifier=None, is_str=False): py_strings = self.py_strings text = self.text if encoding is not None: encoding = encoding.upper() - key = (bool(identifier), encoding) + is_str = identifier or bool(is_str) + + key = (is_str, encoding) if py_strings is not None and key in py_strings: py_string = py_strings[key] else: @@ -333,11 +335,11 @@ class StringConst(object): pystring_cname = "%s%s%s_%s" % ( prefix, is_unicode and 'u' or 'b', - identifier and 'i' or '', + is_str and 's' or '', self.cname[len(Naming.const_prefix):]) py_string = PyStringConst( - pystring_cname, is_unicode, bool(identifier), intern) + pystring_cname, encoding, is_unicode, is_str, intern) self.py_strings[key] = py_string return py_string @@ -346,14 +348,16 @@ class PyStringConst(object): """Global info about a Python string constant held by GlobalState. """ # cname string - # unicode boolean + # encoding string # intern boolean - # identifier boolean + # is_unicode boolean + # is_str boolean - def __init__(self, cname, is_unicode, identifier=False, intern=False): + def __init__(self, cname, encoding, is_unicode, is_str=False, intern=False): self.cname = cname - self.identifier = identifier - self.unicode = is_unicode + self.encoding = encoding + self.is_str = is_str + self.is_unicode = is_unicode self.intern = intern def __lt__(self, other): @@ -550,10 +554,10 @@ class GlobalState(object): c = self.new_string_const(text, byte_string) return c - def get_py_string_const(self, text, identifier=None): + def get_py_string_const(self, text, identifier=None, is_str=False): # return a Python string constant, creating a new one if necessary c_string = self.get_string_const(text) - py_string = c_string.get_py_string_const(text.encoding, identifier) + py_string = c_string.get_py_string_const(text.encoding, identifier, is_str) return py_string def new_string_const(self, text, byte_string): @@ -601,7 +605,7 @@ class GlobalState(object): def add_cached_builtin_decl(self, entry): if Options.cache_builtins: if self.should_declare(entry.cname, entry): - interned_cname = self.get_py_string_const(entry.name, True).cname + interned_cname = self.intern_identifier(entry.name).cname self.put_pyobject_decl(entry) w = self.parts['cached_builtins'] w.putln('%s = __Pyx_GetName(%s, %s); if (!%s) %s' % ( @@ -649,18 +653,26 @@ class GlobalState(object): w.putln("static __Pyx_StringTabEntry %s[] = {" % Naming.stringtab_cname) for c_cname, _, py_string in py_strings: + if not py_string.is_str or not py_string.encoding or \ + py_string.encoding in ('ASCII', 'USASCII', 'US-ASCII', + 'UTF8', 'UTF-8'): + encoding = '0' + else: + encoding = '"%s"' % py_string.encoding.lower() + decls_writer.putln( "static PyObject *%s;" % py_string.cname) w.putln( - "{&%s, %s, sizeof(%s), %d, %d, %d}," % ( + "{&%s, %s, sizeof(%s), %s, %d, %d, %d}," % ( py_string.cname, c_cname, c_cname, - py_string.unicode, - py_string.intern, - py_string.identifier + encoding, + py_string.is_unicode, + py_string.is_str, + py_string.intern )) - w.putln("{0, 0, 0, 0, 0, 0}") + w.putln("{0, 0, 0, 0, 0, 0, 0}") w.putln("};") init_globals = self.parts['init_globals'] @@ -894,8 +906,8 @@ class CCodeWriter(object): def get_string_const(self, text): return self.globalstate.get_string_const(text).cname - def get_py_string_const(self, text, identifier=None): - return self.globalstate.get_py_string_const(text, identifier).cname + def get_py_string_const(self, text, identifier=None, is_str=False): + return self.globalstate.get_py_string_const(text, identifier, is_str).cname def get_argument_default_const(self, type): return self.globalstate.get_py_const(type).cname @@ -904,7 +916,7 @@ class CCodeWriter(object): return self.get_py_string_const(text) def intern_identifier(self, text): - return self.get_py_string_const(text, True) + return self.get_py_string_const(text, identifier=True) # code generation diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py index b4149a34..bd08d1ad 100644 --- a/Cython/Compiler/ExprNodes.py +++ b/Cython/Compiler/ExprNodes.py @@ -801,6 +801,10 @@ class FloatNode(ConstNode): class BytesNode(ConstNode): + # A char* or bytes literal + # + # value BytesLiteral + type = PyrexTypes.c_char_ptr_type def compile_time_value(self, denv): @@ -899,27 +903,32 @@ class StringNode(PyConstNode): # A Python str object, i.e. a byte string in Python 2.x and a # unicode string in Python 3.x # - # Can be coerced to a BytesNode (and thus to C types), but not to - # a UnicodeNode. - # - # value BytesLiteral + # value BytesLiteral + # is_identifier boolean type = Builtin.str_type + is_identifier = False def coerce_to(self, dst_type, env): - if dst_type is Builtin.str_type: - return self -# if dst_type is Builtin.bytes_type: -# # special case: bytes = 'str literal' -# return BytesNode(self.pos, value=self.value) - if not dst_type.is_pyobject: - return BytesNode(self.pos, value=self.value).coerce_to(dst_type, env) - if dst_type is not py_object_type: + if dst_type is not py_object_type and dst_type is not Builtin.str_type: +# if dst_type is Builtin.bytes_type: +# # special case: bytes = 'str literal' +# return BytesNode(self.pos, value=self.value) + if not dst_type.is_pyobject: + return BytesNode(self.pos, value=self.value).coerce_to(dst_type, env) self.check_for_coercion_error(dst_type, fail=True) + + # this will be a unicode string in Py3, so make sure we can decode it + try: + self.value.decode(self.value.encoding) + except UnicodeDecodeError: + error(self.pos, "String decoding as '%s' failed. Consider using a byte string or unicode string explicitly, or adjust the source code encoding." % self.value.encoding) + return self def generate_evaluation_code(self, code): - self.result_code = code.get_py_string_const(self.value, True) + self.result_code = code.get_py_string_const( + self.value, identifier=self.is_identifier, is_str=True) def get_constant_c_result_code(self): return None @@ -931,6 +940,12 @@ class StringNode(PyConstNode): return self.value +class IdentifierStringNode(StringNode): + # A special str value that represents an identifier (bytes in Py2, + # unicode in Py3). + is_identifier = True + + class LongNode(AtomicExprNode): # Python long integer literal # diff --git a/Cython/Compiler/Nodes.py b/Cython/Compiler/Nodes.py index b8c536a4..7a6e1a2e 100644 --- a/Cython/Compiler/Nodes.py +++ b/Cython/Compiler/Nodes.py @@ -4757,7 +4757,7 @@ utility_function_predeclarations = \ #define INLINE #endif -typedef struct {PyObject **p; char *s; long n; char is_unicode; char intern; char is_identifier;} __Pyx_StringTabEntry; /*proto*/ +typedef struct {PyObject **p; char *s; const long n; const char* encoding; const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; /*proto*/ """ @@ -5518,7 +5518,7 @@ impl = """ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { while (t->p) { #if PY_MAJOR_VERSION < 3 - if (t->is_unicode && (!t->is_identifier)) { + if (t->is_unicode) { *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL); } else if (t->intern) { *t->p = PyString_InternFromString(t->s); @@ -5526,10 +5526,14 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { *t->p = PyString_FromStringAndSize(t->s, t->n - 1); } #else /* Python 3+ has unicode identifiers */ - if (t->is_identifier || (t->is_unicode && t->intern)) { - *t->p = PyUnicode_InternFromString(t->s); - } else if (t->is_unicode) { - *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1); + if (t->is_unicode | t->is_str) { + if (t->intern) { + *t->p = PyUnicode_InternFromString(t->s); + } else if (t->encoding) { + *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL); + } else { + *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1); + } } else { *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1); } diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py index 3d5a66f6..175a7e71 100644 --- a/Cython/Compiler/Parsing.py +++ b/Cython/Compiler/Parsing.py @@ -348,7 +348,7 @@ def p_call(s, function): s.error("Expected an identifier before '='", pos = arg.pos) encoded_name = EncodedString(arg.name) - keyword = ExprNodes.StringNode(arg.pos, value = encoded_name) + keyword = ExprNodes.IdentifierStringNode(arg.pos, value = encoded_name) arg = p_simple_expr(s) keyword_args.append((keyword, arg)) else: @@ -1128,14 +1128,14 @@ def p_import_statement(s): else: if as_name and "." in dotted_name: name_list = ExprNodes.ListNode(pos, args = [ - ExprNodes.StringNode(pos, value = EncodedString("*"))]) + ExprNodes.IdentifierStringNode(pos, value = EncodedString("*"))]) else: name_list = None stat = Nodes.SingleAssignmentNode(pos, lhs = ExprNodes.NameNode(pos, name = as_name or target_name), rhs = ExprNodes.ImportNode(pos, - module_name = ExprNodes.StringNode( + module_name = ExprNodes.IdentifierStringNode( pos, value = dotted_name), name_list = name_list)) stats.append(stat) @@ -1193,7 +1193,7 @@ def p_from_import_statement(s, first_statement = 0): for (name_pos, name, as_name, kind) in imported_names: encoded_name = EncodedString(name) imported_name_strings.append( - ExprNodes.StringNode(name_pos, value = encoded_name)) + ExprNodes.IdentifierStringNode(name_pos, value = encoded_name)) items.append( (name, ExprNodes.NameNode(name_pos, @@ -1203,7 +1203,7 @@ def p_from_import_statement(s, first_statement = 0): dotted_name = EncodedString(dotted_name) return Nodes.FromImportStatNode(pos, module = ExprNodes.ImportNode(dotted_name_pos, - module_name = ExprNodes.StringNode(pos, value = dotted_name), + module_name = ExprNodes.IdentifierStringNode(pos, value = dotted_name), name_list = import_list), items = items) @@ -1713,7 +1713,7 @@ def p_positional_and_keyword_args(s, end_sy_set, type_positions=(), type_keyword parsed_type = True else: arg = p_simple_expr(s) - keyword_node = ExprNodes.StringNode( + keyword_node = ExprNodes.IdentifierStringNode( arg.pos, value = EncodedString(ident)) keyword_args.append((keyword_node, arg)) was_keyword = True diff --git a/tests/run/str_encoding_latin1.pyx b/tests/run/str_encoding_latin1.pyx new file mode 100644 index 00000000..93f99c07 --- /dev/null +++ b/tests/run/str_encoding_latin1.pyx @@ -0,0 +1,55 @@ +# -*- coding: latin-1 -*- + +__doc__ = (u""" +>>> a == 'abc' +True +>>> isinstance(a, str) +True + +>>> isinstance(s, str) +True +>>> len(s) +6 +>>> s == 'aäÄÖöo' +True + +>>> isinstance(add(), str) +True +>>> len(add()) +9 +>>> add() == 'abcaäÄÖöo' +True + +>>> isinstance(add_literal(), str) +True +>>> len(add_literal()) +9 +>>> add_literal() == 'abcaäÄÖöo' +True + +>>> isinstance(typed(), str) +True +>>> len(typed()) +6 +>>> typed() == 'üüääöö' +True + +""" +# recoding/escaping is required to properly pass the literals to doctest +).encode('unicode_escape').decode('ASCII') + + +a = 'abc' +s = 'aäÄÖöo' +u = u'aäÄÖöo' + +cdef str S = 'üüääöö' + +def add(): + return a+s + +def add_literal(): + return 'abc' + 'aäÄÖöo' + +def typed(): + return S -- 2.26.2