self.escaped_value = StringEncoding.escape_byte_string(byte_string)
self.py_strings = None
- def get_py_string_const(self, encoding, identifier=None):
+ def get_py_string_const(self, encoding, identifier=None, is_str=False):
py_strings = self.py_strings
text = self.text
if encoding is not None:
encoding = encoding.upper()
- key = (bool(identifier), encoding)
+ is_str = identifier or bool(is_str)
+
+ key = (is_str, encoding)
if py_strings is not None and key in py_strings:
py_string = py_strings[key]
else:
pystring_cname = "%s%s%s_%s" % (
prefix,
is_unicode and 'u' or 'b',
- identifier and 'i' or '',
+ is_str and 's' or '',
self.cname[len(Naming.const_prefix):])
py_string = PyStringConst(
- pystring_cname, is_unicode, bool(identifier), intern)
+ pystring_cname, encoding, is_unicode, is_str, intern)
self.py_strings[key] = py_string
return py_string
"""Global info about a Python string constant held by GlobalState.
"""
# cname string
- # unicode boolean
+ # encoding string
# intern boolean
- # identifier boolean
+ # is_unicode boolean
+ # is_str boolean
- def __init__(self, cname, is_unicode, identifier=False, intern=False):
+ def __init__(self, cname, encoding, is_unicode, is_str=False, intern=False):
self.cname = cname
- self.identifier = identifier
- self.unicode = is_unicode
+ self.encoding = encoding
+ self.is_str = is_str
+ self.is_unicode = is_unicode
self.intern = intern
def __lt__(self, other):
c = self.new_string_const(text, byte_string)
return c
- def get_py_string_const(self, text, identifier=None):
+ def get_py_string_const(self, text, identifier=None, is_str=False):
# return a Python string constant, creating a new one if necessary
c_string = self.get_string_const(text)
- py_string = c_string.get_py_string_const(text.encoding, identifier)
+ py_string = c_string.get_py_string_const(text.encoding, identifier, is_str)
return py_string
def new_string_const(self, text, byte_string):
def add_cached_builtin_decl(self, entry):
if Options.cache_builtins:
if self.should_declare(entry.cname, entry):
- interned_cname = self.get_py_string_const(entry.name, True).cname
+ interned_cname = self.intern_identifier(entry.name).cname
self.put_pyobject_decl(entry)
w = self.parts['cached_builtins']
w.putln('%s = __Pyx_GetName(%s, %s); if (!%s) %s' % (
w.putln("static __Pyx_StringTabEntry %s[] = {" %
Naming.stringtab_cname)
for c_cname, _, py_string in py_strings:
+ if not py_string.is_str or not py_string.encoding or \
+ py_string.encoding in ('ASCII', 'USASCII', 'US-ASCII',
+ 'UTF8', 'UTF-8'):
+ encoding = '0'
+ else:
+ encoding = '"%s"' % py_string.encoding.lower()
+
decls_writer.putln(
"static PyObject *%s;" % py_string.cname)
w.putln(
- "{&%s, %s, sizeof(%s), %d, %d, %d}," % (
+ "{&%s, %s, sizeof(%s), %s, %d, %d, %d}," % (
py_string.cname,
c_cname,
c_cname,
- py_string.unicode,
- py_string.intern,
- py_string.identifier
+ encoding,
+ py_string.is_unicode,
+ py_string.is_str,
+ py_string.intern
))
- w.putln("{0, 0, 0, 0, 0, 0}")
+ w.putln("{0, 0, 0, 0, 0, 0, 0}")
w.putln("};")
init_globals = self.parts['init_globals']
def get_string_const(self, text):
return self.globalstate.get_string_const(text).cname
- def get_py_string_const(self, text, identifier=None):
- return self.globalstate.get_py_string_const(text, identifier).cname
+ def get_py_string_const(self, text, identifier=None, is_str=False):
+ return self.globalstate.get_py_string_const(text, identifier, is_str).cname
def get_argument_default_const(self, type):
return self.globalstate.get_py_const(type).cname
return self.get_py_string_const(text)
def intern_identifier(self, text):
- return self.get_py_string_const(text, True)
+ return self.get_py_string_const(text, identifier=True)
# code generation
class BytesNode(ConstNode):
+ # A char* or bytes literal
+ #
+ # value BytesLiteral
+
type = PyrexTypes.c_char_ptr_type
def compile_time_value(self, denv):
# A Python str object, i.e. a byte string in Python 2.x and a
# unicode string in Python 3.x
#
- # Can be coerced to a BytesNode (and thus to C types), but not to
- # a UnicodeNode.
- #
- # value BytesLiteral
+ # value BytesLiteral
+ # is_identifier boolean
type = Builtin.str_type
+ is_identifier = False
def coerce_to(self, dst_type, env):
- if dst_type is Builtin.str_type:
- return self
-# if dst_type is Builtin.bytes_type:
-# # special case: bytes = 'str literal'
-# return BytesNode(self.pos, value=self.value)
- if not dst_type.is_pyobject:
- return BytesNode(self.pos, value=self.value).coerce_to(dst_type, env)
- if dst_type is not py_object_type:
+ if dst_type is not py_object_type and dst_type is not Builtin.str_type:
+# if dst_type is Builtin.bytes_type:
+# # special case: bytes = 'str literal'
+# return BytesNode(self.pos, value=self.value)
+ if not dst_type.is_pyobject:
+ return BytesNode(self.pos, value=self.value).coerce_to(dst_type, env)
self.check_for_coercion_error(dst_type, fail=True)
+
+ # this will be a unicode string in Py3, so make sure we can decode it
+ try:
+ self.value.decode(self.value.encoding)
+ except UnicodeDecodeError:
+ error(self.pos, "String decoding as '%s' failed. Consider using a byte string or unicode string explicitly, or adjust the source code encoding." % self.value.encoding)
+
return self
def generate_evaluation_code(self, code):
- self.result_code = code.get_py_string_const(self.value, True)
+ self.result_code = code.get_py_string_const(
+ self.value, identifier=self.is_identifier, is_str=True)
def get_constant_c_result_code(self):
return None
return self.value
+class IdentifierStringNode(StringNode):
+ # A special str value that represents an identifier (bytes in Py2,
+ # unicode in Py3).
+ is_identifier = True
+
+
class LongNode(AtomicExprNode):
# Python long integer literal
#
#define INLINE
#endif
-typedef struct {PyObject **p; char *s; long n; char is_unicode; char intern; char is_identifier;} __Pyx_StringTabEntry; /*proto*/
+typedef struct {PyObject **p; char *s; const long n; const char* encoding; const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; /*proto*/
"""
static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
while (t->p) {
#if PY_MAJOR_VERSION < 3
- if (t->is_unicode && (!t->is_identifier)) {
+ if (t->is_unicode) {
*t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
} else if (t->intern) {
*t->p = PyString_InternFromString(t->s);
*t->p = PyString_FromStringAndSize(t->s, t->n - 1);
}
#else /* Python 3+ has unicode identifiers */
- if (t->is_identifier || (t->is_unicode && t->intern)) {
- *t->p = PyUnicode_InternFromString(t->s);
- } else if (t->is_unicode) {
- *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
+ if (t->is_unicode | t->is_str) {
+ if (t->intern) {
+ *t->p = PyUnicode_InternFromString(t->s);
+ } else if (t->encoding) {
+ *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
+ } else {
+ *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
+ }
} else {
*t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
}
s.error("Expected an identifier before '='",
pos = arg.pos)
encoded_name = EncodedString(arg.name)
- keyword = ExprNodes.StringNode(arg.pos, value = encoded_name)
+ keyword = ExprNodes.IdentifierStringNode(arg.pos, value = encoded_name)
arg = p_simple_expr(s)
keyword_args.append((keyword, arg))
else:
else:
if as_name and "." in dotted_name:
name_list = ExprNodes.ListNode(pos, args = [
- ExprNodes.StringNode(pos, value = EncodedString("*"))])
+ ExprNodes.IdentifierStringNode(pos, value = EncodedString("*"))])
else:
name_list = None
stat = Nodes.SingleAssignmentNode(pos,
lhs = ExprNodes.NameNode(pos,
name = as_name or target_name),
rhs = ExprNodes.ImportNode(pos,
- module_name = ExprNodes.StringNode(
+ module_name = ExprNodes.IdentifierStringNode(
pos, value = dotted_name),
name_list = name_list))
stats.append(stat)
for (name_pos, name, as_name, kind) in imported_names:
encoded_name = EncodedString(name)
imported_name_strings.append(
- ExprNodes.StringNode(name_pos, value = encoded_name))
+ ExprNodes.IdentifierStringNode(name_pos, value = encoded_name))
items.append(
(name,
ExprNodes.NameNode(name_pos,
dotted_name = EncodedString(dotted_name)
return Nodes.FromImportStatNode(pos,
module = ExprNodes.ImportNode(dotted_name_pos,
- module_name = ExprNodes.StringNode(pos, value = dotted_name),
+ module_name = ExprNodes.IdentifierStringNode(pos, value = dotted_name),
name_list = import_list),
items = items)
parsed_type = True
else:
arg = p_simple_expr(s)
- keyword_node = ExprNodes.StringNode(
+ keyword_node = ExprNodes.IdentifierStringNode(
arg.pos, value = EncodedString(ident))
keyword_args.append((keyword_node, arg))
was_keyword = True
--- /dev/null
+# -*- coding: latin-1 -*-
+
+__doc__ = (u"""
+>>> a == 'abc'
+True
+>>> isinstance(a, str)
+True
+
+>>> isinstance(s, str)
+True
+>>> len(s)
+6
+>>> s == 'aäÄÖöo'
+True
+
+>>> isinstance(add(), str)
+True
+>>> len(add())
+9
+>>> add() == 'abcaäÄÖöo'
+True
+
+>>> isinstance(add_literal(), str)
+True
+>>> len(add_literal())
+9
+>>> add_literal() == 'abcaäÄÖöo'
+True
+
+>>> isinstance(typed(), str)
+True
+>>> len(typed())
+6
+>>> typed() == 'üüääöö'
+True
+
+"""
+# recoding/escaping is required to properly pass the literals to doctest
+).encode('unicode_escape').decode('ASCII')
+
+
+a = 'abc'
+s = 'aäÄÖöo'
+u = u'aäÄÖöo'
+
+cdef str S = 'üüääöö'
+
+def add():
+ return a+s
+
+def add_literal():
+ return 'abc' + 'aäÄÖöo'
+
+def typed():
+ return S