from DebugFlags import debug_disposal_code, debug_temp_alloc, \
debug_coercion
+class EncodedString(unicode):
+ # unicode string subclass to keep track of the original encoding.
+ # 'encoding' is None for unicode strings and the source encoding
+ # otherwise
+ encoding = None
+
+ def byteencode(self):
+ assert self.encoding is not None
+ return self.encode(self.encoding)
+
+ def utf8encode(self):
+ assert self.encoding is None
+ return self.encode("UTF-8")
+
+ def is_unicode(self):
+ return self.encoding is None
+ is_unicode = property(is_unicode)
+
+# def __eq__(self, other):
+# return unicode.__eq__(self, other) and \
+# getattr(other, 'encoding', '') == self.encoding
+
+
class ExprNode(Node):
# subexprs [string] Class var holding names of subexpr node attrs
# type PyrexType Type of the result
type = PyrexTypes.c_char_ptr_type
def compile_time_value(self, denv):
- return eval('"%s"' % self.value)
+ return self.value
def analyse_types(self, env):
self.entry = env.add_string_const(self.value)
def coerce_to(self, dst_type, env):
if dst_type.is_int:
- if not self.type.is_pyobject and len(self.value) == 1:
- return CharNode(self.pos, value=self.value)
+ if not self.type.is_pyobject and len(self.entry.init) == 1:
+ # we use the *encoded* value here
+ return CharNode(self.pos, value=self.entry.init)
else:
error(self.pos, "Only coerce single-character ascii strings can be used as ints.")
return self
# Cython Top Level
#
-import os, sys, re
+import os, sys, re, codecs
if sys.version_info[:2] < (2, 2):
print >>sys.stderr, "Sorry, Cython requires Python 2.2 or later"
sys.exit(1)
-import os
from time import time
import Version
from Scanning import PyrexScanner
self.modules[name] = scope
return scope
+ match_file_encoding = re.compile("coding[:=]\s*([-\w.]+)").search
+
+ def detect_file_encoding(self, source_filename):
+ # PEPs 263 and 3120
+ f = codecs.open(source_filename, "rU", encoding="UTF-8")
+ try:
+ for line_no, line in enumerate(f):
+ encoding = self.match_file_encoding(line)
+ if encoding:
+ return encoding.group(1)
+ if line_no == 1:
+ break
+ finally:
+ f.close()
+ return "UTF-8"
+
def parse(self, source_filename, type_names, pxd, full_module_name):
# Parse the given source file and return a parse tree.
- f = open(source_filename, "rU")
- s = PyrexScanner(f, source_filename,
+ encoding = self.detect_file_encoding(source_filename)
+ f = codecs.open(source_filename, "rU", encoding=encoding)
+ s = PyrexScanner(f, source_filename, source_encoding = encoding,
type_names = type_names, context = self)
try:
tree = Parsing.p_module(s, pxd, full_module_name)
entry.pystring_cname,
entry.cname,
entry.cname,
- isinstance(entry.init, unicode)
+ entry.type.is_unicode
))
code.putln(
"{0, 0, 0, 0}")
# args [CArgDeclNode] formal arguments
# star_arg PyArgDeclNode or None * argument
# starstar_arg PyArgDeclNode or None ** argument
- # doc string or None
+ # doc EncodedString or None
# body StatListNode
#
# The following subnode is constructed internally
entry.pymethdef_cname = \
Naming.pymethdef_prefix + prefix + name
if not Options.docstrings:
- self.entry.doc = None
+ entry.doc = None
else:
if Options.embed_pos_in_docstring:
- entry.doc = 'File: %s (starting at line %s)'%relative_position(self.pos)
+ doc = u'File: %s (starting at line %s)'%relative_position(self.pos)
if not self.doc is None:
- entry.doc = entry.doc + '\\n' + self.doc
+ doc = doc + u'\\n' + self.doc
+ doc = ExprNodes.EncodedString(doc)
+ doc.encoding = self.doc.encoding
+ entry.doc = doc
else:
entry.doc = self.doc
entry.doc_cname = \
self.dict = ExprNodes.DictNode(pos, key_value_pairs = [])
if self.doc and Options.docstrings:
if Options.embed_pos_in_docstring:
- doc = 'File: %s (starting at line %s)'%relative_position(self.pos)
- doc = doc + '\\n' + self.doc
+ doc = u'File: %s (starting at line %s)'%relative_position(self.pos)
+ doc = ExprNodes.EncodedString(doc + 'u\\n' + self.doc)
+ doc.encoding = self.doc.encoding
doc_node = ExprNodes.StringNode(pos, value = doc)
else:
doc_node = None
# Definition of a property in an extension type.
#
# name string
- # doc string or None Doc string
+ # doc EncodedString or None Doc string
# body StatListNode
child_attrs = ["body"]
if not arg.is_name:
s.error("Expected an identifier before '='",
pos = arg.pos)
+ encoded_name = ExprNodes.EncodedString(arg.name)
+ encoded_name.encoding = s.source_encoding
keyword = ExprNodes.StringNode(arg.pos,
- value = arg.name)
+ value = encoded_name)
arg = p_simple_expr(s)
keyword_args.append((keyword, arg))
else:
value = s.systring[:-1]
s.next()
return ExprNodes.ImagNode(pos, value = value)
- elif sy == 'STRING' or sy == 'BEGIN_STRING':
+ elif sy == 'BEGIN_STRING':
kind, value = p_cat_string_literal(s)
if kind == 'c':
return ExprNodes.CharNode(pos, value = value)
elif isinstance(value, float):
return ExprNodes.FloatNode(pos, value = rep)
elif isinstance(value, str):
- return ExprNodes.StringNode(pos, value = rep[1:-1])
+ sval = ExprNodes.EncodedString(rep[1:-1])
+ sval.encoding = value.encoding
+ return ExprNodes.StringNode(pos, value = sval)
+ elif isinstance(value, unicode):
+ sval = ExprNodes.EncodedString(rep[2:-1])
+ return ExprNodes.StringNode(pos, value = sval)
else:
error(pos, "Invalid type for compile-time constant: %s"
% value.__class__.__name__)
def p_cat_string_literal(s):
# A sequence of one or more adjacent string literals.
- # Returns (kind, value) where kind in ('', 'c', 'r')
+ # Returns (kind, value) where kind in ('', 'c', 'r', 'u')
kind, value = p_string_literal(s)
if kind != 'c':
strings = [value]
- while s.sy == 'STRING' or s.sy == 'BEGIN_STRING':
+ while s.sy == 'BEGIN_STRING':
next_kind, next_value = p_string_literal(s)
if next_kind == 'c':
self.error(
"Cannot concatenate char literal with another string or char literal")
+ elif next_kind == 'u':
+ kind = 'u'
strings.append(next_value)
- value = ''.join(strings)
+ value = ExprNodes.EncodedString( u''.join(strings) )
+ if kind != 'u':
+ value.encoding = s.source_encoding
return kind, value
def p_opt_string_literal(s):
- if s.sy == 'STRING' or s.sy == 'BEGIN_STRING':
+ if s.sy == 'BEGIN_STRING':
return p_string_literal(s)
else:
return None
def p_string_literal(s):
# A single string or char literal.
# Returns (kind, value) where kind in ('', 'c', 'r', 'u')
- if s.sy == 'STRING':
- value = unquote(s.systring)
- s.next()
- return value
# s.sy == 'BEGIN_STRING'
pos = s.position()
#is_raw = s.systring[:1].lower() == "r"
systr = s.systring
if len(systr) == 1 and systr in "'\"\n":
chars.append('\\')
- if kind == 'u' and not isinstance(systr, unicode):
- systr = systr.decode("UTF-8")
chars.append(systr)
elif sy == 'ESCAPE':
systr = s.systring
elif c in 'ux':
if kind == 'u':
try:
- chars.append(systr.decode('unicode_escape'))
+ chars.append(
+ systr.encode("ASCII").decode('unicode_escape'))
except UnicodeDecodeError:
s.error("Invalid unicode escape '%s'" % systr,
pos = pos)
"Unexpected token %r:%r in string literal" %
(sy, s.systring))
s.next()
- value = ''.join(chars)
+ value = ExprNodes.EncodedString( u''.join(chars) )
+ if kind != 'u':
+ value.encoding = s.source_encoding
#print "p_string_literal: value =", repr(value) ###
return kind, value
-def unquote(s):
- is_raw = 0
- if s[:1].lower() == "r":
- is_raw = 1
- s = s[1:]
- q = s[:3]
- if q == '"""' or q == "'''":
- s = s[3:-3]
- else:
- s = s[1:-1]
- if is_raw:
- s = s.replace('\\', '\\\\')
- s = s.replace('\n', '\\\n')
- else:
- # Split into double quotes, newlines, escape sequences
- # and spans of regular chars
- l1 = re.split(r'((?:\\[0-7]{1,3})|(?:\\x[0-9A-Fa-f]{2})|(?:\\.)|(?:\\\n)|(?:\n)|")', s)
- #print "unquote: l1 =", l1 ###
- l2 = []
- for item in l1:
- if item == '"' or item == '\n':
- l2.append('\\' + item)
- elif item == '\\\n':
- pass
- elif item[:1] == '\\':
- if len(item) == 2:
- if item[1] in '"\\abfnrtv':
- l2.append(item)
- else:
- l2.append(item[1])
- elif item[1:2] == 'x':
- l2.append('\\x0' + item[2:])
- else:
- # octal escape
- l2.append(item)
- else:
- l2.append(item)
- s = "".join(l2)
- return s
-
# list_display ::= "[" [listmaker] "]"
# listmaker ::= expression ( list_for | ( "," expression )* [","] )
# list_iter ::= list_for | list_if
ExprNodes.StringNode(pos, value = "*")])
else:
name_list = None
+ dotted_name = ExprNodes.EncodedString(dotted_name)
+ dotted_name.encoding = s.source_encoding
stat = Nodes.SingleAssignmentNode(pos,
lhs = ExprNodes.NameNode(pos,
name = as_name or target_name),
imported_name_strings = []
items = []
for (name_pos, name, as_name) in imported_names:
+ encoded_name = ExprNodes.EncodedString(name)
+ encoded_name.encoding = s.source_encoding
imported_name_strings.append(
- ExprNodes.StringNode(name_pos, value = name))
+ ExprNodes.StringNode(name_pos, value = encoded_name))
items.append(
(name,
ExprNodes.NameNode(name_pos,
name = as_name or name)))
import_list = ExprNodes.ListNode(
imported_names[0][0], args = imported_name_strings)
+ dotted_name = ExprNodes.EncodedString(dotted_name)
+ dotted_name.encoding = s.source_encoding
return Nodes.FromImportStatNode(pos,
module = ExprNodes.ImportNode(dotted_name_pos,
module_name = ExprNodes.StringNode(dotted_name_pos,
# s.sy == 'class'
pos = s.position()
s.next()
- class_name = p_ident(s)
+ class_name = ExprNodes.EncodedString( p_ident(s) )
+ class_name.encoding = s.source_encoding
if s.sy == '(':
s.next()
base_list = p_simple_expr_list(s)
return Nodes.PropertyNode(pos, name = name, doc = doc, body = body)
def p_doc_string(s):
- if s.sy == 'STRING' or s.sy == 'BEGIN_STRING':
+ if s.sy == 'BEGIN_STRING':
_, result = p_cat_string_literal(s)
if s.sy != 'EOF':
s.expect_newline("Syntax error in doc string")
# is_enum boolean Is a C enum type
# is_typedef boolean Is a typedef type
# is_string boolean Is a C char * type
+ # is_unicode boolean Is a UTF-8 encoded C char * type
# is_returncode boolean Is used only to signal exceptions
# is_error boolean Is the dummy error type
# has_attributes boolean Has C dot-selectable attributes
is_enum = 0
is_typedef = 0
is_string = 0
+ is_unicode = 0
is_returncode = 0
is_error = 0
has_attributes = 0
return self.base_declaration_code(public_decl(base, dll_linkage), entity_code)
+def _escape_byte_string(s):
+ try:
+ s.decode("ASCII")
+ return s
+ except UnicodeDecodeError:
+ pass
+ l = []
+ append = l.append
+ for c in s:
+ o = ord(c)
+ if o >= 128:
+ append('\\x%X' % o)
+ else:
+ append(c)
+ return ''.join(l)
+
class CStringType:
# Mixin class for C string types.
is_string = 1
+ is_unicode = 0
to_py_function = "PyString_FromString"
from_py_function = "PyString_AsString"
exception_value = "NULL"
def literal_code(self, value):
- if isinstance(value, unicode):
- value = value.encode("UTF-8")
- return '"%s"' % value
+ assert isinstance(value, str)
+ return '"%s"' % _escape_byte_string(value)
+
+
+class CUTF8StringType:
+ # Mixin class for C unicode types.
+
+ is_string = 1
+ is_unicode = 1
+
+ to_py_function = "PyUnicode_DecodeUTF8"
+ exception_value = "NULL"
+
+ def literal_code(self, value):
+ assert isinstance(value, str)
+ return '"%s"' % _escape_byte_string(value)
class CCharArrayType(CStringType, CArrayType):
def __init__(self, size):
CArrayType.__init__(self, c_char_type, size)
+
+
+class CUTF8CharArrayType(CUTF8StringType, CArrayType):
+ # C 'char []' type.
+
+ parsetuple_format = "s"
+ pymemberdef_typecode = "T_STRING_INPLACE"
+
+ def __init__(self, size):
+ CArrayType.__init__(self, c_char_type, size)
class CCharPtrType(CStringType, CPtrType):
CPtrType.__init__(self, c_char_type)
+class CUTF8CharPtrType(CUTF8StringType, CPtrType):
+ # C 'char *' type, encoded in UTF-8.
+
+ parsetuple_format = "s"
+ pymemberdef_typecode = "T_STRING"
+
+ def __init__(self):
+ CPtrType.__init__(self, c_char_type)
+
+
class ErrorType(PyrexType):
# Used to prevent propagation of error messages.
c_null_ptr_type = CNullPtrType(c_void_type)
c_char_array_type = CCharArrayType(None)
+c_utf8_char_array_type = CUTF8CharArrayType(None)
c_char_ptr_type = CCharPtrType()
+c_utf8_char_ptr_type = CUTF8CharPtrType()
c_char_ptr_ptr_type = CPtrType(c_char_ptr_type)
c_int_ptr_type = CPtrType(c_int_type)
resword_dict = build_resword_dict()
def __init__(self, file, filename, parent_scanner = None,
- type_names = None, context = None):
+ type_names = None, context = None, source_encoding=None):
Scanner.__init__(self, get_lexicon(), file, filename)
if parent_scanner:
self.context = parent_scanner.context
self.compile_time_env = initial_compile_time_env()
self.compile_time_eval = 1
self.compile_time_expr = 0
+ self.source_encoding = source_encoding
self.trace = trace_scanner
self.indentation_stack = [0]
self.indentation_char = None
if not entry:
entry = self.declare_var(name, py_object_type, None)
return entry
-
+
def add_string_const(self, value):
# Add an entry for a string constant.
cname = self.new_const_cname()
- entry = Entry("", cname, c_char_array_type, init = value)
+ if value.is_unicode:
+ c_type = c_utf8_char_array_type
+ value = value.utf8encode()
+ else:
+ c_type = c_char_array_type
+ value = value.byteencode()
+ entry = Entry("", cname, c_type, init = value)
entry.used = 1
self.const_entries.append(entry)
return entry
-
+
def get_string_const(self, value):
# Get entry for string constant. Returns an existing
# one if possible, otherwise creates a new one.
entry = self.add_string_const(value)
genv.string_to_entry[value] = entry
return entry
-
+
def add_py_string(self, entry):
# If not already done, allocate a C name for a Python version of
# a string literal, and add it to the list of Python strings to
# Python identifier, it will be interned.
if not entry.pystring_cname:
value = entry.init
- if identifier_pattern.match(value) and isinstance(value, str):
+ if not entry.type.is_unicode and identifier_pattern.match(value):
entry.pystring_cname = self.intern(value)
entry.is_interned = 1
else: