From: Stefan Behnel Date: Tue, 22 Apr 2008 14:37:33 +0000 (+0200) Subject: source code encoding support (PEP 263) and UTF-8 default source encoding (PEP 3120) X-Git-Tag: 0.9.6.14~20^2~25 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=70ea30b6d61f1ae3b25e0c285e12c2413ecb41dd;p=cython.git source code encoding support (PEP 263) and UTF-8 default source encoding (PEP 3120) --- diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py index e79ae49c..c2f5a475 100644 --- a/Cython/Compiler/ExprNodes.py +++ b/Cython/Compiler/ExprNodes.py @@ -18,6 +18,29 @@ from Cython.Debugging import print_call_chain from DebugFlags import debug_disposal_code, debug_temp_alloc, \ debug_coercion +class EncodedString(unicode): + # unicode string subclass to keep track of the original encoding. + # 'encoding' is None for unicode strings and the source encoding + # otherwise + encoding = None + + def byteencode(self): + assert self.encoding is not None + return self.encode(self.encoding) + + def utf8encode(self): + assert self.encoding is None + return self.encode("UTF-8") + + def is_unicode(self): + return self.encoding is None + is_unicode = property(is_unicode) + +# def __eq__(self, other): +# return unicode.__eq__(self, other) and \ +# getattr(other, 'encoding', '') == self.encoding + + class ExprNode(Node): # subexprs [string] Class var holding names of subexpr node attrs # type PyrexType Type of the result @@ -696,15 +719,16 @@ class StringNode(ConstNode): type = PyrexTypes.c_char_ptr_type def compile_time_value(self, denv): - return eval('"%s"' % self.value) + return self.value def analyse_types(self, env): self.entry = env.add_string_const(self.value) def coerce_to(self, dst_type, env): if dst_type.is_int: - if not self.type.is_pyobject and len(self.value) == 1: - return CharNode(self.pos, value=self.value) + if not self.type.is_pyobject and len(self.entry.init) == 1: + # we use the *encoded* value here + return CharNode(self.pos, value=self.entry.init) else: error(self.pos, "Only coerce single-character ascii strings can be used as ints.") return self diff --git a/Cython/Compiler/Main.py b/Cython/Compiler/Main.py index ca8dc88d..c49d170e 100644 --- a/Cython/Compiler/Main.py +++ b/Cython/Compiler/Main.py @@ -2,12 +2,11 @@ # Cython Top Level # -import os, sys, re +import os, sys, re, codecs if sys.version_info[:2] < (2, 2): print >>sys.stderr, "Sorry, Cython requires Python 2.2 or later" sys.exit(1) -import os from time import time import Version from Scanning import PyrexScanner @@ -138,10 +137,27 @@ class Context: self.modules[name] = scope return scope + match_file_encoding = re.compile("coding[:=]\s*([-\w.]+)").search + + def detect_file_encoding(self, source_filename): + # PEPs 263 and 3120 + f = codecs.open(source_filename, "rU", encoding="UTF-8") + try: + for line_no, line in enumerate(f): + encoding = self.match_file_encoding(line) + if encoding: + return encoding.group(1) + if line_no == 1: + break + finally: + f.close() + return "UTF-8" + def parse(self, source_filename, type_names, pxd, full_module_name): # Parse the given source file and return a parse tree. - f = open(source_filename, "rU") - s = PyrexScanner(f, source_filename, + encoding = self.detect_file_encoding(source_filename) + f = codecs.open(source_filename, "rU", encoding=encoding) + s = PyrexScanner(f, source_filename, source_encoding = encoding, type_names = type_names, context = self) try: tree = Parsing.p_module(s, pxd, full_module_name) diff --git a/Cython/Compiler/ModuleNode.py b/Cython/Compiler/ModuleNode.py index 73ec489a..4a58885a 100644 --- a/Cython/Compiler/ModuleNode.py +++ b/Cython/Compiler/ModuleNode.py @@ -1270,7 +1270,7 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode): entry.pystring_cname, entry.cname, entry.cname, - isinstance(entry.init, unicode) + entry.type.is_unicode )) code.putln( "{0, 0, 0, 0}") diff --git a/Cython/Compiler/Nodes.py b/Cython/Compiler/Nodes.py index eb593d2e..c17f332a 100644 --- a/Cython/Compiler/Nodes.py +++ b/Cython/Compiler/Nodes.py @@ -1199,7 +1199,7 @@ class DefNode(FuncDefNode): # args [CArgDeclNode] formal arguments # star_arg PyArgDeclNode or None * argument # starstar_arg PyArgDeclNode or None ** argument - # doc string or None + # doc EncodedString or None # body StatListNode # # The following subnode is constructed internally @@ -1358,12 +1358,15 @@ class DefNode(FuncDefNode): entry.pymethdef_cname = \ Naming.pymethdef_prefix + prefix + name if not Options.docstrings: - self.entry.doc = None + entry.doc = None else: if Options.embed_pos_in_docstring: - entry.doc = 'File: %s (starting at line %s)'%relative_position(self.pos) + doc = u'File: %s (starting at line %s)'%relative_position(self.pos) if not self.doc is None: - entry.doc = entry.doc + '\\n' + self.doc + doc = doc + u'\\n' + self.doc + doc = ExprNodes.EncodedString(doc) + doc.encoding = self.doc.encoding + entry.doc = doc else: entry.doc = self.doc entry.doc_cname = \ @@ -1920,8 +1923,9 @@ class PyClassDefNode(StatNode, BlockNode): self.dict = ExprNodes.DictNode(pos, key_value_pairs = []) if self.doc and Options.docstrings: if Options.embed_pos_in_docstring: - doc = 'File: %s (starting at line %s)'%relative_position(self.pos) - doc = doc + '\\n' + self.doc + doc = u'File: %s (starting at line %s)'%relative_position(self.pos) + doc = ExprNodes.EncodedString(doc + 'u\\n' + self.doc) + doc.encoding = self.doc.encoding doc_node = ExprNodes.StringNode(pos, value = doc) else: doc_node = None @@ -2073,7 +2077,7 @@ class PropertyNode(StatNode): # Definition of a property in an extension type. # # name string - # doc string or None Doc string + # doc EncodedString or None Doc string # body StatListNode child_attrs = ["body"] diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py index 93491c9e..c817ec37 100644 --- a/Cython/Compiler/Parsing.py +++ b/Cython/Compiler/Parsing.py @@ -281,8 +281,10 @@ def p_call(s, function): if not arg.is_name: s.error("Expected an identifier before '='", pos = arg.pos) + encoded_name = ExprNodes.EncodedString(arg.name) + encoded_name.encoding = s.source_encoding keyword = ExprNodes.StringNode(arg.pos, - value = arg.name) + value = encoded_name) arg = p_simple_expr(s) keyword_args.append((keyword, arg)) else: @@ -459,7 +461,7 @@ def p_atom(s): value = s.systring[:-1] s.next() return ExprNodes.ImagNode(pos, value = value) - elif sy == 'STRING' or sy == 'BEGIN_STRING': + elif sy == 'BEGIN_STRING': kind, value = p_cat_string_literal(s) if kind == 'c': return ExprNodes.CharNode(pos, value = value) @@ -500,7 +502,12 @@ def p_name(s, name): elif isinstance(value, float): return ExprNodes.FloatNode(pos, value = rep) elif isinstance(value, str): - return ExprNodes.StringNode(pos, value = rep[1:-1]) + sval = ExprNodes.EncodedString(rep[1:-1]) + sval.encoding = value.encoding + return ExprNodes.StringNode(pos, value = sval) + elif isinstance(value, unicode): + sval = ExprNodes.EncodedString(rep[2:-1]) + return ExprNodes.StringNode(pos, value = sval) else: error(pos, "Invalid type for compile-time constant: %s" % value.__class__.__name__) @@ -508,21 +515,25 @@ def p_name(s, name): def p_cat_string_literal(s): # A sequence of one or more adjacent string literals. - # Returns (kind, value) where kind in ('', 'c', 'r') + # Returns (kind, value) where kind in ('', 'c', 'r', 'u') kind, value = p_string_literal(s) if kind != 'c': strings = [value] - while s.sy == 'STRING' or s.sy == 'BEGIN_STRING': + while s.sy == 'BEGIN_STRING': next_kind, next_value = p_string_literal(s) if next_kind == 'c': self.error( "Cannot concatenate char literal with another string or char literal") + elif next_kind == 'u': + kind = 'u' strings.append(next_value) - value = ''.join(strings) + value = ExprNodes.EncodedString( u''.join(strings) ) + if kind != 'u': + value.encoding = s.source_encoding return kind, value def p_opt_string_literal(s): - if s.sy == 'STRING' or s.sy == 'BEGIN_STRING': + if s.sy == 'BEGIN_STRING': return p_string_literal(s) else: return None @@ -530,10 +541,6 @@ def p_opt_string_literal(s): def p_string_literal(s): # A single string or char literal. # Returns (kind, value) where kind in ('', 'c', 'r', 'u') - if s.sy == 'STRING': - value = unquote(s.systring) - s.next() - return value # s.sy == 'BEGIN_STRING' pos = s.position() #is_raw = s.systring[:1].lower() == "r" @@ -549,8 +556,6 @@ def p_string_literal(s): systr = s.systring if len(systr) == 1 and systr in "'\"\n": chars.append('\\') - if kind == 'u' and not isinstance(systr, unicode): - systr = systr.decode("UTF-8") chars.append(systr) elif sy == 'ESCAPE': systr = s.systring @@ -572,7 +577,8 @@ def p_string_literal(s): elif c in 'ux': if kind == 'u': try: - chars.append(systr.decode('unicode_escape')) + chars.append( + systr.encode("ASCII").decode('unicode_escape')) except UnicodeDecodeError: s.error("Invalid unicode escape '%s'" % systr, pos = pos) @@ -593,50 +599,12 @@ def p_string_literal(s): "Unexpected token %r:%r in string literal" % (sy, s.systring)) s.next() - value = ''.join(chars) + value = ExprNodes.EncodedString( u''.join(chars) ) + if kind != 'u': + value.encoding = s.source_encoding #print "p_string_literal: value =", repr(value) ### return kind, value -def unquote(s): - is_raw = 0 - if s[:1].lower() == "r": - is_raw = 1 - s = s[1:] - q = s[:3] - if q == '"""' or q == "'''": - s = s[3:-3] - else: - s = s[1:-1] - if is_raw: - s = s.replace('\\', '\\\\') - s = s.replace('\n', '\\\n') - else: - # Split into double quotes, newlines, escape sequences - # and spans of regular chars - l1 = re.split(r'((?:\\[0-7]{1,3})|(?:\\x[0-9A-Fa-f]{2})|(?:\\.)|(?:\\\n)|(?:\n)|")', s) - #print "unquote: l1 =", l1 ### - l2 = [] - for item in l1: - if item == '"' or item == '\n': - l2.append('\\' + item) - elif item == '\\\n': - pass - elif item[:1] == '\\': - if len(item) == 2: - if item[1] in '"\\abfnrtv': - l2.append(item) - else: - l2.append(item[1]) - elif item[1:2] == 'x': - l2.append('\\x0' + item[2:]) - else: - # octal escape - l2.append(item) - else: - l2.append(item) - s = "".join(l2) - return s - # list_display ::= "[" [listmaker] "]" # listmaker ::= expression ( list_for | ( "," expression )* [","] ) # list_iter ::= list_for | list_if @@ -946,6 +914,8 @@ def p_import_statement(s): ExprNodes.StringNode(pos, value = "*")]) else: name_list = None + dotted_name = ExprNodes.EncodedString(dotted_name) + dotted_name.encoding = s.source_encoding stat = Nodes.SingleAssignmentNode(pos, lhs = ExprNodes.NameNode(pos, name = as_name or target_name), @@ -984,14 +954,18 @@ def p_from_import_statement(s): imported_name_strings = [] items = [] for (name_pos, name, as_name) in imported_names: + encoded_name = ExprNodes.EncodedString(name) + encoded_name.encoding = s.source_encoding imported_name_strings.append( - ExprNodes.StringNode(name_pos, value = name)) + ExprNodes.StringNode(name_pos, value = encoded_name)) items.append( (name, ExprNodes.NameNode(name_pos, name = as_name or name))) import_list = ExprNodes.ListNode( imported_names[0][0], args = imported_name_strings) + dotted_name = ExprNodes.EncodedString(dotted_name) + dotted_name.encoding = s.source_encoding return Nodes.FromImportStatNode(pos, module = ExprNodes.ImportNode(dotted_name_pos, module_name = ExprNodes.StringNode(dotted_name_pos, @@ -1996,7 +1970,8 @@ def p_class_statement(s): # s.sy == 'class' pos = s.position() s.next() - class_name = p_ident(s) + class_name = ExprNodes.EncodedString( p_ident(s) ) + class_name.encoding = s.source_encoding if s.sy == '(': s.next() base_list = p_simple_expr_list(s) @@ -2113,7 +2088,7 @@ def p_property_decl(s): return Nodes.PropertyNode(pos, name = name, doc = doc, body = body) def p_doc_string(s): - if s.sy == 'STRING' or s.sy == 'BEGIN_STRING': + if s.sy == 'BEGIN_STRING': _, result = p_cat_string_literal(s) if s.sy != 'EOF': s.expect_newline("Syntax error in doc string") diff --git a/Cython/Compiler/PyrexTypes.py b/Cython/Compiler/PyrexTypes.py index bf3e6f9f..d7427d4c 100644 --- a/Cython/Compiler/PyrexTypes.py +++ b/Cython/Compiler/PyrexTypes.py @@ -37,6 +37,7 @@ class PyrexType(BaseType): # is_enum boolean Is a C enum type # is_typedef boolean Is a typedef type # is_string boolean Is a C char * type + # is_unicode boolean Is a UTF-8 encoded C char * type # is_returncode boolean Is used only to signal exceptions # is_error boolean Is the dummy error type # has_attributes boolean Has C dot-selectable attributes @@ -83,6 +84,7 @@ class PyrexType(BaseType): is_enum = 0 is_typedef = 0 is_string = 0 + is_unicode = 0 is_returncode = 0 is_error = 0 has_attributes = 0 @@ -875,19 +877,49 @@ class CEnumType(CType): return self.base_declaration_code(public_decl(base, dll_linkage), entity_code) +def _escape_byte_string(s): + try: + s.decode("ASCII") + return s + except UnicodeDecodeError: + pass + l = [] + append = l.append + for c in s: + o = ord(c) + if o >= 128: + append('\\x%X' % o) + else: + append(c) + return ''.join(l) + class CStringType: # Mixin class for C string types. is_string = 1 + is_unicode = 0 to_py_function = "PyString_FromString" from_py_function = "PyString_AsString" exception_value = "NULL" def literal_code(self, value): - if isinstance(value, unicode): - value = value.encode("UTF-8") - return '"%s"' % value + assert isinstance(value, str) + return '"%s"' % _escape_byte_string(value) + + +class CUTF8StringType: + # Mixin class for C unicode types. + + is_string = 1 + is_unicode = 1 + + to_py_function = "PyUnicode_DecodeUTF8" + exception_value = "NULL" + + def literal_code(self, value): + assert isinstance(value, str) + return '"%s"' % _escape_byte_string(value) class CCharArrayType(CStringType, CArrayType): @@ -898,6 +930,16 @@ class CCharArrayType(CStringType, CArrayType): def __init__(self, size): CArrayType.__init__(self, c_char_type, size) + + +class CUTF8CharArrayType(CUTF8StringType, CArrayType): + # C 'char []' type. + + parsetuple_format = "s" + pymemberdef_typecode = "T_STRING_INPLACE" + + def __init__(self, size): + CArrayType.__init__(self, c_char_type, size) class CCharPtrType(CStringType, CPtrType): @@ -910,6 +952,16 @@ class CCharPtrType(CStringType, CPtrType): CPtrType.__init__(self, c_char_type) +class CUTF8CharPtrType(CUTF8StringType, CPtrType): + # C 'char *' type, encoded in UTF-8. + + parsetuple_format = "s" + pymemberdef_typecode = "T_STRING" + + def __init__(self): + CPtrType.__init__(self, c_char_type) + + class ErrorType(PyrexType): # Used to prevent propagation of error messages. @@ -974,7 +1026,9 @@ c_longdouble_type = CFloatType(8) c_null_ptr_type = CNullPtrType(c_void_type) c_char_array_type = CCharArrayType(None) +c_utf8_char_array_type = CUTF8CharArrayType(None) c_char_ptr_type = CCharPtrType() +c_utf8_char_ptr_type = CUTF8CharPtrType() c_char_ptr_ptr_type = CPtrType(c_char_ptr_type) c_int_ptr_type = CPtrType(c_int_type) diff --git a/Cython/Compiler/Scanning.py b/Cython/Compiler/Scanning.py index e48c8dce..e91e343a 100644 --- a/Cython/Compiler/Scanning.py +++ b/Cython/Compiler/Scanning.py @@ -212,7 +212,7 @@ class PyrexScanner(Scanner): resword_dict = build_resword_dict() def __init__(self, file, filename, parent_scanner = None, - type_names = None, context = None): + type_names = None, context = None, source_encoding=None): Scanner.__init__(self, get_lexicon(), file, filename) if parent_scanner: self.context = parent_scanner.context @@ -226,6 +226,7 @@ class PyrexScanner(Scanner): self.compile_time_env = initial_compile_time_env() self.compile_time_eval = 1 self.compile_time_expr = 0 + self.source_encoding = source_encoding self.trace = trace_scanner self.indentation_stack = [0] self.indentation_char = None diff --git a/Cython/Compiler/Symtab.py b/Cython/Compiler/Symtab.py index 0a6e4293..4140d80f 100644 --- a/Cython/Compiler/Symtab.py +++ b/Cython/Compiler/Symtab.py @@ -434,15 +434,21 @@ class Scope: if not entry: entry = self.declare_var(name, py_object_type, None) return entry - + def add_string_const(self, value): # Add an entry for a string constant. cname = self.new_const_cname() - entry = Entry("", cname, c_char_array_type, init = value) + if value.is_unicode: + c_type = c_utf8_char_array_type + value = value.utf8encode() + else: + c_type = c_char_array_type + value = value.byteencode() + entry = Entry("", cname, c_type, init = value) entry.used = 1 self.const_entries.append(entry) return entry - + def get_string_const(self, value): # Get entry for string constant. Returns an existing # one if possible, otherwise creates a new one. @@ -452,7 +458,7 @@ class Scope: entry = self.add_string_const(value) genv.string_to_entry[value] = entry return entry - + def add_py_string(self, entry): # If not already done, allocate a C name for a Python version of # a string literal, and add it to the list of Python strings to @@ -460,7 +466,7 @@ class Scope: # Python identifier, it will be interned. if not entry.pystring_cname: value = entry.init - if identifier_pattern.match(value) and isinstance(value, str): + if not entry.type.is_unicode and identifier_pattern.match(value): entry.pystring_cname = self.intern(value) entry.is_interned = 1 else: