From: Stefan Behnel <scoder@users.berlios.de>
Date: Fri, 15 Aug 2008 02:41:09 +0000 (+0200)
Subject: Rewrite of the string literal handling code
X-Git-Tag: 0.9.8.1~47
X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=2e8a0084bb1aa86b27877bfb117cbdad023bbdd9;p=cython.git

Rewrite of the string literal handling code

String literals pass through the compiler as follows:
- unicode string literals are stored as unicode strings and encoded to UTF-8 on the way out
- byte string literals are stored as correctly encoded byte strings by unescaping the source string literal into the corresponding byte sequence. No further encoding is done later on!
- char literals are stored as byte strings of length 1. This can be verified by the parser now, e.g. a non-ASCII char literal in UTF-8 source code will result in an error, as it would end up as two or more bytes in the C code, which can no longer be represented as a C char.

Storing byte strings is necessary as we otherwise loose the ability to encode byte string literals on the way out. They do not necessarily contain only bytes that fit into the source code encoding as the source can use escape sequences to represent them. Previously, ASCII encoded source code could not contain byte string literals with properly escaped non-ASCII bytes.

Another bug that was fixed: in Python, escape sequences behave different in unicode strings (where they represent the character code) and byte strings (where they represent a byte value). Previously, they resulted in the same byte value in Cython code. This is only a problem for non-ASCII escapes, since the character code and the byte value of ASCII characters are identical.
---

diff --git a/Cython/Compiler/Buffer.py b/Cython/Compiler/Buffer.py
index 7beeca23..637ff9a5 100644
--- a/Cython/Compiler/Buffer.py
+++ b/Cython/Compiler/Buffer.py
@@ -3,7 +3,7 @@ from Cython.Compiler.ModuleNode import ModuleNode
 from Cython.Compiler.Nodes import *
 from Cython.Compiler.ExprNodes import *
 from Cython.Compiler.TreeFragment import TreeFragment
-from Cython.Utils import EncodedString
+from Cython.Compiler.StringEncoding import EncodedString
 from Cython.Compiler.Errors import CompileError
 import Interpreter
 import PyrexTypes
diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py
index 9ab4d664..a1b7982e 100644
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -6,6 +6,7 @@ import operator
 from string import join
 
 from Errors import error, warning, InternalError
+import StringEncoding
 import Naming
 from Nodes import Node
 import PyrexTypes
@@ -14,7 +15,6 @@ from Builtin import list_type, tuple_type, dict_type, unicode_type
 import Symtab
 import Options
 from Annotate import AnnotationItem
-from Cython import Utils
 
 from Cython.Debugging import print_call_chain
 from DebugFlags import debug_disposal_code, debug_temp_alloc, \
@@ -640,10 +640,10 @@ class CharNode(ConstNode):
     type = PyrexTypes.c_char_type
     
     def compile_time_value(self, denv):
-        return ord(self.value.byteencode())
+        return ord(self.value)
     
     def calculate_result_code(self):
-        return "'%s'" % Utils.escape_character(self.value.byteencode())
+        return "'%s'" % StringEncoding.escape_character(self.value)
 
 
 class IntNode(ConstNode):
diff --git a/Cython/Compiler/Main.py b/Cython/Compiler/Main.py
index 2d8c3dd6..723d2547 100644
--- a/Cython/Compiler/Main.py
+++ b/Cython/Compiler/Main.py
@@ -397,6 +397,8 @@ class Context:
             finally:
                 f.close()
         except UnicodeDecodeError, msg:
+            import traceback
+            traceback.print_exc()
             error((source_desc, 0, 0), "Decoding error, missing or incorrect coding=<encoding-name> at top of source (%s)" % msg)
         if Errors.num_errors > 0:
             raise CompileError
diff --git a/Cython/Compiler/ModuleNode.py b/Cython/Compiler/ModuleNode.py
index 60071619..efc56c56 100644
--- a/Cython/Compiler/ModuleNode.py
+++ b/Cython/Compiler/ModuleNode.py
@@ -23,7 +23,8 @@ import Version
 
 from Errors import error, warning
 from PyrexTypes import py_object_type
-from Cython.Utils import open_new_file, replace_suffix, escape_byte_string, EncodedString
+from Cython.Utils import open_new_file, replace_suffix
+from StringEncoding import escape_byte_string, EncodedString
 
 
 def check_c_classes(module_node):
@@ -514,9 +515,12 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode):
         code.putln('static const char *%s;' % Naming.filename_cname)
         code.putln('static const char **%s;' % Naming.filetable_cname)
         if env.doc:
+            docstr = env.doc
+            if not isinstance(docstr, str):
+                docstr = docstr.utf8encode()
             code.putln('')
             code.putln('static char %s[] = "%s";' % (
-                    env.doc_cname, escape_byte_string(env.doc.utf8encode())))
+                    env.doc_cname, escape_byte_string(docstr)))
     
     def generate_extern_c_macro_definition(self, code):
         name = Naming.extern_c_macro
diff --git a/Cython/Compiler/Nodes.py b/Cython/Compiler/Nodes.py
index 2b299a23..f10d278d 100644
--- a/Cython/Compiler/Nodes.py
+++ b/Cython/Compiler/Nodes.py
@@ -13,7 +13,7 @@ from PyrexTypes import py_object_type, error_type, CTypedefType, CFuncType
 from Symtab import ModuleScope, LocalScope, GeneratorLocalScope, \
     StructOrUnionScope, PyClassScope, CClassScope
 from Cython.Utils import open_new_file, replace_suffix
-from Cython.Utils import EncodedString, escape_byte_string
+from StringEncoding import EncodedString, escape_byte_string
 import Options
 import ControlFlow
 
@@ -1516,10 +1516,13 @@ class DefNode(FuncDefNode):
         if proto_only:
             return
         if self.entry.doc and Options.docstrings:
+            docstr = self.entry.doc
+            if not isinstance(docstr, str):
+                docstr = docstr.utf8encode()
             code.putln(
                 'static char %s[] = "%s";' % (
                     self.entry.doc_cname,
-                    escape_byte_string(self.entry.doc.utf8encode())))
+                    escape_byte_string(docstr)))
         if with_pymethdef:
             code.put(
                 "static PyMethodDef %s = " % 
diff --git a/Cython/Compiler/ParseTreeTransforms.py b/Cython/Compiler/ParseTreeTransforms.py
index b6bb6509..5150797c 100644
--- a/Cython/Compiler/ParseTreeTransforms.py
+++ b/Cython/Compiler/ParseTreeTransforms.py
@@ -3,7 +3,7 @@ from Cython.Compiler.ModuleNode import ModuleNode
 from Cython.Compiler.Nodes import *
 from Cython.Compiler.ExprNodes import *
 from Cython.Compiler.TreeFragment import TreeFragment
-from Cython.Utils import EncodedString
+from Cython.Compiler.StringEncoding import EncodedString
 from Cython.Compiler.Errors import CompileError
 try:
     set
diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py
index d2863db3..5d3009a6 100644
--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -9,6 +9,8 @@ from types import ListType, TupleType
 from Scanning import PyrexScanner, FileSourceDescriptor
 import Nodes
 import ExprNodes
+import StringEncoding
+from StringEncoding import EncodedString, BytesLiteral
 from ModuleNode import ModuleNode
 from Errors import error, warning, InternalError
 from Cython import Utils
@@ -280,7 +282,7 @@ def p_trailer(s, node1):
         return p_index(s, node1)
     else: # s.sy == '.'
         s.next()
-        name = Utils.EncodedString( p_ident(s) )
+        name = EncodedString( p_ident(s) )
         return ExprNodes.AttributeNode(pos, 
             obj = node1, attribute = name)
 
@@ -302,7 +304,7 @@ def p_call(s, function):
             if not arg.is_name:
                 s.error("Expected an identifier before '='",
                     pos = arg.pos)
-            encoded_name = Utils.EncodedString(arg.name)
+            encoded_name = EncodedString(arg.name)
             keyword = ExprNodes.IdentifierStringNode(arg.pos, 
                 value = encoded_name)
             arg = p_simple_expr(s)
@@ -498,7 +500,7 @@ def p_atom(s):
         else:
             return ExprNodes.StringNode(pos, value = value)
     elif sy == 'IDENT':
-        name = Utils.EncodedString( s.systring )
+        name = EncodedString( s.systring )
         s.next()
         if name == "None":
             return ExprNodes.NoneNode(pos)
@@ -533,6 +535,8 @@ def p_name(s, name):
                 return ExprNodes.FloatNode(pos, value = rep)
             elif isinstance(value, unicode):
                 return ExprNodes.StringNode(pos, value = value)
+            elif isinstance(value, str):
+                return ExprNodes.StringNode(pos, value = value)
             else:
                 error(pos, "Invalid type for compile-time constant: %s"
                     % value.__class__.__name__)
@@ -549,11 +553,21 @@ def p_cat_string_literal(s):
             if next_kind == 'c':
                 error(s.position(),
                       "Cannot concatenate char literal with another string or char literal")
-            elif next_kind == 'u':
+            elif next_kind != kind:
+                # we have to switch to unicode now
+                if kind == 'b':
+                    # concatenating a unicode string to byte strings
+                    strings = [u''.join([s.decode(s.encoding) for s in strings])]
+                elif kind == 'u':
+                    # concatenating a byte string to unicode strings
+                    strings.append(next_value.decode(next_value.encoding))
                 kind = 'u'
-            strings.append(next_value)
-        value = Utils.EncodedString( u''.join(strings) )
-        if kind != 'u':
+            else:
+                strings.append(next_value)
+        if kind == 'u':
+            value = EncodedString( u''.join(strings) )
+        else:
+            value = BytesLiteral( ''.join(strings) )
             value.encoding = s.source_encoding
     return kind, value
 
@@ -582,7 +596,10 @@ def p_string_literal(s):
             kind = 'u'
     elif kind == '':
         kind = 'b'
-    chars = []
+    if kind == 'u':
+        chars = StringEncoding.UnicodeLiteralBuilder()
+    else:
+        chars = StringEncoding.BytesLiteralBuilder(s.source_encoding)
     while 1:
         s.next()
         sy = s.sy
@@ -590,41 +607,46 @@ def p_string_literal(s):
         if sy == 'CHARS':
             chars.append(s.systring)
         elif sy == 'ESCAPE':
+            has_escape = True
             systr = s.systring
             if is_raw:
-                if systr == '\\\n':
-                    chars.append('\\\n')
-                elif systr == '\\\"':
-                    chars.append('"')
-                elif systr == '\\\'':
-                    chars.append("'")
+                if systr == u'\\\n':
+                    chars.append(u'\\\n')
+                elif systr == u'\\\"':
+                    chars.append(u'"')
+                elif systr == u'\\\'':
+                    chars.append(u"'")
                 else:
                     chars.append(systr)
             else:
                 c = systr[1]
-                if c in "01234567":
-                    chars.append(chr(int(systr[1:], 8)))
-                elif c in "'\"\\":
+                if c in u"01234567":
+                    chars.append_charval( int(systr[1:], 8) )
+                elif c in u"'\"\\":
                     chars.append(c)
-                elif c in "abfnrtv":
-                    chars.append(Utils.char_from_escape_sequence(systr))
-                elif c == '\n':
+                elif c in u"abfnrtv":
+                    chars.append(
+                        StringEncoding.char_from_escape_sequence(systr))
+                elif c == u'\n':
                     pass
-                elif c in 'Uux':
+                elif c in u'Uux':
                     if kind == 'u' or c == 'x':
                         chrval = int(systr[2:], 16)
                         if chrval > 1114111: # sys.maxunicode:
                             s.error("Invalid unicode escape '%s'" % systr,
                                     pos = pos)
-                        strval = unichr(chrval)
+                        elif chrval > 65535:
+                            warning(s.position(),
+                                    "Unicode characters above 65535 are not "
+                                    "necessarily portable across Python installations", 1)
+                        chars.append_charval(chrval)
                     else:
                         # unicode escapes in plain byte strings are not unescaped
-                        strval = systr
-                    chars.append(strval)
+                        chars.append(systr)
                 else:
-                    chars.append('\\' + systr[1:])
+                    chars.append(u'\\' + systr[1:])
         elif sy == 'NEWLINE':
-            chars.append('\n')
+            chars.append(u'\n')
         elif sy == 'END_STRING':
             break
         elif sy == 'EOF':
@@ -633,13 +655,13 @@ def p_string_literal(s):
             s.error(
                 "Unexpected token %r:%r in string literal" %
                     (sy, s.systring))
-    string = u''.join(chars)
-    if kind == 'c' and len(string) != 1:
-        error(pos, u"invalid character literal: %r" % string)
+    if kind == 'c':
+        value = chars.getchar()
+        if len(value) != 1:
+            error(pos, u"invalid character literal: %r" % value)
+    else:
+        value = chars.getstring()
     s.next()
-    value = Utils.EncodedString(string)
-    if kind != 'u':
-        value.encoding = s.source_encoding
     #print "p_string_literal: value =", repr(value) ###
     return kind, value
 
@@ -943,7 +965,7 @@ def p_import_statement(s):
         items.append(p_dotted_name(s, as_allowed = 1))
     stats = []
     for pos, target_name, dotted_name, as_name in items:
-        dotted_name = Utils.EncodedString(dotted_name)
+        dotted_name = EncodedString(dotted_name)
         if kind == 'cimport':
             stat = Nodes.CImportStatNode(pos, 
                 module_name = dotted_name,
@@ -951,7 +973,7 @@ def p_import_statement(s):
         else:
             if as_name and "." in dotted_name:
                 name_list = ExprNodes.ListNode(pos, args = [
-                    ExprNodes.StringNode(pos, value = Utils.EncodedString("*"))])
+                    ExprNodes.StringNode(pos, value = EncodedString("*"))])
             else:
                 name_list = None
             stat = Nodes.SingleAssignmentNode(pos,
@@ -984,7 +1006,7 @@ def p_from_import_statement(s, first_statement = 0):
     while s.sy == ',':
         s.next()
         imported_names.append(p_imported_name(s, is_cimport))
-    dotted_name = Utils.EncodedString(dotted_name)
+    dotted_name = EncodedString(dotted_name)
     if dotted_name == '__future__':
         if not first_statement:
             s.error("from __future__ imports must occur at the beginning of the file")
@@ -1011,7 +1033,7 @@ def p_from_import_statement(s, first_statement = 0):
         imported_name_strings = []
         items = []
         for (name_pos, name, as_name, kind) in imported_names:
-            encoded_name = Utils.EncodedString(name)
+            encoded_name = EncodedString(name)
             imported_name_strings.append(
                 ExprNodes.IdentifierStringNode(name_pos, value = encoded_name))
             items.append(
@@ -1020,7 +1042,7 @@ def p_from_import_statement(s, first_statement = 0):
                                     name = as_name or name)))
         import_list = ExprNodes.ListNode(
             imported_names[0][0], args = imported_name_strings)
-        dotted_name = Utils.EncodedString(dotted_name)
+        dotted_name = EncodedString(dotted_name)
         return Nodes.FromImportStatNode(pos,
             module = ExprNodes.ImportNode(dotted_name_pos,
                 module_name = ExprNodes.IdentifierStringNode(pos, value = dotted_name),
@@ -1520,7 +1542,7 @@ def p_positional_and_keyword_args(s, end_sy_set, type_positions=(), type_keyword
                 else:
                     arg = p_simple_expr(s)
                 keyword_node = ExprNodes.IdentifierStringNode(arg.pos,
-                                value = Utils.EncodedString(ident))
+                                value = EncodedString(ident))
                 keyword_args.append((keyword_node, arg))
                 was_keyword = True
             else:
@@ -2136,10 +2158,10 @@ def p_decorators(s):
         s.next()
         decstring = p_dotted_name(s, as_allowed=0)[2]
         names = decstring.split('.')
-        decorator = ExprNodes.NameNode(pos, name=Utils.EncodedString(names[0]))
+        decorator = ExprNodes.NameNode(pos, name=EncodedString(names[0]))
         for name in names[1:]:
             decorator = ExprNodes.AttributeNode(pos,
-                                           attribute=Utils.EncodedString(name),
+                                           attribute=EncodedString(name),
                                            obj=decorator)
         if s.sy == '(':
             decorator = p_call(s, decorator)
@@ -2187,7 +2209,7 @@ def p_class_statement(s):
     # s.sy == 'class'
     pos = s.position()
     s.next()
-    class_name = Utils.EncodedString( p_ident(s) )
+    class_name = EncodedString( p_ident(s) )
     class_name.encoding = s.source_encoding
     if s.sy == '(':
         s.next()
diff --git a/Cython/Compiler/PyrexTypes.py b/Cython/Compiler/PyrexTypes.py
index 46df500f..e2e7b455 100644
--- a/Cython/Compiler/PyrexTypes.py
+++ b/Cython/Compiler/PyrexTypes.py
@@ -2,7 +2,7 @@
 #   Pyrex - Types
 #
 
-from Cython import Utils
+import StringEncoding
 import Naming
 import copy
 
@@ -1000,7 +1000,7 @@ class CStringType:
 
     def literal_code(self, value):
         assert isinstance(value, str)
-        return '"%s"' % Utils.escape_byte_string(value)
+        return '"%s"' % StringEncoding.escape_byte_string(value)
 
 
 class CUTF8CharArrayType(CStringType, CArrayType):
diff --git a/Cython/Compiler/Scanning.py b/Cython/Compiler/Scanning.py
index 05850274..618168d8 100644
--- a/Cython/Compiler/Scanning.py
+++ b/Cython/Compiler/Scanning.py
@@ -17,7 +17,7 @@ from Cython.Plex.Errors import UnrecognizedInput
 from Errors import CompileError, error
 from Lexicon import string_prefixes, raw_prefixes, make_lexicon
 
-from Cython import Utils
+from StringEncoding import EncodedString
 
 plex_version = getattr(Plex, '_version', None)
 #print "Plex version:", plex_version ###
@@ -413,7 +413,7 @@ class PyrexScanner(Scanner):
             if systring in self.resword_dict:
                 sy = systring
             else:
-                systring = Utils.EncodedString(systring)
+                systring = EncodedString(systring)
                 systring.encoding = self.source_encoding
         self.sy = sy
         self.systring = systring
diff --git a/Cython/Compiler/StringEncoding.py b/Cython/Compiler/StringEncoding.py
new file mode 100644
index 00000000..b3475802
--- /dev/null
+++ b/Cython/Compiler/StringEncoding.py
@@ -0,0 +1,144 @@
+#
+#   Cython -- encoding related tools
+#
+
+import re
+
+class UnicodeLiteralBuilder(object):
+    """Assemble a unicode string.
+    """
+    def __init__(self):
+        self.chars = []
+
+    def append(self, characters):
+        if isinstance(characters, str):
+            # this came from a Py2 string literal in the parser code
+            characters = characters.decode("ASCII")
+        assert isinstance(characters, unicode), str(type(characters))
+        self.chars.append(characters)
+
+    def append_charval(self, char_number):
+        self.chars.append( unichr(char_number) )
+
+    def getstring(self):
+        return EncodedString(u''.join(self.chars))
+
+
+class BytesLiteralBuilder(object):
+    """Assemble a byte string or char value.
+    """
+    def __init__(self, target_encoding):
+        self.chars = []
+        self.target_encoding = target_encoding
+
+    def append(self, characters):
+        if isinstance(characters, unicode):
+            characters = characters.encode(self.target_encoding)
+        assert isinstance(characters, str), str(type(characters))
+        self.chars.append(characters)
+
+    def append_charval(self, char_number):
+        self.chars.append( chr(char_number) )
+
+    def getstring(self):
+        # this *must* return a byte string! => fix it in Py3k!!
+        s = BytesLiteral(''.join(self.chars))
+        s.encoding = self.target_encoding
+        return s
+
+    def getchar(self):
+        # this *must* return a byte string! => fix it in Py3k!!
+        return self.getstring()
+
+class EncodedString(unicode):
+    # unicode string subclass to keep track of the original encoding.
+    # 'encoding' is None for unicode strings and the source encoding
+    # otherwise
+    encoding = None
+
+    def byteencode(self):
+        assert self.encoding is not None
+        return self.encode(self.encoding)
+
+    def utf8encode(self):
+        assert self.encoding is None
+        return self.encode("UTF-8")
+
+    def is_unicode(self):
+        return self.encoding is None
+    is_unicode = property(is_unicode)
+
+class BytesLiteral(str):
+    # str subclass that is compatible with EncodedString
+    encoding = None
+
+    def byteencode(self):
+        return str(self)
+
+    def utf8encode(self):
+        assert False, "this is not a unicode string: %r" % self
+
+    is_unicode = False
+
+char_from_escape_sequence = {
+    r'\a' : u'\a',
+    r'\b' : u'\b',
+    r'\f' : u'\f',
+    r'\n' : u'\n',
+    r'\r' : u'\r',
+    r'\t' : u'\t',
+    r'\v' : u'\v',
+    }.get
+
+def _to_escape_sequence(s):
+    if s in '\n\r\t':
+        return repr(s)[1:-1]
+    elif s == '"':
+        return r'\"'
+    else:
+        # within a character sequence, oct passes much better than hex
+        return ''.join(['\\%03o' % ord(c) for c in s])
+
+_c_special = ('\0', '\n', '\r', '\t', '??', '"')
+_c_special_replacements = zip(_c_special, map(_to_escape_sequence, _c_special))
+
+def _build_specials_test():
+    subexps = []
+    for special in _c_special:
+        regexp = ''.join(['[%s]' % c for c in special])
+        subexps.append(regexp)
+    return re.compile('|'.join(subexps)).search
+
+_has_specials = _build_specials_test()
+
+def escape_character(c):
+    if c in '\n\r\t\\':
+        return repr(c)[1:-1]
+    elif c == "'":
+        return "\\'"
+    n = ord(c)
+    if n < 32 or n > 127:
+        # hex works well for characters
+        return "\\x%02X" % n
+    else:
+        return c
+
+def escape_byte_string(s):
+    s = s.replace('\\', '\\\\')
+    if _has_specials(s):
+        for special, replacement in _c_special_replacements:
+            s = s.replace(special, replacement)
+    try:
+        s.decode("ASCII")
+        return s
+    except UnicodeDecodeError:
+        pass
+    l = []
+    append = l.append
+    for c in s:
+        o = ord(c)
+        if o >= 128:
+            append('\\%3o' % o)
+        else:
+            append(c)
+    return ''.join(l)
diff --git a/Cython/Compiler/Symtab.py b/Cython/Compiler/Symtab.py
index 5c6de053..5c70dcb3 100644
--- a/Cython/Compiler/Symtab.py
+++ b/Cython/Compiler/Symtab.py
@@ -5,6 +5,7 @@
 import re
 from Cython import Utils
 from Errors import warning, error, InternalError
+from StringEncoding import EncodedString
 import Options
 import Naming
 import PyrexTypes
@@ -684,14 +685,14 @@ class BuiltinScope(Scope):
             utility_code = None):
         # If python_equiv == "*", the Python equivalent has the same name
         # as the entry, otherwise it has the name specified by python_equiv.
-        name = Utils.EncodedString(name)
+        name = EncodedString(name)
         entry = self.declare_cfunction(name, type, None, cname)
         entry.utility_code = utility_code
         if python_equiv:
             if python_equiv == "*":
                 python_equiv = name
             else:
-                python_equiv = Utils.EncodedString(python_equiv)
+                python_equiv = EncodedString(python_equiv)
             var_entry = Entry(python_equiv, python_equiv, py_object_type)
             var_entry.is_variable = 1
             var_entry.is_builtin = 1
@@ -699,7 +700,7 @@ class BuiltinScope(Scope):
         return entry
         
     def declare_builtin_type(self, name, cname):
-        name = Utils.EncodedString(name)
+        name = EncodedString(name)
         type = PyrexTypes.BuiltinObjectType(name, cname)
         type.set_scope(CClassScope(name, outer_scope=None, visibility='extern'))
         self.type_names[name] = 1
@@ -1370,7 +1371,7 @@ class CClassScope(ClassScope):
         if name == "__new__":
             warning(pos, "__new__ method of extension type will change semantics "
                 "in a future version of Pyrex and Cython. Use __cinit__ instead.")
-            name = Utils.EncodedString("__cinit__")
+            name = EncodedString("__cinit__")
         entry = self.declare_var(name, py_object_type, pos, visibility='extern')
         special_sig = get_special_method_signature(name)
         if special_sig:
@@ -1387,7 +1388,7 @@ class CClassScope(ClassScope):
     
     def lookup_here(self, name):
         if name == "__new__":
-            name = Utils.EncodedString("__cinit__")
+            name = EncodedString("__cinit__")
         return ClassScope.lookup_here(self, name)
     
     def declare_cfunction(self, name, type, pos,
diff --git a/Cython/Compiler/TypeSlots.py b/Cython/Compiler/TypeSlots.py
index c1890730..a5898c97 100644
--- a/Cython/Compiler/TypeSlots.py
+++ b/Cython/Compiler/TypeSlots.py
@@ -3,9 +3,9 @@
 #           and associated know-how.
 #
 
-from Cython import Utils
 import Naming
 import PyrexTypes
+import StringEncoding
 import sys
 
 class Signature:
@@ -311,7 +311,7 @@ class DocStringSlot(SlotDescriptor):
                 doc = scope.doc.utf8encode()
             else:
                 doc = scope.doc.byteencode()
-            return '"%s"' % Utils.escape_byte_string(doc)
+            return '"%s"' % StringEncoding.escape_byte_string(doc)
         else:
             return "0"
 
diff --git a/Cython/Compiler/Visitor.py b/Cython/Compiler/Visitor.py
index 0f6e826d..80cd3b35 100644
--- a/Cython/Compiler/Visitor.py
+++ b/Cython/Compiler/Visitor.py
@@ -5,7 +5,7 @@ import inspect
 import Nodes
 import ExprNodes
 import Naming
-from Cython.Utils import EncodedString
+from StringEncoding import EncodedString
 
 class BasicVisitor(object):
     """A generic visitor base class which can be used for visiting any kind of object."""
diff --git a/Cython/Utils.py b/Cython/Utils.py
index 480ed6d9..c1027681 100644
--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -40,7 +40,7 @@ def file_newer_than(path, time):
     ftime = modification_time(path)
     return ftime > time
 
-# support for source file encoding detection and unicode decoding
+# support for source file encoding detection
 
 def encode_filename(filename):
     if isinstance(filename, unicode):
@@ -77,90 +77,6 @@ def open_source_file(source_filename, mode="rU"):
     encoding = detect_file_encoding(source_filename)
     return codecs.open(source_filename, mode=mode, encoding=encoding)
 
-class EncodedString(unicode):
-    # unicode string subclass to keep track of the original encoding.
-    # 'encoding' is None for unicode strings and the source encoding
-    # otherwise
-    encoding = None
-
-    def byteencode(self):
-        assert self.encoding is not None
-        return self.encode(self.encoding)
-
-    def utf8encode(self):
-        assert self.encoding is None
-        return self.encode("UTF-8")
-
-    def is_unicode(self):
-        return self.encoding is None
-    is_unicode = property(is_unicode)
-
-#    def __eq__(self, other):
-#        return unicode.__eq__(self, other) and \
-#            getattr(other, 'encoding', '') == self.encoding
-
-char_from_escape_sequence = {
-    r'\a' : '\a',
-    r'\b' : '\b',
-    r'\f' : '\f',
-    r'\n' : '\n',
-    r'\r' : '\r',
-    r'\t' : '\t',
-    r'\v' : '\v',
-    }.get
-
-def _to_escape_sequence(s):
-    if s in '\n\r\t':
-        return repr(s)[1:-1]
-    elif s == '"':
-        return r'\"'
-    else:
-        # within a character sequence, oct passes much better than hex
-        return ''.join(['\\%03o' % ord(c) for c in s])
-
-_c_special = ('\0', '\n', '\r', '\t', '??', '"')
-_c_special_replacements = zip(_c_special, map(_to_escape_sequence, _c_special))
-
-def _build_specials_test():
-    subexps = []
-    for special in _c_special:
-        regexp = ''.join(['[%s]' % c for c in special])
-        subexps.append(regexp)
-    return re.compile('|'.join(subexps)).search
-
-_has_specials = _build_specials_test()
-
-def escape_character(c):
-    if c in '\n\r\t\\':
-        return repr(c)[1:-1]
-    elif c == "'":
-        return "\\'"
-    elif ord(c) < 32:
-        # hex works well for characters
-        return "\\x%02X" % ord(c)
-    else:
-        return c
-
-def escape_byte_string(s):
-    s = s.replace('\\', '\\\\')
-    if _has_specials(s):
-        for special, replacement in _c_special_replacements:
-            s = s.replace(special, replacement)
-    try:
-        s.decode("ASCII")
-        return s
-    except UnicodeDecodeError:
-        pass
-    l = []
-    append = l.append
-    for c in s:
-        o = ord(c)
-        if o >= 128:
-            append('\\%3o' % o)
-        else:
-            append(c)
-    return ''.join(l)
-
 def long_literal(value):
     if isinstance(value, basestring):
         if len(value) < 2:
diff --git a/tests/run/charencoding.pyx b/tests/run/charencoding.pyx
new file mode 100644
index 00000000..1cd1c617
--- /dev/null
+++ b/tests/run/charencoding.pyx
@@ -0,0 +1,30 @@
+# coding: ASCII
+
+__doc__ = u"""
+>>> s = test()
+>>> assert s == ''.join([chr(i) for i in range(0x10,0xFF,0x11)] + [chr(0xFF)]), repr(s)
+"""
+
+def test():
+    cdef char s[17]
+
+    s[ 0] = c'\x10'
+    s[ 1] = c'\x21'
+    s[ 2] = c'\x32'
+    s[ 3] = c'\x43'
+    s[ 4] = c'\x54'
+    s[ 5] = c'\x65'
+    s[ 6] = c'\x76'
+    s[ 7] = c'\x87'
+    s[ 8] = c'\x98'
+    s[ 9] = c'\xA9'
+    s[10] = c'\xBA'
+    s[11] = c'\xCB'
+    s[12] = c'\xDC'
+    s[13] = c'\xED'
+    s[14] = c'\xFE'
+    s[15] = c'\xFF'
+
+    s[16] = c'\x00'
+
+    return s