From: Stefan Behnel <scoder@users.berlios.de>
Date: Tue, 12 Aug 2008 10:25:22 +0000 (+0200)
Subject: unescape all string content in the parser and escape it on the way out
X-Git-Tag: 0.9.8.1~60
X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=62fc87e0a0968f9c01d25f59243979878c94fada;p=cython.git

unescape all string content in the parser and escape it on the way out
otherwise, different ways of spelling special characters can end up being correctly escaped or not in the C file
---

diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py
index d52f5894..e9dbdd64 100644
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -642,7 +642,13 @@ class CharNode(ConstNode):
         return ord(self.value)
     
     def calculate_result_code(self):
-        return "'%s'" % self.value
+        if self.value == "'":
+            return r"'\''"
+        char = ord(self.value)
+        if char < 32:
+            return "'\\x%02X'" % char
+        else:
+            return "'%s'" % self.value
 
 
 class IntNode(ConstNode):
diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py
index 298fc092..18da59b2 100644
--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -585,25 +585,28 @@ def p_string_literal(s):
         sy = s.sy
         #print "p_string_literal: sy =", sy, repr(s.systring) ###
         if sy == 'CHARS':
-            systr = s.systring
-            if len(systr) == 1 and systr in "'\"\n":
-                chars.append('\\')
-            chars.append(systr)
+            chars.append(s.systring)
         elif sy == 'ESCAPE':
             systr = s.systring
             if is_raw:
                 if systr == '\\\n':
-                    chars.append(r'\\\n')
-                elif systr == r'\"':
-                    chars.append(r'\\\"')
-                elif systr == r'\\':
-                    chars.append(r'\\\\')
+                    chars.append('\n')
+                elif systr == '\\\"':
+                    chars.append('"')
+                elif systr == '\\\'':
+                    chars.append("'")
+                elif systr == '\\\\':
+                    chars.append('\\')
                 else:
-                    chars.append('\\' + systr)
+                    chars.append(systr)
             else:
                 c = systr[1]
-                if c in "'\"\\abfnrtv01234567":
-                    chars.append(systr)
+                if c in "01234567":
+                    chars.append(chr(int(systr[1:])))
+                elif c in "'\"\\":
+                    chars.append(c)
+                elif c in "abfnrtv":
+                    chars.append(Utils.char_from_escape_sequence(systr))
                 elif c == '\n':
                     pass
                 elif c in 'Uux':
@@ -616,11 +619,11 @@ def p_string_literal(s):
                     else:
                         # unicode escapes in plain byte strings are not unescaped
                         strval = systr
-                    chars.append(strval.replace('\\', '\\\\'))
+                    chars.append(strval)
                 else:
                     chars.append(r'\\' + systr[1:])
         elif sy == 'NEWLINE':
-            chars.append(r'\n')
+            chars.append('\n')
         elif sy == 'END_STRING':
             break
         elif sy == 'EOF':
@@ -629,8 +632,11 @@ def p_string_literal(s):
             s.error(
                 "Unexpected token %r:%r in string literal" %
                     (sy, s.systring))
+    string = u''.join(chars)
+    if kind == 'c' and len(string) != 1:
+        error(pos, u"invalid character literal: %r" % string)
     s.next()
-    value = Utils.EncodedString( u''.join(chars) )
+    value = Utils.EncodedString(string)
     if kind != 'u':
         value.encoding = s.source_encoding
     #print "p_string_literal: value =", repr(value) ###
diff --git a/Cython/Utils.py b/Cython/Utils.py
index f17ab613..83ca46b5 100644
--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -99,25 +99,39 @@ class EncodedString(unicode):
 #        return unicode.__eq__(self, other) and \
 #            getattr(other, 'encoding', '') == self.encoding
 
-def _to_oct_sequence(s):
+char_from_escape_sequence = {
+    r'\a' : '\a',
+    r'\b' : '\b',
+    r'\f' : '\f',
+    r'\n' : '\n',
+    r'\r' : '\r',
+    r'\t' : '\t',
+    r'\v' : '\v',
+    }.get
+
+def _to_escape_sequence(s):
     if s in '\n\r\t':
         return repr(s)[1:-1]
+    elif s == '"':
+        return r'\"'
     else:
+        # oct passes much better than hex
         return ''.join(['\\%03o' % ord(c) for c in s])
 
-_c_special = ('\0', '\n','\r','\t', '??', '<:', ':>', '<%', '%>', '%:', '%:')
-_c_special_replacements = zip(_c_special, map(_to_oct_sequence, _c_special))
+_c_special = ('\0', '\n', '\r', '\t', '??', '"')
+_c_special_replacements = zip(_c_special, map(_to_escape_sequence, _c_special))
 
-def _build_special_test():
+def _build_specials_test():
     subexps = []
     for special in _c_special:
         regexp = ''.join(['[%s]' % c for c in special])
         subexps.append(regexp)
-    return re.compile('(' + '|'.join(subexps) + ')').search
+    return re.compile('|'.join(subexps)).search
 
-_has_specials = _build_special_test()
+_has_specials = _build_specials_test()
 
 def escape_byte_string(s):
+    s = s.replace('\\', '\\\\')
     if _has_specials(s):
         for special, replacement in _c_special_replacements:
             s = s.replace(special, replacement)
diff --git a/tests/run/charescape.pyx b/tests/run/charescape.pyx
new file mode 100644
index 00000000..1216d260
--- /dev/null
+++ b/tests/run/charescape.pyx
@@ -0,0 +1,64 @@
+__doc__ = u"""
+>>> s = test()
+>>> assert s == ''.join([chr(i) for i in range(1,49)]), s
+"""
+
+def test():
+    cdef char s[50]
+
+    s[ 0] = c'\0'
+    s[ 1] = c'\x01'
+    s[ 2] = c'\x02'
+    s[ 3] = c'\x03'
+    s[ 4] = c'\x04'
+    s[ 5] = c'\x05'
+    s[ 6] = c'\x06'
+    s[ 7] = c'\x07'
+    s[ 8] = c'\x08'
+    s[ 9] = c'\x09'
+    s[10] = c'\x0A'
+    s[11] = c'\x0B'
+    s[12] = c'\x0C'
+    s[13] = c'\x0D'
+    s[14] = c'\x0E'
+    s[15] = c'\x0F'
+    s[16] = c'\x10'
+    s[17] = c'\x11'
+    s[18] = c'\x12'
+    s[19] = c'\x13'
+    s[20] = c'\x14'
+    s[21] = c'\x15'
+    s[22] = c'\x16'
+    s[23] = c'\x17'
+    s[24] = c'\x18'
+    s[25] = c'\x19'
+    s[26] = c'\x1A'
+    s[27] = c'\x1B'
+    s[28] = c'\x1C'
+    s[29] = c'\x1D'
+    s[30] = c'\x1E'
+    s[31] = c'\x1F'
+    s[32] = c'\x20'
+    s[33] = c'\x21'
+    s[34] = c'\x22'
+    s[35] = c'\x23'
+    s[36] = c'\x24'
+    s[37] = c'\x25'
+    s[38] = c'\x26'
+    s[39] = c'\x27'
+    s[40] = c'\x28'
+    s[41] = c'\x29'
+    s[42] = c'\x2A'
+    s[43] = c'\x2B'
+    s[44] = c'\x2C'
+    s[45] = c'\x2D'
+    s[46] = c'\x2E'
+    s[47] = c'\x2F'
+    s[48] = c'\x30'
+
+    s[49] = c'\x00'
+
+    assert s[ 0] == c'\x00'
+    assert s[49] == c'\0'
+
+    return &s[1]
diff --git a/tests/run/strescapes.pyx b/tests/run/strescapes.pyx
index b5d99eed..01b9d795 100644
--- a/tests/run/strescapes.pyx
+++ b/tests/run/strescapes.pyx
@@ -4,6 +4,11 @@ __doc__ = u"""
 ... b'\\x1234',
 ... b'\\x0A12\\x0C34',
 ... b'\\x0A57',
+... b'\\x0A',
+... b'\\'',
+... b"\\'",
+... b"\\"",
+... b'\\"',
 ... b'abc\\x12def',
 ... u'\\u1234',
 ... u'\\U00001234',
@@ -28,14 +33,19 @@ __doc__ = u"""
 
 import sys
 if sys.version_info[0] < 3:
-    __doc__ = __doc__.replace(u" b'", u" '")
+    __doc__ = __doc__.replace(u" b'", u" '").replace(u' b"', u' "')
 else:
-    __doc__ = __doc__.replace(u" u'", u" '")
+    __doc__ = __doc__.replace(u" u'", u" '").replace(u' u"', u' "')
 
 c_strings = [
 (b'\x1234', 3),
 (b'\x0A12\x0C34', 6),
 (b'\x0A57', 3),
+(b'\x0A', 1),
+(b'\'', 1),
+(b"\'", 1),
+(b"\"", 1),
+(b'\"', 1),
 (b'abc\x12def', 7),
 (u'\u1234', 1),
 (u'\U00001234', 1),