unescape all string content in the parser and escape it on the way out

author Stefan Behnel <scoder@users.berlios.de>

Tue, 12 Aug 2008 10:25:22 +0000 (12:25 +0200)

committer Stefan Behnel <scoder@users.berlios.de>

Tue, 12 Aug 2008 10:25:22 +0000 (12:25 +0200)
author Stefan Behnel <scoder@users.berlios.de>
Tue, 12 Aug 2008 10:25:22 +0000 (12:25 +0200)
committer Stefan Behnel <scoder@users.berlios.de>
Tue, 12 Aug 2008 10:25:22 +0000 (12:25 +0200)
diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py

index d52f58943d12a6405807461f42d7845669e89309..e9dbdd64f9fa132e63652b75816e69896e0b4295 100644 (file)
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -642,7 +642,13 @@ class CharNode(ConstNode):
          return ord(self.value)
      
      def calculate_result_code(self):
-        return "'%s'" % self.value
+        if self.value == "'":
+            return r"'\''"
+        char = ord(self.value)
+        if char < 32:
+            return "'\\x%02X'" % char
+        else:
+            return "'%s'" % self.value
  
  
  class IntNode(ConstNode):
diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py

index 298fc092f33e950154682d09920ef31d75b27994..18da59b2aeac97952f774dce70d397503eb90f8a 100644 (file)
--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -585,25 +585,28 @@ def p_string_literal(s):
          sy = s.sy
          #print "p_string_literal: sy =", sy, repr(s.systring) ###
          if sy == 'CHARS':
-            systr = s.systring
-            if len(systr) == 1 and systr in "'\"\n":
-                chars.append('\\')
-            chars.append(systr)
+            chars.append(s.systring)
          elif sy == 'ESCAPE':
              systr = s.systring
              if is_raw:
                  if systr == '\\\n':
-                    chars.append(r'\\\n')
-                elif systr == r'\"':
-                    chars.append(r'\\\"')
-                elif systr == r'\\':
-                    chars.append(r'\\\\')
+                    chars.append('\n')
+                elif systr == '\\\"':
+                    chars.append('"')
+                elif systr == '\\\'':
+                    chars.append("'")
+                elif systr == '\\\\':
+                    chars.append('\\')
                  else:
-                    chars.append('\\' + systr)
+                    chars.append(systr)
              else:
                  c = systr[1]
-                if c in "'\"\\abfnrtv01234567":
-                    chars.append(systr)
+                if c in "01234567":
+                    chars.append(chr(int(systr[1:])))
+                elif c in "'\"\\":
+                    chars.append(c)
+                elif c in "abfnrtv":
+                    chars.append(Utils.char_from_escape_sequence(systr))
                  elif c == '\n':
                      pass
                  elif c in 'Uux':
@@ -616,11 +619,11 @@ def p_string_literal(s):
                      else:
                          # unicode escapes in plain byte strings are not unescaped
                          strval = systr
-                    chars.append(strval.replace('\\', '\\\\'))
+                    chars.append(strval)
                  else:
                      chars.append(r'\\' + systr[1:])
          elif sy == 'NEWLINE':
-            chars.append(r'\n')
+            chars.append('\n')
          elif sy == 'END_STRING':
              break
          elif sy == 'EOF':
@@ -629,8 +632,11 @@ def p_string_literal(s):
              s.error(
                  "Unexpected token %r:%r in string literal" %
                      (sy, s.systring))
+    string = u''.join(chars)
+    if kind == 'c' and len(string) != 1:
+        error(pos, u"invalid character literal: %r" % string)
      s.next()
-    value = Utils.EncodedString( u''.join(chars) )
+    value = Utils.EncodedString(string)
      if kind != 'u':
          value.encoding = s.source_encoding
      #print "p_string_literal: value =", repr(value) ###
diff --git a/Cython/Utils.py b/Cython/Utils.py

index f17ab613a2cf029df1d1035cc0d972fe4c655f93..83ca46b501ba9024aa7c183c68d79272caf3951e 100644 (file)
--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -99,25 +99,39 @@ class EncodedString(unicode):
  #        return unicode.__eq__(self, other) and \
  #            getattr(other, 'encoding', '') == self.encoding
  
-def _to_oct_sequence(s):
+char_from_escape_sequence = {
+    r'\a' : '\a',
+    r'\b' : '\b',
+    r'\f' : '\f',
+    r'\n' : '\n',
+    r'\r' : '\r',
+    r'\t' : '\t',
+    r'\v' : '\v',
+    }.get
+
+def _to_escape_sequence(s):
      if s in '\n\r\t':
          return repr(s)[1:-1]
+    elif s == '"':
+        return r'\"'
      else:
+        # oct passes much better than hex
          return ''.join(['\\%03o' % ord(c) for c in s])
  
-_c_special = ('\0', '\n','\r','\t', '??', '<:', ':>', '<%', '%>', '%:', '%:')
-_c_special_replacements = zip(_c_special, map(_to_oct_sequence, _c_special))
+_c_special = ('\0', '\n', '\r', '\t', '??', '"')
+_c_special_replacements = zip(_c_special, map(_to_escape_sequence, _c_special))
  
-def _build_special_test():
+def _build_specials_test():
      subexps = []
      for special in _c_special:
          regexp = ''.join(['[%s]' % c for c in special])
          subexps.append(regexp)
-    return re.compile('(' + '|'.join(subexps) + ')').search
+    return re.compile('|'.join(subexps)).search
  
-_has_specials = _build_special_test()
+_has_specials = _build_specials_test()
  
  def escape_byte_string(s):
+    s = s.replace('\\', '\\\\')
      if _has_specials(s):
          for special, replacement in _c_special_replacements:
              s = s.replace(special, replacement)
diff --git a/tests/run/charescape.pyx b/tests/run/charescape.pyx

new file mode 100644 (file)

index 0000000..1216d26
--- /dev/null
+++ b/tests/run/charescape.pyx
@@ -0,0 +1,64 @@
+__doc__ = u"""
+>>> s = test()
+>>> assert s == ''.join([chr(i) for i in range(1,49)]), s
+"""
+
+def test():
+    cdef char s[50]
+
+    s[ 0] = c'\0'
+    s[ 1] = c'\x01'
+    s[ 2] = c'\x02'
+    s[ 3] = c'\x03'
+    s[ 4] = c'\x04'
+    s[ 5] = c'\x05'
+    s[ 6] = c'\x06'
+    s[ 7] = c'\x07'
+    s[ 8] = c'\x08'
+    s[ 9] = c'\x09'
+    s[10] = c'\x0A'
+    s[11] = c'\x0B'
+    s[12] = c'\x0C'
+    s[13] = c'\x0D'
+    s[14] = c'\x0E'
+    s[15] = c'\x0F'
+    s[16] = c'\x10'
+    s[17] = c'\x11'
+    s[18] = c'\x12'
+    s[19] = c'\x13'
+    s[20] = c'\x14'
+    s[21] = c'\x15'
+    s[22] = c'\x16'
+    s[23] = c'\x17'
+    s[24] = c'\x18'
+    s[25] = c'\x19'
+    s[26] = c'\x1A'
+    s[27] = c'\x1B'
+    s[28] = c'\x1C'
+    s[29] = c'\x1D'
+    s[30] = c'\x1E'
+    s[31] = c'\x1F'
+    s[32] = c'\x20'
+    s[33] = c'\x21'
+    s[34] = c'\x22'
+    s[35] = c'\x23'
+    s[36] = c'\x24'
+    s[37] = c'\x25'
+    s[38] = c'\x26'
+    s[39] = c'\x27'
+    s[40] = c'\x28'
+    s[41] = c'\x29'
+    s[42] = c'\x2A'
+    s[43] = c'\x2B'
+    s[44] = c'\x2C'
+    s[45] = c'\x2D'
+    s[46] = c'\x2E'
+    s[47] = c'\x2F'
+    s[48] = c'\x30'
+
+    s[49] = c'\x00'
+
+    assert s[ 0] == c'\x00'
+    assert s[49] == c'\0'
+
+    return &s[1]
diff --git a/tests/run/strescapes.pyx b/tests/run/strescapes.pyx

index b5d99eedb24ee88d61a2ad8b5625fbac8d51c926..01b9d7953ac561d9930a93c38e7838bc16c6945b 100644 (file)
--- a/tests/run/strescapes.pyx
+++ b/tests/run/strescapes.pyx
@@ -4,6 +4,11 @@ __doc__ = u"""
  ... b'\\x1234',
  ... b'\\x0A12\\x0C34',
  ... b'\\x0A57',
+... b'\\x0A',
+... b'\\'',
+... b"\\'",
+... b"\\"",
+... b'\\"',
  ... b'abc\\x12def',
  ... u'\\u1234',
  ... u'\\U00001234',
@@ -28,14 +33,19 @@ __doc__ = u"""
  
  import sys
  if sys.version_info[0] < 3:
-    __doc__ = __doc__.replace(u" b'", u" '")
+    __doc__ = __doc__.replace(u" b'", u" '").replace(u' b"', u' "')
  else:
-    __doc__ = __doc__.replace(u" u'", u" '")
+    __doc__ = __doc__.replace(u" u'", u" '").replace(u' u"', u' "')
  
  c_strings = [
  (b'\x1234', 3),
  (b'\x0A12\x0C34', 6),
  (b'\x0A57', 3),
+(b'\x0A', 1),
+(b'\'', 1),
+(b"\'", 1),
+(b"\"", 1),
+(b'\"', 1),
  (b'abc\x12def', 7),
  (u'\u1234', 1),
  (u'\U00001234', 1),
author	Stefan Behnel <scoder@users.berlios.de>
	Tue, 12 Aug 2008 10:25:22 +0000 (12:25 +0200)
committer	Stefan Behnel <scoder@users.berlios.de>
	Tue, 12 Aug 2008 10:25:22 +0000 (12:25 +0200)
Cython/Compiler/ExprNodes.py		patch \| blob \| history
Cython/Compiler/Parsing.py		patch \| blob \| history
Cython/Utils.py		patch \| blob \| history
tests/run/charescape.pyx	[new file with mode: 0644]	patch \| blob
tests/run/strescapes.pyx		patch \| blob \| history