From 5bd4e9b6df50c980ecb4769a7a93b9bf1aae9d6d Mon Sep 17 00:00:00 2001
From: Stefan Behnel <scoder@users.berlios.de>
Date: Tue, 7 Sep 2010 20:25:41 +0200
Subject: [PATCH] reject non-ASCII literal characters in Python 3 byte strings

---
 Cython/Compiler/Parsing.pxd    |  1 +
 Cython/Compiler/Parsing.py     | 27 ++++++++++++++++++++++-----
 tests/errors/cython3_bytes.pyx |  9 +++++++++
 tests/run/cython2_bytes.pyx    | 13 +++++++++++++
 4 files changed, 45 insertions(+), 5 deletions(-)
 create mode 100644 tests/errors/cython3_bytes.pyx
 create mode 100644 tests/run/cython2_bytes.pyx

diff --git a/Cython/Compiler/Parsing.pxd b/Cython/Compiler/Parsing.pxd
index 5a0a3473..ac0ad879 100644
--- a/Cython/Compiler/Parsing.pxd
+++ b/Cython/Compiler/Parsing.pxd
@@ -47,6 +47,7 @@ cpdef p_atom(PyrexScanner s)
 cpdef p_name(PyrexScanner s, name)
 cpdef p_cat_string_literal(PyrexScanner s)
 cpdef p_opt_string_literal(PyrexScanner s, required_type=*)
+cpdef bint check_for_non_ascii_characters(unicode string)
 cpdef p_string_literal(PyrexScanner s, kind_override=*)
 cpdef p_list_maker(PyrexScanner s)
 cpdef p_comp_iter(PyrexScanner s, body)
diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py
index 93c7413b..87a8ebf7 100644
--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -661,9 +661,9 @@ def p_cat_string_literal(s):
             bstrings.append(next_bytes_value)
             ustrings.append(next_unicode_value)
     # join and rewrap the partial literals
-    if kind in ('b', 'c', '') or kind == 'u' and bstrings[0] is not None:
+    if kind in ('b', 'c', '') or kind == 'u' and None not in bstrings:
         # Py3 enforced unicode literals are parsed as bytes/unicode combination
-        bytes_value = BytesLiteral( StringEncoding.join_bytes([ b for b in bstrings if b is not None ]) )
+        bytes_value = BytesLiteral( StringEncoding.join_bytes(bstrings) )
         bytes_value.encoding = s.source_encoding
     if kind in ('u', ''):
         unicode_value = EncodedString( u''.join([ u for u in ustrings if u is not None ]) )
@@ -681,6 +681,12 @@ def p_opt_string_literal(s, required_type='u'):
     else:
         return None
 
+def check_for_non_ascii_characters(string):
+    for c in string:
+        if c >= u'\x80':
+            return True
+    return False
+
 def p_string_literal(s, kind_override=None):
     # A single string or char literal.  Returns (kind, bvalue, uvalue)
     # where kind in ('b', 'c', 'u', '').  The 'bvalue' is the source
@@ -692,6 +698,7 @@ def p_string_literal(s, kind_override=None):
     # s.sy == 'BEGIN_STRING'
     pos = s.position()
     is_raw = 0
+    has_non_ASCII_literal_characters = False
     kind = s.systring[:1].lower()
     if kind == 'r':
         kind = ''
@@ -715,12 +722,13 @@ def p_string_literal(s, kind_override=None):
     while 1:
         s.next()
         sy = s.sy
+        systr = s.systring
         #print "p_string_literal: sy =", sy, repr(s.systring) ###
         if sy == 'CHARS':
-            chars.append(s.systring)
+            chars.append(systr)
+            if not has_non_ASCII_literal_characters and check_for_non_ascii_characters(systr):
+                has_non_ASCII_literal_characters = True
         elif sy == 'ESCAPE':
-            has_escape = True
-            systr = s.systring
             if is_raw:
                 if systr == u'\\\n':
                     chars.append(u'\\\n')
@@ -730,6 +738,8 @@ def p_string_literal(s, kind_override=None):
                     chars.append(u"'")
                 else:
                     chars.append(systr)
+                    if not has_non_ASCII_literal_characters and check_for_non_ascii_characters(systr):
+                        has_non_ASCII_literal_characters = True
             else:
                 c = systr[1]
                 if c in u"01234567":
@@ -755,6 +765,8 @@ def p_string_literal(s, kind_override=None):
                     chars.append_uescape(chrval, systr)
                 else:
                     chars.append(u'\\' + systr[1:])
+                    if not has_non_ASCII_literal_characters and check_for_non_ascii_characters(systr):
+                        has_non_ASCII_literal_characters = True
         elif sy == 'NEWLINE':
             chars.append(u'\n')
         elif sy == 'END_STRING':
@@ -772,6 +784,11 @@ def p_string_literal(s, kind_override=None):
             error(pos, u"invalid character literal: %r" % bytes_value)
     else:
         bytes_value, unicode_value = chars.getstrings()
+        if has_non_ASCII_literal_characters and s.context.language_level >= 3:
+            # Python 3 forbids literal non-ASCII characters in byte strings
+            if kind != 'u':
+                s.error("bytes can only contain ASCII literal characters.", pos = pos)
+            bytes_value = None
     s.next()
     return (kind, bytes_value, unicode_value)
 
diff --git a/tests/errors/cython3_bytes.pyx b/tests/errors/cython3_bytes.pyx
new file mode 100644
index 00000000..a8ad4f66
--- /dev/null
+++ b/tests/errors/cython3_bytes.pyx
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+# cython: language_level=3
+
+escaped = b'abc\xc3\xbc\xc3\xb6\xc3\xa4'
+invalid = b'abcÃ¼Ã¶Ã¤'
+
+_ERRORS = """
+5:10: bytes can only contain ASCII literal characters.
+"""
diff --git a/tests/run/cython2_bytes.pyx b/tests/run/cython2_bytes.pyx
new file mode 100644
index 00000000..84eec1cd
--- /dev/null
+++ b/tests/run/cython2_bytes.pyx
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+# cython: language_level=2
+
+b = b'abcÃ¼Ã¶Ã¤ \x12'
+
+cdef char* cs = 'abcÃ¼Ã¶Ã¤ \x12'
+
+def compare_cs():
+    """
+    >>> b == compare_cs()
+    True
+    """
+    return cs
-- 
2.26.2