From 5bd4e9b6df50c980ecb4769a7a93b9bf1aae9d6d Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Tue, 7 Sep 2010 20:25:41 +0200 Subject: [PATCH] reject non-ASCII literal characters in Python 3 byte strings --- Cython/Compiler/Parsing.pxd | 1 + Cython/Compiler/Parsing.py | 27 ++++++++++++++++++++++----- tests/errors/cython3_bytes.pyx | 9 +++++++++ tests/run/cython2_bytes.pyx | 13 +++++++++++++ 4 files changed, 45 insertions(+), 5 deletions(-) create mode 100644 tests/errors/cython3_bytes.pyx create mode 100644 tests/run/cython2_bytes.pyx diff --git a/Cython/Compiler/Parsing.pxd b/Cython/Compiler/Parsing.pxd index 5a0a3473..ac0ad879 100644 --- a/Cython/Compiler/Parsing.pxd +++ b/Cython/Compiler/Parsing.pxd @@ -47,6 +47,7 @@ cpdef p_atom(PyrexScanner s) cpdef p_name(PyrexScanner s, name) cpdef p_cat_string_literal(PyrexScanner s) cpdef p_opt_string_literal(PyrexScanner s, required_type=*) +cpdef bint check_for_non_ascii_characters(unicode string) cpdef p_string_literal(PyrexScanner s, kind_override=*) cpdef p_list_maker(PyrexScanner s) cpdef p_comp_iter(PyrexScanner s, body) diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py index 93c7413b..87a8ebf7 100644 --- a/Cython/Compiler/Parsing.py +++ b/Cython/Compiler/Parsing.py @@ -661,9 +661,9 @@ def p_cat_string_literal(s): bstrings.append(next_bytes_value) ustrings.append(next_unicode_value) # join and rewrap the partial literals - if kind in ('b', 'c', '') or kind == 'u' and bstrings[0] is not None: + if kind in ('b', 'c', '') or kind == 'u' and None not in bstrings: # Py3 enforced unicode literals are parsed as bytes/unicode combination - bytes_value = BytesLiteral( StringEncoding.join_bytes([ b for b in bstrings if b is not None ]) ) + bytes_value = BytesLiteral( StringEncoding.join_bytes(bstrings) ) bytes_value.encoding = s.source_encoding if kind in ('u', ''): unicode_value = EncodedString( u''.join([ u for u in ustrings if u is not None ]) ) @@ -681,6 +681,12 @@ def p_opt_string_literal(s, required_type='u'): else: return None +def check_for_non_ascii_characters(string): + for c in string: + if c >= u'\x80': + return True + return False + def p_string_literal(s, kind_override=None): # A single string or char literal. Returns (kind, bvalue, uvalue) # where kind in ('b', 'c', 'u', ''). The 'bvalue' is the source @@ -692,6 +698,7 @@ def p_string_literal(s, kind_override=None): # s.sy == 'BEGIN_STRING' pos = s.position() is_raw = 0 + has_non_ASCII_literal_characters = False kind = s.systring[:1].lower() if kind == 'r': kind = '' @@ -715,12 +722,13 @@ def p_string_literal(s, kind_override=None): while 1: s.next() sy = s.sy + systr = s.systring #print "p_string_literal: sy =", sy, repr(s.systring) ### if sy == 'CHARS': - chars.append(s.systring) + chars.append(systr) + if not has_non_ASCII_literal_characters and check_for_non_ascii_characters(systr): + has_non_ASCII_literal_characters = True elif sy == 'ESCAPE': - has_escape = True - systr = s.systring if is_raw: if systr == u'\\\n': chars.append(u'\\\n') @@ -730,6 +738,8 @@ def p_string_literal(s, kind_override=None): chars.append(u"'") else: chars.append(systr) + if not has_non_ASCII_literal_characters and check_for_non_ascii_characters(systr): + has_non_ASCII_literal_characters = True else: c = systr[1] if c in u"01234567": @@ -755,6 +765,8 @@ def p_string_literal(s, kind_override=None): chars.append_uescape(chrval, systr) else: chars.append(u'\\' + systr[1:]) + if not has_non_ASCII_literal_characters and check_for_non_ascii_characters(systr): + has_non_ASCII_literal_characters = True elif sy == 'NEWLINE': chars.append(u'\n') elif sy == 'END_STRING': @@ -772,6 +784,11 @@ def p_string_literal(s, kind_override=None): error(pos, u"invalid character literal: %r" % bytes_value) else: bytes_value, unicode_value = chars.getstrings() + if has_non_ASCII_literal_characters and s.context.language_level >= 3: + # Python 3 forbids literal non-ASCII characters in byte strings + if kind != 'u': + s.error("bytes can only contain ASCII literal characters.", pos = pos) + bytes_value = None s.next() return (kind, bytes_value, unicode_value) diff --git a/tests/errors/cython3_bytes.pyx b/tests/errors/cython3_bytes.pyx new file mode 100644 index 00000000..a8ad4f66 --- /dev/null +++ b/tests/errors/cython3_bytes.pyx @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +# cython: language_level=3 + +escaped = b'abc\xc3\xbc\xc3\xb6\xc3\xa4' +invalid = b'abcüöä' + +_ERRORS = """ +5:10: bytes can only contain ASCII literal characters. +""" diff --git a/tests/run/cython2_bytes.pyx b/tests/run/cython2_bytes.pyx new file mode 100644 index 00000000..84eec1cd --- /dev/null +++ b/tests/run/cython2_bytes.pyx @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +# cython: language_level=2 + +b = b'abcüöä \x12' + +cdef char* cs = 'abcüöä \x12' + +def compare_cs(): + """ + >>> b == compare_cs() + True + """ + return cs -- 2.26.2