2 # Cython -- encoding related tools
7 class UnicodeLiteralBuilder(object):
8 """Assemble a unicode string.
13 def append(self, characters):
14 if isinstance(characters, str):
15 # this came from a Py2 string literal in the parser code
16 characters = characters.decode("ASCII")
17 assert isinstance(characters, unicode), str(type(characters))
18 self.chars.append(characters)
20 def append_charval(self, char_number):
21 self.chars.append( unichr(char_number) )
24 return EncodedString(u''.join(self.chars))
27 class BytesLiteralBuilder(object):
28 """Assemble a byte string or char value.
30 def __init__(self, target_encoding):
32 self.target_encoding = target_encoding
34 def append(self, characters):
35 if isinstance(characters, unicode):
36 characters = characters.encode(self.target_encoding)
37 assert isinstance(characters, str), str(type(characters))
38 self.chars.append(characters)
40 def append_charval(self, char_number):
41 self.chars.append( chr(char_number) )
44 # this *must* return a byte string! => fix it in Py3k!!
45 s = BytesLiteral(''.join(self.chars))
46 s.encoding = self.target_encoding
50 # this *must* return a byte string! => fix it in Py3k!!
51 return self.getstring()
53 class EncodedString(unicode):
54 # unicode string subclass to keep track of the original encoding.
55 # 'encoding' is None for unicode strings and the source encoding
60 assert self.encoding is not None
61 return self.encode(self.encoding)
64 assert self.encoding is None
65 return self.encode("UTF-8")
68 return self.encoding is None
69 is_unicode = property(is_unicode)
71 class BytesLiteral(str):
72 # str subclass that is compatible with EncodedString
79 assert False, "this is not a unicode string: %r" % self
83 char_from_escape_sequence = {
93 def _to_escape_sequence(s):
99 # within a character sequence, oct passes much better than hex
100 return ''.join(['\\%03o' % ord(c) for c in s])
102 _c_special = ('\0', '\n', '\r', '\t', '??', '"')
103 _c_special_replacements = zip(_c_special, map(_to_escape_sequence, _c_special))
105 def _build_specials_test():
107 for special in _c_special:
108 regexp = ''.join(['[%s]' % c for c in special])
109 subexps.append(regexp)
110 return re.compile('|'.join(subexps)).search
112 _has_specials = _build_specials_test()
114 def escape_character(c):
120 if n < 32 or n > 127:
121 # hex works well for characters
126 def escape_byte_string(s):
127 s = s.replace('\\', '\\\\')
129 for special, replacement in _c_special_replacements:
130 s = s.replace(special, replacement)
134 except UnicodeDecodeError: