2 # Cython -- encoding related tools
8 if sys.version_info[0] >= 3:
9 _str, _bytes = str, bytes
12 _str, _bytes = unicode, str
15 empty_bytes = _bytes()
18 join_bytes = empty_bytes.join
20 class UnicodeLiteralBuilder(object):
21 """Assemble a unicode string.
26 def append(self, characters):
27 if isinstance(characters, _bytes):
28 # this came from a Py2 string literal in the parser code
29 characters = characters.decode("ASCII")
30 assert isinstance(characters, _str), str(type(characters))
31 self.chars.append(characters)
33 def append_charval(self, char_number):
34 self.chars.append( unichr(char_number) )
37 return EncodedString(u''.join(self.chars))
40 class BytesLiteralBuilder(object):
41 """Assemble a byte string or char value.
43 def __init__(self, target_encoding):
45 self.target_encoding = target_encoding
47 def append(self, characters):
48 if isinstance(characters, _str):
49 characters = characters.encode(self.target_encoding)
50 assert isinstance(characters, _bytes), str(type(characters))
51 self.chars.append(characters)
53 def append_charval(self, char_number):
54 self.chars.append( unichr(char_number).encode('ISO-8859-1') )
57 # this *must* return a byte string!
58 s = BytesLiteral(join_bytes(self.chars))
59 s.encoding = self.target_encoding
63 # this *must* return a byte string!
64 return self.getstring()
66 class EncodedString(_str):
67 # unicode string subclass to keep track of the original encoding.
68 # 'encoding' is None for unicode strings and the source encoding
73 assert self.encoding is not None
74 return self.encode(self.encoding)
77 assert self.encoding is None
78 return self.encode("UTF-8")
81 return self.encoding is None
82 is_unicode = property(is_unicode)
84 class BytesLiteral(_bytes):
85 # str subclass that is compatible with EncodedString
92 # fake-recode the string to make it a plain bytes object
93 return self.decode('ISO-8859-1').encode('ISO-8859-1')
96 assert False, "this is not a unicode string: %r" % self
99 """Fake-decode the byte string to unicode to support %
100 formatting of unicode strings.
102 return self.decode('ISO-8859-1')
106 char_from_escape_sequence = {
116 def _to_escape_sequence(s):
124 # within a character sequence, oct passes much better than hex
125 return ''.join(['\\%03o' % ord(c) for c in s])
127 _c_special = ('\\', '\0', '\n', '\r', '\t', '??', '"')
128 _c_special_replacements = [(orig.encode('ASCII'),
129 _to_escape_sequence(orig).encode('ASCII'))
130 for orig in _c_special ]
132 def _build_specials_test():
134 for special in _c_special:
135 regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
136 subexps.append(regexp)
137 return re.compile('|'.join(subexps).encode('ASCII')).search
139 _has_specials = _build_specials_test()
143 c = c.decode('ISO-8859-1')
149 if n < 32 or n > 127:
150 # hex works well for characters
155 def escape_byte_string(s):
156 """Escape a byte string so that it can be written into C code.
157 Note that this returns a Unicode string instead which, when
158 encoded as ISO-8859-1, will result in the correct byte sequence
162 for special, replacement in _c_special_replacements:
163 s = s.replace(special, replacement)
165 return s.decode("ASCII") # trial decoding: plain ASCII => done
166 except UnicodeDecodeError:
170 append, extend = s_new.append, s_new.extend
173 extend(('\\%3o' % b).encode('ASCII'))
176 return s_new.decode('ISO-8859-1')
186 return join_bytes(l).decode('ISO-8859-1')
188 def split_docstring(s):
191 return '\\n\"\"'.join(s.split(r'\n'))