2 # Cython -- encoding related tools
8 if sys.version_info[0] >= 3:
9 _unicode, _str, _bytes = str, str, bytes
12 _unicode, _str, _bytes = unicode, str, str
15 empty_bytes = _bytes()
16 empty_unicode = _unicode()
18 join_bytes = empty_bytes.join
20 class UnicodeLiteralBuilder(object):
21 """Assemble a unicode string.
26 def append(self, characters):
27 if isinstance(characters, _bytes):
28 # this came from a Py2 string literal in the parser code
29 characters = characters.decode("ASCII")
30 assert isinstance(characters, _unicode), str(type(characters))
31 self.chars.append(characters)
33 if sys.maxunicode == 65535:
34 def append_charval(self, char_number):
35 if char_number > 65535:
36 # wide Unicode character on narrow platform => replace
38 char_number -= 0x10000
39 self.chars.append( unichr((char_number // 1024) + 0xD800) )
40 self.chars.append( unichr((char_number % 1024) + 0xDC00) )
42 self.chars.append( unichr(char_number) )
44 def append_charval(self, char_number):
45 self.chars.append( unichr(char_number) )
47 def append_uescape(self, char_number, escape_string):
48 self.append_charval(char_number)
51 return EncodedString(u''.join(self.chars))
54 return (None, self.getstring())
57 class BytesLiteralBuilder(object):
58 """Assemble a byte string or char value.
60 def __init__(self, target_encoding):
62 self.target_encoding = target_encoding
64 def append(self, characters):
65 if isinstance(characters, _unicode):
66 characters = characters.encode(self.target_encoding)
67 assert isinstance(characters, _bytes), str(type(characters))
68 self.chars.append(characters)
70 def append_charval(self, char_number):
71 self.chars.append( unichr(char_number).encode('ISO-8859-1') )
73 def append_uescape(self, char_number, escape_string):
74 self.append(escape_string)
77 # this *must* return a byte string!
78 s = BytesLiteral(join_bytes(self.chars))
79 s.encoding = self.target_encoding
83 # this *must* return a byte string!
84 return self.getstring()
87 return (self.getstring(), None)
89 class StrLiteralBuilder(object):
90 """Assemble both a bytes and a unicode representation of a string.
92 def __init__(self, target_encoding):
93 self._bytes = BytesLiteralBuilder(target_encoding)
94 self._unicode = UnicodeLiteralBuilder()
96 def append(self, characters):
97 self._bytes.append(characters)
98 self._unicode.append(characters)
100 def append_charval(self, char_number):
101 self._bytes.append_charval(char_number)
102 self._unicode.append_charval(char_number)
104 def append_uescape(self, char_number, escape_string):
105 self._bytes.append(escape_string)
106 self._unicode.append_charval(char_number)
108 def getstrings(self):
109 return (self._bytes.getstring(), self._unicode.getstring())
112 class EncodedString(_unicode):
113 # unicode string subclass to keep track of the original encoding.
114 # 'encoding' is None for unicode strings and the source encoding
118 def byteencode(self):
119 assert self.encoding is not None
120 return self.encode(self.encoding)
122 def utf8encode(self):
123 assert self.encoding is None
124 return self.encode("UTF-8")
126 def is_unicode(self):
127 return self.encoding is None
128 is_unicode = property(is_unicode)
130 class BytesLiteral(_bytes):
131 # bytes subclass that is compatible with EncodedString
134 def byteencode(self):
138 # fake-recode the string to make it a plain bytes object
139 return self.decode('ISO-8859-1').encode('ISO-8859-1')
141 def utf8encode(self):
142 assert False, "this is not a unicode string: %r" % self
145 """Fake-decode the byte string to unicode to support %
146 formatting of unicode strings.
148 return self.decode('ISO-8859-1')
152 char_from_escape_sequence = {
162 def _to_escape_sequence(s):
170 # within a character sequence, oct passes much better than hex
171 return ''.join(['\\%03o' % ord(c) for c in s])
173 _c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
174 _c_special_replacements = [(orig.encode('ASCII'),
175 _to_escape_sequence(orig).encode('ASCII'))
176 for orig in _c_special ]
178 def _build_specials_test():
180 for special in _c_special:
181 regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
182 subexps.append(regexp)
183 return re.compile('|'.join(subexps).encode('ASCII')).search
185 _has_specials = _build_specials_test()
189 c = c.decode('ISO-8859-1')
195 if n < 32 or n > 127:
196 # hex works well for characters
201 def escape_byte_string(s):
202 """Escape a byte string so that it can be written into C code.
203 Note that this returns a Unicode string instead which, when
204 encoded as ISO-8859-1, will result in the correct byte sequence
208 for special, replacement in _c_special_replacements:
210 s = s.replace(special, replacement)
212 return s.decode("ASCII") # trial decoding: plain ASCII => done
213 except UnicodeDecodeError:
217 append, extend = s_new.append, s_new.extend
220 extend(('\\%3o' % b).encode('ASCII'))
223 return s_new.decode('ISO-8859-1')
233 return join_bytes(l).decode('ISO-8859-1')
235 def split_string_literal(s, limit=2000):
236 # MSVC can't handle long string literals.
242 while start < len(s):
244 if len(s) > end-4 and '\\' in s[end-4:end]:
245 end -= 4 - s[end-4:end].find('\\') # just before the backslash
246 while s[end-1] == '\\':
249 # must have been a long line of backslashes
250 end = start + limit - (limit % 2) - 4
252 chunks.append(s[start:end])
254 return '""'.join(chunks)