Cython/Compiler/StringEncoding.py

   1 #
   2 #   Cython -- encoding related tools
   3 #
   4
   5 import re
   6 import sys
   7
   8 if sys.version_info[0] >= 3:
   9     _str, _bytes = str, bytes
  10     IS_PYTHON3 = True
  11 else:
  12     _str, _bytes = unicode, str
  13     IS_PYTHON3 = False
  14
  15 empty_bytes = _bytes()
  16 empty_str = _str()
  17
  18 join_bytes = empty_bytes.join
  19
  20 class UnicodeLiteralBuilder(object):
  21     """Assemble a unicode string.
  22     """
  23     def __init__(self):
  24         self.chars = []
  25
  26     def append(self, characters):
  27         if isinstance(characters, _bytes):
  28             # this came from a Py2 string literal in the parser code
  29             characters = characters.decode("ASCII")
  30         assert isinstance(characters, _str), str(type(characters))
  31         self.chars.append(characters)
  32
  33     def append_charval(self, char_number):
  34         self.chars.append( unichr(char_number) )
  35
  36     def getstring(self):
  37         return EncodedString(u''.join(self.chars))
  38
  39
  40 class BytesLiteralBuilder(object):
  41     """Assemble a byte string or char value.
  42     """
  43     def __init__(self, target_encoding):
  44         self.chars = []
  45         self.target_encoding = target_encoding
  46
  47     def append(self, characters):
  48         if isinstance(characters, _str):
  49             characters = characters.encode(self.target_encoding)
  50         assert isinstance(characters, _bytes), str(type(characters))
  51         self.chars.append(characters)
  52
  53     def append_charval(self, char_number):
  54         self.chars.append( unichr(char_number).encode('ISO-8859-1') )
  55
  56     def getstring(self):
  57         # this *must* return a byte string!
  58         s = BytesLiteral(join_bytes(self.chars))
  59         s.encoding = self.target_encoding
  60         return s
  61
  62     def getchar(self):
  63         # this *must* return a byte string!
  64         return self.getstring()
  65
  66 class EncodedString(_str):
  67     # unicode string subclass to keep track of the original encoding.
  68     # 'encoding' is None for unicode strings and the source encoding
  69     # otherwise
  70     encoding = None
  71
  72     def byteencode(self):
  73         assert self.encoding is not None
  74         return self.encode(self.encoding)
  75
  76     def utf8encode(self):
  77         assert self.encoding is None
  78         return self.encode("UTF-8")
  79
  80     def is_unicode(self):
  81         return self.encoding is None
  82     is_unicode = property(is_unicode)
  83
  84 class BytesLiteral(_bytes):
  85     # str subclass that is compatible with EncodedString
  86     encoding = None
  87
  88     def byteencode(self):
  89         if IS_PYTHON3:
  90             return _bytes(self)
  91         else:
  92             # fake-recode the string to make it a plain bytes object
  93             return self.decode('ISO-8859-1').encode('ISO-8859-1')
  94
  95     def utf8encode(self):
  96         assert False, "this is not a unicode string: %r" % self
  97
  98     def __str__(self):
  99         """Fake-decode the byte string to unicode to support %
 100         formatting of unicode strings.
 101         """
 102         return self.decode('ISO-8859-1')
 103
 104     is_unicode = False
 105
 106 char_from_escape_sequence = {
 107     r'\a' : u'\a',
 108     r'\b' : u'\b',
 109     r'\f' : u'\f',
 110     r'\n' : u'\n',
 111     r'\r' : u'\r',
 112     r'\t' : u'\t',
 113     r'\v' : u'\v',
 114     }.get
 115
 116 def _to_escape_sequence(s):
 117     if s in '\n\r\t':
 118         return repr(s)[1:-1]
 119     elif s == '"':
 120         return r'\"'
 121     elif s == '\\':
 122         return r'\\'
 123     else:
 124         # within a character sequence, oct passes much better than hex
 125         return ''.join(['\\%03o' % ord(c) for c in s])
 126
 127 _c_special = ('\\', '\0', '\n', '\r', '\t', '??', '"')
 128 _c_special_replacements = [(orig.encode('ASCII'),
 129                             _to_escape_sequence(orig).encode('ASCII'))
 130                            for orig in _c_special ]
 131
 132 def _build_specials_test():
 133     subexps = []
 134     for special in _c_special:
 135         regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
 136         subexps.append(regexp)
 137     return re.compile('|'.join(subexps).encode('ASCII')).search
 138
 139 _has_specials = _build_specials_test()
 140
 141 def escape_char(c):
 142     if IS_PYTHON3:
 143         c = c.decode('ISO-8859-1')
 144     if c in '\n\r\t\\':
 145         return repr(c)[1:-1]
 146     elif c == "'":
 147         return "\\'"
 148     n = ord(c)
 149     if n < 32 or n > 127:
 150         # hex works well for characters
 151         return "\\x%02X" % n
 152     else:
 153         return c
 154
 155 def escape_byte_string(s):
 156     """Escape a byte string so that it can be written into C code.
 157     Note that this returns a Unicode string instead which, when
 158     encoded as ISO-8859-1, will result in the correct byte sequence
 159     being written.
 160     """
 161     if _has_specials(s):
 162         for special, replacement in _c_special_replacements:
 163             s = s.replace(special, replacement)
 164     try:
 165         return s.decode("ASCII") # trial decoding: plain ASCII => done
 166     except UnicodeDecodeError:
 167         pass
 168     if IS_PYTHON3:
 169         s_new = bytearray()
 170         append, extend = s_new.append, s_new.extend
 171         for b in s:
 172             if b >= 128:
 173                 extend(('\\%3o' % b).encode('ASCII'))
 174             else:
 175                 append(b)
 176         return s_new.decode('ISO-8859-1')
 177     else:
 178         l = []
 179         append = l.append
 180         for c in s:
 181             o = ord(c)
 182             if o >= 128:
 183                 append('\\%3o' % o)
 184             else:
 185                 append(c)
 186         return join_bytes(l).decode('ISO-8859-1')
 187
 188 def split_docstring(s):
 189     if len(s) < 2047:
 190         return s
 191     return '\\n\"\"'.join(s.split(r'\n'))