jinja2/lexer.py

   1 # -*- coding: utf-8 -*-
   2 """
   3     jinja2.lexer
   4     ~~~~~~~~~~~~
   5
   6     This module implements a Jinja / Python combination lexer. The
   7     `Lexer` class provided by this module is used to do some preprocessing
   8     for Jinja.
   9
  10     On the one hand it filters out invalid operators like the bitshift
  11     operators we don't allow in templates. On the other hand it separates
  12     template code and python code in expressions.
  13
  14     :copyright: 2007-2008 by Armin Ronacher.
  15     :license: BSD, see LICENSE for more details.
  16 """
  17 import re
  18 import unicodedata
  19 from jinja2.datastructure import TokenStream, Token
  20 from jinja2.exceptions import TemplateSyntaxError
  21 from weakref import WeakValueDictionary
  22
  23
  24 # cache for the lexers. Exists in order to be able to have multiple
  25 # environments with the same lexer
  26 _lexer_cache = WeakValueDictionary()
  27
  28
  29 # static regular expressions
  30 whitespace_re = re.compile(r'\s+(?um)')
  31 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
  32                        r'|"([^"\\]*(?:\\.[^"\\]*)*)")(?ms)')
  33 integer_re = re.compile(r'\d+')
  34 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
  35 float_re = re.compile(r'\d+\.\d+')
  36
  37
  38 # set of used keywords
  39 keywords = set(['and', 'block', 'elif', 'else', 'endblock', 'print',
  40                 'endfilter', 'endfor', 'endif', 'endmacro', 'endraw',
  41                 'endtrans', 'extends', 'filter', 'for', 'if', 'in',
  42                 'include', 'is', 'macro', 'not', 'or', 'pluralize', 'raw',
  43                 'recursive', 'set', 'trans', 'call', 'endcall'])
  44
  45 # bind operators to token types
  46 operators = {
  47     '+':            'add',
  48     '-':            'sub',
  49     '/':            'div',
  50     '//':           'floordiv',
  51     '*':            'mul',
  52     '%':            'mod',
  53     '**':           'pow',
  54     '~':            'tilde',
  55     '[':            'lbracket',
  56     ']':            'rbracket',
  57     '(':            'lparen',
  58     ')':            'rparen',
  59     '{':            'lbrace',
  60     '}':            'rbrace',
  61     '==':           'eq',
  62     '!=':           'ne',
  63     '>':            'gt',
  64     '>=':           'gteq',
  65     '<':            'lt',
  66     '<=':           'lteq',
  67     '=':            'assign',
  68     '.':            'dot',
  69     ':':            'colon',
  70     '|':            'pipe',
  71     ',':            'comma',
  72     ';':            'semicolon'
  73 }
  74
  75 reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
  76 assert len(operators) == len(reverse_operators), 'operators dropped'
  77 operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
  78                          sorted(operators, key=lambda x: -len(x))))
  79
  80 simple_escapes = {
  81     'a':    '\a',
  82     'n':    '\n',
  83     'r':    '\r',
  84     'f':    '\f',
  85     't':    '\t',
  86     'v':    '\v',
  87     '\\':   '\\',
  88     '"':    '"',
  89     "'":    "'",
  90     '0':    '\x00'
  91 }
  92 unicode_escapes = {
  93     'x':    2,
  94     'u':    4,
  95     'U':    8
  96 }
  97
  98
  99 def unescape_string(lineno, filename, s):
 100     r"""
 101     Unescape a string. Supported escapes:
 102         \a, \n, \r\, \f, \v, \\, \", \', \0
 103
 104         \x00, \u0000, \U00000000, \N{...}
 105
 106     Not supported are \101 because imho redundant.
 107     """
 108     result = []
 109     write = result.append
 110     chariter = iter(s)
 111     next_char = chariter.next
 112
 113     # faster lookup
 114     sescapes = simple_escapes
 115     uescapes = unicode_escapes
 116
 117     try:
 118         for char in chariter:
 119             if char == '\\':
 120                 char = next_char()
 121                 if char in sescapes:
 122                     write(sescapes[char])
 123                 elif char in uescapes:
 124                     seq = [next_char() for x in xrange(uescapes[char])]
 125                     try:
 126                         write(unichr(int(''.join(seq), 16)))
 127                     except ValueError:
 128                         raise TemplateSyntaxError('invalid unicode codepoint',
 129                                                   lineno, filename)
 130                 elif char == 'N':
 131                     if next_char() != '{':
 132                         raise TemplateSyntaxError('no name for codepoint',
 133                                                   lineno, filename)
 134                     seq = []
 135                     while 1:
 136                         char = next_char()
 137                         if char == '}':
 138                             break
 139                         seq.append(char)
 140                     try:
 141                         write(unicodedata.lookup(u''.join(seq)))
 142                     except KeyError:
 143                         raise TemplateSyntaxError('unknown character name',
 144                                                   lineno, filename)
 145                 else:
 146                     write('\\' + char)
 147             else:
 148                 write(char)
 149     except StopIteration:
 150         raise TemplateSyntaxError('invalid string escape', lineno, filename)
 151     return u''.join(result)
 152
 153
 154 def unescape_regex(s):
 155     """
 156     Unescape rules for regular expressions.
 157     """
 158     buffer = []
 159     write = buffer.append
 160     in_escape = False
 161     for char in s:
 162         if in_escape:
 163             in_escape = False
 164             if char not in safe_chars:
 165                 write('\\' + char)
 166                 continue
 167         write(char)
 168     return u''.join(buffer)
 169
 170
 171 class Failure(object):
 172     """
 173     Class that raises a `TemplateSyntaxError` if called.
 174     Used by the `Lexer` to specify known errors.
 175     """
 176
 177     def __init__(self, message, cls=TemplateSyntaxError):
 178         self.message = message
 179         self.error_class = cls
 180
 181     def __call__(self, lineno, filename):
 182         raise self.error_class(self.message, lineno, filename)
 183
 184
 185 class LexerMeta(type):
 186     """
 187     Metaclass for the lexer that caches instances for
 188     the same configuration in a weak value dictionary.
 189     """
 190
 191     def __call__(cls, environment):
 192         key = hash((environment.block_start_string,
 193                     environment.block_end_string,
 194                     environment.variable_start_string,
 195                     environment.variable_end_string,
 196                     environment.comment_start_string,
 197                     environment.comment_end_string,
 198                     environment.line_statement_prefix,
 199                     environment.trim_blocks))
 200
 201         # use the cached lexer if possible
 202         if key in _lexer_cache:
 203             return _lexer_cache[key]
 204
 205         # create a new lexer and cache it
 206         lexer = type.__call__(cls, environment)
 207         _lexer_cache[key] = lexer
 208         return lexer
 209
 210
 211 class Lexer(object):
 212     """
 213     Class that implements a lexer for a given environment. Automatically
 214     created by the environment class, usually you don't have to do that.
 215
 216     Note that the lexer is not automatically bound to an environment.
 217     Multiple environments can share the same lexer.
 218     """
 219
 220     __metaclass__ = LexerMeta
 221
 222     def __init__(self, environment):
 223         # shortcuts
 224         c = lambda x: re.compile(x, re.M | re.S)
 225         e = re.escape
 226
 227         # lexing rules for tags
 228         tag_rules = [
 229             (whitespace_re, None, None),
 230             (float_re, 'float', None),
 231             (integer_re, 'integer', None),
 232             (c(r'\b(?:%s)\b' % '|'.join(sorted(keywords, key=lambda x: -len(x)))),
 233              'keyword', None),
 234             (name_re, 'name', None),
 235             (string_re, 'string', None),
 236             (operator_re, 'operator', None)
 237         ]
 238
 239         # assamble the root lexing rule. because "|" is ungreedy
 240         # we have to sort by length so that the lexer continues working
 241         # as expected when we have parsing rules like <% for block and
 242         # <%= for variables. (if someone wants asp like syntax)
 243         # variables are just part of the rules if variable processing
 244         # is required.
 245         root_tag_rules = [
 246             ('comment',     environment.comment_start_string),
 247             ('block',       environment.block_start_string),
 248             ('variable',    environment.variable_start_string)
 249         ]
 250         root_tag_rules.sort(key=lambda x: len(x[1]))
 251
 252         # now escape the rules.  This is done here so that the escape
 253         # signs don't count for the lengths of the tags.
 254         root_tag_rules = [(a, e(b)) for a, b in root_tag_rules]
 255
 256         # if we have a line statement prefix we need an extra rule for
 257         # that.  We add this rule *after* all the others.
 258         if environment.line_statement_prefix is not None:
 259             prefix = e(environment.line_statement_prefix)
 260             root_tag_rules.insert(0, ('linestatement', '^\s*' + prefix))
 261
 262         # block suffix if trimming is enabled
 263         block_suffix_re = environment.trim_blocks and '\\n?' or ''
 264
 265         # global lexing rules
 266         self.rules = {
 267             'root': [
 268                 # directives
 269                 (c('(.*?)(?:%s)' % '|'.join(
 270                     ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
 271                         e(environment.block_start_string),
 272                         e(environment.block_start_string),
 273                         e(environment.block_end_string)
 274                     )] + [
 275                         '(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
 276                         for n, r in root_tag_rules
 277                     ])), ('data', '#bygroup'), '#bygroup'),
 278                 # data
 279                 (c('.+'), 'data', None)
 280             ],
 281             # comments
 282             'comment_begin': [
 283                 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
 284                     e(environment.comment_end_string),
 285                     e(environment.comment_end_string),
 286                     block_suffix_re
 287                 )), ('comment', 'comment_end'), '#pop'),
 288                 (c('(.)'), (Failure('Missing end of comment tag'),), None)
 289             ],
 290             # blocks
 291             'block_begin': [
 292                 (c('(?:\-%s\s*|%s)%s' % (
 293                     e(environment.block_end_string),
 294                     e(environment.block_end_string),
 295                     block_suffix_re
 296                 )), 'block_end', '#pop'),
 297             ] + tag_rules,
 298             # variables
 299             'variable_begin': [
 300                 (c('\-%s\s*|%s' % (
 301                     e(environment.variable_end_string),
 302                     e(environment.variable_end_string)
 303                 )), 'variable_end', '#pop')
 304             ] + tag_rules,
 305             # raw block
 306             'raw_begin': [
 307                 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
 308                     e(environment.block_start_string),
 309                     e(environment.block_start_string),
 310                     e(environment.block_end_string),
 311                     e(environment.block_end_string),
 312                     block_suffix_re
 313                 )), ('data', 'raw_end'), '#pop'),
 314                 (c('(.)'), (Failure('Missing end of raw directive'),), None)
 315             ],
 316             # line statements
 317             'linestatement_begin': [
 318                 (c(r'\s*(\n|$)'), 'linestatement_end', '#pop')
 319             ] + tag_rules
 320         }
 321
 322     def tokenize(self, source, filename=None):
 323         """Works like `tokeniter` but returns a tokenstream of tokens and not
 324         a generator or token tuples. Additionally all token values are already
 325         converted into types and postprocessed. For example keywords are
 326         already keyword tokens, not named tokens, comments are removed,
 327         integers and floats converted, strings unescaped etc.
 328         """
 329         source = unicode(source)
 330         def generate():
 331             for lineno, token, value in self.tokeniter(source, filename):
 332                 if token in ('comment_begin', 'comment', 'comment_end'):
 333                     continue
 334                 elif token == 'linestatement_begin':
 335                     token = 'block_begin'
 336                 elif token == 'linestatement_end':
 337                     token = 'block_end'
 338                 elif token == 'data':
 339                     try:
 340                         value = str(value)
 341                     except UnicodeError:
 342                         pass
 343                 elif token == 'keyword':
 344                     token = value
 345                 elif token == 'name':
 346                     value = str(value)
 347                 elif token == 'string':
 348                     value = unescape_string(lineno, filename, value[1:-1])
 349                     try:
 350                         value = str(value)
 351                     except UnicodeError:
 352                         pass
 353                 elif token == 'integer':
 354                     value = int(value)
 355                 elif token == 'float':
 356                     value = float(value)
 357                 elif token == 'operator':
 358                     token = operators[value]
 359                 yield Token(lineno, token, value)
 360         return TokenStream(generate(), filename)
 361
 362     def tokeniter(self, source, filename=None):
 363         """
 364         This method tokenizes the text and returns the tokens in a generator.
 365         Use this method if you just want to tokenize a template. The output
 366         you get is not compatible with the input the jinja parser wants. The
 367         parser uses the `tokenize` function with returns a `TokenStream` and
 368         keywords instead of just names.
 369         """
 370         source = '\n'.join(source.splitlines())
 371         pos = 0
 372         lineno = 1
 373         stack = ['root']
 374         statetokens = self.rules['root']
 375         source_length = len(source)
 376
 377         balancing_stack = []
 378
 379         while 1:
 380             # tokenizer loop
 381             for regex, tokens, new_state in statetokens:
 382                 m = regex.match(source, pos)
 383                 # if no match we try again with the next rule
 384                 if m is None:
 385                     continue
 386
 387                 # we only match blocks and variables if brances / parentheses
 388                 # are balanced. continue parsing with the lower rule which
 389                 # is the operator rule. do this only if the end tags look
 390                 # like operators
 391                 if balancing_stack and \
 392                    tokens in ('variable_end', 'block_end',
 393                               'linestatement_end'):
 394                     continue
 395
 396                 # tuples support more options
 397                 if isinstance(tokens, tuple):
 398                     for idx, token in enumerate(tokens):
 399                         # hidden group
 400                         if token is None:
 401                             g = m.group(idx)
 402                             if g:
 403                                 lineno += g.count('\n')
 404                             continue
 405                         # failure group
 406                         elif token.__class__ is Failure:
 407                             raise token(lineno, filename)
 408                         # bygroup is a bit more complex, in that case we
 409                         # yield for the current token the first named
 410                         # group that matched
 411                         elif token == '#bygroup':
 412                             for key, value in m.groupdict().iteritems():
 413                                 if value is not None:
 414                                     yield lineno, key, value
 415                                     lineno += value.count('\n')
 416                                     break
 417                             else:
 418                                 raise RuntimeError('%r wanted to resolve '
 419                                                    'the token dynamically'
 420                                                    ' but no group matched'
 421                                                    % regex)
 422                         # normal group
 423                         else:
 424                             data = m.group(idx + 1)
 425                             if data:
 426                                 yield lineno, token, data
 427                             lineno += data.count('\n')
 428
 429                 # strings as token just are yielded as it.
 430                 else:
 431                     data = m.group()
 432                     # update brace/parentheses balance
 433                     if tokens == 'operator':
 434                         if data == '{':
 435                             balancing_stack.append('}')
 436                         elif data == '(':
 437                             balancing_stack.append(')')
 438                         elif data == '[':
 439                             balancing_stack.append(']')
 440                         elif data in ('}', ')', ']'):
 441                             if not balancing_stack:
 442                                 raise TemplateSyntaxError('unexpected "%s"' %
 443                                                           data, lineno,
 444                                                           filename)
 445                             expected_op = balancing_stack.pop()
 446                             if expected_op != data:
 447                                 raise TemplateSyntaxError('unexpected "%s", '
 448                                                           'expected "%s"' %
 449                                                           (data, expected_op),
 450                                                           lineno, filename)
 451                     # yield items
 452                     if tokens is not None:
 453                         yield lineno, tokens, data
 454                     lineno += data.count('\n')
 455
 456                 # fetch new position into new variable so that we can check
 457                 # if there is a internal parsing error which would result
 458                 # in an infinite loop
 459                 pos2 = m.end()
 460
 461                 # handle state changes
 462                 if new_state is not None:
 463                     # remove the uppermost state
 464                     if new_state == '#pop':
 465                         stack.pop()
 466                     # resolve the new state by group checking
 467                     elif new_state == '#bygroup':
 468                         for key, value in m.groupdict().iteritems():
 469                             if value is not None:
 470                                 stack.append(key)
 471                                 break
 472                         else:
 473                             raise RuntimeError('%r wanted to resolve the '
 474                                                'new state dynamically but'
 475                                                ' no group matched' %
 476                                                regex)
 477                     # direct state name given
 478                     else:
 479                         stack.append(new_state)
 480                     statetokens = self.rules[stack[-1]]
 481                 # we are still at the same position and no stack change.
 482                 # this means a loop without break condition, avoid that and
 483                 # raise error
 484                 elif pos2 == pos:
 485                     raise RuntimeError('%r yielded empty string without '
 486                                        'stack change' % regex)
 487                 # publish new function and start again
 488                 pos = pos2
 489                 break
 490             # if loop terminated without break we havn't found a single match
 491             # either we are at the end of the file or we have a problem
 492             else:
 493                 # end of text
 494                 if pos >= source_length:
 495                     return
 496                 # something went wrong
 497                 raise TemplateSyntaxError('unexpected char %r at %d' %
 498                                           (source[pos], pos), lineno,
 499                                           filename)