jinja2/lexer.py

   1 # -*- coding: utf-8 -*-
   2 """
   3     jinja2.lexer
   4     ~~~~~~~~~~~~
   5
   6     This module implements a Jinja / Python combination lexer. The
   7     `Lexer` class provided by this module is used to do some preprocessing
   8     for Jinja.
   9
  10     On the one hand it filters out invalid operators like the bitshift
  11     operators we don't allow in templates. On the other hand it separates
  12     template code and python code in expressions.
  13
  14     :copyright: (c) 2009 by the Jinja Team.
  15     :license: BSD, see LICENSE for more details.
  16 """
  17 import re
  18 from operator import itemgetter
  19 from collections import deque
  20 from jinja2.exceptions import TemplateSyntaxError
  21 from jinja2.utils import LRUCache
  22
  23
  24 # cache for the lexers. Exists in order to be able to have multiple
  25 # environments with the same lexer
  26 _lexer_cache = LRUCache(50)
  27
  28 # static regular expressions
  29 whitespace_re = re.compile(r'\s+', re.U)
  30 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
  31                        r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
  32 integer_re = re.compile(r'\d+')
  33 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
  34 float_re = re.compile(r'(?<!\.)\d+\.\d+')
  35 newline_re = re.compile(r'(\r\n|\r|\n)')
  36
  37 # internal the tokens and keep references to them
  38 TOKEN_ADD = intern('add')
  39 TOKEN_ASSIGN = intern('assign')
  40 TOKEN_COLON = intern('colon')
  41 TOKEN_COMMA = intern('comma')
  42 TOKEN_DIV = intern('div')
  43 TOKEN_DOT = intern('dot')
  44 TOKEN_EQ = intern('eq')
  45 TOKEN_FLOORDIV = intern('floordiv')
  46 TOKEN_GT = intern('gt')
  47 TOKEN_GTEQ = intern('gteq')
  48 TOKEN_LBRACE = intern('lbrace')
  49 TOKEN_LBRACKET = intern('lbracket')
  50 TOKEN_LPAREN = intern('lparen')
  51 TOKEN_LT = intern('lt')
  52 TOKEN_LTEQ = intern('lteq')
  53 TOKEN_MOD = intern('mod')
  54 TOKEN_MUL = intern('mul')
  55 TOKEN_NE = intern('ne')
  56 TOKEN_PIPE = intern('pipe')
  57 TOKEN_POW = intern('pow')
  58 TOKEN_RBRACE = intern('rbrace')
  59 TOKEN_RBRACKET = intern('rbracket')
  60 TOKEN_RPAREN = intern('rparen')
  61 TOKEN_SEMICOLON = intern('semicolon')
  62 TOKEN_SUB = intern('sub')
  63 TOKEN_TILDE = intern('tilde')
  64 TOKEN_WHITESPACE = intern('whitespace')
  65 TOKEN_FLOAT = intern('float')
  66 TOKEN_INTEGER = intern('integer')
  67 TOKEN_NAME = intern('name')
  68 TOKEN_STRING = intern('string')
  69 TOKEN_OPERATOR = intern('operator')
  70 TOKEN_BLOCK_BEGIN = intern('block_begin')
  71 TOKEN_BLOCK_END = intern('block_end')
  72 TOKEN_VARIABLE_BEGIN = intern('variable_begin')
  73 TOKEN_VARIABLE_END = intern('variable_end')
  74 TOKEN_RAW_BEGIN = intern('raw_begin')
  75 TOKEN_RAW_END = intern('raw_end')
  76 TOKEN_COMMENT_BEGIN = intern('comment_begin')
  77 TOKEN_COMMENT_END = intern('comment_end')
  78 TOKEN_COMMENT = intern('comment')
  79 TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
  80 TOKEN_LINESTATEMENT_END = intern('linestatement_end')
  81 TOKEN_DATA = intern('data')
  82 TOKEN_INITIAL = intern('initial')
  83 TOKEN_EOF = intern('eof')
  84
  85 # bind operators to token types
  86 operators = {
  87     '+':            TOKEN_ADD,
  88     '-':            TOKEN_SUB,
  89     '/':            TOKEN_DIV,
  90     '//':           TOKEN_FLOORDIV,
  91     '*':            TOKEN_MUL,
  92     '%':            TOKEN_MOD,
  93     '**':           TOKEN_POW,
  94     '~':            TOKEN_TILDE,
  95     '[':            TOKEN_LBRACKET,
  96     ']':            TOKEN_RBRACKET,
  97     '(':            TOKEN_LPAREN,
  98     ')':            TOKEN_RPAREN,
  99     '{':            TOKEN_LBRACE,
 100     '}':            TOKEN_RBRACE,
 101     '==':           TOKEN_EQ,
 102     '!=':           TOKEN_NE,
 103     '>':            TOKEN_GT,
 104     '>=':           TOKEN_GTEQ,
 105     '<':            TOKEN_LT,
 106     '<=':           TOKEN_LTEQ,
 107     '=':            TOKEN_ASSIGN,
 108     '.':            TOKEN_DOT,
 109     ':':            TOKEN_COLON,
 110     '|':            TOKEN_PIPE,
 111     ',':            TOKEN_COMMA,
 112     ';':            TOKEN_SEMICOLON
 113 }
 114
 115 reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
 116 assert len(operators) == len(reverse_operators), 'operators dropped'
 117 operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
 118                          sorted(operators, key=lambda x: -len(x))))
 119
 120
 121 def count_newlines(value):
 122     """Count the number of newline characters in the string.  This is
 123     useful for extensions that filter a stream.
 124     """
 125     return len(newline_re.findall(value))
 126
 127
 128 class Failure(object):
 129     """Class that raises a `TemplateSyntaxError` if called.
 130     Used by the `Lexer` to specify known errors.
 131     """
 132
 133     def __init__(self, message, cls=TemplateSyntaxError):
 134         self.message = message
 135         self.error_class = cls
 136
 137     def __call__(self, lineno, filename):
 138         raise self.error_class(self.message, lineno, filename)
 139
 140
 141 class Token(tuple):
 142     """Token class."""
 143     __slots__ = ()
 144     lineno, type, value = (property(itemgetter(x)) for x in range(3))
 145
 146     def __new__(cls, lineno, type, value):
 147         return tuple.__new__(cls, (lineno, intern(str(type)), value))
 148
 149     def __str__(self):
 150         if self.type in reverse_operators:
 151             return reverse_operators[self.type]
 152         elif self.type == 'name':
 153             return self.value
 154         return self.type
 155
 156     def test(self, expr):
 157         """Test a token against a token expression.  This can either be a
 158         token type or ``'token_type:token_value'``.  This can only test
 159         against string values and types.
 160         """
 161         # here we do a regular string equality check as test_any is usually
 162         # passed an iterable of not interned strings.
 163         if self.type == expr:
 164             return True
 165         elif ':' in expr:
 166             return expr.split(':', 1) == [self.type, self.value]
 167         return False
 168
 169     def test_any(self, *iterable):
 170         """Test against multiple token expressions."""
 171         for expr in iterable:
 172             if self.test(expr):
 173                 return True
 174         return False
 175
 176     def __repr__(self):
 177         return 'Token(%r, %r, %r)' % (
 178             self.lineno,
 179             self.type,
 180             self.value
 181         )
 182
 183
 184 class TokenStreamIterator(object):
 185     """The iterator for tokenstreams.  Iterate over the stream
 186     until the eof token is reached.
 187     """
 188
 189     def __init__(self, stream):
 190         self.stream = stream
 191
 192     def __iter__(self):
 193         return self
 194
 195     def next(self):
 196         token = self.stream.current
 197         if token.type is TOKEN_EOF:
 198             self.stream.close()
 199             raise StopIteration()
 200         self.stream.next()
 201         return token
 202
 203
 204 class TokenStream(object):
 205     """A token stream is an iterable that yields :class:`Token`\s.  The
 206     parser however does not iterate over it but calls :meth:`next` to go
 207     one token ahead.  The current active token is stored as :attr:`current`.
 208     """
 209
 210     def __init__(self, generator, name, filename):
 211         self._next = iter(generator).next
 212         self._pushed = deque()
 213         self.name = name
 214         self.filename = filename
 215         self.closed = False
 216         self.current = Token(1, TOKEN_INITIAL, '')
 217         self.next()
 218
 219     def __iter__(self):
 220         return TokenStreamIterator(self)
 221
 222     def __nonzero__(self):
 223         """Are we at the end of the stream?"""
 224         return bool(self._pushed) or self.current.type is not TOKEN_EOF
 225
 226     eos = property(lambda x: not x.__nonzero__(), doc=__nonzero__.__doc__)
 227
 228     def push(self, token):
 229         """Push a token back to the stream."""
 230         self._pushed.append(token)
 231
 232     def look(self):
 233         """Look at the next token."""
 234         old_token = self.next()
 235         result = self.current
 236         self.push(result)
 237         self.current = old_token
 238         return result
 239
 240     def skip(self, n=1):
 241         """Got n tokens ahead."""
 242         for x in xrange(n):
 243             self.next()
 244
 245     def next_if(self, expr):
 246         """Perform the token test and return the token if it matched.
 247         Otherwise the return value is `None`.
 248         """
 249         if self.current.test(expr):
 250             return self.next()
 251
 252     def skip_if(self, expr):
 253         """Like :meth:`next_if` but only returns `True` or `False`."""
 254         return self.next_if(expr) is not None
 255
 256     def next(self):
 257         """Go one token ahead and return the old one"""
 258         rv = self.current
 259         if self._pushed:
 260             self.current = self._pushed.popleft()
 261         elif self.current.type is not TOKEN_EOF:
 262             try:
 263                 self.current = self._next()
 264             except StopIteration:
 265                 self.close()
 266         return rv
 267
 268     def close(self):
 269         """Close the stream."""
 270         self.current = Token(self.current.lineno, TOKEN_EOF, '')
 271         self._next = None
 272         self.closed = True
 273
 274     def expect(self, expr):
 275         """Expect a given token type and return it.  This accepts the same
 276         argument as :meth:`jinja2.lexer.Token.test`.
 277         """
 278         if not self.current.test(expr):
 279             if ':' in expr:
 280                 expr = expr.split(':')[1]
 281             if self.current.type is TOKEN_EOF:
 282                 raise TemplateSyntaxError('unexpected end of template, '
 283                                           'expected %r.' % expr,
 284                                           self.current.lineno,
 285                                           self.name, self.filename)
 286             raise TemplateSyntaxError("expected token %r, got %r" %
 287                                       (expr, str(self.current)),
 288                                       self.current.lineno,
 289                                       self.name, self.filename)
 290         try:
 291             return self.current
 292         finally:
 293             self.next()
 294
 295
 296 def get_lexer(environment):
 297     """Return a lexer which is probably cached."""
 298     key = (environment.block_start_string,
 299            environment.block_end_string,
 300            environment.variable_start_string,
 301            environment.variable_end_string,
 302            environment.comment_start_string,
 303            environment.comment_end_string,
 304            environment.line_statement_prefix,
 305            environment.trim_blocks,
 306            environment.newline_sequence)
 307     lexer = _lexer_cache.get(key)
 308     if lexer is None:
 309         lexer = Lexer(environment)
 310         _lexer_cache[key] = lexer
 311     return lexer
 312
 313
 314 class Lexer(object):
 315     """Class that implements a lexer for a given environment. Automatically
 316     created by the environment class, usually you don't have to do that.
 317
 318     Note that the lexer is not automatically bound to an environment.
 319     Multiple environments can share the same lexer.
 320     """
 321
 322     def __init__(self, environment):
 323         # shortcuts
 324         c = lambda x: re.compile(x, re.M | re.S)
 325         e = re.escape
 326
 327         # lexing rules for tags
 328         tag_rules = [
 329             (whitespace_re, TOKEN_WHITESPACE, None),
 330             (float_re, TOKEN_FLOAT, None),
 331             (integer_re, TOKEN_INTEGER, None),
 332             (name_re, TOKEN_NAME, None),
 333             (string_re, TOKEN_STRING, None),
 334             (operator_re, TOKEN_OPERATOR, None)
 335         ]
 336
 337         # assamble the root lexing rule. because "|" is ungreedy
 338         # we have to sort by length so that the lexer continues working
 339         # as expected when we have parsing rules like <% for block and
 340         # <%= for variables. (if someone wants asp like syntax)
 341         # variables are just part of the rules if variable processing
 342         # is required.
 343         root_tag_rules = [
 344             ('comment',     environment.comment_start_string),
 345             ('block',       environment.block_start_string),
 346             ('variable',    environment.variable_start_string)
 347         ]
 348         root_tag_rules.sort(key=lambda x: -len(x[1]))
 349
 350         # now escape the rules.  This is done here so that the escape
 351         # signs don't count for the lengths of the tags.
 352         root_tag_rules = [(a, e(b)) for a, b in root_tag_rules]
 353
 354         # if we have a line statement prefix we need an extra rule for
 355         # that.  We add this rule *after* all the others.
 356         if environment.line_statement_prefix is not None:
 357             prefix = e(environment.line_statement_prefix)
 358             root_tag_rules.insert(0, ('linestatement', '^\s*' + prefix))
 359
 360         # block suffix if trimming is enabled
 361         block_suffix_re = environment.trim_blocks and '\\n?' or ''
 362
 363         self.newline_sequence = environment.newline_sequence
 364
 365         # global lexing rules
 366         self.rules = {
 367             'root': [
 368                 # directives
 369                 (c('(.*?)(?:%s)' % '|'.join(
 370                     ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
 371                         e(environment.block_start_string),
 372                         e(environment.block_start_string),
 373                         e(environment.block_end_string)
 374                     )] + [
 375                         '(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
 376                         for n, r in root_tag_rules
 377                     ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
 378                 # data
 379                 (c('.+'), 'data', None)
 380             ],
 381             # comments
 382             TOKEN_COMMENT_BEGIN: [
 383                 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
 384                     e(environment.comment_end_string),
 385                     e(environment.comment_end_string),
 386                     block_suffix_re
 387                 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
 388                 (c('(.)'), (Failure('Missing end of comment tag'),), None)
 389             ],
 390             # blocks
 391             TOKEN_BLOCK_BEGIN: [
 392                 (c('(?:\-%s\s*|%s)%s' % (
 393                     e(environment.block_end_string),
 394                     e(environment.block_end_string),
 395                     block_suffix_re
 396                 )), TOKEN_BLOCK_END, '#pop'),
 397             ] + tag_rules,
 398             # variables
 399             TOKEN_VARIABLE_BEGIN: [
 400                 (c('\-%s\s*|%s' % (
 401                     e(environment.variable_end_string),
 402                     e(environment.variable_end_string)
 403                 )), TOKEN_VARIABLE_END, '#pop')
 404             ] + tag_rules,
 405             # raw block
 406             TOKEN_RAW_BEGIN: [
 407                 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
 408                     e(environment.block_start_string),
 409                     e(environment.block_start_string),
 410                     e(environment.block_end_string),
 411                     e(environment.block_end_string),
 412                     block_suffix_re
 413                 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
 414                 (c('(.)'), (Failure('Missing end of raw directive'),), None)
 415             ],
 416             # line statements
 417             TOKEN_LINESTATEMENT_BEGIN: [
 418                 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
 419             ] + tag_rules
 420         }
 421
 422     def _normalize_newlines(self, value):
 423         """Called for strings and template data to normlize it to unicode."""
 424         return newline_re.sub(self.newline_sequence, value)
 425
 426     def tokenize(self, source, name=None, filename=None, state=None):
 427         """Calls tokeniter + tokenize and wraps it in a token stream.
 428         """
 429         stream = self.tokeniter(source, name, filename, state)
 430         return TokenStream(self.wrap(stream, name, filename), name, filename)
 431
 432     def wrap(self, stream, name=None, filename=None):
 433         """This is called with the stream as returned by `tokenize` and wraps
 434         every token in a :class:`Token` and converts the value.
 435         """
 436         for lineno, token, value in stream:
 437             if token in ('comment_begin', 'comment', 'comment_end',
 438                          'whitespace'):
 439                 continue
 440             elif token == 'linestatement_begin':
 441                 token = 'block_begin'
 442             elif token == 'linestatement_end':
 443                 token = 'block_end'
 444             # we are not interested in those tokens in the parser
 445             elif token in ('raw_begin', 'raw_end'):
 446                 continue
 447             elif token == 'data':
 448                 value = self._normalize_newlines(value)
 449             elif token == 'keyword':
 450                 token = value
 451             elif token == 'name':
 452                 value = str(value)
 453             elif token == 'string':
 454                 # try to unescape string
 455                 try:
 456                     value = self._normalize_newlines(value[1:-1]) \
 457                         .encode('ascii', 'backslashreplace') \
 458                         .decode('unicode-escape')
 459                 except Exception, e:
 460                     msg = str(e).split(':')[-1].strip()
 461                     raise TemplateSyntaxError(msg, lineno, name, filename)
 462                 # if we can express it as bytestring (ascii only)
 463                 # we do that for support of semi broken APIs
 464                 # as datetime.datetime.strftime
 465                 try:
 466                     value = str(value)
 467                 except UnicodeError:
 468                     pass
 469             elif token == 'integer':
 470                 value = int(value)
 471             elif token == 'float':
 472                 value = float(value)
 473             elif token == 'operator':
 474                 token = operators[value]
 475             yield Token(lineno, token, value)
 476
 477     def tokeniter(self, source, name, filename=None, state=None):
 478         """This method tokenizes the text and returns the tokens in a
 479         generator.  Use this method if you just want to tokenize a template.
 480         """
 481         source = '\n'.join(unicode(source).splitlines())
 482         pos = 0
 483         lineno = 1
 484         stack = ['root']
 485         if state is not None and state != 'root':
 486             assert state in ('variable', 'block'), 'invalid state'
 487             stack.append(state + '_begin')
 488         else:
 489             state = 'root'
 490         statetokens = self.rules[stack[-1]]
 491         source_length = len(source)
 492
 493         balancing_stack = []
 494
 495         while 1:
 496             # tokenizer loop
 497             for regex, tokens, new_state in statetokens:
 498                 m = regex.match(source, pos)
 499                 # if no match we try again with the next rule
 500                 if m is None:
 501                     continue
 502
 503                 # we only match blocks and variables if brances / parentheses
 504                 # are balanced. continue parsing with the lower rule which
 505                 # is the operator rule. do this only if the end tags look
 506                 # like operators
 507                 if balancing_stack and \
 508                    tokens in ('variable_end', 'block_end',
 509                               'linestatement_end'):
 510                     continue
 511
 512                 # tuples support more options
 513                 if isinstance(tokens, tuple):
 514                     for idx, token in enumerate(tokens):
 515                         # failure group
 516                         if token.__class__ is Failure:
 517                             raise token(lineno, filename)
 518                         # bygroup is a bit more complex, in that case we
 519                         # yield for the current token the first named
 520                         # group that matched
 521                         elif token == '#bygroup':
 522                             for key, value in m.groupdict().iteritems():
 523                                 if value is not None:
 524                                     yield lineno, key, value
 525                                     lineno += value.count('\n')
 526                                     break
 527                             else:
 528                                 raise RuntimeError('%r wanted to resolve '
 529                                                    'the token dynamically'
 530                                                    ' but no group matched'
 531                                                    % regex)
 532                         # normal group
 533                         else:
 534                             data = m.group(idx + 1)
 535                             if data:
 536                                 yield lineno, token, data
 537                             lineno += data.count('\n')
 538
 539                 # strings as token just are yielded as it.
 540                 else:
 541                     data = m.group()
 542                     # update brace/parentheses balance
 543                     if tokens == 'operator':
 544                         if data == '{':
 545                             balancing_stack.append('}')
 546                         elif data == '(':
 547                             balancing_stack.append(')')
 548                         elif data == '[':
 549                             balancing_stack.append(']')
 550                         elif data in ('}', ')', ']'):
 551                             if not balancing_stack:
 552                                 raise TemplateSyntaxError('unexpected "%s"' %
 553                                                           data, lineno, name,
 554                                                           filename)
 555                             expected_op = balancing_stack.pop()
 556                             if expected_op != data:
 557                                 raise TemplateSyntaxError('unexpected "%s", '
 558                                                           'expected "%s"' %
 559                                                           (data, expected_op),
 560                                                           lineno, name,
 561                                                           filename)
 562                     # yield items
 563                     yield lineno, tokens, data
 564                     lineno += data.count('\n')
 565
 566                 # fetch new position into new variable so that we can check
 567                 # if there is a internal parsing error which would result
 568                 # in an infinite loop
 569                 pos2 = m.end()
 570
 571                 # handle state changes
 572                 if new_state is not None:
 573                     # remove the uppermost state
 574                     if new_state == '#pop':
 575                         stack.pop()
 576                     # resolve the new state by group checking
 577                     elif new_state == '#bygroup':
 578                         for key, value in m.groupdict().iteritems():
 579                             if value is not None:
 580                                 stack.append(key)
 581                                 break
 582                         else:
 583                             raise RuntimeError('%r wanted to resolve the '
 584                                                'new state dynamically but'
 585                                                ' no group matched' %
 586                                                regex)
 587                     # direct state name given
 588                     else:
 589                         stack.append(new_state)
 590                     statetokens = self.rules[stack[-1]]
 591                 # we are still at the same position and no stack change.
 592                 # this means a loop without break condition, avoid that and
 593                 # raise error
 594                 elif pos2 == pos:
 595                     raise RuntimeError('%r yielded empty string without '
 596                                        'stack change' % regex)
 597                 # publish new function and start again
 598                 pos = pos2
 599                 break
 600             # if loop terminated without break we havn't found a single match
 601             # either we are at the end of the file or we have a problem
 602             else:
 603                 # end of text
 604                 if pos >= source_length:
 605                     return
 606                 # something went wrong
 607                 raise TemplateSyntaxError('unexpected char %r at %d' %
 608                                           (source[pos], pos), lineno,
 609                                           name, filename)