jinja2/lexer.py

   1 # -*- coding: utf-8 -*-
   2 """
   3     jinja2.lexer
   4     ~~~~~~~~~~~~
   5
   6     This module implements a Jinja / Python combination lexer. The
   7     `Lexer` class provided by this module is used to do some preprocessing
   8     for Jinja.
   9
  10     On the one hand it filters out invalid operators like the bitshift
  11     operators we don't allow in templates. On the other hand it separates
  12     template code and python code in expressions.
  13
  14     :copyright: 2007-2008 by Armin Ronacher.
  15     :license: BSD, see LICENSE for more details.
  16 """
  17 import re
  18 from operator import itemgetter
  19 from collections import deque
  20 from jinja2.exceptions import TemplateSyntaxError
  21 from jinja2.utils import LRUCache
  22
  23
  24 # cache for the lexers. Exists in order to be able to have multiple
  25 # environments with the same lexer
  26 _lexer_cache = LRUCache(50)
  27
  28 # static regular expressions
  29 whitespace_re = re.compile(r'\s+', re.U)
  30 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
  31                        r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
  32 integer_re = re.compile(r'\d+')
  33 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
  34 float_re = re.compile(r'(?<!\.)\d+\.\d+')
  35 newline_re = re.compile(r'(\r\n|\r|\n)')
  36
  37 # bind operators to token types
  38 operators = {
  39     '+':            'add',
  40     '-':            'sub',
  41     '/':            'div',
  42     '//':           'floordiv',
  43     '*':            'mul',
  44     '%':            'mod',
  45     '**':           'pow',
  46     '~':            'tilde',
  47     '[':            'lbracket',
  48     ']':            'rbracket',
  49     '(':            'lparen',
  50     ')':            'rparen',
  51     '{':            'lbrace',
  52     '}':            'rbrace',
  53     '==':           'eq',
  54     '!=':           'ne',
  55     '>':            'gt',
  56     '>=':           'gteq',
  57     '<':            'lt',
  58     '<=':           'lteq',
  59     '=':            'assign',
  60     '.':            'dot',
  61     ':':            'colon',
  62     '|':            'pipe',
  63     ',':            'comma',
  64     ';':            'semicolon'
  65 }
  66
  67 reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
  68 assert len(operators) == len(reverse_operators), 'operators dropped'
  69 operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
  70                          sorted(operators, key=lambda x: -len(x))))
  71
  72
  73 def count_newlines(value):
  74     """Count the number of newline characters in the string.  This is
  75     useful for extensions that filter a stream.
  76     """
  77     return len(newline_re.findall(value))
  78
  79
  80 class Failure(object):
  81     """Class that raises a `TemplateSyntaxError` if called.
  82     Used by the `Lexer` to specify known errors.
  83     """
  84
  85     def __init__(self, message, cls=TemplateSyntaxError):
  86         self.message = message
  87         self.error_class = cls
  88
  89     def __call__(self, lineno, filename):
  90         raise self.error_class(self.message, lineno, filename)
  91
  92
  93 class Token(tuple):
  94     """Token class."""
  95     __slots__ = ()
  96     lineno, type, value = (property(itemgetter(x)) for x in range(3))
  97
  98     def __new__(cls, lineno, type, value):
  99         return tuple.__new__(cls, (lineno, intern(str(type)), value))
 100
 101     def __str__(self):
 102         if self.type in reverse_operators:
 103             return reverse_operators[self.type]
 104         elif self.type is 'name':
 105             return self.value
 106         return self.type
 107
 108     def test(self, expr):
 109         """Test a token against a token expression.  This can either be a
 110         token type or ``'token_type:token_value'``.  This can only test
 111         against string values and types.
 112         """
 113         # here we do a regular string equality check as test_any is usually
 114         # passed an iterable of not interned strings.
 115         if self.type == expr:
 116             return True
 117         elif ':' in expr:
 118             return expr.split(':', 1) == [self.type, self.value]
 119         return False
 120
 121     def test_any(self, *iterable):
 122         """Test against multiple token expressions."""
 123         for expr in iterable:
 124             if self.test(expr):
 125                 return True
 126         return False
 127
 128     def __repr__(self):
 129         return 'Token(%r, %r, %r)' % (
 130             self.lineno,
 131             self.type,
 132             self.value
 133         )
 134
 135
 136 class TokenStreamIterator(object):
 137     """The iterator for tokenstreams.  Iterate over the stream
 138     until the eof token is reached.
 139     """
 140
 141     def __init__(self, stream):
 142         self.stream = stream
 143
 144     def __iter__(self):
 145         return self
 146
 147     def next(self):
 148         token = self.stream.current
 149         if token.type == 'eof':
 150             self.stream.close()
 151             raise StopIteration()
 152         self.stream.next()
 153         return token
 154
 155
 156 class TokenStream(object):
 157     """A token stream is an iterable that yields :class:`Token`\s.  The
 158     parser however does not iterate over it but calls :meth:`next` to go
 159     one token ahead.  The current active token is stored as :attr:`current`.
 160     """
 161
 162     def __init__(self, generator, name, filename):
 163         self._next = iter(generator).next
 164         self._pushed = deque()
 165         self.name = name
 166         self.filename = filename
 167         self.closed = False
 168         self.current = Token(1, 'initial', '')
 169         self.next()
 170
 171     def __iter__(self):
 172         return TokenStreamIterator(self)
 173
 174     def __nonzero__(self):
 175         """Are we at the end of the stream?"""
 176         return bool(self._pushed) or self.current.type != 'eof'
 177
 178     eos = property(lambda x: not x.__nonzero__(), doc=__nonzero__.__doc__)
 179
 180     def push(self, token):
 181         """Push a token back to the stream."""
 182         self._pushed.append(token)
 183
 184     def look(self):
 185         """Look at the next token."""
 186         old_token = self.next()
 187         result = self.current
 188         self.push(result)
 189         self.current = old_token
 190         return result
 191
 192     def skip(self, n=1):
 193         """Got n tokens ahead."""
 194         for x in xrange(n):
 195             self.next()
 196
 197     def next_if(self, expr):
 198         """Perform the token test and return the token if it matched.
 199         Otherwise the return value is `None`.
 200         """
 201         if self.current.test(expr):
 202             return self.next()
 203
 204     def skip_if(self, expr):
 205         """Like :meth:`next_if` but only returns `True` or `False`."""
 206         return self.next_if(expr) is not None
 207
 208     def next(self):
 209         """Go one token ahead and return the old one"""
 210         rv = self.current
 211         if self._pushed:
 212             self.current = self._pushed.popleft()
 213         elif self.current.type is not 'eof':
 214             try:
 215                 self.current = self._next()
 216             except StopIteration:
 217                 self.close()
 218         return rv
 219
 220     def close(self):
 221         """Close the stream."""
 222         self.current = Token(self.current.lineno, 'eof', '')
 223         self._next = None
 224         self.closed = True
 225
 226     def expect(self, expr):
 227         """Expect a given token type and return it.  This accepts the same
 228         argument as :meth:`jinja2.lexer.Token.test`.
 229         """
 230         if not self.current.test(expr):
 231             if ':' in expr:
 232                 expr = expr.split(':')[1]
 233             if self.current.type is 'eof':
 234                 raise TemplateSyntaxError('unexpected end of template, '
 235                                           'expected %r.' % expr,
 236                                           self.current.lineno,
 237                                           self.name, self.filename)
 238             raise TemplateSyntaxError("expected token %r, got %r" %
 239                                       (expr, str(self.current)),
 240                                       self.current.lineno,
 241                                       self.name, self.filename)
 242         try:
 243             return self.current
 244         finally:
 245             self.next()
 246
 247
 248 def get_lexer(environment):
 249     """Return a lexer which is probably cached."""
 250     key = (environment.block_start_string,
 251            environment.block_end_string,
 252            environment.variable_start_string,
 253            environment.variable_end_string,
 254            environment.comment_start_string,
 255            environment.comment_end_string,
 256            environment.line_statement_prefix,
 257            environment.trim_blocks,
 258            environment.newline_sequence)
 259     lexer = _lexer_cache.get(key)
 260     if lexer is None:
 261         lexer = Lexer(environment)
 262         _lexer_cache[key] = lexer
 263     return lexer
 264
 265
 266 class Lexer(object):
 267     """Class that implements a lexer for a given environment. Automatically
 268     created by the environment class, usually you don't have to do that.
 269
 270     Note that the lexer is not automatically bound to an environment.
 271     Multiple environments can share the same lexer.
 272     """
 273
 274     def __init__(self, environment):
 275         # shortcuts
 276         c = lambda x: re.compile(x, re.M | re.S)
 277         e = re.escape
 278
 279         # lexing rules for tags
 280         tag_rules = [
 281             (whitespace_re, 'whitespace', None),
 282             (float_re, 'float', None),
 283             (integer_re, 'integer', None),
 284             (name_re, 'name', None),
 285             (string_re, 'string', None),
 286             (operator_re, 'operator', None)
 287         ]
 288
 289         # assamble the root lexing rule. because "|" is ungreedy
 290         # we have to sort by length so that the lexer continues working
 291         # as expected when we have parsing rules like <% for block and
 292         # <%= for variables. (if someone wants asp like syntax)
 293         # variables are just part of the rules if variable processing
 294         # is required.
 295         root_tag_rules = [
 296             ('comment',     environment.comment_start_string),
 297             ('block',       environment.block_start_string),
 298             ('variable',    environment.variable_start_string)
 299         ]
 300         root_tag_rules.sort(key=lambda x: -len(x[1]))
 301
 302         # now escape the rules.  This is done here so that the escape
 303         # signs don't count for the lengths of the tags.
 304         root_tag_rules = [(a, e(b)) for a, b in root_tag_rules]
 305
 306         # if we have a line statement prefix we need an extra rule for
 307         # that.  We add this rule *after* all the others.
 308         if environment.line_statement_prefix is not None:
 309             prefix = e(environment.line_statement_prefix)
 310             root_tag_rules.insert(0, ('linestatement', '^\s*' + prefix))
 311
 312         # block suffix if trimming is enabled
 313         block_suffix_re = environment.trim_blocks and '\\n?' or ''
 314
 315         self.newline_sequence = environment.newline_sequence
 316
 317         # global lexing rules
 318         self.rules = {
 319             'root': [
 320                 # directives
 321                 (c('(.*?)(?:%s)' % '|'.join(
 322                     ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
 323                         e(environment.block_start_string),
 324                         e(environment.block_start_string),
 325                         e(environment.block_end_string)
 326                     )] + [
 327                         '(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
 328                         for n, r in root_tag_rules
 329                     ])), ('data', '#bygroup'), '#bygroup'),
 330                 # data
 331                 (c('.+'), 'data', None)
 332             ],
 333             # comments
 334             'comment_begin': [
 335                 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
 336                     e(environment.comment_end_string),
 337                     e(environment.comment_end_string),
 338                     block_suffix_re
 339                 )), ('comment', 'comment_end'), '#pop'),
 340                 (c('(.)'), (Failure('Missing end of comment tag'),), None)
 341             ],
 342             # blocks
 343             'block_begin': [
 344                 (c('(?:\-%s\s*|%s)%s' % (
 345                     e(environment.block_end_string),
 346                     e(environment.block_end_string),
 347                     block_suffix_re
 348                 )), 'block_end', '#pop'),
 349             ] + tag_rules,
 350             # variables
 351             'variable_begin': [
 352                 (c('\-%s\s*|%s' % (
 353                     e(environment.variable_end_string),
 354                     e(environment.variable_end_string)
 355                 )), 'variable_end', '#pop')
 356             ] + tag_rules,
 357             # raw block
 358             'raw_begin': [
 359                 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
 360                     e(environment.block_start_string),
 361                     e(environment.block_start_string),
 362                     e(environment.block_end_string),
 363                     e(environment.block_end_string),
 364                     block_suffix_re
 365                 )), ('data', 'raw_end'), '#pop'),
 366                 (c('(.)'), (Failure('Missing end of raw directive'),), None)
 367             ],
 368             # line statements
 369             'linestatement_begin': [
 370                 (c(r'\s*(\n|$)'), 'linestatement_end', '#pop')
 371             ] + tag_rules
 372         }
 373
 374     def _normalize_newlines(self, value):
 375         """Called for strings and template data to normlize it to unicode."""
 376         return newline_re.sub(self.newline_sequence, value)
 377
 378     def tokenize(self, source, name=None, filename=None, state=None):
 379         """Calls tokeniter + tokenize and wraps it in a token stream.
 380         """
 381         stream = self.tokeniter(source, name, filename, state)
 382         return TokenStream(self.wrap(stream, name, filename), name, filename)
 383
 384     def wrap(self, stream, name=None, filename=None):
 385         """This is called with the stream as returned by `tokenize` and wraps
 386         every token in a :class:`Token` and converts the value.
 387         """
 388         for lineno, token, value in stream:
 389             if token in ('comment_begin', 'comment', 'comment_end',
 390                          'whitespace'):
 391                 continue
 392             elif token == 'linestatement_begin':
 393                 token = 'block_begin'
 394             elif token == 'linestatement_end':
 395                 token = 'block_end'
 396             # we are not interested in those tokens in the parser
 397             elif token in ('raw_begin', 'raw_end'):
 398                 continue
 399             elif token == 'data':
 400                 value = self._normalize_newlines(value)
 401             elif token == 'keyword':
 402                 token = value
 403             elif token == 'name':
 404                 value = str(value)
 405             elif token == 'string':
 406                 # try to unescape string
 407                 try:
 408                     value = self._normalize_newlines(value[1:-1]) \
 409                         .encode('ascii', 'backslashreplace') \
 410                         .decode('unicode-escape')
 411                 except Exception, e:
 412                     msg = str(e).split(':')[-1].strip()
 413                     raise TemplateSyntaxError(msg, lineno, name, filename)
 414                 # if we can express it as bytestring (ascii only)
 415                 # we do that for support of semi broken APIs
 416                 # as datetime.datetime.strftime
 417                 try:
 418                     value = str(value)
 419                 except UnicodeError:
 420                     pass
 421             elif token == 'integer':
 422                 value = int(value)
 423             elif token == 'float':
 424                 value = float(value)
 425             elif token == 'operator':
 426                 token = operators[value]
 427             yield Token(lineno, token, value)
 428
 429     def tokeniter(self, source, name, filename=None, state=None):
 430         """This method tokenizes the text and returns the tokens in a
 431         generator.  Use this method if you just want to tokenize a template.
 432         """
 433         source = '\n'.join(unicode(source).splitlines())
 434         pos = 0
 435         lineno = 1
 436         stack = ['root']
 437         if state is not None and state != 'root':
 438             assert state in ('variable', 'block'), 'invalid state'
 439             stack.append(state + '_begin')
 440         else:
 441             state = 'root'
 442         statetokens = self.rules[stack[-1]]
 443         source_length = len(source)
 444
 445         balancing_stack = []
 446
 447         while 1:
 448             # tokenizer loop
 449             for regex, tokens, new_state in statetokens:
 450                 m = regex.match(source, pos)
 451                 # if no match we try again with the next rule
 452                 if m is None:
 453                     continue
 454
 455                 # we only match blocks and variables if brances / parentheses
 456                 # are balanced. continue parsing with the lower rule which
 457                 # is the operator rule. do this only if the end tags look
 458                 # like operators
 459                 if balancing_stack and \
 460                    tokens in ('variable_end', 'block_end',
 461                               'linestatement_end'):
 462                     continue
 463
 464                 # tuples support more options
 465                 if isinstance(tokens, tuple):
 466                     for idx, token in enumerate(tokens):
 467                         # failure group
 468                         if token.__class__ is Failure:
 469                             raise token(lineno, filename)
 470                         # bygroup is a bit more complex, in that case we
 471                         # yield for the current token the first named
 472                         # group that matched
 473                         elif token == '#bygroup':
 474                             for key, value in m.groupdict().iteritems():
 475                                 if value is not None:
 476                                     yield lineno, key, value
 477                                     lineno += value.count('\n')
 478                                     break
 479                             else:
 480                                 raise RuntimeError('%r wanted to resolve '
 481                                                    'the token dynamically'
 482                                                    ' but no group matched'
 483                                                    % regex)
 484                         # normal group
 485                         else:
 486                             data = m.group(idx + 1)
 487                             if data:
 488                                 yield lineno, token, data
 489                             lineno += data.count('\n')
 490
 491                 # strings as token just are yielded as it.
 492                 else:
 493                     data = m.group()
 494                     # update brace/parentheses balance
 495                     if tokens == 'operator':
 496                         if data == '{':
 497                             balancing_stack.append('}')
 498                         elif data == '(':
 499                             balancing_stack.append(')')
 500                         elif data == '[':
 501                             balancing_stack.append(']')
 502                         elif data in ('}', ')', ']'):
 503                             if not balancing_stack:
 504                                 raise TemplateSyntaxError('unexpected "%s"' %
 505                                                           data, lineno, name,
 506                                                           filename)
 507                             expected_op = balancing_stack.pop()
 508                             if expected_op != data:
 509                                 raise TemplateSyntaxError('unexpected "%s", '
 510                                                           'expected "%s"' %
 511                                                           (data, expected_op),
 512                                                           lineno, name,
 513                                                           filename)
 514                     # yield items
 515                     yield lineno, tokens, data
 516                     lineno += data.count('\n')
 517
 518                 # fetch new position into new variable so that we can check
 519                 # if there is a internal parsing error which would result
 520                 # in an infinite loop
 521                 pos2 = m.end()
 522
 523                 # handle state changes
 524                 if new_state is not None:
 525                     # remove the uppermost state
 526                     if new_state == '#pop':
 527                         stack.pop()
 528                     # resolve the new state by group checking
 529                     elif new_state == '#bygroup':
 530                         for key, value in m.groupdict().iteritems():
 531                             if value is not None:
 532                                 stack.append(key)
 533                                 break
 534                         else:
 535                             raise RuntimeError('%r wanted to resolve the '
 536                                                'new state dynamically but'
 537                                                ' no group matched' %
 538                                                regex)
 539                     # direct state name given
 540                     else:
 541                         stack.append(new_state)
 542                     statetokens = self.rules[stack[-1]]
 543                 # we are still at the same position and no stack change.
 544                 # this means a loop without break condition, avoid that and
 545                 # raise error
 546                 elif pos2 == pos:
 547                     raise RuntimeError('%r yielded empty string without '
 548                                        'stack change' % regex)
 549                 # publish new function and start again
 550                 pos = pos2
 551                 break
 552             # if loop terminated without break we havn't found a single match
 553             # either we are at the end of the file or we have a problem
 554             else:
 555                 # end of text
 556                 if pos >= source_length:
 557                     return
 558                 # something went wrong
 559                 raise TemplateSyntaxError('unexpected char %r at %d' %
 560                                           (source[pos], pos), lineno,
 561                                           name, filename)