jinja2/lexer.py

   1 # -*- coding: utf-8 -*-
   2 """
   3     jinja2.lexer
   4     ~~~~~~~~~~~~
   5
   6     This module implements a Jinja / Python combination lexer. The
   7     `Lexer` class provided by this module is used to do some preprocessing
   8     for Jinja.
   9
  10     On the one hand it filters out invalid operators like the bitshift
  11     operators we don't allow in templates. On the other hand it separates
  12     template code and python code in expressions.
  13
  14     :copyright: 2007-2008 by Armin Ronacher.
  15     :license: BSD, see LICENSE for more details.
  16 """
  17 import re
  18 import unicodedata
  19 from jinja.datastructure import TokenStream, Token
  20 from jinja.exceptions import TemplateSyntaxError
  21 from weakref import WeakValueDictionary
  22
  23
  24 __all__ = ['Lexer', 'Failure', 'keywords']
  25
  26
  27 # cache for the lexers. Exists in order to be able to have multiple
  28 # environments with the same lexer
  29 _lexer_cache = WeakValueDictionary()
  30
  31
  32 # static regular expressions
  33 whitespace_re = re.compile(r'\s+(?um)')
  34 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
  35                        r'|"([^"\\]*(?:\\.[^"\\]*)*)")(?ms)')
  36 integer_re = re.compile(r'\d+')
  37 name_re = re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*')
  38 float_re = re.compile(r'\d+\.\d+')
  39
  40
  41 # set of used keywords
  42 keywords = set(['and', 'block', 'elif', 'else', 'endblock',
  43                 'endfilter', 'endfor', 'endif', 'endmacro', 'endraw',
  44                 'endtrans', 'extends', 'filter', 'for', 'if', 'in',
  45                 'include', 'is', 'macro', 'not', 'or', 'pluralize', 'raw',
  46                 'recursive', 'set', 'trans', 'call', 'endcall',
  47                 'true', 'false', 'none'])
  48
  49 # bind operators to token types
  50 operators = {
  51     '+':            'add',
  52     '-':            'sub',
  53     '/':            'div',
  54     '//':           'floordiv',
  55     '*':            'mul',
  56     '%':            'mod',
  57     '**':           'pow',
  58     '~':            'tilde',
  59     '[':            'lbracket',
  60     ']':            'rbracket',
  61     '(':            'lparen',
  62     ')':            'rparen',
  63     '{':            'lbrace',
  64     '}':            'rbrace',
  65     '==':           'eq',
  66     '!=':           'ne',
  67     '>':            'gt',
  68     '>=':           'gteq',
  69     '<':            'lt',
  70     '<=':           'lteq',
  71     '=':            'assign',
  72     '.':            'dot',
  73     ':':            'colon',
  74     '|':            'pipe',
  75     ',':            'comma',
  76     ';':            'semicolon'
  77 }
  78
  79 reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
  80 assert len(operators) == len(reverse_operators), 'operators dropped'
  81 operator_re = re.compile('(%s)' % '|'.join([re.escape(x) for x in
  82                          sorted(operators, key=lambda x: -len(x))]))
  83
  84 simple_escapes = {
  85     'a':    '\a',
  86     'n':    '\n',
  87     'r':    '\r',
  88     'f':    '\f',
  89     't':    '\t',
  90     'v':    '\v',
  91     '\\':   '\\',
  92     '"':    '"',
  93     "'":    "'",
  94     '0':    '\x00'
  95 }
  96 unicode_escapes = {
  97     'x':    2,
  98     'u':    4,
  99     'U':    8
 100 }
 101
 102
 103 def unescape_string(lineno, filename, s):
 104     r"""
 105     Unescape a string. Supported escapes:
 106         \a, \n, \r\, \f, \v, \\, \", \', \0
 107
 108         \x00, \u0000, \U00000000, \N{...}
 109
 110     Not supported are \101 because imho redundant.
 111     """
 112     result = []
 113     write = result.append
 114     chariter = iter(s)
 115     next_char = chariter.next
 116
 117     # faster lookup
 118     sescapes = simple_escapes
 119     uescapes = unicode_escapes
 120
 121     try:
 122         for char in chariter:
 123             if char == '\\':
 124                 char = next_char()
 125                 if char in sescapes:
 126                     write(sescapes[char])
 127                 elif char in uescapes:
 128                     seq = [next_char() for x in xrange(uescapes[char])]
 129                     try:
 130                         write(unichr(int(''.join(seq), 16)))
 131                     except ValueError:
 132                         raise TemplateSyntaxError('invalid unicode codepoint',
 133                                                   lineno, filename)
 134                 elif char == 'N':
 135                     if next_char() != '{':
 136                         raise TemplateSyntaxError('no name for codepoint',
 137                                                   lineno, filename)
 138                     seq = []
 139                     while 1:
 140                         char = next_char()
 141                         if char == '}':
 142                             break
 143                         seq.append(char)
 144                     try:
 145                         write(unicodedata.lookup(u''.join(seq)))
 146                     except KeyError:
 147                         raise TemplateSyntaxError('unknown character name',
 148                                                   lineno, filename)
 149                 else:
 150                     write('\\' + char)
 151             else:
 152                 write(char)
 153     except StopIteration:
 154         raise TemplateSyntaxError('invalid string escape', lineno, filename)
 155     return u''.join(result)
 156
 157
 158 def unescape_regex(s):
 159     """
 160     Unescape rules for regular expressions.
 161     """
 162     buffer = []
 163     write = buffer.append
 164     in_escape = False
 165     for char in s:
 166         if in_escape:
 167             in_escape = False
 168             if char not in safe_chars:
 169                 write('\\' + char)
 170                 continue
 171         write(char)
 172     return u''.join(buffer)
 173
 174
 175 class Failure(object):
 176     """
 177     Class that raises a `TemplateSyntaxError` if called.
 178     Used by the `Lexer` to specify known errors.
 179     """
 180
 181     def __init__(self, message, cls=TemplateSyntaxError):
 182         self.message = message
 183         self.error_class = cls
 184
 185     def __call__(self, lineno, filename):
 186         raise self.error_class(self.message, lineno, filename)
 187
 188
 189 class LexerMeta(type):
 190     """
 191     Metaclass for the lexer that caches instances for
 192     the same configuration in a weak value dictionary.
 193     """
 194
 195     def __call__(cls, environment):
 196         key = hash((environment.block_start_string,
 197                     environment.block_end_string,
 198                     environment.variable_start_string,
 199                     environment.variable_end_string,
 200                     environment.comment_start_string,
 201                     environment.comment_end_string,
 202                     environment.trim_blocks))
 203
 204         # use the cached lexer if possible
 205         if key in _lexer_cache:
 206             return _lexer_cache[key]
 207
 208         # create a new lexer and cache it
 209         lexer = type.__call__(cls, environment)
 210         _lexer_cache[key] = lexer
 211         return lexer
 212
 213
 214 class Lexer(object):
 215     """
 216     Class that implements a lexer for a given environment. Automatically
 217     created by the environment class, usually you don't have to do that.
 218
 219     Note that the lexer is not automatically bound to an environment.
 220     Multiple environments can share the same lexer.
 221     """
 222
 223     __metaclass__ = LexerMeta
 224
 225     def __init__(self, environment):
 226         # shortcuts
 227         c = lambda x: re.compile(x, re.M | re.S)
 228         e = re.escape
 229
 230         # lexing rules for tags
 231         tag_rules = [
 232             (whitespace_re, None, None),
 233             (float_re, 'float', None),
 234             (integer_re, 'integer', None),
 235             ('%s' % '|'.join(sorted(keywords, key=lambda x: -len(x))),
 236              'keyword', None),
 237             (name_re, 'name', None),
 238             (string_re, 'string', None),
 239             (operator_re, 'operator', None)
 240         ]
 241
 242         #: if variables and blocks have the same delimiters we won't
 243         #: receive any variable blocks in the parser. This variable is `True`
 244         #: if we need that.
 245         self.no_variable_block = (
 246             (environment.variable_start_string is
 247              environment.variable_end_string is None) or
 248             (environment.variable_start_string ==
 249              environment.block_start_string and
 250              environment.variable_end_string ==
 251              environment.block_end_string)
 252         )
 253
 254         # assamble the root lexing rule. because "|" is ungreedy
 255         # we have to sort by length so that the lexer continues working
 256         # as expected when we have parsing rules like <% for block and
 257         # <%= for variables. (if someone wants asp like syntax)
 258         # variables are just part of the rules if variable processing
 259         # is required.
 260         root_tag_rules = [
 261             ('comment',     environment.comment_start_string),
 262             ('block',       environment.block_start_string)
 263         ]
 264         if not self.no_variable_block:
 265             root_tag_rules.append(('variable',
 266                                    environment.variable_start_string))
 267         root_tag_rules.sort(lambda a, b: cmp(len(b[1]), len(a[1])))
 268
 269         # block suffix if trimming is enabled
 270         block_suffix_re = environment.trim_blocks and '\\n?' or ''
 271
 272         # global lexing rules
 273         self.rules = {
 274             'root': [
 275                 # directives
 276                 (c('(.*?)(?:%s)' % '|'.join(
 277                     ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
 278                         e(environment.block_start_string),
 279                         e(environment.block_start_string),
 280                         e(environment.block_end_string)
 281                     )] + [
 282                         '(?P<%s_begin>\s*%s\-|%s)' % (n, e(r), e(r))
 283                         for n, r in root_tag_rules
 284                     ])), ('data', '#bygroup'), '#bygroup'),
 285                 # data
 286                 (c('.+'), 'data', None)
 287             ],
 288             # comments
 289             'comment_begin': [
 290                 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
 291                     e(environment.comment_end_string),
 292                     e(environment.comment_end_string),
 293                     block_suffix_re
 294                 )), ('comment', 'comment_end'), '#pop'),
 295                 (c('(.)'), (Failure('Missing end of comment tag'),), None)
 296             ],
 297             # blocks
 298             'block_begin': [
 299                 (c('(?:\-%s\s*|%s)%s' % (
 300                     e(environment.block_end_string),
 301                     e(environment.block_end_string),
 302                     block_suffix_re
 303                 )), 'block_end', '#pop'),
 304             ] + tag_rules,
 305             # raw block
 306             'raw_begin': [
 307                 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
 308                     e(environment.block_start_string),
 309                     e(environment.block_start_string),
 310                     e(environment.block_end_string),
 311                     e(environment.block_end_string),
 312                     block_suffix_re
 313                 )), ('data', 'raw_end'), '#pop'),
 314                 (c('(.)'), (Failure('Missing end of raw directive'),), None)
 315             ]
 316         }
 317
 318         # only add the variable rules to the list if we process variables
 319         # the variable_end_string variable could be None and break things.
 320         if not self.no_variable_block:
 321             self.rules['variable_begin'] = [
 322                 (c('\-%s\s*|%s' % (
 323                     e(environment.variable_end_string),
 324                     e(environment.variable_end_string)
 325                 )), 'variable_end', '#pop')
 326             ] + tag_rules
 327
 328     def tokenize(self, source, filename=None):
 329         """
 330         Works like `tokeniter` but returns a tokenstream of tokens and not a
 331         generator or token tuples. Additionally all token values are already
 332         converted into types and postprocessed. For example keywords are
 333         already keyword tokens, not named tokens, comments are removed,
 334         integers and floats converted, strings unescaped etc.
 335         """
 336         def generate():
 337             for lineno, token, value in self.tokeniter(source, filename):
 338                 if token in ('comment_begin', 'comment', 'comment_end'):
 339                     continue
 340                 elif token == 'data':
 341                     try:
 342                         value = str(value)
 343                     except UnicodeError:
 344                         pass
 345                 elif token == 'keyword':
 346                     token = str(value)
 347                 elif token == 'name':
 348                     value = str(value)
 349                 elif token == 'string':
 350                     value = unescape_string(lineno, filename, value[1:-1])
 351                     try:
 352                         value = str(value)
 353                     except UnicodeError:
 354                         pass
 355                 elif token == 'integer':
 356                     value = int(value)
 357                 elif token == 'float':
 358                     value = float(value)
 359                 elif token == 'operator':
 360                     token = operators[value]
 361                     value = ''
 362                 yield Token(lineno, token, value)
 363         return TokenStream(generate(), filename)
 364
 365     def tokeniter(self, source, filename=None):
 366         """
 367         This method tokenizes the text and returns the tokens in a generator.
 368         Use this method if you just want to tokenize a template. The output
 369         you get is not compatible with the input the jinja parser wants. The
 370         parser uses the `tokenize` function with returns a `TokenStream` and
 371         keywords instead of just names.
 372         """
 373         source = '\n'.join(source.splitlines())
 374         pos = 0
 375         lineno = 1
 376         stack = ['root']
 377         statetokens = self.rules['root']
 378         source_length = len(source)
 379
 380         balancing_stack = []
 381
 382         while True:
 383             # tokenizer loop
 384             for regex, tokens, new_state in statetokens:
 385                 m = regex.match(source, pos)
 386                 # if no match we try again with the next rule
 387                 if not m:
 388                     continue
 389
 390                 # we only match blocks and variables if brances / parentheses
 391                 # are balanced. continue parsing with the lower rule which
 392                 # is the operator rule. do this only if the end tags look
 393                 # like operators
 394                 if balancing_stack and \
 395                    tokens in ('variable_end', 'block_end'):
 396                     continue
 397
 398                 # tuples support more options
 399                 if isinstance(tokens, tuple):
 400                     for idx, token in enumerate(tokens):
 401                         # hidden group
 402                         if token is None:
 403                             g = m.group(idx)
 404                             if g:
 405                                 lineno += g.count('\n')
 406                             continue
 407                         # failure group
 408                         elif token.__class__ is Failure:
 409                             raise token(lineno, filename)
 410                         # bygroup is a bit more complex, in that case we
 411                         # yield for the current token the first named
 412                         # group that matched
 413                         elif token == '#bygroup':
 414                             for key, value in m.groupdict().iteritems():
 415                                 if value is not None:
 416                                     yield lineno, key, value
 417                                     lineno += value.count('\n')
 418                                     break
 419                             else:
 420                                 raise RuntimeError('%r wanted to resolve '
 421                                                    'the token dynamically'
 422                                                    ' but no group matched'
 423                                                    % regex)
 424                         # normal group
 425                         else:
 426                             data = m.group(idx + 1)
 427                             if data:
 428                                 yield lineno, token, data
 429                             lineno += data.count('\n')
 430
 431                 # strings as token just are yielded as it, but just
 432                 # if the data is not empty
 433                 else:
 434                     data = m.group()
 435                     # update brace/parentheses balance
 436                     if tokens == 'operator':
 437                         if data == '{':
 438                             balancing_stack.append('}')
 439                         elif data == '(':
 440                             balancing_stack.append(')')
 441                         elif data == '[':
 442                             balancing_stack.append(']')
 443                         elif data in ('}', ')', ']'):
 444                             if not balancing_stack:
 445                                 raise TemplateSyntaxError('unexpected "%s"' %
 446                                                           data, lineno,
 447                                                           filename)
 448                             expected_op = balancing_stack.pop()
 449                             if expected_op != data:
 450                                 raise TemplateSyntaxError('unexpected "%s", '
 451                                                           'expected "%s"' %
 452                                                           (data, expected_op),
 453                                                           lineno, filename)
 454                     # yield items
 455                     if tokens is not None:
 456                         if data:
 457                             yield lineno, tokens, data
 458                     lineno += data.count('\n')
 459
 460                 # fetch new position into new variable so that we can check
 461                 # if there is a internal parsing error which would result
 462                 # in an infinite loop
 463                 pos2 = m.end()
 464
 465                 # handle state changes
 466                 if new_state is not None:
 467                     # remove the uppermost state
 468                     if new_state == '#pop':
 469                         stack.pop()
 470                     # resolve the new state by group checking
 471                     elif new_state == '#bygroup':
 472                         for key, value in m.groupdict().iteritems():
 473                             if value is not None:
 474                                 stack.append(key)
 475                                 break
 476                         else:
 477                             raise RuntimeError('%r wanted to resolve the '
 478                                                'new state dynamically but'
 479                                                ' no group matched' %
 480                                                regex)
 481                     # direct state name given
 482                     else:
 483                         stack.append(new_state)
 484                     statetokens = self.rules[stack[-1]]
 485                 # we are still at the same position and no stack change.
 486                 # this means a loop without break condition, avoid that and
 487                 # raise error
 488                 elif pos2 == pos:
 489                     raise RuntimeError('%r yielded empty string without '
 490                                        'stack change' % regex)
 491                 # publish new function and start again
 492                 pos = pos2
 493                 break
 494             # if loop terminated without break we havn't found a single match
 495             # either we are at the end of the file or we have a problem
 496             else:
 497                 # end of text
 498                 if pos >= source_length:
 499                     return
 500                 # something went wrong
 501                 raise TemplateSyntaxError('unexpected char %r at %d' %
 502                                           (source[pos], pos), lineno,
 503                                           filename)