1 # -*- coding: utf-8 -*-
6 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
14 :copyright: (c) 2009 by the Jinja Team.
15 :license: BSD, see LICENSE for more details.
18 from operator import itemgetter
19 from collections import deque
20 from jinja2.exceptions import TemplateSyntaxError
21 from jinja2.utils import LRUCache
24 # cache for the lexers. Exists in order to be able to have multiple
25 # environments with the same lexer
26 _lexer_cache = LRUCache(50)
28 # static regular expressions
29 whitespace_re = re.compile(r'\s+', re.U)
30 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
31 r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S)
32 integer_re = re.compile(r'\d+')
33 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
34 float_re = re.compile(r'(?<!\.)\d+\.\d+')
35 newline_re = re.compile(r'(\r\n|\r|\n)')
37 # internal the tokens and keep references to them
38 TOKEN_ADD = intern('add')
39 TOKEN_ASSIGN = intern('assign')
40 TOKEN_COLON = intern('colon')
41 TOKEN_COMMA = intern('comma')
42 TOKEN_DIV = intern('div')
43 TOKEN_DOT = intern('dot')
44 TOKEN_EQ = intern('eq')
45 TOKEN_FLOORDIV = intern('floordiv')
46 TOKEN_GT = intern('gt')
47 TOKEN_GTEQ = intern('gteq')
48 TOKEN_LBRACE = intern('lbrace')
49 TOKEN_LBRACKET = intern('lbracket')
50 TOKEN_LPAREN = intern('lparen')
51 TOKEN_LT = intern('lt')
52 TOKEN_LTEQ = intern('lteq')
53 TOKEN_MOD = intern('mod')
54 TOKEN_MUL = intern('mul')
55 TOKEN_NE = intern('ne')
56 TOKEN_PIPE = intern('pipe')
57 TOKEN_POW = intern('pow')
58 TOKEN_RBRACE = intern('rbrace')
59 TOKEN_RBRACKET = intern('rbracket')
60 TOKEN_RPAREN = intern('rparen')
61 TOKEN_SEMICOLON = intern('semicolon')
62 TOKEN_SUB = intern('sub')
63 TOKEN_TILDE = intern('tilde')
64 TOKEN_WHITESPACE = intern('whitespace')
65 TOKEN_FLOAT = intern('float')
66 TOKEN_INTEGER = intern('integer')
67 TOKEN_NAME = intern('name')
68 TOKEN_STRING = intern('string')
69 TOKEN_OPERATOR = intern('operator')
70 TOKEN_BLOCK_BEGIN = intern('block_begin')
71 TOKEN_BLOCK_END = intern('block_end')
72 TOKEN_VARIABLE_BEGIN = intern('variable_begin')
73 TOKEN_VARIABLE_END = intern('variable_end')
74 TOKEN_RAW_BEGIN = intern('raw_begin')
75 TOKEN_RAW_END = intern('raw_end')
76 TOKEN_COMMENT_BEGIN = intern('comment_begin')
77 TOKEN_COMMENT_END = intern('comment_end')
78 TOKEN_COMMENT = intern('comment')
79 TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin')
80 TOKEN_LINESTATEMENT_END = intern('linestatement_end')
81 TOKEN_DATA = intern('data')
82 TOKEN_INITIAL = intern('initial')
83 TOKEN_EOF = intern('eof')
85 # bind operators to token types
115 reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
116 assert len(operators) == len(reverse_operators), 'operators dropped'
117 operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
118 sorted(operators, key=lambda x: -len(x))))
121 def count_newlines(value):
122 """Count the number of newline characters in the string. This is
123 useful for extensions that filter a stream.
125 return len(newline_re.findall(value))
128 class Failure(object):
129 """Class that raises a `TemplateSyntaxError` if called.
130 Used by the `Lexer` to specify known errors.
133 def __init__(self, message, cls=TemplateSyntaxError):
134 self.message = message
135 self.error_class = cls
137 def __call__(self, lineno, filename):
138 raise self.error_class(self.message, lineno, filename)
144 lineno, type, value = (property(itemgetter(x)) for x in range(3))
146 def __new__(cls, lineno, type, value):
147 return tuple.__new__(cls, (lineno, intern(str(type)), value))
150 if self.type in reverse_operators:
151 return reverse_operators[self.type]
152 elif self.type == 'name':
156 def test(self, expr):
157 """Test a token against a token expression. This can either be a
158 token type or ``'token_type:token_value'``. This can only test
159 against string values and types.
161 # here we do a regular string equality check as test_any is usually
162 # passed an iterable of not interned strings.
163 if self.type == expr:
166 return expr.split(':', 1) == [self.type, self.value]
169 def test_any(self, *iterable):
170 """Test against multiple token expressions."""
171 for expr in iterable:
177 return 'Token(%r, %r, %r)' % (
184 class TokenStreamIterator(object):
185 """The iterator for tokenstreams. Iterate over the stream
186 until the eof token is reached.
189 def __init__(self, stream):
196 token = self.stream.current
197 if token.type is TOKEN_EOF:
199 raise StopIteration()
204 class TokenStream(object):
205 """A token stream is an iterable that yields :class:`Token`\s. The
206 parser however does not iterate over it but calls :meth:`next` to go
207 one token ahead. The current active token is stored as :attr:`current`.
210 def __init__(self, generator, name, filename):
211 self._next = iter(generator).next
212 self._pushed = deque()
214 self.filename = filename
216 self.current = Token(1, TOKEN_INITIAL, '')
220 return TokenStreamIterator(self)
222 def __nonzero__(self):
223 """Are we at the end of the stream?"""
224 return bool(self._pushed) or self.current.type is not TOKEN_EOF
226 eos = property(lambda x: not x.__nonzero__(), doc=__nonzero__.__doc__)
228 def push(self, token):
229 """Push a token back to the stream."""
230 self._pushed.append(token)
233 """Look at the next token."""
234 old_token = self.next()
235 result = self.current
237 self.current = old_token
241 """Got n tokens ahead."""
245 def next_if(self, expr):
246 """Perform the token test and return the token if it matched.
247 Otherwise the return value is `None`.
249 if self.current.test(expr):
252 def skip_if(self, expr):
253 """Like :meth:`next_if` but only returns `True` or `False`."""
254 return self.next_if(expr) is not None
257 """Go one token ahead and return the old one"""
260 self.current = self._pushed.popleft()
261 elif self.current.type is not TOKEN_EOF:
263 self.current = self._next()
264 except StopIteration:
269 """Close the stream."""
270 self.current = Token(self.current.lineno, TOKEN_EOF, '')
274 def expect(self, expr):
275 """Expect a given token type and return it. This accepts the same
276 argument as :meth:`jinja2.lexer.Token.test`.
278 if not self.current.test(expr):
280 expr = expr.split(':')[1]
281 if self.current.type is TOKEN_EOF:
282 raise TemplateSyntaxError('unexpected end of template, '
283 'expected %r.' % expr,
285 self.name, self.filename)
286 raise TemplateSyntaxError("expected token %r, got %r" %
287 (expr, str(self.current)),
289 self.name, self.filename)
296 def get_lexer(environment):
297 """Return a lexer which is probably cached."""
298 key = (environment.block_start_string,
299 environment.block_end_string,
300 environment.variable_start_string,
301 environment.variable_end_string,
302 environment.comment_start_string,
303 environment.comment_end_string,
304 environment.line_statement_prefix,
305 environment.trim_blocks,
306 environment.newline_sequence)
307 lexer = _lexer_cache.get(key)
309 lexer = Lexer(environment)
310 _lexer_cache[key] = lexer
315 """Class that implements a lexer for a given environment. Automatically
316 created by the environment class, usually you don't have to do that.
318 Note that the lexer is not automatically bound to an environment.
319 Multiple environments can share the same lexer.
322 def __init__(self, environment):
324 c = lambda x: re.compile(x, re.M | re.S)
327 # lexing rules for tags
329 (whitespace_re, TOKEN_WHITESPACE, None),
330 (float_re, TOKEN_FLOAT, None),
331 (integer_re, TOKEN_INTEGER, None),
332 (name_re, TOKEN_NAME, None),
333 (string_re, TOKEN_STRING, None),
334 (operator_re, TOKEN_OPERATOR, None)
337 # assamble the root lexing rule. because "|" is ungreedy
338 # we have to sort by length so that the lexer continues working
339 # as expected when we have parsing rules like <% for block and
340 # <%= for variables. (if someone wants asp like syntax)
341 # variables are just part of the rules if variable processing
344 ('comment', environment.comment_start_string),
345 ('block', environment.block_start_string),
346 ('variable', environment.variable_start_string)
348 root_tag_rules.sort(key=lambda x: -len(x[1]))
350 # now escape the rules. This is done here so that the escape
351 # signs don't count for the lengths of the tags.
352 root_tag_rules = [(a, e(b)) for a, b in root_tag_rules]
354 # if we have a line statement prefix we need an extra rule for
355 # that. We add this rule *after* all the others.
356 if environment.line_statement_prefix is not None:
357 prefix = e(environment.line_statement_prefix)
358 root_tag_rules.insert(0, ('linestatement', '^\s*' + prefix))
360 # block suffix if trimming is enabled
361 block_suffix_re = environment.trim_blocks and '\\n?' or ''
363 self.newline_sequence = environment.newline_sequence
365 # global lexing rules
369 (c('(.*?)(?:%s)' % '|'.join(
370 ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
371 e(environment.block_start_string),
372 e(environment.block_start_string),
373 e(environment.block_end_string)
375 '(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
376 for n, r in root_tag_rules
377 ])), (TOKEN_DATA, '#bygroup'), '#bygroup'),
379 (c('.+'), 'data', None)
382 TOKEN_COMMENT_BEGIN: [
383 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
384 e(environment.comment_end_string),
385 e(environment.comment_end_string),
387 )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'),
388 (c('(.)'), (Failure('Missing end of comment tag'),), None)
392 (c('(?:\-%s\s*|%s)%s' % (
393 e(environment.block_end_string),
394 e(environment.block_end_string),
396 )), TOKEN_BLOCK_END, '#pop'),
399 TOKEN_VARIABLE_BEGIN: [
401 e(environment.variable_end_string),
402 e(environment.variable_end_string)
403 )), TOKEN_VARIABLE_END, '#pop')
407 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
408 e(environment.block_start_string),
409 e(environment.block_start_string),
410 e(environment.block_end_string),
411 e(environment.block_end_string),
413 )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'),
414 (c('(.)'), (Failure('Missing end of raw directive'),), None)
417 TOKEN_LINESTATEMENT_BEGIN: [
418 (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop')
422 def _normalize_newlines(self, value):
423 """Called for strings and template data to normlize it to unicode."""
424 return newline_re.sub(self.newline_sequence, value)
426 def tokenize(self, source, name=None, filename=None, state=None):
427 """Calls tokeniter + tokenize and wraps it in a token stream.
429 stream = self.tokeniter(source, name, filename, state)
430 return TokenStream(self.wrap(stream, name, filename), name, filename)
432 def wrap(self, stream, name=None, filename=None):
433 """This is called with the stream as returned by `tokenize` and wraps
434 every token in a :class:`Token` and converts the value.
436 for lineno, token, value in stream:
437 if token in ('comment_begin', 'comment', 'comment_end',
440 elif token == 'linestatement_begin':
441 token = 'block_begin'
442 elif token == 'linestatement_end':
444 # we are not interested in those tokens in the parser
445 elif token in ('raw_begin', 'raw_end'):
447 elif token == 'data':
448 value = self._normalize_newlines(value)
449 elif token == 'keyword':
451 elif token == 'name':
453 elif token == 'string':
454 # try to unescape string
456 value = self._normalize_newlines(value[1:-1]) \
457 .encode('ascii', 'backslashreplace') \
458 .decode('unicode-escape')
460 msg = str(e).split(':')[-1].strip()
461 raise TemplateSyntaxError(msg, lineno, name, filename)
462 # if we can express it as bytestring (ascii only)
463 # we do that for support of semi broken APIs
464 # as datetime.datetime.strftime
469 elif token == 'integer':
471 elif token == 'float':
473 elif token == 'operator':
474 token = operators[value]
475 yield Token(lineno, token, value)
477 def tokeniter(self, source, name, filename=None, state=None):
478 """This method tokenizes the text and returns the tokens in a
479 generator. Use this method if you just want to tokenize a template.
481 source = '\n'.join(unicode(source).splitlines())
485 if state is not None and state != 'root':
486 assert state in ('variable', 'block'), 'invalid state'
487 stack.append(state + '_begin')
490 statetokens = self.rules[stack[-1]]
491 source_length = len(source)
497 for regex, tokens, new_state in statetokens:
498 m = regex.match(source, pos)
499 # if no match we try again with the next rule
503 # we only match blocks and variables if brances / parentheses
504 # are balanced. continue parsing with the lower rule which
505 # is the operator rule. do this only if the end tags look
507 if balancing_stack and \
508 tokens in ('variable_end', 'block_end',
509 'linestatement_end'):
512 # tuples support more options
513 if isinstance(tokens, tuple):
514 for idx, token in enumerate(tokens):
516 if token.__class__ is Failure:
517 raise token(lineno, filename)
518 # bygroup is a bit more complex, in that case we
519 # yield for the current token the first named
521 elif token == '#bygroup':
522 for key, value in m.groupdict().iteritems():
523 if value is not None:
524 yield lineno, key, value
525 lineno += value.count('\n')
528 raise RuntimeError('%r wanted to resolve '
529 'the token dynamically'
530 ' but no group matched'
534 data = m.group(idx + 1)
536 yield lineno, token, data
537 lineno += data.count('\n')
539 # strings as token just are yielded as it.
542 # update brace/parentheses balance
543 if tokens == 'operator':
545 balancing_stack.append('}')
547 balancing_stack.append(')')
549 balancing_stack.append(']')
550 elif data in ('}', ')', ']'):
551 if not balancing_stack:
552 raise TemplateSyntaxError('unexpected "%s"' %
555 expected_op = balancing_stack.pop()
556 if expected_op != data:
557 raise TemplateSyntaxError('unexpected "%s", '
563 yield lineno, tokens, data
564 lineno += data.count('\n')
566 # fetch new position into new variable so that we can check
567 # if there is a internal parsing error which would result
568 # in an infinite loop
571 # handle state changes
572 if new_state is not None:
573 # remove the uppermost state
574 if new_state == '#pop':
576 # resolve the new state by group checking
577 elif new_state == '#bygroup':
578 for key, value in m.groupdict().iteritems():
579 if value is not None:
583 raise RuntimeError('%r wanted to resolve the '
584 'new state dynamically but'
585 ' no group matched' %
587 # direct state name given
589 stack.append(new_state)
590 statetokens = self.rules[stack[-1]]
591 # we are still at the same position and no stack change.
592 # this means a loop without break condition, avoid that and
595 raise RuntimeError('%r yielded empty string without '
596 'stack change' % regex)
597 # publish new function and start again
600 # if loop terminated without break we havn't found a single match
601 # either we are at the end of the file or we have a problem
604 if pos >= source_length:
606 # something went wrong
607 raise TemplateSyntaxError('unexpected char %r at %d' %
608 (source[pos], pos), lineno,