1 # -*- coding: utf-8 -*-
6 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
14 :copyright: 2007-2008 by Armin Ronacher.
15 :license: BSD, see LICENSE for more details.
19 from jinja2.datastructure import TokenStream, Token
20 from jinja2.exceptions import TemplateSyntaxError
21 from weakref import WeakValueDictionary
24 # cache for the lexers. Exists in order to be able to have multiple
25 # environments with the same lexer
26 _lexer_cache = WeakValueDictionary()
29 # static regular expressions
30 whitespace_re = re.compile(r'\s+(?um)')
31 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
32 r'|"([^"\\]*(?:\\.[^"\\]*)*)")(?ms)')
33 integer_re = re.compile(r'\d+')
34 name_re = re.compile(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b')
35 float_re = re.compile(r'\d+\.\d+')
38 # set of used keywords
39 keywords = set(['and', 'block', 'elif', 'else', 'endblock', 'print',
40 'endfilter', 'endfor', 'endif', 'endmacro', 'endraw',
41 'endtrans', 'extends', 'filter', 'for', 'if', 'in',
42 'include', 'is', 'macro', 'not', 'or', 'pluralize', 'raw',
43 'recursive', 'set', 'trans', 'call', 'endcall'])
45 # bind operators to token types
75 reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
76 assert len(operators) == len(reverse_operators), 'operators dropped'
77 operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in
78 sorted(operators, key=lambda x: -len(x))))
99 def unescape_string(lineno, filename, s):
101 Unescape a string. Supported escapes:
102 \a, \n, \r\, \f, \v, \\, \", \', \0
104 \x00, \u0000, \U00000000, \N{...}
106 Not supported are \101 because imho redundant.
109 write = result.append
111 next_char = chariter.next
114 sescapes = simple_escapes
115 uescapes = unicode_escapes
118 for char in chariter:
122 write(sescapes[char])
123 elif char in uescapes:
124 seq = [next_char() for x in xrange(uescapes[char])]
126 write(unichr(int(''.join(seq), 16)))
128 raise TemplateSyntaxError('invalid unicode codepoint',
131 if next_char() != '{':
132 raise TemplateSyntaxError('no name for codepoint',
141 write(unicodedata.lookup(u''.join(seq)))
143 raise TemplateSyntaxError('unknown character name',
149 except StopIteration:
150 raise TemplateSyntaxError('invalid string escape', lineno, filename)
151 return u''.join(result)
154 def unescape_regex(s):
156 Unescape rules for regular expressions.
159 write = buffer.append
164 if char not in safe_chars:
168 return u''.join(buffer)
171 class Failure(object):
173 Class that raises a `TemplateSyntaxError` if called.
174 Used by the `Lexer` to specify known errors.
177 def __init__(self, message, cls=TemplateSyntaxError):
178 self.message = message
179 self.error_class = cls
181 def __call__(self, lineno, filename):
182 raise self.error_class(self.message, lineno, filename)
185 class LexerMeta(type):
187 Metaclass for the lexer that caches instances for
188 the same configuration in a weak value dictionary.
191 def __call__(cls, environment):
192 key = hash((environment.block_start_string,
193 environment.block_end_string,
194 environment.variable_start_string,
195 environment.variable_end_string,
196 environment.comment_start_string,
197 environment.comment_end_string,
198 environment.line_statement_prefix,
199 environment.trim_blocks))
201 # use the cached lexer if possible
202 if key in _lexer_cache:
203 return _lexer_cache[key]
205 # create a new lexer and cache it
206 lexer = type.__call__(cls, environment)
207 _lexer_cache[key] = lexer
213 Class that implements a lexer for a given environment. Automatically
214 created by the environment class, usually you don't have to do that.
216 Note that the lexer is not automatically bound to an environment.
217 Multiple environments can share the same lexer.
220 __metaclass__ = LexerMeta
222 def __init__(self, environment):
224 c = lambda x: re.compile(x, re.M | re.S)
227 # lexing rules for tags
229 (whitespace_re, None, None),
230 (float_re, 'float', None),
231 (integer_re, 'integer', None),
232 (c(r'\b(?:%s)\b' % '|'.join(sorted(keywords, key=lambda x: -len(x)))),
234 (name_re, 'name', None),
235 (string_re, 'string', None),
236 (operator_re, 'operator', None)
239 # assamble the root lexing rule. because "|" is ungreedy
240 # we have to sort by length so that the lexer continues working
241 # as expected when we have parsing rules like <% for block and
242 # <%= for variables. (if someone wants asp like syntax)
243 # variables are just part of the rules if variable processing
246 ('comment', environment.comment_start_string),
247 ('block', environment.block_start_string),
248 ('variable', environment.variable_start_string)
250 root_tag_rules.sort(key=lambda x: len(x[1]))
252 # now escape the rules. This is done here so that the escape
253 # signs don't count for the lengths of the tags.
254 root_tag_rules = [(a, e(b)) for a, b in root_tag_rules]
256 # if we have a line statement prefix we need an extra rule for
257 # that. We add this rule *after* all the others.
258 if environment.line_statement_prefix is not None:
259 prefix = e(environment.line_statement_prefix)
260 root_tag_rules.insert(0, ('linestatement', '^\s*' + prefix))
262 # block suffix if trimming is enabled
263 block_suffix_re = environment.trim_blocks and '\\n?' or ''
265 # global lexing rules
269 (c('(.*?)(?:%s)' % '|'.join(
270 ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
271 e(environment.block_start_string),
272 e(environment.block_start_string),
273 e(environment.block_end_string)
275 '(?P<%s_begin>\s*%s\-|%s)' % (n, r, r)
276 for n, r in root_tag_rules
277 ])), ('data', '#bygroup'), '#bygroup'),
279 (c('.+'), 'data', None)
283 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
284 e(environment.comment_end_string),
285 e(environment.comment_end_string),
287 )), ('comment', 'comment_end'), '#pop'),
288 (c('(.)'), (Failure('Missing end of comment tag'),), None)
292 (c('(?:\-%s\s*|%s)%s' % (
293 e(environment.block_end_string),
294 e(environment.block_end_string),
296 )), 'block_end', '#pop'),
301 e(environment.variable_end_string),
302 e(environment.variable_end_string)
303 )), 'variable_end', '#pop')
307 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
308 e(environment.block_start_string),
309 e(environment.block_start_string),
310 e(environment.block_end_string),
311 e(environment.block_end_string),
313 )), ('data', 'raw_end'), '#pop'),
314 (c('(.)'), (Failure('Missing end of raw directive'),), None)
317 'linestatement_begin': [
318 (c(r'\s*(\n|$)'), 'linestatement_end', '#pop')
322 def tokenize(self, source, filename=None):
323 """Works like `tokeniter` but returns a tokenstream of tokens and not
324 a generator or token tuples. Additionally all token values are already
325 converted into types and postprocessed. For example keywords are
326 already keyword tokens, not named tokens, comments are removed,
327 integers and floats converted, strings unescaped etc.
329 source = unicode(source)
331 for lineno, token, value in self.tokeniter(source, filename):
332 if token in ('comment_begin', 'comment', 'comment_end'):
334 elif token == 'linestatement_begin':
335 token = 'block_begin'
336 elif token == 'linestatement_end':
338 elif token == 'data':
343 elif token == 'keyword':
345 elif token == 'name':
347 elif token == 'string':
348 value = unescape_string(lineno, filename, value[1:-1])
353 elif token == 'integer':
355 elif token == 'float':
357 elif token == 'operator':
358 token = operators[value]
359 yield Token(lineno, token, value)
360 return TokenStream(generate(), filename)
362 def tokeniter(self, source, filename=None):
364 This method tokenizes the text and returns the tokens in a generator.
365 Use this method if you just want to tokenize a template. The output
366 you get is not compatible with the input the jinja parser wants. The
367 parser uses the `tokenize` function with returns a `TokenStream` and
368 keywords instead of just names.
370 source = '\n'.join(source.splitlines())
374 statetokens = self.rules['root']
375 source_length = len(source)
381 for regex, tokens, new_state in statetokens:
382 m = regex.match(source, pos)
383 # if no match we try again with the next rule
387 # we only match blocks and variables if brances / parentheses
388 # are balanced. continue parsing with the lower rule which
389 # is the operator rule. do this only if the end tags look
391 if balancing_stack and \
392 tokens in ('variable_end', 'block_end',
393 'linestatement_end'):
396 # tuples support more options
397 if isinstance(tokens, tuple):
398 for idx, token in enumerate(tokens):
403 lineno += g.count('\n')
406 elif token.__class__ is Failure:
407 raise token(lineno, filename)
408 # bygroup is a bit more complex, in that case we
409 # yield for the current token the first named
411 elif token == '#bygroup':
412 for key, value in m.groupdict().iteritems():
413 if value is not None:
414 yield lineno, key, value
415 lineno += value.count('\n')
418 raise RuntimeError('%r wanted to resolve '
419 'the token dynamically'
420 ' but no group matched'
424 data = m.group(idx + 1)
426 yield lineno, token, data
427 lineno += data.count('\n')
429 # strings as token just are yielded as it.
432 # update brace/parentheses balance
433 if tokens == 'operator':
435 balancing_stack.append('}')
437 balancing_stack.append(')')
439 balancing_stack.append(']')
440 elif data in ('}', ')', ']'):
441 if not balancing_stack:
442 raise TemplateSyntaxError('unexpected "%s"' %
445 expected_op = balancing_stack.pop()
446 if expected_op != data:
447 raise TemplateSyntaxError('unexpected "%s", '
452 if tokens is not None:
453 yield lineno, tokens, data
454 lineno += data.count('\n')
456 # fetch new position into new variable so that we can check
457 # if there is a internal parsing error which would result
458 # in an infinite loop
461 # handle state changes
462 if new_state is not None:
463 # remove the uppermost state
464 if new_state == '#pop':
466 # resolve the new state by group checking
467 elif new_state == '#bygroup':
468 for key, value in m.groupdict().iteritems():
469 if value is not None:
473 raise RuntimeError('%r wanted to resolve the '
474 'new state dynamically but'
475 ' no group matched' %
477 # direct state name given
479 stack.append(new_state)
480 statetokens = self.rules[stack[-1]]
481 # we are still at the same position and no stack change.
482 # this means a loop without break condition, avoid that and
485 raise RuntimeError('%r yielded empty string without '
486 'stack change' % regex)
487 # publish new function and start again
490 # if loop terminated without break we havn't found a single match
491 # either we are at the end of the file or we have a problem
494 if pos >= source_length:
496 # something went wrong
497 raise TemplateSyntaxError('unexpected char %r at %d' %
498 (source[pos], pos), lineno,