1 # -*- coding: utf-8 -*-
6 This module implements a Jinja / Python combination lexer. The
7 `Lexer` class provided by this module is used to do some preprocessing
10 On the one hand it filters out invalid operators like the bitshift
11 operators we don't allow in templates. On the other hand it separates
12 template code and python code in expressions.
14 :copyright: 2007-2008 by Armin Ronacher.
15 :license: BSD, see LICENSE for more details.
19 from jinja.datastructure import TokenStream, Token
20 from jinja.exceptions import TemplateSyntaxError
21 from weakref import WeakValueDictionary
24 __all__ = ['Lexer', 'Failure', 'keywords']
27 # cache for the lexers. Exists in order to be able to have multiple
28 # environments with the same lexer
29 _lexer_cache = WeakValueDictionary()
32 # static regular expressions
33 whitespace_re = re.compile(r'\s+(?um)')
34 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
35 r'|"([^"\\]*(?:\\.[^"\\]*)*)")(?ms)')
36 integer_re = re.compile(r'\d+')
37 name_re = re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*')
38 float_re = re.compile(r'\d+\.\d+')
41 # set of used keywords
42 keywords = set(['and', 'block', 'elif', 'else', 'endblock',
43 'endfilter', 'endfor', 'endif', 'endmacro', 'endraw',
44 'endtrans', 'extends', 'filter', 'for', 'if', 'in',
45 'include', 'is', 'macro', 'not', 'or', 'pluralize', 'raw',
46 'recursive', 'set', 'trans', 'call', 'endcall',
47 'true', 'false', 'none'])
49 # bind operators to token types
79 reverse_operators = dict([(v, k) for k, v in operators.iteritems()])
80 assert len(operators) == len(reverse_operators), 'operators dropped'
81 operator_re = re.compile('(%s)' % '|'.join([re.escape(x) for x in
82 sorted(operators, key=lambda x: -len(x))]))
103 def unescape_string(lineno, filename, s):
105 Unescape a string. Supported escapes:
106 \a, \n, \r\, \f, \v, \\, \", \', \0
108 \x00, \u0000, \U00000000, \N{...}
110 Not supported are \101 because imho redundant.
113 write = result.append
115 next_char = chariter.next
118 sescapes = simple_escapes
119 uescapes = unicode_escapes
122 for char in chariter:
126 write(sescapes[char])
127 elif char in uescapes:
128 seq = [next_char() for x in xrange(uescapes[char])]
130 write(unichr(int(''.join(seq), 16)))
132 raise TemplateSyntaxError('invalid unicode codepoint',
135 if next_char() != '{':
136 raise TemplateSyntaxError('no name for codepoint',
145 write(unicodedata.lookup(u''.join(seq)))
147 raise TemplateSyntaxError('unknown character name',
153 except StopIteration:
154 raise TemplateSyntaxError('invalid string escape', lineno, filename)
155 return u''.join(result)
158 def unescape_regex(s):
160 Unescape rules for regular expressions.
163 write = buffer.append
168 if char not in safe_chars:
172 return u''.join(buffer)
175 class Failure(object):
177 Class that raises a `TemplateSyntaxError` if called.
178 Used by the `Lexer` to specify known errors.
181 def __init__(self, message, cls=TemplateSyntaxError):
182 self.message = message
183 self.error_class = cls
185 def __call__(self, lineno, filename):
186 raise self.error_class(self.message, lineno, filename)
189 class LexerMeta(type):
191 Metaclass for the lexer that caches instances for
192 the same configuration in a weak value dictionary.
195 def __call__(cls, environment):
196 key = hash((environment.block_start_string,
197 environment.block_end_string,
198 environment.variable_start_string,
199 environment.variable_end_string,
200 environment.comment_start_string,
201 environment.comment_end_string,
202 environment.trim_blocks))
204 # use the cached lexer if possible
205 if key in _lexer_cache:
206 return _lexer_cache[key]
208 # create a new lexer and cache it
209 lexer = type.__call__(cls, environment)
210 _lexer_cache[key] = lexer
216 Class that implements a lexer for a given environment. Automatically
217 created by the environment class, usually you don't have to do that.
219 Note that the lexer is not automatically bound to an environment.
220 Multiple environments can share the same lexer.
223 __metaclass__ = LexerMeta
225 def __init__(self, environment):
227 c = lambda x: re.compile(x, re.M | re.S)
230 # lexing rules for tags
232 (whitespace_re, None, None),
233 (float_re, 'float', None),
234 (integer_re, 'integer', None),
235 ('%s' % '|'.join(sorted(keywords, key=lambda x: -len(x))),
237 (name_re, 'name', None),
238 (string_re, 'string', None),
239 (operator_re, 'operator', None)
242 #: if variables and blocks have the same delimiters we won't
243 #: receive any variable blocks in the parser. This variable is `True`
245 self.no_variable_block = (
246 (environment.variable_start_string is
247 environment.variable_end_string is None) or
248 (environment.variable_start_string ==
249 environment.block_start_string and
250 environment.variable_end_string ==
251 environment.block_end_string)
254 # assamble the root lexing rule. because "|" is ungreedy
255 # we have to sort by length so that the lexer continues working
256 # as expected when we have parsing rules like <% for block and
257 # <%= for variables. (if someone wants asp like syntax)
258 # variables are just part of the rules if variable processing
261 ('comment', environment.comment_start_string),
262 ('block', environment.block_start_string)
264 if not self.no_variable_block:
265 root_tag_rules.append(('variable',
266 environment.variable_start_string))
267 root_tag_rules.sort(lambda a, b: cmp(len(b[1]), len(a[1])))
269 # block suffix if trimming is enabled
270 block_suffix_re = environment.trim_blocks and '\\n?' or ''
272 # global lexing rules
276 (c('(.*?)(?:%s)' % '|'.join(
277 ['(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*%s)' % (
278 e(environment.block_start_string),
279 e(environment.block_start_string),
280 e(environment.block_end_string)
282 '(?P<%s_begin>\s*%s\-|%s)' % (n, e(r), e(r))
283 for n, r in root_tag_rules
284 ])), ('data', '#bygroup'), '#bygroup'),
286 (c('.+'), 'data', None)
290 (c(r'(.*?)((?:\-%s\s*|%s)%s)' % (
291 e(environment.comment_end_string),
292 e(environment.comment_end_string),
294 )), ('comment', 'comment_end'), '#pop'),
295 (c('(.)'), (Failure('Missing end of comment tag'),), None)
299 (c('(?:\-%s\s*|%s)%s' % (
300 e(environment.block_end_string),
301 e(environment.block_end_string),
303 )), 'block_end', '#pop'),
307 (c('(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % (
308 e(environment.block_start_string),
309 e(environment.block_start_string),
310 e(environment.block_end_string),
311 e(environment.block_end_string),
313 )), ('data', 'raw_end'), '#pop'),
314 (c('(.)'), (Failure('Missing end of raw directive'),), None)
318 # only add the variable rules to the list if we process variables
319 # the variable_end_string variable could be None and break things.
320 if not self.no_variable_block:
321 self.rules['variable_begin'] = [
323 e(environment.variable_end_string),
324 e(environment.variable_end_string)
325 )), 'variable_end', '#pop')
328 def tokenize(self, source, filename=None):
330 Works like `tokeniter` but returns a tokenstream of tokens and not a
331 generator or token tuples. Additionally all token values are already
332 converted into types and postprocessed. For example keywords are
333 already keyword tokens, not named tokens, comments are removed,
334 integers and floats converted, strings unescaped etc.
337 for lineno, token, value in self.tokeniter(source, filename):
338 if token in ('comment_begin', 'comment', 'comment_end'):
340 elif token == 'data':
345 elif token == 'keyword':
347 elif token == 'name':
349 elif token == 'string':
350 value = unescape_string(lineno, filename, value[1:-1])
355 elif token == 'integer':
357 elif token == 'float':
359 elif token == 'operator':
360 token = operators[value]
362 yield Token(lineno, token, value)
363 return TokenStream(generate(), filename)
365 def tokeniter(self, source, filename=None):
367 This method tokenizes the text and returns the tokens in a generator.
368 Use this method if you just want to tokenize a template. The output
369 you get is not compatible with the input the jinja parser wants. The
370 parser uses the `tokenize` function with returns a `TokenStream` and
371 keywords instead of just names.
373 source = '\n'.join(source.splitlines())
377 statetokens = self.rules['root']
378 source_length = len(source)
384 for regex, tokens, new_state in statetokens:
385 m = regex.match(source, pos)
386 # if no match we try again with the next rule
390 # we only match blocks and variables if brances / parentheses
391 # are balanced. continue parsing with the lower rule which
392 # is the operator rule. do this only if the end tags look
394 if balancing_stack and \
395 tokens in ('variable_end', 'block_end'):
398 # tuples support more options
399 if isinstance(tokens, tuple):
400 for idx, token in enumerate(tokens):
405 lineno += g.count('\n')
408 elif token.__class__ is Failure:
409 raise token(lineno, filename)
410 # bygroup is a bit more complex, in that case we
411 # yield for the current token the first named
413 elif token == '#bygroup':
414 for key, value in m.groupdict().iteritems():
415 if value is not None:
416 yield lineno, key, value
417 lineno += value.count('\n')
420 raise RuntimeError('%r wanted to resolve '
421 'the token dynamically'
422 ' but no group matched'
426 data = m.group(idx + 1)
428 yield lineno, token, data
429 lineno += data.count('\n')
431 # strings as token just are yielded as it, but just
432 # if the data is not empty
435 # update brace/parentheses balance
436 if tokens == 'operator':
438 balancing_stack.append('}')
440 balancing_stack.append(')')
442 balancing_stack.append(']')
443 elif data in ('}', ')', ']'):
444 if not balancing_stack:
445 raise TemplateSyntaxError('unexpected "%s"' %
448 expected_op = balancing_stack.pop()
449 if expected_op != data:
450 raise TemplateSyntaxError('unexpected "%s", '
455 if tokens is not None:
457 yield lineno, tokens, data
458 lineno += data.count('\n')
460 # fetch new position into new variable so that we can check
461 # if there is a internal parsing error which would result
462 # in an infinite loop
465 # handle state changes
466 if new_state is not None:
467 # remove the uppermost state
468 if new_state == '#pop':
470 # resolve the new state by group checking
471 elif new_state == '#bygroup':
472 for key, value in m.groupdict().iteritems():
473 if value is not None:
477 raise RuntimeError('%r wanted to resolve the '
478 'new state dynamically but'
479 ' no group matched' %
481 # direct state name given
483 stack.append(new_state)
484 statetokens = self.rules[stack[-1]]
485 # we are still at the same position and no stack change.
486 # this means a loop without break condition, avoid that and
489 raise RuntimeError('%r yielded empty string without '
490 'stack change' % regex)
491 # publish new function and start again
494 # if loop terminated without break we havn't found a single match
495 # either we are at the end of the file or we have a problem
498 if pos >= source_length:
500 # something went wrong
501 raise TemplateSyntaxError('unexpected char %r at %d' %
502 (source[pos], pos), lineno,