1 # -*- coding: utf-8 -*-
6 :copyright: 2006 by Armin Ronacher.
7 :license: BSD, see LICENSE for more details.
10 from jinja.datastructure import TokenStream
11 from jinja.exceptions import TemplateSyntaxError
14 # static regular expressions
15 whitespace_re = re.compile(r'\s+(?m)')
16 name_re = re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*[!?]?')
17 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
18 r'|"([^"\\]*(?:\\.[^"\\]*)*)")(?ms)')
19 number_re = re.compile(r'\d+(\.\d+)*')
21 operator_re = re.compile('(%s)' % '|'.join(
22 isinstance(x, unicode) and str(x) or re.escape(x) for x in [
24 '+', '-', '*', '/', '%',
25 # braces and parenthesis
26 '[', ']', '(', ')', '{', '}',
27 # attribute access and comparison / logical operators
28 '.', ':', ',', '|', '==', '<', '>', '<=', '>=', '!=', '=',
29 ur'or\b', ur'and\b', ur'not\b', ur'in\b', ur'is'
33 class Failure(object):
35 Class that raises a `TemplateSyntaxError` if called.
36 Used by the `Lexer` to specify known errors.
39 def __init__(self, message, cls=TemplateSyntaxError):
40 self.message = message
41 self.error_class = cls
43 def __call__(self, lineno):
44 raise self.error_class(self.message, lineno)
49 Class that implements a lexer for a given environment. Automatically
50 created by the environment class, usually you don't have to do that.
53 def __init__(self, environment):
55 c = lambda x: re.compile(x, re.M | re.S)
58 # parsing rules for tags
60 (whitespace_re, None, None),
61 (number_re, 'number', None),
62 (operator_re, 'operator', None),
63 (name_re, 'name', None),
64 (string_re, 'string', None)
67 # assamble the root lexing rule. because "|" is ungreedy
68 # we have to sort by length so that the lexer continues working
69 # as expected when we have parsing rules like <% for block and
70 # <%= for variables. (if someone wants asp like syntax)
72 ('comment', environment.comment_start_string),
73 ('block', environment.block_start_string),
74 ('variable', environment.variable_start_string)
76 root_tag_rules.sort(lambda a, b: cmp(len(b[1]), len(a[1])))
78 # global parsing rules
81 (c('(.*?)(?:%s)' % '|'.join([
82 '(?P<%s_begin>%s)' % (n, e(r)) for n, r in root_tag_rules
83 ])), ('data', '#bygroup'), '#bygroup'),
84 (c('.+'), 'data', None)
87 (c(r'(.*?)(%s)' % e(environment.comment_end_string)),
88 ('comment', 'comment_end'), '#pop'),
89 (c('(.)'), (Failure('Missing end of comment tag'),), None)
92 (c(e(environment.block_end_string) +
93 (environment.trim_blocks and '\\n?' or '')), 'block_end', '#pop')
96 (c(e(environment.variable_end_string)), 'variable_end',
101 def tokenize(self, source):
103 Simple tokenize function that yields ``(position, type, contents)``
104 tuples. Wrap the generator returned by this function in a
105 `TokenStream` to get real token instances and be able to push tokens
106 back to the stream. That's for example done by the parser.
108 return TokenStream(self.tokeniter(source))
110 def tokeniter(self, source):
112 This method tokenizes the text and returns the tokens in a generator.
113 Normally it's a better idea to use the `tokenize` function which
114 returns a `TokenStream` but in some situations it can be useful
115 to use this function since it can be marginally faster.
117 source = type(source)('\n').join(source.splitlines())
120 statetokens = self.rules['root']
121 source_length = len(source)
125 for regex, tokens, new_state in statetokens:
126 m = regex.match(source, pos)
128 # tuples support more options
129 if isinstance(tokens, tuple):
130 for idx, token in enumerate(tokens):
135 lineno += g.count('\n')
138 elif isinstance(token, Failure):
139 raise token(m.start(idx + 1))
140 # bygroup is a bit more complex, in that case we
141 # yield for the current token the first named
143 elif token == '#bygroup':
144 for key, value in m.groupdict().iteritems():
145 if value is not None:
146 yield lineno, key, value
147 lineno += value.count('\n')
150 raise RuntimeError('%r wanted to resolve '
151 'the token dynamically'
152 ' but no group matched'
156 data = m.group(idx + 1)
158 yield lineno, token, data
159 lineno += data.count('\n')
160 # strings as token just are yielded as it, but just
161 # if the data is not empty
164 if tokens is not None:
166 yield lineno, tokens, data
167 lineno += data.count('\n')
168 # fetch new position into new variable so that we can check
169 # if there is a internal parsing error which would result
170 # in an infinite loop
172 # handle state changes
173 if new_state is not None:
174 # remove the uppermost state
175 if new_state == '#pop':
177 # resolve the new state by group checking
178 elif new_state == '#bygroup':
179 for key, value in m.groupdict().iteritems():
180 if value is not None:
184 raise RuntimeError('%r wanted to resolve the '
185 'new state dynamically but'
186 ' no group matched' %
188 # direct state name given
190 stack.append(new_state)
191 statetokens = self.rules[stack[-1]]
192 # we are still at the same position and no stack change.
193 # this means a loop without break condition, avoid that and
196 raise RuntimeError('%r yielded empty string without '
197 'stack change' % regex)
198 # publish new function and start again
201 # if loop terminated without break we havn't found a single match
202 # either we are at the end of the file or we have a problem
205 if pos >= source_length:
207 # something went wrong
208 raise TemplateSyntaxError('unexpected char %r at %d' %
209 (source[pos], pos), lineno)