jinja/lexer.py

   1 # -*- coding: utf-8 -*-
   2 """
   3     jinja.lexer
   4     ~~~~~~~~~~~
   5
   6     :copyright: 2006 by Armin Ronacher.
   7     :license: BSD, see LICENSE for more details.
   8 """
   9 import re
  10 from jinja.datastructure import TokenStream
  11 from jinja.exceptions import TemplateSyntaxError
  12
  13
  14 # static regular expressions
  15 whitespace_re = re.compile(r'\s+(?m)')
  16 name_re = re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*[!?]?')
  17 string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
  18                        r'|"([^"\\]*(?:\\.[^"\\]*)*)")(?ms)')
  19 number_re = re.compile(r'\d+(\.\d+)*')
  20
  21 operator_re = re.compile('(%s)' % '|'.join(
  22     isinstance(x, unicode) and str(x) or re.escape(x) for x in [
  23     # math operators
  24     '+', '-', '*', '/', '%',
  25     # braces and parenthesis
  26     '[', ']', '(', ')', '{', '}',
  27     # attribute access and comparison / logical operators
  28     '.', ':', ',', '|', '==', '<', '>', '<=', '>=', '!=', '=',
  29     ur'or\b', ur'and\b', ur'not\b', ur'in\b', ur'is'
  30 ]))
  31
  32
  33 class Failure(object):
  34     """
  35     Class that raises a `TemplateSyntaxError` if called.
  36     Used by the `Lexer` to specify known errors.
  37     """
  38
  39     def __init__(self, message, cls=TemplateSyntaxError):
  40         self.message = message
  41         self.error_class = cls
  42
  43     def __call__(self, lineno):
  44         raise self.error_class(self.message, lineno)
  45
  46
  47 class Lexer(object):
  48     """
  49     Class that implements a lexer for a given environment. Automatically
  50     created by the environment class, usually you don't have to do that.
  51     """
  52
  53     def __init__(self, environment):
  54         # shortcuts
  55         c = lambda x: re.compile(x, re.M | re.S)
  56         e = re.escape
  57
  58         # parsing rules for tags
  59         tag_rules = [
  60             (whitespace_re, None, None),
  61             (number_re, 'number', None),
  62             (operator_re, 'operator', None),
  63             (name_re, 'name', None),
  64             (string_re, 'string', None)
  65         ]
  66
  67         # assamble the root lexing rule. because "|" is ungreedy
  68         # we have to sort by length so that the lexer continues working
  69         # as expected when we have parsing rules like <% for block and
  70         # <%= for variables. (if someone wants asp like syntax)
  71         root_tag_rules = [
  72             ('comment',     environment.comment_start_string),
  73             ('block',       environment.block_start_string),
  74             ('variable',    environment.variable_start_string)
  75         ]
  76         root_tag_rules.sort(lambda a, b: cmp(len(b[1]), len(a[1])))
  77
  78         # global parsing rules
  79         self.rules = {
  80             'root': [
  81                 (c('(.*?)(?:%s)' % '|'.join([
  82                     '(?P<%s_begin>%s)' % (n, e(r)) for n, r in root_tag_rules
  83                 ])), ('data', '#bygroup'), '#bygroup'),
  84                 (c('.+'), 'data', None)
  85             ],
  86             'comment_begin': [
  87                 (c(r'(.*?)(%s)' % e(environment.comment_end_string)),
  88                  ('comment', 'comment_end'), '#pop'),
  89                 (c('(.)'), (Failure('Missing end of comment tag'),), None)
  90             ],
  91             'block_begin': [
  92                 (c(e(environment.block_end_string) +
  93                   (environment.trim_blocks and '\\n?' or '')), 'block_end', '#pop')
  94             ] + tag_rules,
  95             'variable_begin': [
  96                 (c(e(environment.variable_end_string)), 'variable_end',
  97                  '#pop')
  98             ] + tag_rules
  99         }
 100
 101     def tokenize(self, source):
 102         """
 103         Simple tokenize function that yields ``(position, type, contents)``
 104         tuples. Wrap the generator returned by this function in a
 105         `TokenStream` to get real token instances and be able to push tokens
 106         back to the stream. That's for example done by the parser.
 107         """
 108         return TokenStream(self.tokeniter(source))
 109
 110     def tokeniter(self, source):
 111         """
 112         This method tokenizes the text and returns the tokens in a generator.
 113         Normally it's a better idea to use the `tokenize` function which
 114         returns a `TokenStream` but in some situations it can be useful
 115         to use this function since it can be marginally faster.
 116         """
 117         source = type(source)('\n').join(source.splitlines())
 118         pos = lineno = 0
 119         stack = ['root']
 120         statetokens = self.rules['root']
 121         source_length = len(source)
 122
 123         while True:
 124             # tokenizer loop
 125             for regex, tokens, new_state in statetokens:
 126                 m = regex.match(source, pos)
 127                 if m:
 128                     # tuples support more options
 129                     if isinstance(tokens, tuple):
 130                         for idx, token in enumerate(tokens):
 131                             # hidden group
 132                             if token is None:
 133                                 g += m.group(idx)
 134                                 if g:
 135                                     lineno += g.count('\n')
 136                                 continue
 137                             # failure group
 138                             elif isinstance(token, Failure):
 139                                 raise token(m.start(idx + 1))
 140                             # bygroup is a bit more complex, in that case we
 141                             # yield for the current token the first named
 142                             # group that matched
 143                             elif token == '#bygroup':
 144                                 for key, value in m.groupdict().iteritems():
 145                                     if value is not None:
 146                                         yield lineno, key, value
 147                                         lineno += value.count('\n')
 148                                         break
 149                                 else:
 150                                     raise RuntimeError('%r wanted to resolve '
 151                                                        'the token dynamically'
 152                                                        ' but no group matched'
 153                                                        % regex)
 154                             # normal group
 155                             else:
 156                                 data = m.group(idx + 1)
 157                                 if data:
 158                                     yield lineno, token, data
 159                                 lineno += data.count('\n')
 160                     # strings as token just are yielded as it, but just
 161                     # if the data is not empty
 162                     else:
 163                         data = m.group()
 164                         if tokens is not None:
 165                             if data:
 166                                 yield lineno, tokens, data
 167                         lineno += data.count('\n')
 168                     # fetch new position into new variable so that we can check
 169                     # if there is a internal parsing error which would result
 170                     # in an infinite loop
 171                     pos2 = m.end()
 172                     # handle state changes
 173                     if new_state is not None:
 174                         # remove the uppermost state
 175                         if new_state == '#pop':
 176                             stack.pop()
 177                         # resolve the new state by group checking
 178                         elif new_state == '#bygroup':
 179                             for key, value in m.groupdict().iteritems():
 180                                 if value is not None:
 181                                     stack.append(key)
 182                                     break
 183                             else:
 184                                 raise RuntimeError('%r wanted to resolve the '
 185                                                    'new state dynamically but'
 186                                                    ' no group matched' %
 187                                                    regex)
 188                         # direct state name given
 189                         else:
 190                             stack.append(new_state)
 191                         statetokens = self.rules[stack[-1]]
 192                     # we are still at the same position and no stack change.
 193                     # this means a loop without break condition, avoid that and
 194                     # raise error
 195                     elif pos2 == pos:
 196                         raise RuntimeError('%r yielded empty string without '
 197                                            'stack change' % regex)
 198                     # publish new function and start again
 199                     pos = pos2
 200                     break
 201             # if loop terminated without break we havn't found a single match
 202             # either we are at the end of the file or we have a problem
 203             else:
 204                 # end of text
 205                 if pos >= source_length:
 206                     return
 207                 # something went wrong
 208                 raise TemplateSyntaxError('unexpected char %r at %d' %
 209                                           (source[pos], pos), lineno)