From 9ad96e7af659846abcf45e0c64bf07dacf7881c4 Mon Sep 17 00:00:00 2001
From: Armin Ronacher <armin.ronacher@active-4.com>
Date: Fri, 13 Jun 2008 22:44:01 +0200
Subject: [PATCH] added support for token stream filtering and preprocessing.

--HG--
branch : trunk
---
 CHANGES               |   4 ++
 docs/api.rst          |   2 +
 docs/extensions.rst   |   2 +-
 jinja2/environment.py |  31 +++++++++++-
 jinja2/ext.py         |  16 ++++++
 jinja2/lexer.py       | 110 +++++++++++++++++++++---------------------
 jinja2/parser.py      |   3 +-
 7 files changed, 109 insertions(+), 59 deletions(-)

diff --git a/CHANGES b/CHANGES
index 393ba0c..4277283 100644
--- a/CHANGES
+++ b/CHANGES
@@ -13,6 +13,10 @@ Version 2.0
 
   For more information see :ref:`the implementation details <notes-on-subscribing>`.
 
+- added support for preprocessing and token stream filtering for extensions.
+  This would allow extensions to allow simplified gettext calls in template
+  data and something similar.
+
 Version 2.0rc1
 --------------
 (no codename, released on July 9th 2008)
diff --git a/docs/api.rst b/docs/api.rst
index e4c7a50..e16107a 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -590,6 +590,8 @@ don't recommend using any of those.
 
 .. automethod:: Environment.parse
 
+.. automethod:: Environment.preprocess
+
 .. automethod:: Template.new_context
 
 .. method:: Template.root_render_func(context)
diff --git a/docs/extensions.rst b/docs/extensions.rst
index 8ea554a..215a684 100644
--- a/docs/extensions.rst
+++ b/docs/extensions.rst
@@ -168,7 +168,7 @@ Extension API
 Extensions always have to extend the :class:`jinja2.ext.Extension` class:
 
 .. autoclass:: Extension
-    :members: parse, attr, call_method
+    :members: preprocess, filter_stream, parse, attr, call_method
 
     .. attribute:: identifier
 
diff --git a/jinja2/environment.py b/jinja2/environment.py
index 689bc92..e24e78e 100644
--- a/jinja2/environment.py
+++ b/jinja2/environment.py
@@ -10,7 +10,7 @@
 """
 import sys
 from jinja2.defaults import *
-from jinja2.lexer import Lexer
+from jinja2.lexer import Lexer, TokenStream
 from jinja2.parser import Parser
 from jinja2.optimizer import optimize
 from jinja2.compiler import generate
@@ -339,8 +339,35 @@ class Environment(object):
         tokens as tuples in the form ``(lineno, token_type, value)``.
         This can be useful for :ref:`extension development <writing-extensions>`
         and debugging templates.
+
+        This does not perform preprocessing.  If you want the preprocessing
+        of the extensions to be applied you have to filter source through
+        the :meth:`preprocess` method.
+        """
+        return self.lexer.tokeniter(unicode(source), name, filename)
+
+    def preprocess(self, source, name=None, filename=None):
+        """Preprocesses the source with all extensions.  This is automatically
+        called for all parsing and compiling methods but *not* for :meth:`lex`
+        because there you usually only want the actual source tokenized.
+        """
+        return reduce(lambda s, e: e.preprocess(s, name, filename),
+                      self.extensions.itervalues(), unicode(source))
+
+    def _tokenize(self, source, name, filename=None):
+        """Called by the parser to do the preprocessing and filtering
+        for all the extensions.  Returns a :class:`~jinja2.lexer.TokenStream`.
         """
-        return self.lexer.tokeniter(source, name, filename)
+        def _stream(iterable):
+            if not isinstance(iterable, TokenStream):
+                iterable = TokenStream(iterable, name, filename)
+            return iterable
+        source = self.preprocess(source, name, filename)
+        tokeniter = self.lexer.tokeniter(source, name, filename)
+        stream = _stream(self.lexer.wrap(tokeniter, name, filename))
+        for ext in self.extensions.itervalues():
+            stream = _stream(ext.filter_stream(stream))
+        return stream
 
     def compile(self, source, name=None, filename=None, raw=False):
         """Compile a node or template source code.  The `name` parameter is
diff --git a/jinja2/ext.py b/jinja2/ext.py
index 93cde83..9dfa87c 100644
--- a/jinja2/ext.py
+++ b/jinja2/ext.py
@@ -16,6 +16,7 @@ from jinja2.defaults import *
 from jinja2.environment import get_spontaneous_environment
 from jinja2.runtime import Undefined, concat
 from jinja2.exceptions import TemplateAssertionError, TemplateSyntaxError
+from jinja2.lexer import Token
 from jinja2.utils import contextfunction, import_string, Markup
 
 
@@ -67,6 +68,21 @@ class Extension(object):
         rv.environment = environment
         return rv
 
+    def preprocess(self, source, name, filename=None):
+        """This method is called before the actual lexing and can be used to
+        preprocess the source.  The `filename` is optional.  The return value
+        must be the preprocessed source.
+        """
+        return source
+
+    def filter_stream(self, stream):
+        """It's passed a :class:`~jinja2.lexer.TokenStream` that can be used
+        to filter tokens returned.  This method has to return an iterable of
+        :class:`~jinja2.lexer.Token`\s, but it doesn't have to return a
+        :class:`~jinja2.lexer.TokenStream`.
+        """
+        return stream
+
     def parse(self, parser):
         """If any of the :attr:`tags` matched this method is called with the
         parser as first argument.  The token the parser stream is pointing at
diff --git a/jinja2/lexer.py b/jinja2/lexer.py
index 64621fd..1f22ed7 100644
--- a/jinja2/lexer.py
+++ b/jinja2/lexer.py
@@ -133,17 +133,17 @@ class TokenStreamIterator(object):
     """
 
     def __init__(self, stream):
-        self._stream = stream
+        self.stream = stream
 
     def __iter__(self):
         return self
 
     def next(self):
-        token = self._stream.current
+        token = self.stream.current
         if token.type == 'eof':
-            self._stream.close()
+            self.stream.close()
             raise StopIteration()
-        self._stream.next(False)
+        self.stream.next()
         return token
 
 
@@ -154,11 +154,12 @@ class TokenStream(object):
     """
 
     def __init__(self, generator, name, filename):
-        self._next = generator.next
+        self._next = iter(generator).next
         self._pushed = deque()
-        self.current = Token(1, 'initial', '')
         self.name = name
         self.filename = filename
+        self.closed = False
+        self.current = Token(1, 'initial', '')
         self.next()
 
     def __iter__(self):
@@ -214,6 +215,7 @@ class TokenStream(object):
         """Close the stream."""
         self.current = Token(self.current.lineno, 'eof', '')
         self._next = None
+        self.closed = True
 
     def expect(self, expr):
         """Expect a given token type and return it.  This accepts the same
@@ -374,60 +376,60 @@ class Lexer(object):
         return newline_re.sub(self.newline_sequence, value)
 
     def tokenize(self, source, name=None, filename=None):
-        """Works like `tokeniter` but returns a tokenstream of tokens and not
-        a generator or token tuples.  Additionally all token values are already
-        converted into types and postprocessed. For example comments are removed,
-        integers and floats converted, strings unescaped etc.
+        """Calls tokeniter + tokenize and wraps it in a token stream.
+        This is currently only used for unittests.
         """
-        def generate():
-            for lineno, token, value in self.tokeniter(source, name, filename):
-                if token in ('comment_begin', 'comment', 'comment_end',
-                             'whitespace'):
-                    continue
-                elif token == 'linestatement_begin':
-                    token = 'block_begin'
-                elif token == 'linestatement_end':
-                    token = 'block_end'
-                # we are not interested in those tokens in the parser
-                elif token in ('raw_begin', 'raw_end'):
-                    continue
-                elif token == 'data':
-                    value = self._normalize_newlines(value)
-                elif token == 'keyword':
-                    token = value
-                elif token == 'name':
+        stream = self.tokeniter(source, name, filename)
+        return TokenStream(self.wrap(stream, name, filename), name, filename)
+
+    def wrap(self, stream, name=None, filename=None):
+        """This is called with the stream as returned by `tokenize` and wraps
+        every token in a :class:`Token` and converts the value.
+        """
+        for lineno, token, value in stream:
+            if token in ('comment_begin', 'comment', 'comment_end',
+                         'whitespace'):
+                continue
+            elif token == 'linestatement_begin':
+                token = 'block_begin'
+            elif token == 'linestatement_end':
+                token = 'block_end'
+            # we are not interested in those tokens in the parser
+            elif token in ('raw_begin', 'raw_end'):
+                continue
+            elif token == 'data':
+                value = self._normalize_newlines(value)
+            elif token == 'keyword':
+                token = value
+            elif token == 'name':
+                value = str(value)
+            elif token == 'string':
+                # try to unescape string
+                try:
+                    value = self._normalize_newlines(value[1:-1]) \
+                        .encode('ascii', 'backslashreplace') \
+                        .decode('unicode-escape')
+                except Exception, e:
+                    msg = str(e).split(':')[-1].strip()
+                    raise TemplateSyntaxError(msg, lineno, name, filename)
+                # if we can express it as bytestring (ascii only)
+                # we do that for support of semi broken APIs
+                # as datetime.datetime.strftime
+                try:
                     value = str(value)
-                elif token == 'string':
-                    # try to unescape string
-                    try:
-                        value = self._normalize_newlines(value[1:-1]) \
-                            .encode('ascii', 'backslashreplace') \
-                            .decode('unicode-escape')
-                    except Exception, e:
-                        msg = str(e).split(':')[-1].strip()
-                        raise TemplateSyntaxError(msg, lineno, name, filename)
-                    # if we can express it as bytestring (ascii only)
-                    # we do that for support of semi broken APIs
-                    # as datetime.datetime.strftime
-                    try:
-                        value = str(value)
-                    except UnicodeError:
-                        pass
-                elif token == 'integer':
-                    value = int(value)
-                elif token == 'float':
-                    value = float(value)
-                elif token == 'operator':
-                    token = operators[value]
-                yield Token(lineno, token, value)
-        return TokenStream(generate(), name, filename)
+                except UnicodeError:
+                    pass
+            elif token == 'integer':
+                value = int(value)
+            elif token == 'float':
+                value = float(value)
+            elif token == 'operator':
+                token = operators[value]
+            yield Token(lineno, token, value)
 
     def tokeniter(self, source, name, filename=None):
         """This method tokenizes the text and returns the tokens in a
         generator.  Use this method if you just want to tokenize a template.
-        The output you get is not compatible with the input the jinja parser
-        wants.  The parser uses the `tokenize` function with returns a
-        `TokenStream` and postprocessed tokens.
         """
         source = '\n'.join(unicode(source).splitlines())
         pos = 0
diff --git a/jinja2/parser.py b/jinja2/parser.py
index 810e381..ea7391f 100644
--- a/jinja2/parser.py
+++ b/jinja2/parser.py
@@ -25,11 +25,10 @@ class Parser(object):
 
     def __init__(self, environment, source, name=None, filename=None):
         self.environment = environment
-        self.source = unicode(source)
+        self.stream = environment._tokenize(source, name, filename)
         self.name = name
         self.filename = filename
         self.closed = False
-        self.stream = environment.lexer.tokenize(self.source, name, filename)
         self.extensions = {}
         for extension in environment.extensions.itervalues():
             for tag in extension.tags:
-- 
2.26.2