some more cythonisation in Plex scanner classes (15% faster for lxml)
authorStefan Behnel <scoder@users.berlios.de>
Thu, 12 Mar 2009 19:24:28 +0000 (20:24 +0100)
committerStefan Behnel <scoder@users.berlios.de>
Thu, 12 Mar 2009 19:24:28 +0000 (20:24 +0100)
Cython/Plex/Scanners.pxd
Cython/Plex/Scanners.py

index 58c9a670282c94e14106eba8c55de35dd39fedd4..fe09cff065cf18f6ef2415ecfb08fc9eb63ca0f4 100644 (file)
@@ -6,28 +6,29 @@ cdef class Scanner:
     cdef public stream
     cdef public name
     cdef public buffer
-    cdef public long buf_start_pos
-    cdef public long next_pos
-    cdef public long cur_pos
-    cdef public long cur_line
-    cdef public long cur_line_start
-    cdef public long start_pos
-    cdef public long start_line
-    cdef public long start_col
+    cdef public Py_ssize_t buf_start_pos
+    cdef public Py_ssize_t next_pos
+    cdef public Py_ssize_t cur_pos
+    cdef public Py_ssize_t cur_line
+    cdef public Py_ssize_t cur_line_start
+    cdef public Py_ssize_t start_pos
+    cdef public Py_ssize_t start_line
+    cdef public Py_ssize_t start_col
     cdef public text
     cdef public initial_state # int?
     cdef public state_name
     cdef public list queue
     cdef public bint trace
     cdef public cur_char
-    cdef public input_state
+    cdef public int input_state
 
     cdef public level
 
     @cython.locals(input_state=long)
     cpdef next_char(self)
     cpdef read(self)
-    cpdef position(self)
+    cpdef tuple scan_a_token(self)
+    cpdef tuple position(self)
 
     @cython.locals(cur_pos=cython.long, cur_line=cython.long,
                  cur_line_start=cython.long, input_state=cython.long,
index c76ec7067843549a927d84503ac8d09c28dfc58d..c6d511edeb7bc2e5f1d3c99d043e9e6019f33075 100644 (file)
@@ -75,6 +75,8 @@ class Scanner:
       |name| is optional, and may be the name of the file being
       scanned or any other identifying string.
     """
+    self.trace = 0
+
     self.buffer = ''
     self.buf_start_pos = 0
     self.next_pos = 0
@@ -135,7 +137,7 @@ class Scanner:
 #        else:
 #            action = self.run_machine_inlined()
     action = self.run_machine_inlined()
-    if action:
+    if action is not None:
       if self.trace:
         print("Scanner: read: Performing %s %d:%d" % (
           action, self.start_pos, self.cur_pos))
@@ -144,21 +146,11 @@ class Scanner:
       return (text, action)
     else:
       if self.cur_pos == self.start_pos:
-        if self.cur_char == EOL:
+        if self.cur_char is EOL:
           self.next_char()
-        if not self.cur_char or self.cur_char == EOF:
+        if self.cur_char is None or self.cur_char is EOF:
           return ('', None)
       raise Errors.UnrecognizedInput(self, self.state_name)
-  
-  def run_machine(self):
-    """
-    Run the machine until no more transitions are possible.
-    """
-    self.state = self.initial_state
-    self.backup_state = None
-    while self.transition():
-      pass
-    return self.back_up()
 
   def run_machine_inlined(self):
     """
@@ -183,7 +175,7 @@ class Scanner:
       # Begin inlined self.save_for_backup()
       #action = state.action #@slow
       action = state['action'] #@fast
-      if action:
+      if action is not None:
         backup_state = (
           action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos)
       # End inlined self.save_for_backup()
@@ -245,7 +237,7 @@ class Scanner:
         if trace: #TRACE#
           print("blocked")  #TRACE#
         # Begin inlined: action = self.back_up()
-        if backup_state:
+        if backup_state is not None:
           (action, cur_pos, cur_line, cur_line_start, 
             cur_char, input_state, next_pos) = backup_state
         else:
@@ -259,46 +251,9 @@ class Scanner:
     self.input_state = input_state
     self.next_pos     = next_pos
     if trace: #TRACE#
-      if action: #TRACE#
-        print("Doing " + action) #TRACE#
+      if action is not None: #TRACE#
+        print("Doing %s" % action) #TRACE#
     return action
-    
-#    def transition(self):
-#        self.save_for_backup()
-#        c = self.cur_char
-#        new_state = self.state.new_state(c)
-#        if new_state:
-#            if self.trace:
-#                print "Scanner: read: State %d: %s --> State %d" % (
-#                    self.state.number, repr(c), new_state.number)
-#            self.state = new_state
-#            self.next_char()
-#            return 1
-#        else:
-#            if self.trace:
-#                print "Scanner: read: State %d: %s --> blocked" % (
-#                    self.state.number, repr(c))
-#            return 0
-  
-#    def save_for_backup(self):
-#        action = self.state.get_action()
-#        if action:
-#            if self.trace:
-#                print "Scanner: read: Saving backup point at", self.cur_pos
-#            self.backup_state = (
-#                action, self.cur_pos, self.cur_line, self.cur_line_start, 
-#                self.cur_char, self.input_state, self.next_pos)
-  
-#    def back_up(self):
-#        backup_state = self.backup_state
-#        if backup_state:
-#            (action, self.cur_pos, self.cur_line, self.cur_line_start, 
-#                self.cur_char, self.input_state, self.next_pos) = backup_state
-#            if self.trace:
-#                print "Scanner: read: Backing up to", self.cur_pos
-#            return action
-#        else:
-#            return None
 
   def next_char(self):
     input_state = self.input_state
@@ -330,26 +285,7 @@ class Scanner:
       self.cur_char = ''
     if self.trace:
       print("--> [%d] %d %s" % (input_state, self.cur_pos, repr(self.cur_char)))
-    
-#    def read_char(self):
-#        """
-#    Get the next input character, filling the buffer if necessary.
-#    Returns '' at end of file.
-#    """
-#        next_pos = self.next_pos
-#        buf_index = next_pos - self.buf_start_pos
-#        if buf_index == len(self.buffer):
-#            discard = self.start_pos - self.buf_start_pos
-#            data = self.stream.read(0x1000)
-#            self.buffer = self.buffer[discard:] + data
-#            self.buf_start_pos = self.buf_start_pos + discard
-#            buf_index = buf_index - discard
-#            if not data:
-#                return ''
-#        c = self.buffer[buf_index]
-#        self.next_pos = next_pos + 1
-#        return c
-  
+
   def position(self):
     """
     Return a tuple (name, line, col) representing the location of