1 #=======================================================================
3 # Python Lexical Analyser
6 # Scanning an input stream
8 #=======================================================================
11 cython.declare(BOL=object, EOL=object, EOF=object)
14 from Regexps import BOL, EOL, EOF
16 class Scanner(object):
18 A Scanner is used to read tokens from a stream of characters
19 using the token set specified by a Plex.Lexicon.
23 Scanner(lexicon, stream, name = '')
25 See the docstring of the __init__ method for details.
29 See the docstrings of the individual methods for more
32 read() --> (value, text)
33 Reads the next lexical token from the stream.
35 position() --> (name, line, col)
36 Returns the position of the last token read using the
40 Causes scanner to change state.
42 produce(value [, text])
43 Causes return of a token value to the caller of the
48 # lexicon = None # Lexicon
49 # stream = None # file-like object
52 # buf_start_pos = 0 # position in input of start of buffer
53 # next_pos = 0 # position in input of next char to read
54 # cur_pos = 0 # position in input of current char
55 # cur_line = 1 # line number of current char
56 # cur_line_start = 0 # position in input of start of current line
57 # start_pos = 0 # position in input of start of token
58 # start_line = 0 # line number of start of token
59 # start_col = 0 # position in line of start of token
60 # text = None # text of last token read
61 # initial_state = None # Node
62 # state_name = '' # Name of initial state
63 # queue = None # list of tokens to be returned
66 def __init__(self, lexicon, stream, name = '', initial_pos = None):
68 Scanner(lexicon, stream, name = '')
70 |lexicon| is a Plex.Lexicon instance specifying the lexical tokens
73 |stream| can be a file object or anything which implements a
74 compatible read() method.
76 |name| is optional, and may be the name of the file being
77 scanned or any other identifying string.
82 self.buf_start_pos = 0
90 self.state_name = None
92 self.lexicon = lexicon
96 self.initial_state = None
100 self.cur_line_start = 0
103 if initial_pos is not None:
104 self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2]
108 Read the next lexical token from the stream and return a
109 tuple (value, text), where |value| is the value associated with
110 the token as specified by the Lexicon, and |text| is the actual
111 string read from the stream. Returns (None, '') on end of file.
115 self.text, action = self.scan_a_token()
120 value = action.perform(self, self.text)
121 if value is not None:
127 def scan_a_token(self):
129 Read the next input sequence recognised by the machine
130 and return (text, action). Returns ('', None) on end of
133 self.start_pos = self.cur_pos
134 self.start_line = self.cur_line
135 self.start_col = self.cur_pos - self.cur_line_start
136 action = self.run_machine_inlined()
137 if action is not None:
139 print("Scanner: read: Performing %s %d:%d" % (
140 action, self.start_pos, self.cur_pos))
141 base = self.buf_start_pos
142 text = self.buffer[self.start_pos - base : self.cur_pos - base]
143 return (text, action)
145 if self.cur_pos == self.start_pos:
146 if self.cur_char is EOL:
148 if self.cur_char is None or self.cur_char is EOF:
150 raise Errors.UnrecognizedInput(self, self.state_name)
152 def run_machine_inlined(self):
154 Inlined version of run_machine for speed.
156 state = self.initial_state
157 cur_pos = self.cur_pos
158 cur_line = self.cur_line
159 cur_line_start = self.cur_line_start
160 cur_char = self.cur_char
161 input_state = self.input_state
162 next_pos = self.next_pos
164 buf_start_pos = self.buf_start_pos
165 buf_len = len(buffer)
170 print("State %d, %d/%d:%s -->" % ( #TRACE#
171 state['number'], input_state, cur_pos, repr(cur_char))) #TRACE#
172 # Begin inlined self.save_for_backup()
173 #action = state.action #@slow
174 action = state['action'] #@fast
175 if action is not None:
177 action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos)
178 # End inlined self.save_for_backup()
180 #new_state = state.new_state(c) #@slow
181 new_state = state.get(c, -1) #@fast
182 if new_state == -1: #@fast
183 new_state = c and state.get('else') #@fast
186 print("State %d" % new_state['number']) #TRACE#
188 # Begin inlined: self.next_char()
191 # Begin inlined: c = self.read_char()
192 buf_index = next_pos - buf_start_pos
193 if buf_index < buf_len:
194 c = buffer[buf_index]
195 next_pos = next_pos + 1
197 discard = self.start_pos - buf_start_pos
198 data = self.stream.read(0x1000)
199 buffer = self.buffer[discard:] + data
201 buf_start_pos = buf_start_pos + discard
202 self.buf_start_pos = buf_start_pos
203 buf_len = len(buffer)
204 buf_index = buf_index - discard
206 c = buffer[buf_index]
207 next_pos = next_pos + 1
210 # End inlined: c = self.read_char()
219 elif input_state == 2:
222 elif input_state == 3:
223 cur_line = cur_line + 1
224 cur_line_start = cur_pos = next_pos
227 elif input_state == 4:
230 else: # input_state = 5
232 # End inlined self.next_char()
233 else: # not new_state
235 print("blocked") #TRACE#
236 # Begin inlined: action = self.back_up()
237 if backup_state is not None:
238 (action, cur_pos, cur_line, cur_line_start,
239 cur_char, input_state, next_pos) = backup_state
243 # End inlined: action = self.back_up()
244 self.cur_pos = cur_pos
245 self.cur_line = cur_line
246 self.cur_line_start = cur_line_start
247 self.cur_char = cur_char
248 self.input_state = input_state
249 self.next_pos = next_pos
251 if action is not None: #TRACE#
252 print("Doing %s" % action) #TRACE#
256 input_state = self.input_state
258 print("Scanner: next: %s [%d] %d" % (" "*20, input_state, self.cur_pos))
260 self.cur_pos = self.next_pos
270 elif input_state == 2:
271 self.cur_char = u'\n'
273 elif input_state == 3:
274 self.cur_line = self.cur_line + 1
275 self.cur_line_start = self.cur_pos = self.next_pos
278 elif input_state == 4:
281 else: # input_state = 5
284 print("--> [%d] %d %s" % (input_state, self.cur_pos, repr(self.cur_char)))
288 Return a tuple (name, line, col) representing the location of
289 the last token read using the read() method. |name| is the
290 name that was provided to the Scanner constructor; |line|
291 is the line number in the stream (1-based); |col| is the
292 position within the line of the first character of the token
295 return (self.name, self.start_line, self.start_col)
297 def begin(self, state_name):
298 """Set the current state of the scanner to the named state."""
299 self.initial_state = (
300 self.lexicon.get_initial_state(state_name))
301 self.state_name = state_name
303 def produce(self, value, text = None):
305 Called from an action procedure, causes |value| to be returned
306 as the token value from read(). If |text| is supplied, it is
307 returned in place of the scanned text.
309 produce() can be called more than once during a single call to an action
310 procedure, in which case the tokens are queued up and returned one
311 at a time by subsequent calls to read(), until the queue is empty,
312 whereupon scanning resumes.
316 self.queue.append((value, text))
320 Override this method if you want something to be done at