00001
00002
00003 """Copyright (c) 2011, Mark Watkinson
00004 All rights reserved.
00005
00006 Redistribution and use in source and binary forms, with or without
00007 modification, are permitted provided that the following conditions are met:
00008 * Redistributions of source code must retain the above copyright
00009 notice, this list of conditions and the following disclaimer.
00010 * Redistributions in binary form must reproduce the above copyright
00011 notice, this list of conditions and the following disclaimer in the
00012 documentation and/or other materials provided with the distribution.
00013 * Neither the name of Scanner nor the
00014 names of its contributors may be used to endorse or promote products
00015 derived from this software without specific prior written permission.
00016
00017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
00018 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
00019 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00020 DISCLAIMED. IN NO EVENT SHALL MARK WATKINSON BE LIABLE FOR ANY
00021 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
00022 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00023 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
00024 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00025 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00026 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."""
00027
00028
00029 import re
00030
00031 class Scanner(object):
00032 """A simple class to aid in lexical analysis of a string.
00033 Styled after, but not entirely the same as, Ruby's StringScanner class.
00034
00035 Basic philosophy is simple: Scanner traverses a string left to right,
00036 consuming the string as it goes. The current position is the string pointer
00037 Scanner.pos. At each iteration, the caller uses the scanning methods to
00038 determine what the current piece of string actually is.
00039
00040 Scanning methods:
00041 With the exception of get and peek, all scanning methods take a pattern and
00042 (optionally) flags (e.g re.X). The patterns are assumed to be either
00043 strings or compiled regular expression objects (i.e. the result of
00044 re.compile, or equivalent). If a pattern is not a string but does not
00045 implement match or search (whichever is being used), a ValueError is raised.
00046 String patterns are compiled and cached internally.
00047
00048 The check, scan and skip methods all try to match *at* the current scan
00049 pointer. check_to, scan_to and skip_to all try to find a match somewhere
00050 beyond the scan pointer and jump *to* that position. check_until, scan_until,
00051 and skip_until are like *_to, but also consume the match (so the jump to
00052 the *end* of that position)
00053
00054 Lookahead:
00055 check()
00056 check_to()
00057 check_until()
00058 peek()
00059
00060 Consume:
00061 get()
00062 scan()
00063 scan_to()
00064 scan_until()
00065 skip()
00066 skip_to()
00067 skip_until()
00068 skip_bytes() (convenience wrapper)
00069 skip_lines() (convenience wrapper)
00070 skip_whitespace() (convenience wrapper)
00071
00072 Note that scan* and check* both return either a string, in the case of a
00073 match, or None, in the case of no match. If the match exists but is zero
00074 length, the empty string is returned. Be careful handling this as both
00075 None and the empty string evaluate to False, but mean very different things.
00076
00077 peek and get also return the empty string when the end of the stream is
00078 reached.
00079
00080
00081 Most recent match data:
00082
00083 matched() -- True/False - was the most recent match a success?
00084
00085 The following methods all throw Exception if not matched()
00086
00087 match() -- matched string
00088 match_len() -- matched string length
00089 match_pos() -- offset of match
00090
00091 Wrappers around re.*
00092 match_info() -- the re.MatchObject
00093 match_group()
00094 match_groups()
00095 match_groupdict()
00096
00097 pre_match() -- string preceding the match
00098 post_match() -- string following the match
00099
00100 Misc:
00101 pos -- get/set current scan pointer position
00102
00103 bol() -- beginning of line? (DOS/Unix/Mac aware)
00104 eol() -- end of line? (DOS/Unix/Mac aware)
00105 eos() -- end of string?
00106 rest() -- remaining (unconsumed) string
00107 rest_len() -- length of remaining string
00108 unscan() -- revert to previous state
00109
00110 Setup:
00111 string -- get/set current source string
00112
00113 reset() -- reset the scanner ready to start again
00114 terminate() -- trigger premature finish
00115 """
00116
00117 def __init__(self, src=None):
00118 """Constructor
00119
00120 Arguments:
00121 src -- a string to scan. This can be set later by string()
00122
00123 """
00124
00125 self.__src = None
00126 self.__index = None
00127 self.__src_len = 0
00128
00129
00130 self.__regex_cache = {}
00131
00132
00133
00134
00135
00136 self.__match_history = None
00137
00138 if src is not None:
00139 self.string = src
00140
00141
00142 self.__rest_gen = None
00143
00144 self.reset()
00145
00146
00147 @property
00148 def pos(self):
00149 """The current string pointer position."""
00150 return self.__index
00151
00152 @pos.setter
00153 def pos(self, new_pos):
00154 """Set the string pointer position.
00155
00156 Arguments:
00157 new_pos -- The new offset into the string
00158
00159 Throw Exception if new_pos is out of range
00160
00161 """
00162 p_ = max(min(new_pos, self.__src_len), 0)
00163 if new_pos != p_: raise Exception('pos set to out of range value')
00164 self.__index = p_
00165
00166 def eos(self):
00167 """Return True iff we are at the end of the string, else False."""
00168 return self.__index >= self.__src_len
00169
00170 def reset(self):
00171 """Reset the scanner's state including string pointer and match history."""
00172 self.pos = 0
00173 self.__match_history = []
00174 self.__rest_gen = self.__rest()
00175
00176
00177 @property
00178 def string(self):
00179 """The source string"""
00180 return self.__src
00181
00182 @string.setter
00183 def string(self, s):
00184 """Set the source string"""
00185 if s is None: raise Exception('Scanner.string called with None')
00186 self.__src = s
00187 self.__src_len = len(s)
00188 self.reset()
00189 return self.__src
00190
00191 def terminate(self):
00192 """Set the string pointer to the end of the input and clear the match
00193 history."""
00194 self.reset()
00195 self.pos = self.__src_len
00196
00197 def bol(self):
00198 """Return whether or not the scan pointer is immediately after a newline
00199 character (DOS/Unix/Mac aware), or at the start of the string. """
00200 return self.__index == 0 or self.__src[self.__index-1] == '\n' \
00201 or self.__src[self.__index-1] == '\r' and self.peek() != '\n'
00202
00203 def eol(self):
00204 """Return whether or not the scan pointer is immediately before a newline
00205 character (DOS/Unix/Mac aware) or at the end of the string."""
00206 if self.eos(): return True
00207 p = self.peek(2)
00208 if p == '\r\n' or p.startswith('\r'): return True
00209 if p.startswith('\n'):
00210 if self.__index == 0 or self.__src[self.__index-1] != '\r': return True
00211 return False
00212
00213 def __match(self, strict=True):
00214 """ Return the most recent match data.
00215 Raise Exception if no matches are known.
00216
00217 This method is used by most of the matched_*, and the exception should
00218 be allowed to propagate back to the caller
00219 """
00220 if self.__match_history:
00221 return self.__match_history[-1]
00222 else:
00223 if strict: raise Exception('No matches recorded')
00224 return None
00225
00226
00227 def matched(self):
00228 """Return True if the last match was successful, else False.
00229 Raise Exception if no match attempts have been recorded."""
00230 return self.__match()['matchinfo'] is not None
00231
00232 def __matched_exception(self):
00233 """raise an exception if the most recent match failed"""
00234 if not self.matched():
00235 raise Exception('Cannot access match information: most recent match failed')
00236
00237
00238 def match(self):
00239 """Return the last matching string
00240 Raise Exception if no match attempts have been recorded.
00241 Raise Exception if most recent match failed
00242 """
00243 self.__matched_exception()
00244 return self.__match()['text']
00245
00246 def match_len(self):
00247 """Return the length of the last matching string
00248 This is equivalent to len(scanner.match()).
00249
00250 Raise Exception if no match attempts have been recorded.
00251 Raise Exception if most recent match failed
00252 """
00253 self.__matched_exception()
00254
00255 return self.__match()['len']
00256
00257 def match_pos(self):
00258 """Return the offset into the string of the last match
00259 Raise Exception if no match attempts have been recorded.
00260 Raise Exception if most recent match failed
00261 """
00262 self.__matched_exception()
00263 return self.__match()['index']
00264
00265
00266 def __match_info(self, strict=True):
00267 m = self.__match()['matchinfo']
00268 if m is None and strict:
00269 self.__matched_exception()
00270 return m
00271
00272 def match_info(self):
00273 """Return the most recent match's MatchObject. This is what's returned by
00274 the re module. Use this if the other methods here don't expose what you
00275 need.
00276 Raise Exception if no match attempts have been recorded.
00277 Raise Exception if most recent match failed
00278
00279 """
00280 return self.__match_info(True)
00281
00282 def match_groups(self, default=None):
00283 """Return the most recent's match's groups, this is a wrapper to
00284 re.MatchObject.groups()
00285
00286 Raise Exception if no match attempts have been recorded.
00287 Raise Exception if most recent match failed
00288 """
00289 return self.__match_info().groups(default)
00290
00291 def match_groupdict(self, default=None):
00292 """Return a dict containing group_name => match. This is a wrapper to
00293 re.MatchObject.groupdict() and as such it only works for named groups
00294
00295 Raise Exception if no match attempts have been recorded.
00296 Raise Exception if most recent match failed
00297 """
00298 return self.__match_info().groupdict(default)
00299
00300
00301
00302 def match_group(self, *args):
00303 """Return the contents of the given group in the most recent match.
00304 This is a wrapper to re.MatchObject.group()
00305 raise IndexError if the match exists but the group does not
00306 raise Exception if no match attempts have been recorded
00307 raise Exception if most recent match failed
00308 """
00309 m = self.__match_info()
00310 if not args: args = (0,)
00311
00312 return m.group(*args)
00313
00314
00315 def pre_match(self):
00316 """Return the string preceding the last match or None. This is equivalent
00317 to: scanner.string[:scanner.match_pos()]
00318
00319 raise Exception if no match attempts have been recorded
00320 """
00321 return self.__src[:self.match_pos()]
00322
00323 def post_match(self):
00324 """Return the string following the last match or None. This is equivalent
00325 to: scanner.string[scanner.match_pos() + scanner.match_len() : ]
00326
00327 raise Exception if no match attempts have been recorded
00328 """
00329 return self.__src[ self.match_pos() + self.match_len() : ]
00330
00331 def __rest(self):
00332 """ Return the rest of the string """
00333
00334
00335
00336 s = None
00337 last = None
00338 while True:
00339 pos = self.__index
00340 if last != pos:
00341 s = self.string[pos:]
00342 last = pos
00343 yield s
00344
00345 def rest(self):
00346 """Return the string from the current pointer onwards, i.e. the segment of
00347 string which has not yet been consumed."""
00348
00349
00350 return next(self.__rest_gen)
00351
00352 def rest_len(self):
00353 """Return the length of string remaining.
00354 This is equivalent to len(rest())"""
00355 return len(self.rest())
00356
00357 def unscan(self):
00358 """Revert the scanner's state to that of the previous match. Only one
00359 previous state is remembered
00360 Throw Exception if there is no previous known state to restore"""
00361 if not self.__match_history:
00362 raise Exception('Cannot unscan, already at earliest point in history')
00363 m = self.__match_history.pop()
00364 self.__index = m['pos']
00365
00366
00367 def __check(self, pattern, flags, consume=False, log=True,
00368 search_func='match', consume_match=True):
00369 """ Perform a match and return the matching substring or None
00370
00371 Arguments:
00372 pattern -- the regex pattern to look for (as string or compiled)
00373 flags -- the regex flags to use in the match, as defined in the re module
00374 consume -- whether or not to consume the matching string
00375 log -- whether or not to write to the __match_history
00376 search_func -- Either 'match' or 'search'. The former looks for matches
00377 immediately at the beginning of the string pointer, the latter will look
00378 for matches anywhere after the string pointer.
00379 consume_match -- If consume is True, this sets that the full text of the
00380 match should be consumed as well as what preceded it up until that match
00381 """
00382
00383 if self.__src is None:
00384 raise Exception('Scanner called with no string set')
00385
00386 regex = None
00387 compiled_ourselves = False
00388
00389
00390 if isinstance(pattern, str):
00391
00392 compiled_ourselves = True
00393 key = (pattern, flags)
00394 regex = None
00395 if key in self.__regex_cache:
00396 regex = self.__regex_cache[key]
00397 else:
00398 regex = re.compile(pattern, flags)
00399 self.__regex_cache[key] = regex
00400 else:
00401 regex = pattern
00402
00403 try:
00404 func = getattr(regex, search_func)
00405 except AttributeError:
00406 assert not compiled_ourselves
00407 raise ValueError("Object passed as 'pattern' to scan/check/skip does not implement a {0} method".format(search_func))
00408
00409 m = func(self.__src, self.__index)
00410
00411 substr = None
00412 substr_len = None
00413 match_pos = None
00414
00415 if m:
00416 match_pos = self.__index
00417 substr = '' if m.start(0) == match_pos else self.__src[self.__index:m.start(0)]
00418 if consume_match: substr += m.group(0)
00419 substr_len = len(substr)
00420
00421 if log:
00422 matched = {}
00423 matched['index'] = None if m is None else match_pos
00424 matched['text'] = substr
00425 matched['len'] = None if m is None else substr_len
00426 matched['pos'] = self.__index
00427 matched['matchinfo'] = m
00428 if self.__match_history:
00429 self.__match_history = [self.__match_history[-1], matched]
00430 else:
00431 self.__match_history = [matched]
00432
00433 if consume and m:
00434 self.__index = match_pos + substr_len
00435
00436 return substr
00437
00438 def check(self, pattern, flags=0):
00439 """Return a match for the pattern (or None) at the scan pointer without
00440 actually consuming the string
00441 If the pattern matched but was zero length, the empty string is returned
00442 If the pattern did not match, None is returned
00443
00444 """
00445 return self.__check(pattern, flags)
00446
00447 def check_to(self, pattern, flags=0):
00448 """Return all text up until the beginning of the first match for the pattern
00449 after the scan pointer without consuming the string
00450 If the pattern matched but was zero length, the empty string is returned
00451 If the pattern did not match, None is returned
00452 """
00453 return self.__check(pattern, flags, search_func='search', consume_match=False)
00454
00455 def check_until(self, pattern, flags=0):
00456 """Return all text up until the end of the first match for the pattern
00457 after the scan pointer without consuming the string
00458 If the pattern matched but was zero length, the empty string is returned
00459 If the pattern did not match, None is returned
00460 """
00461 return self.__check(pattern, flags, consume=False, search_func='search')
00462
00463 def scan(self, pattern, flags=0):
00464 """Return a match for the pattern at the scan pointer and consume the
00465 string.
00466 Return None if not match is found"""
00467 return self.__check(pattern, flags, consume=True)
00468
00469 def scan_to(self, pattern, flags=0):
00470 """Return all text up until the beginning of the first match for the pattern
00471 after the scan pointer.
00472 The pattern is not included in the match.
00473 The scan pointer will be moved such that it immediately precedes the pattern
00474 Return None if no match is found"""
00475 return self.__check(pattern, flags, consume=True, consume_match=False,
00476 search_func='search')
00477
00478 def scan_until(self, pattern, flags=0):
00479 """Return the first match for the pattern after the scan pointer and
00480 consumes the string up until the end of the match.
00481 Return None if no match is found"""
00482 return self.__check(pattern, flags, consume=True, search_func='search')
00483
00484 def skip(self, pattern, flags=0):
00485 """Scan ahead over the given pattern and return how many characters were
00486 consumed, or None.
00487 Similar to scan, but does not return the string or record the match """
00488 m = self.__check(pattern, flags, log=False, consume=True)
00489 return None if m is None else len(m)
00490
00491 def skip_to(self, pattern, flags=0):
00492 """Scan ahead until the beginning of first occurrance of the given pattern
00493 and return how many characters were skipped, or None if the match
00494 failed
00495 The match is not recorded.
00496 """
00497 start = self.__index
00498 m = self.__check(pattern, flags, log=False, consume=True, consume_match=False,
00499 search_func='search')
00500 return None if m is None else self.__index - start
00501
00502 def skip_until(self, pattern, flags=0):
00503 """Scan ahead until the end of first occurrance of the given pattern and
00504 return how many characters were consumed, or None if the match failed
00505 The match is not recorded
00506 """
00507 start = self.__index
00508 m = self.__check(pattern, flags, log=False, consume=True, search_func='search')
00509 return None if m is None else self.__index - start
00510
00511 def skip_lines(self, n=1):
00512 """ Skip the given number of lines and return the number of lines consumed """
00513 for i in xrange(n):
00514 if not self.skip_until('.^', re.M|re.S): return i
00515 return i+1
00516
00517 def skip_bytes(self, n):
00518 """Skip the given number of bytes and return the number of bytes consumed"""
00519 return len(self.get(n))
00520
00521 def skip_whitespace(self, n=None, multiline=True):
00522 """Skip over whitespace characters and return the number of characters
00523 consumed
00524
00525 Arguments:
00526 n -- maximum number of characters to cosume (default None)
00527 multiline -- whether or not to consume newline characters (default True)
00528 """
00529 chars = r'\s' if multiline else '[\b\f\t ]'
00530 chars += ('+' if n is None else '{{,{0}}}'.format(n))
00531 skipped = self.skip(chars)
00532 return 0 if skipped is None else skipped
00533
00534 def peek(self, length=1):
00535 """Return the given number of characters from the current string pointer
00536 without consuming them.
00537 If we reach the end of the stream, the empty string is returned"""
00538 return self.__src[self.__index: self.__index + length]
00539
00540 def get(self, length=1):
00541 """Return the given number of characters from the current string pointer
00542 and consume them
00543 If we reach the end of the stream, the empty string is returned
00544 """
00545 s = self.peek(length)
00546 self.__index += len(s)
00547 return s