scanner.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 
00003 """Copyright (c) 2011, Mark Watkinson
00004 All rights reserved.
00005 
00006 Redistribution and use in source and binary forms, with or without
00007 modification, are permitted provided that the following conditions are met:
00008     * Redistributions of source code must retain the above copyright
00009       notice, this list of conditions and the following disclaimer.
00010     * Redistributions in binary form must reproduce the above copyright
00011       notice, this list of conditions and the following disclaimer in the
00012       documentation and/or other materials provided with the distribution.
00013     * Neither the name of Scanner nor the
00014       names of its contributors may be used to endorse or promote products
00015       derived from this software without specific prior written permission.
00016 
00017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
00018 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
00019 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00020 DISCLAIMED. IN NO EVENT SHALL MARK WATKINSON BE LIABLE FOR ANY
00021 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
00022 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00023 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
00024 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00025 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00026 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."""
00027 
00028 
00029 import re
00030 
00031 class Scanner(object):  
00032   """A simple class to aid in lexical analysis of a string.
00033   Styled after, but not entirely the same as, Ruby's StringScanner class.
00034   
00035   Basic philosophy is simple: Scanner traverses a string left to right,
00036   consuming the string as it goes. The current position is the string pointer
00037   Scanner.pos. At each iteration, the caller uses the scanning methods to 
00038   determine what the current piece of string actually is.
00039   
00040   Scanning methods:
00041     With the exception of get and peek, all scanning methods take a pattern and
00042     (optionally) flags (e.g re.X). The patterns are assumed to be either 
00043     strings or compiled regular expression objects (i.e. the result of 
00044     re.compile, or equivalent). If a pattern is not a string but does not 
00045     implement match or search (whichever is being used), a ValueError is raised.
00046     String patterns are compiled and cached internally.
00047     
00048     The check, scan and skip methods all try to match *at* the current scan
00049     pointer. check_to, scan_to and skip_to all try to find a match somewhere
00050     beyond the scan pointer and jump *to* that position. check_until, scan_until,
00051     and skip_until are like *_to, but also consume the match (so the jump to
00052     the *end* of that position)
00053     
00054     Lookahead:
00055       check()
00056       check_to()
00057       check_until()
00058       peek()
00059       
00060     Consume:
00061       get()
00062       scan()
00063       scan_to()
00064       scan_until()      
00065       skip()
00066       skip_to()
00067       skip_until()
00068       skip_bytes()      (convenience wrapper)
00069       skip_lines()      (convenience wrapper)
00070       skip_whitespace() (convenience wrapper)
00071       
00072     Note that scan* and check* both return either a string, in the case of a
00073     match, or None, in the case of no match. If the match exists but is zero
00074     length, the empty string is returned. Be careful handling this as both 
00075     None and the empty string evaluate to False, but mean very different things.
00076     
00077     peek and get also return the empty string when the end of the stream is 
00078     reached.
00079     
00080       
00081   Most recent match data:
00082     
00083     matched() -- True/False - was the most recent match a success?
00084     
00085     The following methods all throw Exception if not matched()
00086     
00087     match() -- matched string
00088     match_len() -- matched string length
00089     match_pos() -- offset of match
00090     
00091     Wrappers around re.*
00092     match_info()  -- the re.MatchObject
00093     match_group()
00094     match_groups()
00095     match_groupdict()
00096     
00097     pre_match() -- string preceding the match
00098     post_match() -- string following the match
00099     
00100   Misc:
00101     pos -- get/set current scan pointer position
00102     
00103     bol() -- beginning of line? (DOS/Unix/Mac aware)
00104     eol() -- end of line? (DOS/Unix/Mac aware)
00105     eos() -- end of string?
00106     rest() -- remaining (unconsumed) string
00107     rest_len() -- length of remaining string
00108     unscan() -- revert to previous state
00109     
00110   Setup:
00111     string -- get/set current source string
00112     
00113     reset() -- reset the scanner ready to start again
00114     terminate() -- trigger premature finish  
00115   """
00116   
00117   def __init__(self, src=None):
00118     """Constructor 
00119     
00120     Arguments:
00121     src -- a string to scan. This can be set later by string()
00122     
00123     """
00124     
00125     self.__src = None
00126     self.__index = None    
00127     self.__src_len = 0
00128     
00129     # Cached regex objects.
00130     self.__regex_cache = {}
00131     
00132     # A list of dicts
00133     # Each dict has keys: 'index' (position of match), 'pos' (pos pointer at 
00134     # time the match was run), 'text' (text of the match), 'len' (length of the
00135     # text of the match), and 'matchinfo', as returned by re.search or re.match
00136     self.__match_history = None
00137     
00138     if src is not None:
00139       self.string = src
00140       
00141     # a generator object corresponding to __rest(), to be used by rest()
00142     self.__rest_gen = None
00143     
00144     self.reset()
00145     
00146     
00147   @property
00148   def pos(self):
00149     """The current string pointer position."""
00150     return self.__index
00151     
00152   @pos.setter
00153   def pos(self, new_pos):
00154     """Set the string pointer position.
00155     
00156     Arguments:
00157     new_pos -- The new offset into the string
00158     
00159     Throw Exception if new_pos is out of range
00160     
00161     """
00162     p_ = max(min(new_pos, self.__src_len), 0)
00163     if new_pos != p_: raise Exception('pos set to out of range value')
00164     self.__index = p_ 
00165     
00166   def eos(self):
00167     """Return True iff we are at the end of the string, else False."""
00168     return self.__index >= self.__src_len
00169     
00170   def reset(self):
00171     """Reset the scanner's state including string pointer and match history."""
00172     self.pos = 0
00173     self.__match_history = []
00174     self.__rest_gen = self.__rest()
00175 
00176     
00177   @property
00178   def string(self):
00179     """The source string"""
00180     return self.__src
00181     
00182   @string.setter 
00183   def string(self, s):
00184     """Set the source string"""
00185     if s is None:  raise Exception('Scanner.string called with None')   
00186     self.__src = s
00187     self.__src_len = len(s)
00188     self.reset()
00189     return self.__src
00190       
00191   def terminate(self):
00192     """Set the string pointer to the end of the input and clear the match 
00193     history."""
00194     self.reset()
00195     self.pos = self.__src_len
00196     
00197   def bol(self):
00198     """Return whether or not the scan pointer is immediately after a newline
00199     character (DOS/Unix/Mac aware), or at the start of the string. """
00200     return self.__index == 0 or self.__src[self.__index-1] == '\n' \
00201       or self.__src[self.__index-1] == '\r' and self.peek() != '\n' 
00202       
00203   def eol(self):
00204     """Return whether or not the scan pointer is immediately before a newline 
00205     character (DOS/Unix/Mac aware) or at the end of the string."""
00206     if self.eos(): return True
00207     p = self.peek(2)
00208     if p == '\r\n' or p.startswith('\r'): return True    
00209     if p.startswith('\n'):
00210       if self.__index == 0 or self.__src[self.__index-1] != '\r': return True
00211     return False
00212     
00213   def __match(self, strict=True):
00214     """ Return the most recent match data.
00215     Raise Exception if no matches are known.
00216     
00217     This method is used by most of the matched_*, and the exception should 
00218     be allowed to propagate back to the caller
00219     """
00220     if self.__match_history:
00221       return self.__match_history[-1]
00222     else: 
00223       if strict: raise Exception('No matches recorded')
00224       return None
00225    
00226   
00227   def matched(self):
00228     """Return True if the last match was successful, else False.
00229     Raise Exception if no match attempts have been recorded."""
00230     return self.__match()['matchinfo'] is not None
00231     
00232   def __matched_exception(self):
00233     """raise an exception if the most recent match failed"""
00234     if not self.matched():
00235       raise Exception('Cannot access match information: most recent match failed')
00236     
00237     
00238   def match(self):
00239     """Return the last matching string
00240     Raise Exception if no match attempts have been recorded.
00241     Raise Exception if most recent match failed
00242     """
00243     self.__matched_exception()
00244     return self.__match()['text']
00245     
00246   def match_len(self):
00247     """Return the length of the last matching string
00248     This is equivalent to len(scanner.match()).
00249     
00250     Raise Exception if no match attempts have been recorded.
00251     Raise Exception if most recent match failed    
00252     """
00253     self.__matched_exception()
00254     
00255     return self.__match()['len']
00256     
00257   def match_pos(self):
00258     """Return the offset into the string of the last match
00259     Raise Exception if no match attempts have been recorded.
00260     Raise Exception if most recent match failed    
00261     """
00262     self.__matched_exception()
00263     return self.__match()['index']
00264     
00265 
00266   def __match_info(self, strict=True):
00267     m = self.__match()['matchinfo']
00268     if m is None and strict:
00269       self.__matched_exception()
00270     return m
00271   
00272   def match_info(self):
00273     """Return the most recent match's MatchObject. This is what's returned by
00274     the re module. Use this if the other methods here don't expose what you 
00275     need.
00276     Raise Exception if no match attempts have been recorded.
00277     Raise Exception if most recent match failed
00278     
00279     """
00280     return self.__match_info(True)
00281     
00282   def match_groups(self, default=None):
00283     """Return the most recent's match's groups, this is a wrapper to 
00284     re.MatchObject.groups()
00285     
00286     Raise Exception if no match attempts have been recorded.
00287     Raise Exception if most recent match failed
00288     """    
00289     return self.__match_info().groups(default)
00290   
00291   def match_groupdict(self, default=None):
00292     """Return a dict containing group_name => match. This is a wrapper to
00293     re.MatchObject.groupdict() and as such it only works for named groups
00294     
00295     Raise Exception if no match attempts have been recorded.
00296     Raise Exception if most recent match failed
00297     """
00298     return self.__match_info().groupdict(default)
00299     
00300 
00301     
00302   def match_group(self, *args):
00303     """Return the contents of the given group in the most recent match.
00304     This is a wrapper to re.MatchObject.group()
00305     raise IndexError if the match exists but the group does not
00306     raise Exception if no match attempts have been recorded
00307     raise Exception if most recent match failed
00308     """
00309     m = self.__match_info()
00310     if not args: args = (0,) # should this be a tuple or list?
00311     # throws IndexError, allow it to propagate
00312     return m.group(*args)
00313     
00314     
00315   def pre_match(self):
00316     """Return the string preceding the last match or None. This is equivalent 
00317     to:  scanner.string[:scanner.match_pos()]
00318     
00319     raise Exception if no match attempts have been recorded
00320     """    
00321     return self.__src[:self.match_pos()]
00322       
00323   def post_match(self):
00324     """Return the string following the last match or None. This is equivalent 
00325     to:  scanner.string[scanner.match_pos() + scanner.match_len() : ]
00326     
00327     raise Exception if no match attempts have been recorded
00328     """
00329     return self.__src[ self.match_pos() + self.match_len() : ]
00330   
00331   def __rest(self):
00332     """ Return the rest of the string """
00333   
00334     # a generator in here simulates static variables such that we aren't 
00335     # recalculating the substring on every call, just when pos changes    
00336     s = None
00337     last = None
00338     while True:
00339       pos = self.__index
00340       if last != pos:
00341         s = self.string[pos:]
00342         last = pos
00343       yield s
00344       
00345   def rest(self):
00346     """Return the string from the current pointer onwards, i.e. the segment of 
00347     string which has not yet been consumed."""
00348         
00349     # use next(), not .next() for py3k compat
00350     return next(self.__rest_gen)
00351     
00352   def rest_len(self):
00353     """Return the length of string remaining. 
00354     This is equivalent to len(rest())"""
00355     return len(self.rest())
00356     
00357   def unscan(self):
00358     """Revert the scanner's state to that of the previous match. Only one 
00359     previous state is remembered
00360     Throw Exception if there is no previous known state to restore"""
00361     if not self.__match_history:
00362       raise Exception('Cannot unscan, already at earliest point in history')
00363     m = self.__match_history.pop()
00364     self.__index = m['pos']
00365     
00366     
00367   def __check(self, pattern, flags, consume=False, log=True, 
00368     search_func='match', consume_match=True):
00369     """ Perform a match and return the matching substring or None 
00370     
00371     Arguments:
00372     pattern -- the regex pattern to look for (as string or compiled)
00373     flags -- the regex flags to use in the match, as defined in the re module
00374     consume -- whether or not to consume the matching string
00375     log -- whether or not to write to the __match_history
00376     search_func -- Either 'match' or 'search'. The former looks for matches 
00377     immediately at the beginning of the string pointer, the latter will look
00378     for matches anywhere after the string pointer.
00379     consume_match -- If consume is True, this sets that the full text of the 
00380       match should be consumed as well as what preceded it up until that match
00381     """
00382     
00383     if self.__src is None:
00384       raise Exception('Scanner called with no string set')
00385     
00386     regex = None
00387     compiled_ourselves = False
00388     # if the pattern is a string we need to compile it
00389     # if it's not a string we assume it's a compiled regular expression
00390     if isinstance(pattern, str):
00391       # cache the regex
00392       compiled_ourselves = True
00393       key = (pattern, flags)
00394       regex = None
00395       if key in self.__regex_cache:
00396         regex = self.__regex_cache[key]
00397       else:    
00398         regex = re.compile(pattern, flags)
00399         self.__regex_cache[key] = regex
00400     else:
00401       regex = pattern
00402       
00403     try:
00404       func = getattr(regex, search_func)
00405     except AttributeError:
00406       assert not compiled_ourselves
00407       raise ValueError("Object passed as 'pattern' to scan/check/skip does not implement a {0} method".format(search_func))
00408     
00409     m = func(self.__src, self.__index)
00410     
00411     substr = None
00412     substr_len = None
00413     match_pos = None
00414     
00415     if m:
00416       match_pos = self.__index
00417       substr = '' if m.start(0) == match_pos else self.__src[self.__index:m.start(0)]
00418       if consume_match: substr += m.group(0)
00419       substr_len = len(substr)
00420       
00421     if log:
00422       matched = {}
00423       matched['index'] = None if m is None else match_pos
00424       matched['text'] = substr
00425       matched['len'] = None if m is None else substr_len
00426       matched['pos'] = self.__index
00427       matched['matchinfo'] = m
00428       if self.__match_history:
00429         self.__match_history = [self.__match_history[-1], matched]
00430       else:
00431         self.__match_history = [matched]
00432       
00433     if consume and m:
00434       self.__index = match_pos + substr_len
00435       
00436     return substr
00437     
00438   def check(self, pattern, flags=0):
00439     """Return a match for the pattern (or None) at the scan pointer without 
00440     actually consuming the string
00441     If the pattern matched but was zero length, the empty string is returned
00442     If the pattern did not match, None is returned
00443     
00444     """
00445     return self.__check(pattern, flags)
00446     
00447   def check_to(self, pattern, flags=0):
00448     """Return all text up until the beginning of the first match for the pattern 
00449     after the scan pointer without consuming the string
00450     If the pattern matched but was zero length, the empty string is returned
00451     If the pattern did not match, None is returned
00452     """
00453     return self.__check(pattern, flags, search_func='search', consume_match=False)
00454     
00455   def check_until(self, pattern, flags=0):
00456     """Return all text up until the end of the first match for the pattern 
00457     after the scan pointer without consuming the string
00458     If the pattern matched but was zero length, the empty string is returned
00459     If the pattern did not match, None is returned
00460     """
00461     return self.__check(pattern, flags, consume=False, search_func='search')
00462     
00463   def scan(self, pattern, flags=0):
00464     """Return a match for the pattern at the scan pointer and consume the 
00465     string.
00466     Return None if not match is found"""
00467     return self.__check(pattern, flags, consume=True)
00468     
00469   def scan_to(self, pattern, flags=0):
00470     """Return all text up until the beginning of the first match for the pattern
00471     after the scan pointer.
00472     The pattern is not included in the match.
00473     The scan pointer will be moved such that it immediately precedes the pattern
00474     Return None if no match is found"""
00475     return self.__check(pattern, flags, consume=True, consume_match=False,
00476       search_func='search')
00477     
00478   def scan_until(self, pattern, flags=0):
00479     """Return the first match for the pattern after the scan pointer and 
00480     consumes the string up until the end of the match.    
00481     Return None if no match is found"""    
00482     return self.__check(pattern, flags, consume=True, search_func='search')
00483     
00484   def skip(self, pattern, flags=0):
00485     """Scan ahead over the given pattern and return how many characters were
00486     consumed, or None.
00487     Similar to scan, but does not return the string or record the match """
00488     m = self.__check(pattern, flags, log=False, consume=True)
00489     return None if m is None else len(m)
00490     
00491   def skip_to(self, pattern, flags=0):
00492     """Scan ahead until the beginning of first occurrance of the given pattern
00493     and return how many characters were skipped, or None if the match
00494     failed
00495     The match is not recorded.
00496     """    
00497     start = self.__index
00498     m = self.__check(pattern, flags, log=False, consume=True, consume_match=False,
00499       search_func='search')
00500     return None if m is None else self.__index - start
00501     
00502   def skip_until(self, pattern, flags=0):
00503     """Scan ahead until the end of first occurrance of the given pattern and 
00504     return how many characters were consumed, or None if the match failed
00505     The match is not recorded
00506     """
00507     start = self.__index
00508     m = self.__check(pattern, flags, log=False, consume=True, search_func='search')
00509     return None if m is None else self.__index - start
00510     
00511   def skip_lines(self, n=1):
00512     """ Skip the given number of lines and return the number of lines consumed """
00513     for i in xrange(n):      
00514       if not self.skip_until('.^', re.M|re.S): return i
00515     return i+1
00516     
00517   def skip_bytes(self, n):
00518     """Skip the given number of bytes and return the number of bytes consumed"""
00519     return len(self.get(n))
00520     
00521   def skip_whitespace(self, n=None, multiline=True):
00522     """Skip over whitespace characters and return the number of characters 
00523     consumed
00524     
00525     Arguments: 
00526     n -- maximum number of characters to cosume (default None)
00527     multiline -- whether or not to consume newline characters (default True)
00528     """
00529     chars = r'\s' if multiline else '[\b\f\t ]'
00530     chars += ('+' if n is None else '{{,{0}}}'.format(n))
00531     skipped = self.skip(chars)
00532     return 0 if skipped is None else skipped
00533     
00534   def peek(self, length=1):
00535     """Return the given number of characters from the current string pointer
00536     without consuming them.
00537     If we reach the end of the stream, the empty string is returned"""
00538     return self.__src[self.__index: self.__index + length]
00539     
00540   def get(self, length=1):
00541     """Return the given number of characters from the current string pointer 
00542     and consume them
00543     If we reach the end of the stream, the empty string is returned
00544     """
00545     s = self.peek(length)
00546     self.__index += len(s)
00547     return s


mtconnect_ros_bridge
Author(s): Stephen L. Wiedmann
autogenerated on Mon Jan 6 2014 11:30:45