streams.py
Go to the documentation of this file.
00001 """ANTLR3 runtime package"""
00002 
00003 # begin[licence]
00004 #
00005 # [The "BSD licence"]
00006 # Copyright (c) 2005-2008 Terence Parr
00007 # All rights reserved.
00008 #
00009 # Redistribution and use in source and binary forms, with or without
00010 # modification, are permitted provided that the following conditions
00011 # are met:
00012 # 1. Redistributions of source code must retain the above copyright
00013 #    notice, this list of conditions and the following disclaimer.
00014 # 2. Redistributions in binary form must reproduce the above copyright
00015 #    notice, this list of conditions and the following disclaimer in the
00016 #    documentation and/or other materials provided with the distribution.
00017 # 3. The name of the author may not be used to endorse or promote products
00018 #    derived from this software without specific prior written permission.
00019 #
00020 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
00021 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
00022 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
00023 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
00024 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
00025 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00026 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00027 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00028 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
00029 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030 #
00031 # end[licence]
00032 
00033 import codecs
00034 from StringIO import StringIO
00035 
00036 from constants import DEFAULT_CHANNEL, EOF
00037 from tokens import Token, EOF_TOKEN
00038 
00039 
00040 ############################################################################
00041 #
00042 # basic interfaces
00043 #   IntStream
00044 #    +- CharStream
00045 #    \- TokenStream
00046 #
00047 # subclasses must implemented all methods
00048 #
00049 ############################################################################
00050 
00051 class IntStream(object):
00052     """
00053     @brief Base interface for streams of integer values.
00054 
00055     A simple stream of integers used when all I care about is the char
00056     or token type sequence (such as interpretation).
00057     """
00058 
00059     def consume(self):
00060         raise NotImplementedError
00061     
00062 
00063     def LA(self, i):
00064         """Get int at current input pointer + i ahead where i=1 is next int.
00065 
00066         Negative indexes are allowed.  LA(-1) is previous token (token
00067         just matched).  LA(-i) where i is before first token should
00068         yield -1, invalid char / EOF.
00069         """
00070         
00071         raise NotImplementedError
00072         
00073 
00074     def mark(self):
00075         """
00076         Tell the stream to start buffering if it hasn't already.  Return
00077         current input position, index(), or some other marker so that
00078         when passed to rewind() you get back to the same spot.
00079         rewind(mark()) should not affect the input cursor.  The Lexer
00080         track line/col info as well as input index so its markers are
00081         not pure input indexes.  Same for tree node streams.
00082         """
00083 
00084         raise NotImplementedError
00085 
00086 
00087     def index(self):
00088         """
00089         Return the current input symbol index 0..n where n indicates the
00090         last symbol has been read.  The index is the symbol about to be
00091         read not the most recently read symbol.
00092         """
00093 
00094         raise NotImplementedError
00095 
00096 
00097     def rewind(self, marker=None):
00098         """
00099         Reset the stream so that next call to index would return marker.
00100         The marker will usually be index() but it doesn't have to be.  It's
00101         just a marker to indicate what state the stream was in.  This is
00102         essentially calling release() and seek().  If there are markers
00103         created after this marker argument, this routine must unroll them
00104         like a stack.  Assume the state the stream was in when this marker
00105         was created.
00106 
00107         If marker is None:
00108         Rewind to the input position of the last marker.
00109         Used currently only after a cyclic DFA and just
00110         before starting a sem/syn predicate to get the
00111         input position back to the start of the decision.
00112         Do not "pop" the marker off the state.  mark(i)
00113         and rewind(i) should balance still. It is
00114         like invoking rewind(last marker) but it should not "pop"
00115         the marker off.  It's like seek(last marker's input position).       
00116         """
00117 
00118         raise NotImplementedError
00119 
00120 
00121     def release(self, marker=None):
00122         """
00123         You may want to commit to a backtrack but don't want to force the
00124         stream to keep bookkeeping objects around for a marker that is
00125         no longer necessary.  This will have the same behavior as
00126         rewind() except it releases resources without the backward seek.
00127         This must throw away resources for all markers back to the marker
00128         argument.  So if you're nested 5 levels of mark(), and then release(2)
00129         you have to release resources for depths 2..5.
00130         """
00131 
00132         raise NotImplementedError
00133 
00134 
00135     def seek(self, index):
00136         """
00137         Set the input cursor to the position indicated by index.  This is
00138         normally used to seek ahead in the input stream.  No buffering is
00139         required to do this unless you know your stream will use seek to
00140         move backwards such as when backtracking.
00141 
00142         This is different from rewind in its multi-directional
00143         requirement and in that its argument is strictly an input cursor
00144         (index).
00145 
00146         For char streams, seeking forward must update the stream state such
00147         as line number.  For seeking backwards, you will be presumably
00148         backtracking using the mark/rewind mechanism that restores state and
00149         so this method does not need to update state when seeking backwards.
00150 
00151         Currently, this method is only used for efficient backtracking using
00152         memoization, but in the future it may be used for incremental parsing.
00153 
00154         The index is 0..n-1.  A seek to position i means that LA(1) will
00155         return the ith symbol.  So, seeking to 0 means LA(1) will return the
00156         first element in the stream. 
00157         """
00158 
00159         raise NotImplementedError
00160 
00161 
00162     def size(self):
00163         """
00164         Only makes sense for streams that buffer everything up probably, but
00165         might be useful to display the entire stream or for testing.  This
00166         value includes a single EOF.
00167         """
00168 
00169         raise NotImplementedError
00170 
00171 
00172     def getSourceName(self):
00173         """
00174         Where are you getting symbols from?  Normally, implementations will
00175         pass the buck all the way to the lexer who can ask its input stream
00176         for the file name or whatever.
00177         """
00178 
00179         raise NotImplementedError
00180 
00181 
00182 class CharStream(IntStream):
00183     """
00184     @brief A source of characters for an ANTLR lexer.
00185 
00186     This is an abstract class that must be implemented by a subclass.
00187     
00188     """
00189 
00190     # pylint does not realize that this is an interface, too
00191     #pylint: disable-msg=W0223
00192     
00193     EOF = -1
00194 
00195 
00196     def substring(self, start, stop):
00197         """
00198         For infinite streams, you don't need this; primarily I'm providing
00199         a useful interface for action code.  Just make sure actions don't
00200         use this on streams that don't support it.
00201         """
00202 
00203         raise NotImplementedError
00204         
00205     
00206     def LT(self, i):
00207         """
00208         Get the ith character of lookahead.  This is the same usually as
00209         LA(i).  This will be used for labels in the generated
00210         lexer code.  I'd prefer to return a char here type-wise, but it's
00211         probably better to be 32-bit clean and be consistent with LA.
00212         """
00213 
00214         raise NotImplementedError
00215 
00216 
00217     def getLine(self):
00218         """ANTLR tracks the line information automatically"""
00219 
00220         raise NotImplementedError
00221 
00222 
00223     def setLine(self, line):
00224         """
00225         Because this stream can rewind, we need to be able to reset the line
00226         """
00227 
00228         raise NotImplementedError
00229 
00230 
00231     def getCharPositionInLine(self):
00232         """
00233         The index of the character relative to the beginning of the line 0..n-1
00234         """
00235 
00236         raise NotImplementedError
00237 
00238 
00239     def setCharPositionInLine(self, pos):
00240         raise NotImplementedError
00241 
00242 
00243 class TokenStream(IntStream):
00244     """
00245 
00246     @brief A stream of tokens accessing tokens from a TokenSource
00247 
00248     This is an abstract class that must be implemented by a subclass.
00249     
00250     """
00251     
00252     # pylint does not realize that this is an interface, too
00253     #pylint: disable-msg=W0223
00254     
00255     def LT(self, k):
00256         """
00257         Get Token at current input pointer + i ahead where i=1 is next Token.
00258         i<0 indicates tokens in the past.  So -1 is previous token and -2 is
00259         two tokens ago. LT(0) is undefined.  For i>=n, return Token.EOFToken.
00260         Return null for LT(0) and any index that results in an absolute address
00261         that is negative.
00262         """
00263 
00264         raise NotImplementedError
00265 
00266 
00267     def get(self, i):
00268         """
00269         Get a token at an absolute index i; 0..n-1.  This is really only
00270         needed for profiling and debugging and token stream rewriting.
00271         If you don't want to buffer up tokens, then this method makes no
00272         sense for you.  Naturally you can't use the rewrite stream feature.
00273         I believe DebugTokenStream can easily be altered to not use
00274         this method, removing the dependency.
00275         """
00276 
00277         raise NotImplementedError
00278 
00279 
00280     def getTokenSource(self):
00281         """
00282         Where is this stream pulling tokens from?  This is not the name, but
00283         the object that provides Token objects.
00284         """
00285 
00286         raise NotImplementedError
00287 
00288 
00289     def toString(self, start=None, stop=None):
00290         """
00291         Return the text of all tokens from start to stop, inclusive.
00292         If the stream does not buffer all the tokens then it can just
00293         return "" or null;  Users should not access $ruleLabel.text in
00294         an action of course in that case.
00295 
00296         Because the user is not required to use a token with an index stored
00297         in it, we must provide a means for two token objects themselves to
00298         indicate the start/end location.  Most often this will just delegate
00299         to the other toString(int,int).  This is also parallel with
00300         the TreeNodeStream.toString(Object,Object).
00301         """
00302 
00303         raise NotImplementedError
00304 
00305         
00306 ############################################################################
00307 #
00308 # character streams for use in lexers
00309 #   CharStream
00310 #   \- ANTLRStringStream
00311 #
00312 ############################################################################
00313 
00314 
00315 class ANTLRStringStream(CharStream):
00316     """
00317     @brief CharStream that pull data from a unicode string.
00318     
00319     A pretty quick CharStream that pulls all data from an array
00320     directly.  Every method call counts in the lexer.
00321 
00322     """
00323 
00324     
00325     def __init__(self, data):
00326         """
00327         @param data This should be a unicode string holding the data you want
00328            to parse. If you pass in a byte string, the Lexer will choke on
00329            non-ascii data.
00330            
00331         """
00332         
00333         CharStream.__init__(self)
00334         
00335         # The data being scanned
00336         self.strdata = unicode(data)
00337         self.data = [ord(c) for c in self.strdata]
00338         
00339         # How many characters are actually in the buffer
00340         self.n = len(data)
00341 
00342         # 0..n-1 index into string of next char
00343         self.p = 0
00344 
00345         # line number 1..n within the input
00346         self.line = 1
00347 
00348         # The index of the character relative to the beginning of the
00349         # line 0..n-1
00350         self.charPositionInLine = 0
00351 
00352         # A list of CharStreamState objects that tracks the stream state
00353         # values line, charPositionInLine, and p that can change as you
00354         # move through the input stream.  Indexed from 0..markDepth-1.
00355         self._markers = [ ]
00356         self.lastMarker = None
00357         self.markDepth = 0
00358 
00359         # What is name or source of this char stream?
00360         self.name = None
00361 
00362 
00363     def reset(self):
00364         """
00365         Reset the stream so that it's in the same state it was
00366         when the object was created *except* the data array is not
00367         touched.
00368         """
00369         
00370         self.p = 0
00371         self.line = 1
00372         self.charPositionInLine = 0
00373         self._markers = [ ]
00374 
00375 
00376     def consume(self):
00377         try:
00378             if self.data[self.p] == 10: # \n
00379                 self.line += 1
00380                 self.charPositionInLine = 0
00381             else:
00382                 self.charPositionInLine += 1
00383 
00384             self.p += 1
00385             
00386         except IndexError:
00387             # happend when we reached EOF and self.data[self.p] fails
00388             # just do nothing
00389             pass
00390 
00391 
00392 
00393     def LA(self, i):
00394         if i == 0:
00395             return 0 # undefined
00396 
00397         if i < 0:
00398             i += 1 # e.g., translate LA(-1) to use offset i=0; then data[p+0-1]
00399 
00400         try:
00401             return self.data[self.p+i-1]
00402         except IndexError:
00403             return EOF
00404 
00405 
00406 
00407     def LT(self, i):
00408         if i == 0:
00409             return 0 # undefined
00410 
00411         if i < 0:
00412             i += 1 # e.g., translate LA(-1) to use offset i=0; then data[p+0-1]
00413 
00414         try:
00415             return self.strdata[self.p+i-1]
00416         except IndexError:
00417             return EOF
00418 
00419 
00420     def index(self):
00421         """
00422         Return the current input symbol index 0..n where n indicates the
00423         last symbol has been read.  The index is the index of char to
00424         be returned from LA(1).
00425         """
00426         
00427         return self.p
00428 
00429 
00430     def size(self):
00431         return self.n
00432 
00433 
00434     def mark(self):
00435         state = (self.p, self.line, self.charPositionInLine)
00436         try:
00437             self._markers[self.markDepth] = state
00438         except IndexError:
00439             self._markers.append(state)
00440         self.markDepth += 1
00441         
00442         self.lastMarker = self.markDepth
00443         
00444         return self.lastMarker
00445 
00446 
00447     def rewind(self, marker=None):
00448         if marker is None:
00449             marker = self.lastMarker
00450 
00451         p, line, charPositionInLine = self._markers[marker-1]
00452 
00453         self.seek(p)
00454         self.line = line
00455         self.charPositionInLine = charPositionInLine
00456         self.release(marker)
00457 
00458 
00459     def release(self, marker=None):
00460         if marker is None:
00461             marker = self.lastMarker
00462 
00463         self.markDepth = marker-1
00464 
00465 
00466     def seek(self, index):
00467         """
00468         consume() ahead until p==index; can't just set p=index as we must
00469         update line and charPositionInLine.
00470         """
00471         
00472         if index <= self.p:
00473             self.p = index # just jump; don't update stream state (line, ...)
00474             return
00475 
00476         # seek forward, consume until p hits index
00477         while self.p < index:
00478             self.consume()
00479 
00480 
00481     def substring(self, start, stop):
00482         return self.strdata[start:stop+1]
00483 
00484 
00485     def getLine(self):
00486         """Using setter/getter methods is deprecated. Use o.line instead."""
00487         return self.line
00488 
00489 
00490     def getCharPositionInLine(self):
00491         """
00492         Using setter/getter methods is deprecated. Use o.charPositionInLine
00493         instead.
00494         """
00495         return self.charPositionInLine
00496 
00497 
00498     def setLine(self, line):
00499         """Using setter/getter methods is deprecated. Use o.line instead."""
00500         self.line = line
00501 
00502 
00503     def setCharPositionInLine(self, pos):
00504         """
00505         Using setter/getter methods is deprecated. Use o.charPositionInLine
00506         instead.
00507         """
00508         self.charPositionInLine = pos
00509 
00510 
00511     def getSourceName(self):
00512         return self.name
00513 
00514 
00515 class ANTLRFileStream(ANTLRStringStream):
00516     """
00517     @brief CharStream that opens a file to read the data.
00518     
00519     This is a char buffer stream that is loaded from a file
00520     all at once when you construct the object.
00521     """
00522 
00523     def __init__(self, fileName, encoding=None):
00524         """
00525         @param fileName The path to the file to be opened. The file will be
00526            opened with mode 'rb'.
00527 
00528         @param encoding If you set the optional encoding argument, then the
00529            data will be decoded on the fly.
00530            
00531         """
00532         
00533         self.fileName = fileName
00534 
00535         fp = codecs.open(fileName, 'rb', encoding)
00536         try:
00537             data = fp.read()
00538         finally:
00539             fp.close()
00540             
00541         ANTLRStringStream.__init__(self, data)
00542 
00543 
00544     def getSourceName(self):
00545         """Deprecated, access o.fileName directly."""
00546         
00547         return self.fileName
00548 
00549 
00550 class ANTLRInputStream(ANTLRStringStream):
00551     """
00552     @brief CharStream that reads data from a file-like object.
00553 
00554     This is a char buffer stream that is loaded from a file like object
00555     all at once when you construct the object.
00556     
00557     All input is consumed from the file, but it is not closed.
00558     """
00559 
00560     def __init__(self, file, encoding=None):
00561         """
00562         @param file A file-like object holding your input. Only the read()
00563            method must be implemented.
00564 
00565         @param encoding If you set the optional encoding argument, then the
00566            data will be decoded on the fly.
00567            
00568         """
00569         
00570         if encoding is not None:
00571             # wrap input in a decoding reader
00572             reader = codecs.lookup(encoding)[2]
00573             file = reader(file)
00574 
00575         data = file.read()
00576             
00577         ANTLRStringStream.__init__(self, data)
00578 
00579 
00580 # I guess the ANTLR prefix exists only to avoid a name clash with some Java
00581 # mumbojumbo. A plain "StringStream" looks better to me, which should be
00582 # the preferred name in Python.
00583 StringStream = ANTLRStringStream
00584 FileStream = ANTLRFileStream
00585 InputStream = ANTLRInputStream
00586 
00587 
00588 ############################################################################
00589 #
00590 # Token streams
00591 #   TokenStream
00592 #   +- CommonTokenStream
00593 #   \- TokenRewriteStream
00594 #
00595 ############################################################################
00596 
00597 
00598 class CommonTokenStream(TokenStream):
00599     """
00600     @brief The most common stream of tokens
00601     
00602     The most common stream of tokens is one where every token is buffered up
00603     and tokens are prefiltered for a certain channel (the parser will only
00604     see these tokens and cannot change the filter channel number during the
00605     parse).
00606     """
00607 
00608     def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL):
00609         """
00610         @param tokenSource A TokenSource instance (usually a Lexer) to pull
00611             the tokens from.
00612 
00613         @param channel Skip tokens on any channel but this one; this is how we
00614             skip whitespace...
00615             
00616         """
00617         
00618         TokenStream.__init__(self)
00619         
00620         self.tokenSource = tokenSource
00621 
00622         # Record every single token pulled from the source so we can reproduce
00623         # chunks of it later.
00624         self.tokens = []
00625 
00626         # Map<tokentype, channel> to override some Tokens' channel numbers
00627         self.channelOverrideMap = {}
00628 
00629         # Set<tokentype>; discard any tokens with this type
00630         self.discardSet = set()
00631 
00632         # Skip tokens on any channel but this one; this is how we skip whitespace...
00633         self.channel = channel
00634 
00635         # By default, track all incoming tokens
00636         self.discardOffChannelTokens = False
00637 
00638         # The index into the tokens list of the current token (next token
00639         # to consume).  p==-1 indicates that the tokens list is empty
00640         self.p = -1
00641 
00642         # Remember last marked position
00643         self.lastMarker = None
00644         
00645 
00646     def setTokenSource(self, tokenSource):
00647         """Reset this token stream by setting its token source."""
00648         
00649         self.tokenSource = tokenSource
00650         self.tokens = []
00651         self.p = -1
00652         self.channel = DEFAULT_CHANNEL
00653 
00654 
00655     def reset(self):
00656         self.p = 0
00657         self.lastMarker = None
00658 
00659 
00660     def fillBuffer(self):
00661         """
00662         Load all tokens from the token source and put in tokens.
00663         This is done upon first LT request because you might want to
00664         set some token type / channel overrides before filling buffer.
00665         """
00666         
00667 
00668         index = 0
00669         t = self.tokenSource.nextToken()
00670         while t is not None and t.type != EOF:
00671             discard = False
00672             
00673             if self.discardSet is not None and t.type in self.discardSet:
00674                 discard = True
00675 
00676             elif self.discardOffChannelTokens and t.channel != self.channel:
00677                 discard = True
00678 
00679             # is there a channel override for token type?
00680             try:
00681                 overrideChannel = self.channelOverrideMap[t.type]
00682                 
00683             except KeyError:
00684                 # no override for this type
00685                 pass
00686             
00687             else:
00688                 if overrideChannel == self.channel:
00689                     t.channel = overrideChannel
00690                 else:
00691                     discard = True
00692             
00693             if not discard:
00694                 t.index = index
00695                 self.tokens.append(t)
00696                 index += 1
00697 
00698             t = self.tokenSource.nextToken()
00699        
00700         # leave p pointing at first token on channel
00701         self.p = 0
00702         self.p = self.skipOffTokenChannels(self.p)
00703 
00704 
00705     def consume(self):
00706         """
00707         Move the input pointer to the next incoming token.  The stream
00708         must become active with LT(1) available.  consume() simply
00709         moves the input pointer so that LT(1) points at the next
00710         input symbol. Consume at least one token.
00711 
00712         Walk past any token not on the channel the parser is listening to.
00713         """
00714         
00715         if self.p < len(self.tokens):
00716             self.p += 1
00717 
00718             self.p = self.skipOffTokenChannels(self.p) # leave p on valid token
00719 
00720 
00721     def skipOffTokenChannels(self, i):
00722         """
00723         Given a starting index, return the index of the first on-channel
00724         token.
00725         """
00726 
00727         try:
00728             while self.tokens[i].channel != self.channel:
00729                 i += 1
00730         except IndexError:
00731             # hit the end of token stream
00732             pass
00733         
00734         return i
00735 
00736 
00737     def skipOffTokenChannelsReverse(self, i):
00738         while i >= 0 and self.tokens[i].channel != self.channel:
00739             i -= 1
00740 
00741         return i
00742 
00743 
00744     def setTokenTypeChannel(self, ttype, channel):
00745         """
00746         A simple filter mechanism whereby you can tell this token stream
00747         to force all tokens of type ttype to be on channel.  For example,
00748         when interpreting, we cannot exec actions so we need to tell
00749         the stream to force all WS and NEWLINE to be a different, ignored
00750         channel.
00751         """
00752         
00753         self.channelOverrideMap[ttype] = channel
00754 
00755 
00756     def discardTokenType(self, ttype):
00757         self.discardSet.add(ttype)
00758 
00759 
00760     def getTokens(self, start=None, stop=None, types=None):
00761         """
00762         Given a start and stop index, return a list of all tokens in
00763         the token type set.  Return None if no tokens were found.  This
00764         method looks at both on and off channel tokens.
00765         """
00766 
00767         if self.p == -1:
00768             self.fillBuffer()
00769 
00770         if stop is None or stop >= len(self.tokens):
00771             stop = len(self.tokens) - 1
00772             
00773         if start is None or stop < 0:
00774             start = 0
00775 
00776         if start > stop:
00777             return None
00778 
00779         if isinstance(types, (int, long)):
00780             # called with a single type, wrap into set
00781             types = set([types])
00782             
00783         filteredTokens = [
00784             token for token in self.tokens[start:stop]
00785             if types is None or token.type in types
00786             ]
00787 
00788         if len(filteredTokens) == 0:
00789             return None
00790 
00791         return filteredTokens
00792 
00793 
00794     def LT(self, k):
00795         """
00796         Get the ith token from the current position 1..n where k=1 is the
00797         first symbol of lookahead.
00798         """
00799 
00800         if self.p == -1:
00801             self.fillBuffer()
00802 
00803         if k == 0:
00804             return None
00805 
00806         if k < 0:
00807             return self.LB(-k)
00808                 
00809         i = self.p
00810         n = 1
00811         # find k good tokens
00812         while n < k:
00813             # skip off-channel tokens
00814             i = self.skipOffTokenChannels(i+1) # leave p on valid token
00815             n += 1
00816 
00817         try:
00818             return self.tokens[i]
00819         except IndexError:
00820             return EOF_TOKEN
00821 
00822 
00823     def LB(self, k):
00824         """Look backwards k tokens on-channel tokens"""
00825 
00826         if self.p == -1:
00827             self.fillBuffer()
00828 
00829         if k == 0:
00830             return None
00831 
00832         if self.p - k < 0:
00833             return None
00834 
00835         i = self.p
00836         n = 1
00837         # find k good tokens looking backwards
00838         while n <= k:
00839             # skip off-channel tokens
00840             i = self.skipOffTokenChannelsReverse(i-1) # leave p on valid token
00841             n += 1
00842 
00843         if i < 0:
00844             return None
00845             
00846         return self.tokens[i]
00847 
00848 
00849     def get(self, i):
00850         """
00851         Return absolute token i; ignore which channel the tokens are on;
00852         that is, count all tokens not just on-channel tokens.
00853         """
00854 
00855         return self.tokens[i]
00856 
00857 
00858     def LA(self, i):
00859         return self.LT(i).type
00860 
00861 
00862     def mark(self):
00863         self.lastMarker = self.index()
00864         return self.lastMarker
00865     
00866 
00867     def release(self, marker=None):
00868         # no resources to release
00869         pass
00870     
00871 
00872     def size(self):
00873         return len(self.tokens)
00874 
00875 
00876     def index(self):
00877         return self.p
00878 
00879 
00880     def rewind(self, marker=None):
00881         if marker is None:
00882             marker = self.lastMarker
00883             
00884         self.seek(marker)
00885 
00886 
00887     def seek(self, index):
00888         self.p = index
00889 
00890 
00891     def getTokenSource(self):
00892         return self.tokenSource
00893 
00894 
00895     def getSourceName(self):
00896         return self.tokenSource.getSourceName()
00897 
00898 
00899     def toString(self, start=None, stop=None):
00900         if self.p == -1:
00901             self.fillBuffer()
00902 
00903         if start is None:
00904             start = 0
00905         elif not isinstance(start, int):
00906             start = start.index
00907 
00908         if stop is None:
00909             stop = len(self.tokens) - 1
00910         elif not isinstance(stop, int):
00911             stop = stop.index
00912         
00913         if stop >= len(self.tokens):
00914             stop = len(self.tokens) - 1
00915 
00916         return ''.join([t.text for t in self.tokens[start:stop+1]])
00917 
00918 
00919 class RewriteOperation(object):
00920     """@brief Internal helper class."""
00921     
00922     def __init__(self, stream, index, text):
00923         self.stream = stream
00924         self.index = index
00925         self.text = text
00926 
00927     def execute(self, buf):
00928         """Execute the rewrite operation by possibly adding to the buffer.
00929         Return the index of the next token to operate on.
00930         """
00931 
00932         return self.index
00933 
00934     def toString(self):
00935         opName = self.__class__.__name__
00936         return '<%s@%d:"%s">' % (opName, self.index, self.text)
00937 
00938     __str__ = toString
00939     __repr__ = toString
00940 
00941 
00942 class InsertBeforeOp(RewriteOperation):
00943     """@brief Internal helper class."""
00944 
00945     def execute(self, buf):
00946         buf.write(self.text)
00947         buf.write(self.stream.tokens[self.index].text)
00948         return self.index + 1
00949 
00950 
00951 class ReplaceOp(RewriteOperation):
00952     """
00953     @brief Internal helper class.
00954     
00955     I'm going to try replacing range from x..y with (y-x)+1 ReplaceOp
00956     instructions.
00957     """
00958 
00959     def __init__(self, stream, first, last, text):
00960         RewriteOperation.__init__(self, stream, first, text)
00961         self.lastIndex = last
00962 
00963 
00964     def execute(self, buf):
00965         if self.text is not None:
00966             buf.write(self.text)
00967 
00968         return self.lastIndex + 1
00969 
00970 
00971     def toString(self):
00972         return '<ReplaceOp@%d..%d:"%s">' % (
00973             self.index, self.lastIndex, self.text)
00974 
00975     __str__ = toString
00976     __repr__ = toString
00977 
00978 
00979 class DeleteOp(ReplaceOp):
00980     """
00981     @brief Internal helper class.
00982     """
00983 
00984     def __init__(self, stream, first, last):
00985         ReplaceOp.__init__(self, stream, first, last, None)
00986 
00987 
00988     def toString(self):
00989         return '<DeleteOp@%d..%d>' % (self.index, self.lastIndex)
00990 
00991     __str__ = toString
00992     __repr__ = toString
00993 
00994 
00995 class TokenRewriteStream(CommonTokenStream):
00996     """@brief CommonTokenStream that can be modified.
00997 
00998     Useful for dumping out the input stream after doing some
00999     augmentation or other manipulations.
01000 
01001     You can insert stuff, replace, and delete chunks.  Note that the
01002     operations are done lazily--only if you convert the buffer to a
01003     String.  This is very efficient because you are not moving data around
01004     all the time.  As the buffer of tokens is converted to strings, the
01005     toString() method(s) check to see if there is an operation at the
01006     current index.  If so, the operation is done and then normal String
01007     rendering continues on the buffer.  This is like having multiple Turing
01008     machine instruction streams (programs) operating on a single input tape. :)
01009 
01010     Since the operations are done lazily at toString-time, operations do not
01011     screw up the token index values.  That is, an insert operation at token
01012     index i does not change the index values for tokens i+1..n-1.
01013 
01014     Because operations never actually alter the buffer, you may always get
01015     the original token stream back without undoing anything.  Since
01016     the instructions are queued up, you can easily simulate transactions and
01017     roll back any changes if there is an error just by removing instructions.
01018     For example,
01019 
01020      CharStream input = new ANTLRFileStream("input");
01021      TLexer lex = new TLexer(input);
01022      TokenRewriteStream tokens = new TokenRewriteStream(lex);
01023      T parser = new T(tokens);
01024      parser.startRule();
01025 
01026      Then in the rules, you can execute
01027         Token t,u;
01028         ...
01029         input.insertAfter(t, "text to put after t");}
01030         input.insertAfter(u, "text after u");}
01031         System.out.println(tokens.toString());
01032 
01033     Actually, you have to cast the 'input' to a TokenRewriteStream. :(
01034 
01035     You can also have multiple "instruction streams" and get multiple
01036     rewrites from a single pass over the input.  Just name the instruction
01037     streams and use that name again when printing the buffer.  This could be
01038     useful for generating a C file and also its header file--all from the
01039     same buffer:
01040 
01041         tokens.insertAfter("pass1", t, "text to put after t");}
01042         tokens.insertAfter("pass2", u, "text after u");}
01043         System.out.println(tokens.toString("pass1"));
01044         System.out.println(tokens.toString("pass2"));
01045 
01046     If you don't use named rewrite streams, a "default" stream is used as
01047     the first example shows.
01048     """
01049     
01050     DEFAULT_PROGRAM_NAME = "default"
01051     MIN_TOKEN_INDEX = 0
01052 
01053     def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL):
01054         CommonTokenStream.__init__(self, tokenSource, channel)
01055 
01056         # You may have multiple, named streams of rewrite operations.
01057         # I'm calling these things "programs."
01058         #  Maps String (name) -> rewrite (List)
01059         self.programs = {}
01060         self.programs[self.DEFAULT_PROGRAM_NAME] = []
01061         
01062         # Map String (program name) -> Integer index
01063         self.lastRewriteTokenIndexes = {}
01064         
01065 
01066     def rollback(self, *args):
01067         """
01068         Rollback the instruction stream for a program so that
01069         the indicated instruction (via instructionIndex) is no
01070         longer in the stream.  UNTESTED!
01071         """
01072 
01073         if len(args) == 2:
01074             programName = args[0]
01075             instructionIndex = args[1]
01076         elif len(args) == 1:
01077             programName = self.DEFAULT_PROGRAM_NAME
01078             instructionIndex = args[0]
01079         else:
01080             raise TypeError("Invalid arguments")
01081         
01082         p = self.programs.get(programName, None)
01083         if p is not None:
01084             self.programs[programName] = (
01085                 p[self.MIN_TOKEN_INDEX:instructionIndex])
01086 
01087 
01088     def deleteProgram(self, programName=DEFAULT_PROGRAM_NAME):
01089         """Reset the program so that no instructions exist"""
01090             
01091         self.rollback(programName, self.MIN_TOKEN_INDEX)
01092 
01093 
01094     def insertAfter(self, *args):
01095         if len(args) == 2:
01096             programName = self.DEFAULT_PROGRAM_NAME
01097             index = args[0]
01098             text = args[1]
01099             
01100         elif len(args) == 3:
01101             programName = args[0]
01102             index = args[1]
01103             text = args[2]
01104 
01105         else:
01106             raise TypeError("Invalid arguments")
01107 
01108         if isinstance(index, Token):
01109             # index is a Token, grap the stream index from it
01110             index = index.index
01111 
01112         # to insert after, just insert before next index (even if past end)
01113         self.insertBefore(programName, index+1, text)
01114 
01115 
01116     def insertBefore(self, *args):
01117         if len(args) == 2:
01118             programName = self.DEFAULT_PROGRAM_NAME
01119             index = args[0]
01120             text = args[1]
01121             
01122         elif len(args) == 3:
01123             programName = args[0]
01124             index = args[1]
01125             text = args[2]
01126 
01127         else:
01128             raise TypeError("Invalid arguments")
01129 
01130         if isinstance(index, Token):
01131             # index is a Token, grap the stream index from it
01132             index = index.index
01133 
01134         op = InsertBeforeOp(self, index, text)
01135         rewrites = self.getProgram(programName)
01136         rewrites.append(op)
01137 
01138 
01139     def replace(self, *args):
01140         if len(args) == 2:
01141             programName = self.DEFAULT_PROGRAM_NAME
01142             first = args[0]
01143             last = args[0]
01144             text = args[1]
01145             
01146         elif len(args) == 3:
01147             programName = self.DEFAULT_PROGRAM_NAME
01148             first = args[0]
01149             last = args[1]
01150             text = args[2]
01151             
01152         elif len(args) == 4:
01153             programName = args[0]
01154             first = args[1]
01155             last = args[2]
01156             text = args[3]
01157 
01158         else:
01159             raise TypeError("Invalid arguments")
01160 
01161         if isinstance(first, Token):
01162             # first is a Token, grap the stream index from it
01163             first = first.index
01164 
01165         if isinstance(last, Token):
01166             # last is a Token, grap the stream index from it
01167             last = last.index
01168 
01169         if first > last or first < 0 or last < 0 or last >= len(self.tokens):
01170             raise ValueError(
01171                 "replace: range invalid: "+first+".."+last+
01172                 "(size="+len(self.tokens)+")")
01173 
01174         op = ReplaceOp(self, first, last, text)
01175         rewrites = self.getProgram(programName)
01176         rewrites.append(op)
01177         
01178 
01179     def delete(self, *args):
01180         self.replace(*(list(args) + [None]))
01181 
01182 
01183     def getLastRewriteTokenIndex(self, programName=DEFAULT_PROGRAM_NAME):
01184         return self.lastRewriteTokenIndexes.get(programName, -1)
01185 
01186 
01187     def setLastRewriteTokenIndex(self, programName, i):
01188         self.lastRewriteTokenIndexes[programName] = i
01189 
01190 
01191     def getProgram(self, name):
01192         p = self.programs.get(name, None)
01193         if p is  None:
01194             p = self.initializeProgram(name)
01195 
01196         return p
01197 
01198 
01199     def initializeProgram(self, name):
01200         p = []
01201         self.programs[name] = p
01202         return p
01203 
01204 
01205     def toOriginalString(self, start=None, end=None):
01206         if start is None:
01207             start = self.MIN_TOKEN_INDEX
01208         if end is None:
01209             end = self.size() - 1
01210         
01211         buf = StringIO()
01212         i = start
01213         while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens):
01214             buf.write(self.get(i).text)
01215             i += 1
01216 
01217         return buf.getvalue()
01218 
01219 
01220     def toString(self, *args):
01221         if len(args) == 0:
01222             programName = self.DEFAULT_PROGRAM_NAME
01223             start = self.MIN_TOKEN_INDEX
01224             end = self.size() - 1
01225             
01226         elif len(args) == 1:
01227             programName = args[0]
01228             start = self.MIN_TOKEN_INDEX
01229             end = self.size() - 1
01230 
01231         elif len(args) == 2:
01232             programName = self.DEFAULT_PROGRAM_NAME
01233             start = args[0]
01234             end = args[1]
01235             
01236         if start is None:
01237             start = self.MIN_TOKEN_INDEX
01238         elif not isinstance(start, int):
01239             start = start.index
01240 
01241         if end is None:
01242             end = len(self.tokens) - 1
01243         elif not isinstance(end, int):
01244             end = end.index
01245 
01246         # ensure start/end are in range
01247         if end >= len(self.tokens):
01248             end = len(self.tokens) - 1
01249 
01250         if start < 0:
01251             start = 0
01252 
01253         rewrites = self.programs.get(programName)
01254         if rewrites is None or len(rewrites) == 0:
01255             # no instructions to execute
01256             return self.toOriginalString(start, end)
01257         
01258         buf = StringIO()
01259 
01260         # First, optimize instruction stream
01261         indexToOp = self.reduceToSingleOperationPerIndex(rewrites)
01262 
01263         # Walk buffer, executing instructions and emitting tokens
01264         i = start
01265         while i <= end and i < len(self.tokens):
01266             op = indexToOp.get(i)
01267             # remove so any left have index size-1
01268             try:
01269                 del indexToOp[i]
01270             except KeyError:
01271                 pass
01272 
01273             t = self.tokens[i]
01274             if op is None:
01275                 # no operation at that index, just dump token
01276                 buf.write(t.text)
01277                 i += 1 # move to next token
01278 
01279             else:
01280                 i = op.execute(buf) # execute operation and skip
01281 
01282         # include stuff after end if it's last index in buffer
01283         # So, if they did an insertAfter(lastValidIndex, "foo"), include
01284         # foo if end==lastValidIndex.
01285         if end == len(self.tokens) - 1:
01286             # Scan any remaining operations after last token
01287             # should be included (they will be inserts).
01288             for i in sorted(indexToOp.keys()):
01289                 op = indexToOp[i]
01290                 if op.index >= len(self.tokens)-1:
01291                     buf.write(op.text)
01292 
01293         return buf.getvalue()
01294 
01295     __str__ = toString
01296 
01297 
01298     def reduceToSingleOperationPerIndex(self, rewrites):
01299         """
01300         We need to combine operations and report invalid operations (like
01301         overlapping replaces that are not completed nested).  Inserts to
01302         same index need to be combined etc...   Here are the cases:
01303 
01304         I.i.u I.j.v                           leave alone, nonoverlapping
01305         I.i.u I.i.v                           combine: Iivu
01306 
01307         R.i-j.u R.x-y.v | i-j in x-y          delete first R
01308         R.i-j.u R.i-j.v                       delete first R
01309         R.i-j.u R.x-y.v | x-y in i-j          ERROR
01310         R.i-j.u R.x-y.v | boundaries overlap  ERROR
01311 
01312         I.i.u R.x-y.v   | i in x-y            delete I
01313         I.i.u R.x-y.v   | i not in x-y        leave alone, nonoverlapping
01314         R.x-y.v I.i.u   | i in x-y            ERROR
01315         R.x-y.v I.x.u                         R.x-y.uv (combine, delete I)
01316         R.x-y.v I.i.u   | i not in x-y        leave alone, nonoverlapping
01317 
01318         I.i.u = insert u before op @ index i
01319         R.x-y.u = replace x-y indexed tokens with u
01320 
01321         First we need to examine replaces.  For any replace op:
01322 
01323           1. wipe out any insertions before op within that range.
01324           2. Drop any replace op before that is contained completely within
01325              that range.
01326           3. Throw exception upon boundary overlap with any previous replace.
01327 
01328         Then we can deal with inserts:
01329 
01330           1. for any inserts to same index, combine even if not adjacent.
01331           2. for any prior replace with same left boundary, combine this
01332              insert with replace and delete this replace.
01333           3. throw exception if index in same range as previous replace
01334 
01335         Don't actually delete; make op null in list. Easier to walk list.
01336         Later we can throw as we add to index -> op map.
01337 
01338         Note that I.2 R.2-2 will wipe out I.2 even though, technically, the
01339         inserted stuff would be before the replace range.  But, if you
01340         add tokens in front of a method body '{' and then delete the method
01341         body, I think the stuff before the '{' you added should disappear too.
01342 
01343         Return a map from token index to operation.
01344         """
01345         
01346         # WALK REPLACES
01347         for i, rop in enumerate(rewrites):
01348             if rop is None:
01349                 continue
01350 
01351             if not isinstance(rop, ReplaceOp):
01352                 continue
01353 
01354             # Wipe prior inserts within range
01355             for j, iop in self.getKindOfOps(rewrites, InsertBeforeOp, i):
01356                 if iop.index >= rop.index and iop.index <= rop.lastIndex:
01357                     rewrites[j] = None  # delete insert as it's a no-op.
01358 
01359             # Drop any prior replaces contained within
01360             for j, prevRop in self.getKindOfOps(rewrites, ReplaceOp, i):
01361                 if (prevRop.index >= rop.index
01362                     and prevRop.lastIndex <= rop.lastIndex):
01363                     rewrites[j] = None  # delete replace as it's a no-op.
01364                     continue
01365 
01366                 # throw exception unless disjoint or identical
01367                 disjoint = (prevRop.lastIndex < rop.index
01368                             or prevRop.index > rop.lastIndex)
01369                 same = (prevRop.index == rop.index
01370                         and prevRop.lastIndex == rop.lastIndex)
01371                 if not disjoint and not same:
01372                     raise ValueError(
01373                         "replace op boundaries of %s overlap with previous %s"
01374                         % (rop, prevRop))
01375 
01376         # WALK INSERTS
01377         for i, iop in enumerate(rewrites):
01378             if iop is None:
01379                 continue
01380 
01381             if not isinstance(iop, InsertBeforeOp):
01382                 continue
01383 
01384             # combine current insert with prior if any at same index
01385             for j, prevIop in self.getKindOfOps(rewrites, InsertBeforeOp, i):
01386                 if prevIop.index == iop.index: # combine objects
01387                     # convert to strings...we're in process of toString'ing
01388                     # whole token buffer so no lazy eval issue with any
01389                     # templates
01390                     iop.text = self.catOpText(iop.text, prevIop.text)
01391                     rewrites[j] = None  # delete redundant prior insert
01392 
01393             # look for replaces where iop.index is in range; error
01394             for j, rop in self.getKindOfOps(rewrites, ReplaceOp, i):
01395                 if iop.index == rop.index:
01396                     rop.text = self.catOpText(iop.text, rop.text)
01397                     rewrites[i] = None  # delete current insert
01398                     continue
01399 
01400                 if iop.index >= rop.index and iop.index <= rop.lastIndex:
01401                     raise ValueError(
01402                         "insert op %s within boundaries of previous %s"
01403                         % (iop, rop))
01404         
01405         m = {}
01406         for i, op in enumerate(rewrites):
01407             if op is None:
01408                 continue # ignore deleted ops
01409 
01410             assert op.index not in m, "should only be one op per index"
01411             m[op.index] = op
01412 
01413         return m
01414 
01415 
01416     def catOpText(self, a, b):
01417         x = ""
01418         y = ""
01419         if a is not None:
01420             x = a
01421         if b is not None:
01422             y = b
01423         return x + y
01424 
01425 
01426     def getKindOfOps(self, rewrites, kind, before=None):
01427         if before is None:
01428             before = len(rewrites)
01429         elif before > len(rewrites):
01430             before = len(rewrites)
01431 
01432         for i, op in enumerate(rewrites[:before]):
01433             if op is None:
01434                 # ignore deleted
01435                 continue
01436             if op.__class__ == kind:
01437                 yield i, op
01438 
01439 
01440     def toDebugString(self, start=None, end=None):
01441         if start is None:
01442             start = self.MIN_TOKEN_INDEX
01443         if end is None:
01444             end = self.size() - 1
01445 
01446         buf = StringIO()
01447         i = start
01448         while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens):
01449             buf.write(self.get(i))
01450             i += 1
01451 
01452         return buf.getvalue()


rve_interface_gen
Author(s): Josh Faust
autogenerated on Wed Dec 11 2013 14:31:00