00001 """ANTLR3 runtime package"""
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 import codecs
00034 from StringIO import StringIO
00035
00036 from constants import DEFAULT_CHANNEL, EOF
00037 from tokens import Token, EOF_TOKEN
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051 class IntStream(object):
00052 """
00053 @brief Base interface for streams of integer values.
00054
00055 A simple stream of integers used when all I care about is the char
00056 or token type sequence (such as interpretation).
00057 """
00058
00059 def consume(self):
00060 raise NotImplementedError
00061
00062
00063 def LA(self, i):
00064 """Get int at current input pointer + i ahead where i=1 is next int.
00065
00066 Negative indexes are allowed. LA(-1) is previous token (token
00067 just matched). LA(-i) where i is before first token should
00068 yield -1, invalid char / EOF.
00069 """
00070
00071 raise NotImplementedError
00072
00073
00074 def mark(self):
00075 """
00076 Tell the stream to start buffering if it hasn't already. Return
00077 current input position, index(), or some other marker so that
00078 when passed to rewind() you get back to the same spot.
00079 rewind(mark()) should not affect the input cursor. The Lexer
00080 track line/col info as well as input index so its markers are
00081 not pure input indexes. Same for tree node streams.
00082 """
00083
00084 raise NotImplementedError
00085
00086
00087 def index(self):
00088 """
00089 Return the current input symbol index 0..n where n indicates the
00090 last symbol has been read. The index is the symbol about to be
00091 read not the most recently read symbol.
00092 """
00093
00094 raise NotImplementedError
00095
00096
00097 def rewind(self, marker=None):
00098 """
00099 Reset the stream so that next call to index would return marker.
00100 The marker will usually be index() but it doesn't have to be. It's
00101 just a marker to indicate what state the stream was in. This is
00102 essentially calling release() and seek(). If there are markers
00103 created after this marker argument, this routine must unroll them
00104 like a stack. Assume the state the stream was in when this marker
00105 was created.
00106
00107 If marker is None:
00108 Rewind to the input position of the last marker.
00109 Used currently only after a cyclic DFA and just
00110 before starting a sem/syn predicate to get the
00111 input position back to the start of the decision.
00112 Do not "pop" the marker off the state. mark(i)
00113 and rewind(i) should balance still. It is
00114 like invoking rewind(last marker) but it should not "pop"
00115 the marker off. It's like seek(last marker's input position).
00116 """
00117
00118 raise NotImplementedError
00119
00120
00121 def release(self, marker=None):
00122 """
00123 You may want to commit to a backtrack but don't want to force the
00124 stream to keep bookkeeping objects around for a marker that is
00125 no longer necessary. This will have the same behavior as
00126 rewind() except it releases resources without the backward seek.
00127 This must throw away resources for all markers back to the marker
00128 argument. So if you're nested 5 levels of mark(), and then release(2)
00129 you have to release resources for depths 2..5.
00130 """
00131
00132 raise NotImplementedError
00133
00134
00135 def seek(self, index):
00136 """
00137 Set the input cursor to the position indicated by index. This is
00138 normally used to seek ahead in the input stream. No buffering is
00139 required to do this unless you know your stream will use seek to
00140 move backwards such as when backtracking.
00141
00142 This is different from rewind in its multi-directional
00143 requirement and in that its argument is strictly an input cursor
00144 (index).
00145
00146 For char streams, seeking forward must update the stream state such
00147 as line number. For seeking backwards, you will be presumably
00148 backtracking using the mark/rewind mechanism that restores state and
00149 so this method does not need to update state when seeking backwards.
00150
00151 Currently, this method is only used for efficient backtracking using
00152 memoization, but in the future it may be used for incremental parsing.
00153
00154 The index is 0..n-1. A seek to position i means that LA(1) will
00155 return the ith symbol. So, seeking to 0 means LA(1) will return the
00156 first element in the stream.
00157 """
00158
00159 raise NotImplementedError
00160
00161
00162 def size(self):
00163 """
00164 Only makes sense for streams that buffer everything up probably, but
00165 might be useful to display the entire stream or for testing. This
00166 value includes a single EOF.
00167 """
00168
00169 raise NotImplementedError
00170
00171
00172 def getSourceName(self):
00173 """
00174 Where are you getting symbols from? Normally, implementations will
00175 pass the buck all the way to the lexer who can ask its input stream
00176 for the file name or whatever.
00177 """
00178
00179 raise NotImplementedError
00180
00181
00182 class CharStream(IntStream):
00183 """
00184 @brief A source of characters for an ANTLR lexer.
00185
00186 This is an abstract class that must be implemented by a subclass.
00187
00188 """
00189
00190
00191
00192
00193 EOF = -1
00194
00195
00196 def substring(self, start, stop):
00197 """
00198 For infinite streams, you don't need this; primarily I'm providing
00199 a useful interface for action code. Just make sure actions don't
00200 use this on streams that don't support it.
00201 """
00202
00203 raise NotImplementedError
00204
00205
00206 def LT(self, i):
00207 """
00208 Get the ith character of lookahead. This is the same usually as
00209 LA(i). This will be used for labels in the generated
00210 lexer code. I'd prefer to return a char here type-wise, but it's
00211 probably better to be 32-bit clean and be consistent with LA.
00212 """
00213
00214 raise NotImplementedError
00215
00216
00217 def getLine(self):
00218 """ANTLR tracks the line information automatically"""
00219
00220 raise NotImplementedError
00221
00222
00223 def setLine(self, line):
00224 """
00225 Because this stream can rewind, we need to be able to reset the line
00226 """
00227
00228 raise NotImplementedError
00229
00230
00231 def getCharPositionInLine(self):
00232 """
00233 The index of the character relative to the beginning of the line 0..n-1
00234 """
00235
00236 raise NotImplementedError
00237
00238
00239 def setCharPositionInLine(self, pos):
00240 raise NotImplementedError
00241
00242
00243 class TokenStream(IntStream):
00244 """
00245
00246 @brief A stream of tokens accessing tokens from a TokenSource
00247
00248 This is an abstract class that must be implemented by a subclass.
00249
00250 """
00251
00252
00253
00254
00255 def LT(self, k):
00256 """
00257 Get Token at current input pointer + i ahead where i=1 is next Token.
00258 i<0 indicates tokens in the past. So -1 is previous token and -2 is
00259 two tokens ago. LT(0) is undefined. For i>=n, return Token.EOFToken.
00260 Return null for LT(0) and any index that results in an absolute address
00261 that is negative.
00262 """
00263
00264 raise NotImplementedError
00265
00266
00267 def get(self, i):
00268 """
00269 Get a token at an absolute index i; 0..n-1. This is really only
00270 needed for profiling and debugging and token stream rewriting.
00271 If you don't want to buffer up tokens, then this method makes no
00272 sense for you. Naturally you can't use the rewrite stream feature.
00273 I believe DebugTokenStream can easily be altered to not use
00274 this method, removing the dependency.
00275 """
00276
00277 raise NotImplementedError
00278
00279
00280 def getTokenSource(self):
00281 """
00282 Where is this stream pulling tokens from? This is not the name, but
00283 the object that provides Token objects.
00284 """
00285
00286 raise NotImplementedError
00287
00288
00289 def toString(self, start=None, stop=None):
00290 """
00291 Return the text of all tokens from start to stop, inclusive.
00292 If the stream does not buffer all the tokens then it can just
00293 return "" or null; Users should not access $ruleLabel.text in
00294 an action of course in that case.
00295
00296 Because the user is not required to use a token with an index stored
00297 in it, we must provide a means for two token objects themselves to
00298 indicate the start/end location. Most often this will just delegate
00299 to the other toString(int,int). This is also parallel with
00300 the TreeNodeStream.toString(Object,Object).
00301 """
00302
00303 raise NotImplementedError
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315 class ANTLRStringStream(CharStream):
00316 """
00317 @brief CharStream that pull data from a unicode string.
00318
00319 A pretty quick CharStream that pulls all data from an array
00320 directly. Every method call counts in the lexer.
00321
00322 """
00323
00324
00325 def __init__(self, data):
00326 """
00327 @param data This should be a unicode string holding the data you want
00328 to parse. If you pass in a byte string, the Lexer will choke on
00329 non-ascii data.
00330
00331 """
00332
00333 CharStream.__init__(self)
00334
00335
00336 self.strdata = unicode(data)
00337 self.data = [ord(c) for c in self.strdata]
00338
00339
00340 self.n = len(data)
00341
00342
00343 self.p = 0
00344
00345
00346 self.line = 1
00347
00348
00349
00350 self.charPositionInLine = 0
00351
00352
00353
00354
00355 self._markers = [ ]
00356 self.lastMarker = None
00357 self.markDepth = 0
00358
00359
00360 self.name = None
00361
00362
00363 def reset(self):
00364 """
00365 Reset the stream so that it's in the same state it was
00366 when the object was created *except* the data array is not
00367 touched.
00368 """
00369
00370 self.p = 0
00371 self.line = 1
00372 self.charPositionInLine = 0
00373 self._markers = [ ]
00374
00375
00376 def consume(self):
00377 try:
00378 if self.data[self.p] == 10:
00379 self.line += 1
00380 self.charPositionInLine = 0
00381 else:
00382 self.charPositionInLine += 1
00383
00384 self.p += 1
00385
00386 except IndexError:
00387
00388
00389 pass
00390
00391
00392
00393 def LA(self, i):
00394 if i == 0:
00395 return 0
00396
00397 if i < 0:
00398 i += 1
00399
00400 try:
00401 return self.data[self.p+i-1]
00402 except IndexError:
00403 return EOF
00404
00405
00406
00407 def LT(self, i):
00408 if i == 0:
00409 return 0
00410
00411 if i < 0:
00412 i += 1
00413
00414 try:
00415 return self.strdata[self.p+i-1]
00416 except IndexError:
00417 return EOF
00418
00419
00420 def index(self):
00421 """
00422 Return the current input symbol index 0..n where n indicates the
00423 last symbol has been read. The index is the index of char to
00424 be returned from LA(1).
00425 """
00426
00427 return self.p
00428
00429
00430 def size(self):
00431 return self.n
00432
00433
00434 def mark(self):
00435 state = (self.p, self.line, self.charPositionInLine)
00436 try:
00437 self._markers[self.markDepth] = state
00438 except IndexError:
00439 self._markers.append(state)
00440 self.markDepth += 1
00441
00442 self.lastMarker = self.markDepth
00443
00444 return self.lastMarker
00445
00446
00447 def rewind(self, marker=None):
00448 if marker is None:
00449 marker = self.lastMarker
00450
00451 p, line, charPositionInLine = self._markers[marker-1]
00452
00453 self.seek(p)
00454 self.line = line
00455 self.charPositionInLine = charPositionInLine
00456 self.release(marker)
00457
00458
00459 def release(self, marker=None):
00460 if marker is None:
00461 marker = self.lastMarker
00462
00463 self.markDepth = marker-1
00464
00465
00466 def seek(self, index):
00467 """
00468 consume() ahead until p==index; can't just set p=index as we must
00469 update line and charPositionInLine.
00470 """
00471
00472 if index <= self.p:
00473 self.p = index
00474 return
00475
00476
00477 while self.p < index:
00478 self.consume()
00479
00480
00481 def substring(self, start, stop):
00482 return self.strdata[start:stop+1]
00483
00484
00485 def getLine(self):
00486 """Using setter/getter methods is deprecated. Use o.line instead."""
00487 return self.line
00488
00489
00490 def getCharPositionInLine(self):
00491 """
00492 Using setter/getter methods is deprecated. Use o.charPositionInLine
00493 instead.
00494 """
00495 return self.charPositionInLine
00496
00497
00498 def setLine(self, line):
00499 """Using setter/getter methods is deprecated. Use o.line instead."""
00500 self.line = line
00501
00502
00503 def setCharPositionInLine(self, pos):
00504 """
00505 Using setter/getter methods is deprecated. Use o.charPositionInLine
00506 instead.
00507 """
00508 self.charPositionInLine = pos
00509
00510
00511 def getSourceName(self):
00512 return self.name
00513
00514
00515 class ANTLRFileStream(ANTLRStringStream):
00516 """
00517 @brief CharStream that opens a file to read the data.
00518
00519 This is a char buffer stream that is loaded from a file
00520 all at once when you construct the object.
00521 """
00522
00523 def __init__(self, fileName, encoding=None):
00524 """
00525 @param fileName The path to the file to be opened. The file will be
00526 opened with mode 'rb'.
00527
00528 @param encoding If you set the optional encoding argument, then the
00529 data will be decoded on the fly.
00530
00531 """
00532
00533 self.fileName = fileName
00534
00535 fp = codecs.open(fileName, 'rb', encoding)
00536 try:
00537 data = fp.read()
00538 finally:
00539 fp.close()
00540
00541 ANTLRStringStream.__init__(self, data)
00542
00543
00544 def getSourceName(self):
00545 """Deprecated, access o.fileName directly."""
00546
00547 return self.fileName
00548
00549
00550 class ANTLRInputStream(ANTLRStringStream):
00551 """
00552 @brief CharStream that reads data from a file-like object.
00553
00554 This is a char buffer stream that is loaded from a file like object
00555 all at once when you construct the object.
00556
00557 All input is consumed from the file, but it is not closed.
00558 """
00559
00560 def __init__(self, file, encoding=None):
00561 """
00562 @param file A file-like object holding your input. Only the read()
00563 method must be implemented.
00564
00565 @param encoding If you set the optional encoding argument, then the
00566 data will be decoded on the fly.
00567
00568 """
00569
00570 if encoding is not None:
00571
00572 reader = codecs.lookup(encoding)[2]
00573 file = reader(file)
00574
00575 data = file.read()
00576
00577 ANTLRStringStream.__init__(self, data)
00578
00579
00580
00581
00582
00583 StringStream = ANTLRStringStream
00584 FileStream = ANTLRFileStream
00585 InputStream = ANTLRInputStream
00586
00587
00588
00589
00590
00591
00592
00593
00594
00595
00596
00597
00598 class CommonTokenStream(TokenStream):
00599 """
00600 @brief The most common stream of tokens
00601
00602 The most common stream of tokens is one where every token is buffered up
00603 and tokens are prefiltered for a certain channel (the parser will only
00604 see these tokens and cannot change the filter channel number during the
00605 parse).
00606 """
00607
00608 def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL):
00609 """
00610 @param tokenSource A TokenSource instance (usually a Lexer) to pull
00611 the tokens from.
00612
00613 @param channel Skip tokens on any channel but this one; this is how we
00614 skip whitespace...
00615
00616 """
00617
00618 TokenStream.__init__(self)
00619
00620 self.tokenSource = tokenSource
00621
00622
00623
00624 self.tokens = []
00625
00626
00627 self.channelOverrideMap = {}
00628
00629
00630 self.discardSet = set()
00631
00632
00633 self.channel = channel
00634
00635
00636 self.discardOffChannelTokens = False
00637
00638
00639
00640 self.p = -1
00641
00642
00643 self.lastMarker = None
00644
00645
00646 def setTokenSource(self, tokenSource):
00647 """Reset this token stream by setting its token source."""
00648
00649 self.tokenSource = tokenSource
00650 self.tokens = []
00651 self.p = -1
00652 self.channel = DEFAULT_CHANNEL
00653
00654
00655 def reset(self):
00656 self.p = 0
00657 self.lastMarker = None
00658
00659
00660 def fillBuffer(self):
00661 """
00662 Load all tokens from the token source and put in tokens.
00663 This is done upon first LT request because you might want to
00664 set some token type / channel overrides before filling buffer.
00665 """
00666
00667
00668 index = 0
00669 t = self.tokenSource.nextToken()
00670 while t is not None and t.type != EOF:
00671 discard = False
00672
00673 if self.discardSet is not None and t.type in self.discardSet:
00674 discard = True
00675
00676 elif self.discardOffChannelTokens and t.channel != self.channel:
00677 discard = True
00678
00679
00680 try:
00681 overrideChannel = self.channelOverrideMap[t.type]
00682
00683 except KeyError:
00684
00685 pass
00686
00687 else:
00688 if overrideChannel == self.channel:
00689 t.channel = overrideChannel
00690 else:
00691 discard = True
00692
00693 if not discard:
00694 t.index = index
00695 self.tokens.append(t)
00696 index += 1
00697
00698 t = self.tokenSource.nextToken()
00699
00700
00701 self.p = 0
00702 self.p = self.skipOffTokenChannels(self.p)
00703
00704
00705 def consume(self):
00706 """
00707 Move the input pointer to the next incoming token. The stream
00708 must become active with LT(1) available. consume() simply
00709 moves the input pointer so that LT(1) points at the next
00710 input symbol. Consume at least one token.
00711
00712 Walk past any token not on the channel the parser is listening to.
00713 """
00714
00715 if self.p < len(self.tokens):
00716 self.p += 1
00717
00718 self.p = self.skipOffTokenChannels(self.p)
00719
00720
00721 def skipOffTokenChannels(self, i):
00722 """
00723 Given a starting index, return the index of the first on-channel
00724 token.
00725 """
00726
00727 try:
00728 while self.tokens[i].channel != self.channel:
00729 i += 1
00730 except IndexError:
00731
00732 pass
00733
00734 return i
00735
00736
00737 def skipOffTokenChannelsReverse(self, i):
00738 while i >= 0 and self.tokens[i].channel != self.channel:
00739 i -= 1
00740
00741 return i
00742
00743
00744 def setTokenTypeChannel(self, ttype, channel):
00745 """
00746 A simple filter mechanism whereby you can tell this token stream
00747 to force all tokens of type ttype to be on channel. For example,
00748 when interpreting, we cannot exec actions so we need to tell
00749 the stream to force all WS and NEWLINE to be a different, ignored
00750 channel.
00751 """
00752
00753 self.channelOverrideMap[ttype] = channel
00754
00755
00756 def discardTokenType(self, ttype):
00757 self.discardSet.add(ttype)
00758
00759
00760 def getTokens(self, start=None, stop=None, types=None):
00761 """
00762 Given a start and stop index, return a list of all tokens in
00763 the token type set. Return None if no tokens were found. This
00764 method looks at both on and off channel tokens.
00765 """
00766
00767 if self.p == -1:
00768 self.fillBuffer()
00769
00770 if stop is None or stop >= len(self.tokens):
00771 stop = len(self.tokens) - 1
00772
00773 if start is None or stop < 0:
00774 start = 0
00775
00776 if start > stop:
00777 return None
00778
00779 if isinstance(types, (int, long)):
00780
00781 types = set([types])
00782
00783 filteredTokens = [
00784 token for token in self.tokens[start:stop]
00785 if types is None or token.type in types
00786 ]
00787
00788 if len(filteredTokens) == 0:
00789 return None
00790
00791 return filteredTokens
00792
00793
00794 def LT(self, k):
00795 """
00796 Get the ith token from the current position 1..n where k=1 is the
00797 first symbol of lookahead.
00798 """
00799
00800 if self.p == -1:
00801 self.fillBuffer()
00802
00803 if k == 0:
00804 return None
00805
00806 if k < 0:
00807 return self.LB(-k)
00808
00809 i = self.p
00810 n = 1
00811
00812 while n < k:
00813
00814 i = self.skipOffTokenChannels(i+1)
00815 n += 1
00816
00817 try:
00818 return self.tokens[i]
00819 except IndexError:
00820 return EOF_TOKEN
00821
00822
00823 def LB(self, k):
00824 """Look backwards k tokens on-channel tokens"""
00825
00826 if self.p == -1:
00827 self.fillBuffer()
00828
00829 if k == 0:
00830 return None
00831
00832 if self.p - k < 0:
00833 return None
00834
00835 i = self.p
00836 n = 1
00837
00838 while n <= k:
00839
00840 i = self.skipOffTokenChannelsReverse(i-1)
00841 n += 1
00842
00843 if i < 0:
00844 return None
00845
00846 return self.tokens[i]
00847
00848
00849 def get(self, i):
00850 """
00851 Return absolute token i; ignore which channel the tokens are on;
00852 that is, count all tokens not just on-channel tokens.
00853 """
00854
00855 return self.tokens[i]
00856
00857
00858 def LA(self, i):
00859 return self.LT(i).type
00860
00861
00862 def mark(self):
00863 self.lastMarker = self.index()
00864 return self.lastMarker
00865
00866
00867 def release(self, marker=None):
00868
00869 pass
00870
00871
00872 def size(self):
00873 return len(self.tokens)
00874
00875
00876 def index(self):
00877 return self.p
00878
00879
00880 def rewind(self, marker=None):
00881 if marker is None:
00882 marker = self.lastMarker
00883
00884 self.seek(marker)
00885
00886
00887 def seek(self, index):
00888 self.p = index
00889
00890
00891 def getTokenSource(self):
00892 return self.tokenSource
00893
00894
00895 def getSourceName(self):
00896 return self.tokenSource.getSourceName()
00897
00898
00899 def toString(self, start=None, stop=None):
00900 if self.p == -1:
00901 self.fillBuffer()
00902
00903 if start is None:
00904 start = 0
00905 elif not isinstance(start, int):
00906 start = start.index
00907
00908 if stop is None:
00909 stop = len(self.tokens) - 1
00910 elif not isinstance(stop, int):
00911 stop = stop.index
00912
00913 if stop >= len(self.tokens):
00914 stop = len(self.tokens) - 1
00915
00916 return ''.join([t.text for t in self.tokens[start:stop+1]])
00917
00918
00919 class RewriteOperation(object):
00920 """@brief Internal helper class."""
00921
00922 def __init__(self, stream, index, text):
00923 self.stream = stream
00924 self.index = index
00925 self.text = text
00926
00927 def execute(self, buf):
00928 """Execute the rewrite operation by possibly adding to the buffer.
00929 Return the index of the next token to operate on.
00930 """
00931
00932 return self.index
00933
00934 def toString(self):
00935 opName = self.__class__.__name__
00936 return '<%s@%d:"%s">' % (opName, self.index, self.text)
00937
00938 __str__ = toString
00939 __repr__ = toString
00940
00941
00942 class InsertBeforeOp(RewriteOperation):
00943 """@brief Internal helper class."""
00944
00945 def execute(self, buf):
00946 buf.write(self.text)
00947 buf.write(self.stream.tokens[self.index].text)
00948 return self.index + 1
00949
00950
00951 class ReplaceOp(RewriteOperation):
00952 """
00953 @brief Internal helper class.
00954
00955 I'm going to try replacing range from x..y with (y-x)+1 ReplaceOp
00956 instructions.
00957 """
00958
00959 def __init__(self, stream, first, last, text):
00960 RewriteOperation.__init__(self, stream, first, text)
00961 self.lastIndex = last
00962
00963
00964 def execute(self, buf):
00965 if self.text is not None:
00966 buf.write(self.text)
00967
00968 return self.lastIndex + 1
00969
00970
00971 def toString(self):
00972 return '<ReplaceOp@%d..%d:"%s">' % (
00973 self.index, self.lastIndex, self.text)
00974
00975 __str__ = toString
00976 __repr__ = toString
00977
00978
00979 class DeleteOp(ReplaceOp):
00980 """
00981 @brief Internal helper class.
00982 """
00983
00984 def __init__(self, stream, first, last):
00985 ReplaceOp.__init__(self, stream, first, last, None)
00986
00987
00988 def toString(self):
00989 return '<DeleteOp@%d..%d>' % (self.index, self.lastIndex)
00990
00991 __str__ = toString
00992 __repr__ = toString
00993
00994
00995 class TokenRewriteStream(CommonTokenStream):
00996 """@brief CommonTokenStream that can be modified.
00997
00998 Useful for dumping out the input stream after doing some
00999 augmentation or other manipulations.
01000
01001 You can insert stuff, replace, and delete chunks. Note that the
01002 operations are done lazily--only if you convert the buffer to a
01003 String. This is very efficient because you are not moving data around
01004 all the time. As the buffer of tokens is converted to strings, the
01005 toString() method(s) check to see if there is an operation at the
01006 current index. If so, the operation is done and then normal String
01007 rendering continues on the buffer. This is like having multiple Turing
01008 machine instruction streams (programs) operating on a single input tape. :)
01009
01010 Since the operations are done lazily at toString-time, operations do not
01011 screw up the token index values. That is, an insert operation at token
01012 index i does not change the index values for tokens i+1..n-1.
01013
01014 Because operations never actually alter the buffer, you may always get
01015 the original token stream back without undoing anything. Since
01016 the instructions are queued up, you can easily simulate transactions and
01017 roll back any changes if there is an error just by removing instructions.
01018 For example,
01019
01020 CharStream input = new ANTLRFileStream("input");
01021 TLexer lex = new TLexer(input);
01022 TokenRewriteStream tokens = new TokenRewriteStream(lex);
01023 T parser = new T(tokens);
01024 parser.startRule();
01025
01026 Then in the rules, you can execute
01027 Token t,u;
01028 ...
01029 input.insertAfter(t, "text to put after t");}
01030 input.insertAfter(u, "text after u");}
01031 System.out.println(tokens.toString());
01032
01033 Actually, you have to cast the 'input' to a TokenRewriteStream. :(
01034
01035 You can also have multiple "instruction streams" and get multiple
01036 rewrites from a single pass over the input. Just name the instruction
01037 streams and use that name again when printing the buffer. This could be
01038 useful for generating a C file and also its header file--all from the
01039 same buffer:
01040
01041 tokens.insertAfter("pass1", t, "text to put after t");}
01042 tokens.insertAfter("pass2", u, "text after u");}
01043 System.out.println(tokens.toString("pass1"));
01044 System.out.println(tokens.toString("pass2"));
01045
01046 If you don't use named rewrite streams, a "default" stream is used as
01047 the first example shows.
01048 """
01049
01050 DEFAULT_PROGRAM_NAME = "default"
01051 MIN_TOKEN_INDEX = 0
01052
01053 def __init__(self, tokenSource=None, channel=DEFAULT_CHANNEL):
01054 CommonTokenStream.__init__(self, tokenSource, channel)
01055
01056
01057
01058
01059 self.programs = {}
01060 self.programs[self.DEFAULT_PROGRAM_NAME] = []
01061
01062
01063 self.lastRewriteTokenIndexes = {}
01064
01065
01066 def rollback(self, *args):
01067 """
01068 Rollback the instruction stream for a program so that
01069 the indicated instruction (via instructionIndex) is no
01070 longer in the stream. UNTESTED!
01071 """
01072
01073 if len(args) == 2:
01074 programName = args[0]
01075 instructionIndex = args[1]
01076 elif len(args) == 1:
01077 programName = self.DEFAULT_PROGRAM_NAME
01078 instructionIndex = args[0]
01079 else:
01080 raise TypeError("Invalid arguments")
01081
01082 p = self.programs.get(programName, None)
01083 if p is not None:
01084 self.programs[programName] = (
01085 p[self.MIN_TOKEN_INDEX:instructionIndex])
01086
01087
01088 def deleteProgram(self, programName=DEFAULT_PROGRAM_NAME):
01089 """Reset the program so that no instructions exist"""
01090
01091 self.rollback(programName, self.MIN_TOKEN_INDEX)
01092
01093
01094 def insertAfter(self, *args):
01095 if len(args) == 2:
01096 programName = self.DEFAULT_PROGRAM_NAME
01097 index = args[0]
01098 text = args[1]
01099
01100 elif len(args) == 3:
01101 programName = args[0]
01102 index = args[1]
01103 text = args[2]
01104
01105 else:
01106 raise TypeError("Invalid arguments")
01107
01108 if isinstance(index, Token):
01109
01110 index = index.index
01111
01112
01113 self.insertBefore(programName, index+1, text)
01114
01115
01116 def insertBefore(self, *args):
01117 if len(args) == 2:
01118 programName = self.DEFAULT_PROGRAM_NAME
01119 index = args[0]
01120 text = args[1]
01121
01122 elif len(args) == 3:
01123 programName = args[0]
01124 index = args[1]
01125 text = args[2]
01126
01127 else:
01128 raise TypeError("Invalid arguments")
01129
01130 if isinstance(index, Token):
01131
01132 index = index.index
01133
01134 op = InsertBeforeOp(self, index, text)
01135 rewrites = self.getProgram(programName)
01136 rewrites.append(op)
01137
01138
01139 def replace(self, *args):
01140 if len(args) == 2:
01141 programName = self.DEFAULT_PROGRAM_NAME
01142 first = args[0]
01143 last = args[0]
01144 text = args[1]
01145
01146 elif len(args) == 3:
01147 programName = self.DEFAULT_PROGRAM_NAME
01148 first = args[0]
01149 last = args[1]
01150 text = args[2]
01151
01152 elif len(args) == 4:
01153 programName = args[0]
01154 first = args[1]
01155 last = args[2]
01156 text = args[3]
01157
01158 else:
01159 raise TypeError("Invalid arguments")
01160
01161 if isinstance(first, Token):
01162
01163 first = first.index
01164
01165 if isinstance(last, Token):
01166
01167 last = last.index
01168
01169 if first > last or first < 0 or last < 0 or last >= len(self.tokens):
01170 raise ValueError(
01171 "replace: range invalid: "+first+".."+last+
01172 "(size="+len(self.tokens)+")")
01173
01174 op = ReplaceOp(self, first, last, text)
01175 rewrites = self.getProgram(programName)
01176 rewrites.append(op)
01177
01178
01179 def delete(self, *args):
01180 self.replace(*(list(args) + [None]))
01181
01182
01183 def getLastRewriteTokenIndex(self, programName=DEFAULT_PROGRAM_NAME):
01184 return self.lastRewriteTokenIndexes.get(programName, -1)
01185
01186
01187 def setLastRewriteTokenIndex(self, programName, i):
01188 self.lastRewriteTokenIndexes[programName] = i
01189
01190
01191 def getProgram(self, name):
01192 p = self.programs.get(name, None)
01193 if p is None:
01194 p = self.initializeProgram(name)
01195
01196 return p
01197
01198
01199 def initializeProgram(self, name):
01200 p = []
01201 self.programs[name] = p
01202 return p
01203
01204
01205 def toOriginalString(self, start=None, end=None):
01206 if start is None:
01207 start = self.MIN_TOKEN_INDEX
01208 if end is None:
01209 end = self.size() - 1
01210
01211 buf = StringIO()
01212 i = start
01213 while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens):
01214 buf.write(self.get(i).text)
01215 i += 1
01216
01217 return buf.getvalue()
01218
01219
01220 def toString(self, *args):
01221 if len(args) == 0:
01222 programName = self.DEFAULT_PROGRAM_NAME
01223 start = self.MIN_TOKEN_INDEX
01224 end = self.size() - 1
01225
01226 elif len(args) == 1:
01227 programName = args[0]
01228 start = self.MIN_TOKEN_INDEX
01229 end = self.size() - 1
01230
01231 elif len(args) == 2:
01232 programName = self.DEFAULT_PROGRAM_NAME
01233 start = args[0]
01234 end = args[1]
01235
01236 if start is None:
01237 start = self.MIN_TOKEN_INDEX
01238 elif not isinstance(start, int):
01239 start = start.index
01240
01241 if end is None:
01242 end = len(self.tokens) - 1
01243 elif not isinstance(end, int):
01244 end = end.index
01245
01246
01247 if end >= len(self.tokens):
01248 end = len(self.tokens) - 1
01249
01250 if start < 0:
01251 start = 0
01252
01253 rewrites = self.programs.get(programName)
01254 if rewrites is None or len(rewrites) == 0:
01255
01256 return self.toOriginalString(start, end)
01257
01258 buf = StringIO()
01259
01260
01261 indexToOp = self.reduceToSingleOperationPerIndex(rewrites)
01262
01263
01264 i = start
01265 while i <= end and i < len(self.tokens):
01266 op = indexToOp.get(i)
01267
01268 try:
01269 del indexToOp[i]
01270 except KeyError:
01271 pass
01272
01273 t = self.tokens[i]
01274 if op is None:
01275
01276 buf.write(t.text)
01277 i += 1
01278
01279 else:
01280 i = op.execute(buf)
01281
01282
01283
01284
01285 if end == len(self.tokens) - 1:
01286
01287
01288 for i in sorted(indexToOp.keys()):
01289 op = indexToOp[i]
01290 if op.index >= len(self.tokens)-1:
01291 buf.write(op.text)
01292
01293 return buf.getvalue()
01294
01295 __str__ = toString
01296
01297
01298 def reduceToSingleOperationPerIndex(self, rewrites):
01299 """
01300 We need to combine operations and report invalid operations (like
01301 overlapping replaces that are not completed nested). Inserts to
01302 same index need to be combined etc... Here are the cases:
01303
01304 I.i.u I.j.v leave alone, nonoverlapping
01305 I.i.u I.i.v combine: Iivu
01306
01307 R.i-j.u R.x-y.v | i-j in x-y delete first R
01308 R.i-j.u R.i-j.v delete first R
01309 R.i-j.u R.x-y.v | x-y in i-j ERROR
01310 R.i-j.u R.x-y.v | boundaries overlap ERROR
01311
01312 I.i.u R.x-y.v | i in x-y delete I
01313 I.i.u R.x-y.v | i not in x-y leave alone, nonoverlapping
01314 R.x-y.v I.i.u | i in x-y ERROR
01315 R.x-y.v I.x.u R.x-y.uv (combine, delete I)
01316 R.x-y.v I.i.u | i not in x-y leave alone, nonoverlapping
01317
01318 I.i.u = insert u before op @ index i
01319 R.x-y.u = replace x-y indexed tokens with u
01320
01321 First we need to examine replaces. For any replace op:
01322
01323 1. wipe out any insertions before op within that range.
01324 2. Drop any replace op before that is contained completely within
01325 that range.
01326 3. Throw exception upon boundary overlap with any previous replace.
01327
01328 Then we can deal with inserts:
01329
01330 1. for any inserts to same index, combine even if not adjacent.
01331 2. for any prior replace with same left boundary, combine this
01332 insert with replace and delete this replace.
01333 3. throw exception if index in same range as previous replace
01334
01335 Don't actually delete; make op null in list. Easier to walk list.
01336 Later we can throw as we add to index -> op map.
01337
01338 Note that I.2 R.2-2 will wipe out I.2 even though, technically, the
01339 inserted stuff would be before the replace range. But, if you
01340 add tokens in front of a method body '{' and then delete the method
01341 body, I think the stuff before the '{' you added should disappear too.
01342
01343 Return a map from token index to operation.
01344 """
01345
01346
01347 for i, rop in enumerate(rewrites):
01348 if rop is None:
01349 continue
01350
01351 if not isinstance(rop, ReplaceOp):
01352 continue
01353
01354
01355 for j, iop in self.getKindOfOps(rewrites, InsertBeforeOp, i):
01356 if iop.index >= rop.index and iop.index <= rop.lastIndex:
01357 rewrites[j] = None
01358
01359
01360 for j, prevRop in self.getKindOfOps(rewrites, ReplaceOp, i):
01361 if (prevRop.index >= rop.index
01362 and prevRop.lastIndex <= rop.lastIndex):
01363 rewrites[j] = None
01364 continue
01365
01366
01367 disjoint = (prevRop.lastIndex < rop.index
01368 or prevRop.index > rop.lastIndex)
01369 same = (prevRop.index == rop.index
01370 and prevRop.lastIndex == rop.lastIndex)
01371 if not disjoint and not same:
01372 raise ValueError(
01373 "replace op boundaries of %s overlap with previous %s"
01374 % (rop, prevRop))
01375
01376
01377 for i, iop in enumerate(rewrites):
01378 if iop is None:
01379 continue
01380
01381 if not isinstance(iop, InsertBeforeOp):
01382 continue
01383
01384
01385 for j, prevIop in self.getKindOfOps(rewrites, InsertBeforeOp, i):
01386 if prevIop.index == iop.index:
01387
01388
01389
01390 iop.text = self.catOpText(iop.text, prevIop.text)
01391 rewrites[j] = None
01392
01393
01394 for j, rop in self.getKindOfOps(rewrites, ReplaceOp, i):
01395 if iop.index == rop.index:
01396 rop.text = self.catOpText(iop.text, rop.text)
01397 rewrites[i] = None
01398 continue
01399
01400 if iop.index >= rop.index and iop.index <= rop.lastIndex:
01401 raise ValueError(
01402 "insert op %s within boundaries of previous %s"
01403 % (iop, rop))
01404
01405 m = {}
01406 for i, op in enumerate(rewrites):
01407 if op is None:
01408 continue
01409
01410 assert op.index not in m, "should only be one op per index"
01411 m[op.index] = op
01412
01413 return m
01414
01415
01416 def catOpText(self, a, b):
01417 x = ""
01418 y = ""
01419 if a is not None:
01420 x = a
01421 if b is not None:
01422 y = b
01423 return x + y
01424
01425
01426 def getKindOfOps(self, rewrites, kind, before=None):
01427 if before is None:
01428 before = len(rewrites)
01429 elif before > len(rewrites):
01430 before = len(rewrites)
01431
01432 for i, op in enumerate(rewrites[:before]):
01433 if op is None:
01434
01435 continue
01436 if op.__class__ == kind:
01437 yield i, op
01438
01439
01440 def toDebugString(self, start=None, end=None):
01441 if start is None:
01442 start = self.MIN_TOKEN_INDEX
01443 if end is None:
01444 end = self.size() - 1
01445
01446 buf = StringIO()
01447 i = start
01448 while i >= self.MIN_TOKEN_INDEX and i <= end and i < len(self.tokens):
01449 buf.write(self.get(i))
01450 i += 1
01451
01452 return buf.getvalue()