00001 """ANTLR3 runtime package"""
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 import sys
00034 import inspect
00035
00036 from rve_interface_gen.antlr3 import runtime_version, runtime_version_str
00037 from constants import DEFAULT_CHANNEL, HIDDEN_CHANNEL, EOF, \
00038 EOR_TOKEN_TYPE, INVALID_TOKEN_TYPE
00039 from exceptions import RecognitionException, MismatchedTokenException, \
00040 MismatchedRangeException, MismatchedTreeNodeException, \
00041 NoViableAltException, EarlyExitException, MismatchedSetException, \
00042 MismatchedNotSetException, FailedPredicateException, \
00043 BacktrackingFailed, UnwantedTokenException, MissingTokenException
00044 from tokens import CommonToken, EOF_TOKEN, SKIP_TOKEN
00045 from compat import set, frozenset, reversed
00046
00047
00048 class RecognizerSharedState(object):
00049 """
00050 The set of fields needed by an abstract recognizer to recognize input
00051 and recover from errors etc... As a separate state object, it can be
00052 shared among multiple grammars; e.g., when one grammar imports another.
00053
00054 These fields are publically visible but the actual state pointer per
00055 parser is protected.
00056 """
00057
00058 def __init__(self):
00059
00060
00061 self.following = []
00062
00063
00064
00065
00066 self.errorRecovery = False
00067
00068
00069
00070
00071
00072
00073 self.lastErrorIndex = -1
00074
00075
00076
00077 self.backtracking = 0
00078
00079
00080
00081
00082
00083
00084
00085 self.ruleMemo = None
00086
00087
00088 self.syntaxErrors = 0
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102 self.token = None
00103
00104
00105
00106
00107 self.tokenStartCharIndex = -1
00108
00109
00110 self.tokenStartLine = None
00111
00112
00113 self.tokenStartCharPositionInLine = None
00114
00115
00116 self.channel = None
00117
00118
00119 self.type = None
00120
00121
00122
00123 self.text = None
00124
00125
00126 class BaseRecognizer(object):
00127 """
00128 @brief Common recognizer functionality.
00129
00130 A generic recognizer that can handle recognizers generated from
00131 lexer, parser, and tree grammars. This is all the parsing
00132 support code essentially; most of it is error recovery stuff and
00133 backtracking.
00134 """
00135
00136 MEMO_RULE_FAILED = -2
00137 MEMO_RULE_UNKNOWN = -1
00138
00139
00140 DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL
00141
00142
00143 HIDDEN = HIDDEN_CHANNEL
00144
00145
00146 tokenNames = None
00147
00148
00149
00150 antlr_version = (3, 0, 1, 0)
00151 antlr_version_str = "3.0.1"
00152
00153 def __init__(self, state=None):
00154
00155 self.input = None
00156
00157
00158
00159
00160
00161
00162 if state is None:
00163 state = RecognizerSharedState()
00164 self._state = state
00165
00166 if self.antlr_version > runtime_version:
00167 raise RuntimeError(
00168 "ANTLR version mismatch: "
00169 "The recognizer has been generated by V%s, but this runtime "
00170 "is V%s. Please use the V%s runtime or higher."
00171 % (self.antlr_version_str,
00172 runtime_version_str,
00173 self.antlr_version_str))
00174 elif (self.antlr_version < (3, 1, 0, 0) and
00175 self.antlr_version != runtime_version):
00176
00177
00178 raise RuntimeError(
00179 "ANTLR version mismatch: "
00180 "The recognizer has been generated by V%s, but this runtime "
00181 "is V%s. Please use the V%s runtime."
00182 % (self.antlr_version_str,
00183 runtime_version_str,
00184 self.antlr_version_str))
00185
00186
00187 def setInput(self, input):
00188 self.input = input
00189
00190
00191 def reset(self):
00192 """
00193 reset the parser's state; subclasses must rewinds the input stream
00194 """
00195
00196
00197 if self._state is None:
00198
00199 return
00200
00201 self._state.following = []
00202 self._state.errorRecovery = False
00203 self._state.lastErrorIndex = -1
00204 self._state.syntaxErrors = 0
00205
00206 self._state.backtracking = 0
00207 if self._state.ruleMemo is not None:
00208 self._state.ruleMemo = {}
00209
00210
00211 def match(self, input, ttype, follow):
00212 """
00213 Match current input symbol against ttype. Attempt
00214 single token insertion or deletion error recovery. If
00215 that fails, throw MismatchedTokenException.
00216
00217 To turn off single token insertion or deletion error
00218 recovery, override recoverFromMismatchedToken() and have it
00219 throw an exception. See TreeParser.recoverFromMismatchedToken().
00220 This way any error in a rule will cause an exception and
00221 immediate exit from rule. Rule would recover by resynchronizing
00222 to the set of symbols that can follow rule ref.
00223 """
00224
00225 matchedSymbol = self.getCurrentInputSymbol(input)
00226 if self.input.LA(1) == ttype:
00227 self.input.consume()
00228 self._state.errorRecovery = False
00229 return matchedSymbol
00230
00231 if self._state.backtracking > 0:
00232
00233 raise BacktrackingFailed
00234
00235 matchedSymbol = self.recoverFromMismatchedToken(input, ttype, follow)
00236 return matchedSymbol
00237
00238
00239 def matchAny(self, input):
00240 """Match the wildcard: in a symbol"""
00241
00242 self._state.errorRecovery = False
00243 self.input.consume()
00244
00245
00246 def mismatchIsUnwantedToken(self, input, ttype):
00247 return input.LA(2) == ttype
00248
00249
00250 def mismatchIsMissingToken(self, input, follow):
00251 if follow is None:
00252
00253
00254 return False
00255
00256
00257 if EOR_TOKEN_TYPE in follow:
00258 viableTokensFollowingThisRule = self.computeContextSensitiveRuleFOLLOW()
00259 follow = follow | viableTokensFollowingThisRule
00260
00261 if len(self._state.following) > 0:
00262
00263 follow = follow - set([EOR_TOKEN_TYPE])
00264
00265
00266
00267
00268 if input.LA(1) in follow or EOR_TOKEN_TYPE in follow:
00269 return True
00270
00271 return False
00272
00273
00274 def reportError(self, e):
00275 """Report a recognition problem.
00276
00277 This method sets errorRecovery to indicate the parser is recovering
00278 not parsing. Once in recovery mode, no errors are generated.
00279 To get out of recovery mode, the parser must successfully match
00280 a token (after a resync). So it will go:
00281
00282 1. error occurs
00283 2. enter recovery mode, report error
00284 3. consume until token found in resynch set
00285 4. try to resume parsing
00286 5. next match() will reset errorRecovery mode
00287
00288 If you override, make sure to update syntaxErrors if you care about
00289 that.
00290
00291 """
00292
00293
00294
00295 if self._state.errorRecovery:
00296 return
00297
00298 self._state.syntaxErrors += 1
00299 self._state.errorRecovery = True
00300
00301 self.displayRecognitionError(self.tokenNames, e)
00302
00303
00304 def displayRecognitionError(self, tokenNames, e):
00305 hdr = self.getErrorHeader(e)
00306 msg = self.getErrorMessage(e, tokenNames)
00307 self.emitErrorMessage(hdr+" "+msg)
00308
00309
00310 def getErrorMessage(self, e, tokenNames):
00311 """
00312 What error message should be generated for the various
00313 exception types?
00314
00315 Not very object-oriented code, but I like having all error message
00316 generation within one method rather than spread among all of the
00317 exception classes. This also makes it much easier for the exception
00318 handling because the exception classes do not have to have pointers back
00319 to this object to access utility routines and so on. Also, changing
00320 the message for an exception type would be difficult because you
00321 would have to subclassing exception, but then somehow get ANTLR
00322 to make those kinds of exception objects instead of the default.
00323 This looks weird, but trust me--it makes the most sense in terms
00324 of flexibility.
00325
00326 For grammar debugging, you will want to override this to add
00327 more information such as the stack frame with
00328 getRuleInvocationStack(e, this.getClass().getName()) and,
00329 for no viable alts, the decision description and state etc...
00330
00331 Override this to change the message generated for one or more
00332 exception types.
00333 """
00334
00335 if isinstance(e, UnwantedTokenException):
00336 tokenName = "<unknown>"
00337 if e.expecting == EOF:
00338 tokenName = "EOF"
00339
00340 else:
00341 tokenName = self.tokenNames[e.expecting]
00342
00343 msg = "extraneous input %s expecting %s" % (
00344 self.getTokenErrorDisplay(e.getUnexpectedToken()),
00345 tokenName
00346 )
00347
00348 elif isinstance(e, MissingTokenException):
00349 tokenName = "<unknown>"
00350 if e.expecting == EOF:
00351 tokenName = "EOF"
00352
00353 else:
00354 tokenName = self.tokenNames[e.expecting]
00355
00356 msg = "missing %s at %s" % (
00357 tokenName, self.getTokenErrorDisplay(e.token)
00358 )
00359
00360 elif isinstance(e, MismatchedTokenException):
00361 tokenName = "<unknown>"
00362 if e.expecting == EOF:
00363 tokenName = "EOF"
00364 else:
00365 tokenName = self.tokenNames[e.expecting]
00366
00367 msg = "mismatched input " \
00368 + self.getTokenErrorDisplay(e.token) \
00369 + " expecting " \
00370 + tokenName
00371
00372 elif isinstance(e, MismatchedTreeNodeException):
00373 tokenName = "<unknown>"
00374 if e.expecting == EOF:
00375 tokenName = "EOF"
00376 else:
00377 tokenName = self.tokenNames[e.expecting]
00378
00379 msg = "mismatched tree node: %s expecting %s" \
00380 % (e.node, tokenName)
00381
00382 elif isinstance(e, NoViableAltException):
00383 msg = "no viable alternative at input " \
00384 + self.getTokenErrorDisplay(e.token)
00385
00386 elif isinstance(e, EarlyExitException):
00387 msg = "required (...)+ loop did not match anything at input " \
00388 + self.getTokenErrorDisplay(e.token)
00389
00390 elif isinstance(e, MismatchedSetException):
00391 msg = "mismatched input " \
00392 + self.getTokenErrorDisplay(e.token) \
00393 + " expecting set " \
00394 + repr(e.expecting)
00395
00396 elif isinstance(e, MismatchedNotSetException):
00397 msg = "mismatched input " \
00398 + self.getTokenErrorDisplay(e.token) \
00399 + " expecting set " \
00400 + repr(e.expecting)
00401
00402 elif isinstance(e, FailedPredicateException):
00403 msg = "rule " \
00404 + e.ruleName \
00405 + " failed predicate: {" \
00406 + e.predicateText \
00407 + "}?"
00408
00409 else:
00410 msg = str(e)
00411
00412 return msg
00413
00414
00415 def getNumberOfSyntaxErrors(self):
00416 """
00417 Get number of recognition errors (lexer, parser, tree parser). Each
00418 recognizer tracks its own number. So parser and lexer each have
00419 separate count. Does not count the spurious errors found between
00420 an error and next valid token match
00421
00422 See also reportError()
00423 """
00424 return self._state.syntaxErrors
00425
00426
00427 def getErrorHeader(self, e):
00428 """
00429 What is the error header, normally line/character position information?
00430 """
00431
00432 return "line %d:%d" % (e.line, e.charPositionInLine)
00433
00434
00435 def getTokenErrorDisplay(self, t):
00436 """
00437 How should a token be displayed in an error message? The default
00438 is to display just the text, but during development you might
00439 want to have a lot of information spit out. Override in that case
00440 to use t.toString() (which, for CommonToken, dumps everything about
00441 the token). This is better than forcing you to override a method in
00442 your token objects because you don't have to go modify your lexer
00443 so that it creates a new Java type.
00444 """
00445
00446 s = t.text
00447 if s is None:
00448 if t.type == EOF:
00449 s = "<EOF>"
00450 else:
00451 s = "<"+t.type+">"
00452
00453 return repr(s)
00454
00455
00456 def emitErrorMessage(self, msg):
00457 """Override this method to change where error messages go"""
00458 sys.stderr.write(msg + '\n')
00459
00460
00461 def recover(self, input, re):
00462 """
00463 Recover from an error found on the input stream. This is
00464 for NoViableAlt and mismatched symbol exceptions. If you enable
00465 single token insertion and deletion, this will usually not
00466 handle mismatched symbol exceptions but there could be a mismatched
00467 token that the match() routine could not recover from.
00468 """
00469
00470
00471
00472 if self._state.lastErrorIndex == input.index():
00473
00474
00475
00476
00477 input.consume()
00478
00479 self._state.lastErrorIndex = input.index()
00480 followSet = self.computeErrorRecoverySet()
00481
00482 self.beginResync()
00483 self.consumeUntil(input, followSet)
00484 self.endResync()
00485
00486
00487 def beginResync(self):
00488 """
00489 A hook to listen in on the token consumption during error recovery.
00490 The DebugParser subclasses this to fire events to the listenter.
00491 """
00492
00493 pass
00494
00495
00496 def endResync(self):
00497 """
00498 A hook to listen in on the token consumption during error recovery.
00499 The DebugParser subclasses this to fire events to the listenter.
00500 """
00501
00502 pass
00503
00504
00505 def computeErrorRecoverySet(self):
00506 """
00507 Compute the error recovery set for the current rule. During
00508 rule invocation, the parser pushes the set of tokens that can
00509 follow that rule reference on the stack; this amounts to
00510 computing FIRST of what follows the rule reference in the
00511 enclosing rule. This local follow set only includes tokens
00512 from within the rule; i.e., the FIRST computation done by
00513 ANTLR stops at the end of a rule.
00514
00515 EXAMPLE
00516
00517 When you find a "no viable alt exception", the input is not
00518 consistent with any of the alternatives for rule r. The best
00519 thing to do is to consume tokens until you see something that
00520 can legally follow a call to r *or* any rule that called r.
00521 You don't want the exact set of viable next tokens because the
00522 input might just be missing a token--you might consume the
00523 rest of the input looking for one of the missing tokens.
00524
00525 Consider grammar:
00526
00527 a : '[' b ']'
00528 | '(' b ')'
00529 ;
00530 b : c '^' INT ;
00531 c : ID
00532 | INT
00533 ;
00534
00535 At each rule invocation, the set of tokens that could follow
00536 that rule is pushed on a stack. Here are the various "local"
00537 follow sets:
00538
00539 FOLLOW(b1_in_a) = FIRST(']') = ']'
00540 FOLLOW(b2_in_a) = FIRST(')') = ')'
00541 FOLLOW(c_in_b) = FIRST('^') = '^'
00542
00543 Upon erroneous input "[]", the call chain is
00544
00545 a -> b -> c
00546
00547 and, hence, the follow context stack is:
00548
00549 depth local follow set after call to rule
00550 0 <EOF> a (from main())
00551 1 ']' b
00552 3 '^' c
00553
00554 Notice that ')' is not included, because b would have to have
00555 been called from a different context in rule a for ')' to be
00556 included.
00557
00558 For error recovery, we cannot consider FOLLOW(c)
00559 (context-sensitive or otherwise). We need the combined set of
00560 all context-sensitive FOLLOW sets--the set of all tokens that
00561 could follow any reference in the call chain. We need to
00562 resync to one of those tokens. Note that FOLLOW(c)='^' and if
00563 we resync'd to that token, we'd consume until EOF. We need to
00564 sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
00565 In this case, for input "[]", LA(1) is in this set so we would
00566 not consume anything and after printing an error rule c would
00567 return normally. It would not find the required '^' though.
00568 At this point, it gets a mismatched token error and throws an
00569 exception (since LA(1) is not in the viable following token
00570 set). The rule exception handler tries to recover, but finds
00571 the same recovery set and doesn't consume anything. Rule b
00572 exits normally returning to rule a. Now it finds the ']' (and
00573 with the successful match exits errorRecovery mode).
00574
00575 So, you cna see that the parser walks up call chain looking
00576 for the token that was a member of the recovery set.
00577
00578 Errors are not generated in errorRecovery mode.
00579
00580 ANTLR's error recovery mechanism is based upon original ideas:
00581
00582 "Algorithms + Data Structures = Programs" by Niklaus Wirth
00583
00584 and
00585
00586 "A note on error recovery in recursive descent parsers":
00587 http://portal.acm.org/citation.cfm?id=947902.947905
00588
00589 Later, Josef Grosch had some good ideas:
00590
00591 "Efficient and Comfortable Error Recovery in Recursive Descent
00592 Parsers":
00593 ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
00594
00595 Like Grosch I implemented local FOLLOW sets that are combined
00596 at run-time upon error to avoid overhead during parsing.
00597 """
00598
00599 return self.combineFollows(False)
00600
00601
00602 def computeContextSensitiveRuleFOLLOW(self):
00603 """
00604 Compute the context-sensitive FOLLOW set for current rule.
00605 This is set of token types that can follow a specific rule
00606 reference given a specific call chain. You get the set of
00607 viable tokens that can possibly come next (lookahead depth 1)
00608 given the current call chain. Contrast this with the
00609 definition of plain FOLLOW for rule r:
00610
00611 FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
00612
00613 where x in T* and alpha, beta in V*; T is set of terminals and
00614 V is the set of terminals and nonterminals. In other words,
00615 FOLLOW(r) is the set of all tokens that can possibly follow
00616 references to r in *any* sentential form (context). At
00617 runtime, however, we know precisely which context applies as
00618 we have the call chain. We may compute the exact (rather
00619 than covering superset) set of following tokens.
00620
00621 For example, consider grammar:
00622
00623 stat : ID '=' expr ';' // FOLLOW(stat)=={EOF}
00624 | "return" expr '.'
00625 ;
00626 expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'}
00627 atom : INT // FOLLOW(atom)=={'+',')',';','.'}
00628 | '(' expr ')'
00629 ;
00630
00631 The FOLLOW sets are all inclusive whereas context-sensitive
00632 FOLLOW sets are precisely what could follow a rule reference.
00633 For input input "i=(3);", here is the derivation:
00634
00635 stat => ID '=' expr ';'
00636 => ID '=' atom ('+' atom)* ';'
00637 => ID '=' '(' expr ')' ('+' atom)* ';'
00638 => ID '=' '(' atom ')' ('+' atom)* ';'
00639 => ID '=' '(' INT ')' ('+' atom)* ';'
00640 => ID '=' '(' INT ')' ';'
00641
00642 At the "3" token, you'd have a call chain of
00643
00644 stat -> expr -> atom -> expr -> atom
00645
00646 What can follow that specific nested ref to atom? Exactly ')'
00647 as you can see by looking at the derivation of this specific
00648 input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
00649
00650 You want the exact viable token set when recovering from a
00651 token mismatch. Upon token mismatch, if LA(1) is member of
00652 the viable next token set, then you know there is most likely
00653 a missing token in the input stream. "Insert" one by just not
00654 throwing an exception.
00655 """
00656
00657 return self.combineFollows(True)
00658
00659
00660 def combineFollows(self, exact):
00661 followSet = set()
00662 for idx, localFollowSet in reversed(list(enumerate(self._state.following))):
00663 followSet |= localFollowSet
00664 if exact:
00665
00666 if EOR_TOKEN_TYPE in localFollowSet:
00667
00668
00669 if idx > 0:
00670 followSet.remove(EOR_TOKEN_TYPE)
00671
00672 else:
00673
00674 break
00675
00676 return followSet
00677
00678
00679 def recoverFromMismatchedToken(self, input, ttype, follow):
00680 """Attempt to recover from a single missing or extra token.
00681
00682 EXTRA TOKEN
00683
00684 LA(1) is not what we are looking for. If LA(2) has the right token,
00685 however, then assume LA(1) is some extra spurious token. Delete it
00686 and LA(2) as if we were doing a normal match(), which advances the
00687 input.
00688
00689 MISSING TOKEN
00690
00691 If current token is consistent with what could come after
00692 ttype then it is ok to 'insert' the missing token, else throw
00693 exception For example, Input 'i=(3;' is clearly missing the
00694 ')'. When the parser returns from the nested call to expr, it
00695 will have call chain:
00696
00697 stat -> expr -> atom
00698
00699 and it will be trying to match the ')' at this point in the
00700 derivation:
00701
00702 => ID '=' '(' INT ')' ('+' atom)* ';'
00703 ^
00704 match() will see that ';' doesn't match ')' and report a
00705 mismatched token error. To recover, it sees that LA(1)==';'
00706 is in the set of tokens that can follow the ')' token
00707 reference in rule atom. It can assume that you forgot the ')'.
00708 """
00709
00710 e = None
00711
00712
00713 if self.mismatchIsUnwantedToken(input, ttype):
00714 e = UnwantedTokenException(ttype, input)
00715
00716 self.beginResync()
00717 input.consume()
00718 self.endResync()
00719
00720
00721 self.reportError(e)
00722
00723
00724 matchedSymbol = self.getCurrentInputSymbol(input)
00725
00726
00727 input.consume()
00728 return matchedSymbol
00729
00730
00731 if self.mismatchIsMissingToken(input, follow):
00732 inserted = self.getMissingSymbol(input, e, ttype, follow)
00733 e = MissingTokenException(ttype, input, inserted)
00734
00735
00736 self.reportError(e)
00737 return inserted
00738
00739
00740 e = MismatchedTokenException(ttype, input)
00741 raise e
00742
00743
00744 def recoverFromMismatchedSet(self, input, e, follow):
00745 """Not currently used"""
00746
00747 if self.mismatchIsMissingToken(input, follow):
00748 self.reportError(e)
00749
00750 return self.getMissingSymbol(input, e, INVALID_TOKEN_TYPE, follow)
00751
00752
00753 raise e
00754
00755
00756 def getCurrentInputSymbol(self, input):
00757 """
00758 Match needs to return the current input symbol, which gets put
00759 into the label for the associated token ref; e.g., x=ID. Token
00760 and tree parsers need to return different objects. Rather than test
00761 for input stream type or change the IntStream interface, I use
00762 a simple method to ask the recognizer to tell me what the current
00763 input symbol is.
00764
00765 This is ignored for lexers.
00766 """
00767
00768 return None
00769
00770
00771 def getMissingSymbol(self, input, e, expectedTokenType, follow):
00772 """Conjure up a missing token during error recovery.
00773
00774 The recognizer attempts to recover from single missing
00775 symbols. But, actions might refer to that missing symbol.
00776 For example, x=ID {f($x);}. The action clearly assumes
00777 that there has been an identifier matched previously and that
00778 $x points at that token. If that token is missing, but
00779 the next token in the stream is what we want we assume that
00780 this token is missing and we keep going. Because we
00781 have to return some token to replace the missing token,
00782 we have to conjure one up. This method gives the user control
00783 over the tokens returned for missing tokens. Mostly,
00784 you will want to create something special for identifier
00785 tokens. For literals such as '{' and ',', the default
00786 action in the parser or tree parser works. It simply creates
00787 a CommonToken of the appropriate type. The text will be the token.
00788 If you change what tokens must be created by the lexer,
00789 override this method to create the appropriate tokens.
00790 """
00791
00792 return None
00793
00794
00795
00796
00797
00798
00799
00800
00801
00802
00803
00804
00805
00806
00807
00808
00809
00810
00811 def consumeUntil(self, input, tokenTypes):
00812 """
00813 Consume tokens until one matches the given token or token set
00814
00815 tokenTypes can be a single token type or a set of token types
00816
00817 """
00818
00819 if not isinstance(tokenTypes, (set, frozenset)):
00820 tokenTypes = frozenset([tokenTypes])
00821
00822 ttype = input.LA(1)
00823 while ttype != EOF and ttype not in tokenTypes:
00824 input.consume()
00825 ttype = input.LA(1)
00826
00827
00828 def getRuleInvocationStack(self):
00829 """
00830 Return List<String> of the rules in your parser instance
00831 leading up to a call to this method. You could override if
00832 you want more details such as the file/line info of where
00833 in the parser java code a rule is invoked.
00834
00835 This is very useful for error messages and for context-sensitive
00836 error recovery.
00837
00838 You must be careful, if you subclass a generated recognizers.
00839 The default implementation will only search the module of self
00840 for rules, but the subclass will not contain any rules.
00841 You probably want to override this method to look like
00842
00843 def getRuleInvocationStack(self):
00844 return self._getRuleInvocationStack(<class>.__module__)
00845
00846 where <class> is the class of the generated recognizer, e.g.
00847 the superclass of self.
00848 """
00849
00850 return self._getRuleInvocationStack(self.__module__)
00851
00852
00853 def _getRuleInvocationStack(cls, module):
00854 """
00855 A more general version of getRuleInvocationStack where you can
00856 pass in, for example, a RecognitionException to get it's rule
00857 stack trace. This routine is shared with all recognizers, hence,
00858 static.
00859
00860 TODO: move to a utility class or something; weird having lexer call
00861 this
00862 """
00863
00864
00865
00866
00867
00868 rules = []
00869 for frame in reversed(inspect.stack()):
00870 code = frame[0].f_code
00871 codeMod = inspect.getmodule(code)
00872 if codeMod is None:
00873 continue
00874
00875
00876 if codeMod.__name__ != module:
00877 continue
00878
00879
00880 if code.co_name in ('nextToken', '<module>'):
00881 continue
00882
00883 rules.append(code.co_name)
00884
00885 return rules
00886
00887 _getRuleInvocationStack = classmethod(_getRuleInvocationStack)
00888
00889
00890 def getBacktrackingLevel(self):
00891 return self._state.backtracking
00892
00893 def setBacktrackingLevel(self, n):
00894 self._state.backtracking = n
00895
00896
00897 def failed(self):
00898 """Return whether or not a backtracking attempt failed."""
00899
00900 return self._state.failed
00901
00902
00903 def getGrammarFileName(self):
00904 """For debugging and other purposes, might want the grammar name.
00905
00906 Have ANTLR generate an implementation for this method.
00907 """
00908
00909 return self.grammarFileName
00910
00911
00912 def getSourceName(self):
00913 raise NotImplementedError
00914
00915
00916 def toStrings(self, tokens):
00917 """A convenience method for use most often with template rewrites.
00918
00919 Convert a List<Token> to List<String>
00920 """
00921
00922 if tokens is None:
00923 return None
00924
00925 return [token.text for token in tokens]
00926
00927
00928 def getRuleMemoization(self, ruleIndex, ruleStartIndex):
00929 """
00930 Given a rule number and a start token index number, return
00931 MEMO_RULE_UNKNOWN if the rule has not parsed input starting from
00932 start index. If this rule has parsed input starting from the
00933 start index before, then return where the rule stopped parsing.
00934 It returns the index of the last token matched by the rule.
00935 """
00936
00937 if ruleIndex not in self._state.ruleMemo:
00938 self._state.ruleMemo[ruleIndex] = {}
00939
00940 return self._state.ruleMemo[ruleIndex].get(
00941 ruleStartIndex, self.MEMO_RULE_UNKNOWN
00942 )
00943
00944
00945 def alreadyParsedRule(self, input, ruleIndex):
00946 """
00947 Has this rule already parsed input at the current index in the
00948 input stream? Return the stop token index or MEMO_RULE_UNKNOWN.
00949 If we attempted but failed to parse properly before, return
00950 MEMO_RULE_FAILED.
00951
00952 This method has a side-effect: if we have seen this input for
00953 this rule and successfully parsed before, then seek ahead to
00954 1 past the stop token matched for this rule last time.
00955 """
00956
00957 stopIndex = self.getRuleMemoization(ruleIndex, input.index())
00958 if stopIndex == self.MEMO_RULE_UNKNOWN:
00959 return False
00960
00961 if stopIndex == self.MEMO_RULE_FAILED:
00962 raise BacktrackingFailed
00963
00964 else:
00965 input.seek(stopIndex + 1)
00966
00967 return True
00968
00969
00970 def memoize(self, input, ruleIndex, ruleStartIndex, success):
00971 """
00972 Record whether or not this rule parsed the input at this position
00973 successfully.
00974 """
00975
00976 if success:
00977 stopTokenIndex = input.index() - 1
00978 else:
00979 stopTokenIndex = self.MEMO_RULE_FAILED
00980
00981 if ruleIndex in self._state.ruleMemo:
00982 self._state.ruleMemo[ruleIndex][ruleStartIndex] = stopTokenIndex
00983
00984
00985 def traceIn(self, ruleName, ruleIndex, inputSymbol):
00986 sys.stdout.write("enter %s %s" % (ruleName, inputSymbol))
00987
00988 if self._state.backtracking > 0:
00989 sys.stdout.write(" backtracking=%s" % self._state.backtracking)
00990
00991 sys.stdout.write('\n')
00992
00993
00994 def traceOut(self, ruleName, ruleIndex, inputSymbol):
00995 sys.stdout.write("exit %s %s" % (ruleName, inputSymbol))
00996
00997 if self._state.backtracking > 0:
00998 sys.stdout.write(" backtracking=%s" % self._state.backtracking)
00999
01000 if self._state.failed:
01001 sys.stdout.write(" failed")
01002 else:
01003 sys.stdout.write(" succeeded")
01004
01005 sys.stdout.write('\n')
01006
01007
01008 class TokenSource(object):
01009 """
01010 @brief Abstract baseclass for token producers.
01011
01012 A source of tokens must provide a sequence of tokens via nextToken()
01013 and also must reveal it's source of characters; CommonToken's text is
01014 computed from a CharStream; it only store indices into the char stream.
01015
01016 Errors from the lexer are never passed to the parser. Either you want
01017 to keep going or you do not upon token recognition error. If you do not
01018 want to continue lexing then you do not want to continue parsing. Just
01019 throw an exception not under RecognitionException and Java will naturally
01020 toss you all the way out of the recognizers. If you want to continue
01021 lexing then you should not throw an exception to the parser--it has already
01022 requested a token. Keep lexing until you get a valid one. Just report
01023 errors and keep going, looking for a valid token.
01024 """
01025
01026 def nextToken(self):
01027 """Return a Token object from your input stream (usually a CharStream).
01028
01029 Do not fail/return upon lexing error; keep chewing on the characters
01030 until you get a good one; errors are not passed through to the parser.
01031 """
01032
01033 raise NotImplementedError
01034
01035
01036 def __iter__(self):
01037 """The TokenSource is an interator.
01038
01039 The iteration will not include the final EOF token, see also the note
01040 for the next() method.
01041
01042 """
01043
01044 return self
01045
01046
01047 def next(self):
01048 """Return next token or raise StopIteration.
01049
01050 Note that this will raise StopIteration when hitting the EOF token,
01051 so EOF will not be part of the iteration.
01052
01053 """
01054
01055 token = self.nextToken()
01056 if token is None or token.type == EOF:
01057 raise StopIteration
01058 return token
01059
01060
01061 class Lexer(BaseRecognizer, TokenSource):
01062 """
01063 @brief Baseclass for generated lexer classes.
01064
01065 A lexer is recognizer that draws input symbols from a character stream.
01066 lexer grammars result in a subclass of this object. A Lexer object
01067 uses simplified match() and error recovery mechanisms in the interest
01068 of speed.
01069 """
01070
01071 def __init__(self, input, state=None):
01072 BaseRecognizer.__init__(self, state)
01073 TokenSource.__init__(self)
01074
01075
01076 self.input = input
01077
01078
01079 def reset(self):
01080 BaseRecognizer.reset(self)
01081
01082 if self.input is not None:
01083
01084 self.input.seek(0)
01085
01086 if self._state is None:
01087
01088 return
01089
01090
01091 self._state.token = None
01092 self._state.type = INVALID_TOKEN_TYPE
01093 self._state.channel = DEFAULT_CHANNEL
01094 self._state.tokenStartCharIndex = -1
01095 self._state.tokenStartLine = -1
01096 self._state.tokenStartCharPositionInLine = -1
01097 self._state.text = None
01098
01099
01100 def nextToken(self):
01101 """
01102 Return a token from this source; i.e., match a token on the char
01103 stream.
01104 """
01105
01106 while 1:
01107 self._state.token = None
01108 self._state.channel = DEFAULT_CHANNEL
01109 self._state.tokenStartCharIndex = self.input.index()
01110 self._state.tokenStartCharPositionInLine = self.input.charPositionInLine
01111 self._state.tokenStartLine = self.input.line
01112 self._state.text = None
01113 if self.input.LA(1) == EOF:
01114 return EOF_TOKEN
01115
01116 try:
01117 self.mTokens()
01118
01119 if self._state.token is None:
01120 self.emit()
01121
01122 elif self._state.token == SKIP_TOKEN:
01123 continue
01124
01125 return self._state.token
01126
01127 except NoViableAltException, re:
01128 self.reportError(re)
01129 self.recover(re)
01130
01131 except RecognitionException, re:
01132 self.reportError(re)
01133
01134
01135
01136 def skip(self):
01137 """
01138 Instruct the lexer to skip creating a token for current lexer rule
01139 and look for another token. nextToken() knows to keep looking when
01140 a lexer rule finishes with token set to SKIP_TOKEN. Recall that
01141 if token==null at end of any token rule, it creates one for you
01142 and emits it.
01143 """
01144
01145 self._state.token = SKIP_TOKEN
01146
01147
01148 def mTokens(self):
01149 """This is the lexer entry point that sets instance var 'token'"""
01150
01151
01152 raise NotImplementedError
01153
01154
01155 def setCharStream(self, input):
01156 """Set the char stream and reset the lexer"""
01157 self.input = None
01158 self.reset()
01159 self.input = input
01160
01161
01162 def getSourceName(self):
01163 return self.input.getSourceName()
01164
01165
01166 def emit(self, token=None):
01167 """
01168 The standard method called to automatically emit a token at the
01169 outermost lexical rule. The token object should point into the
01170 char buffer start..stop. If there is a text override in 'text',
01171 use that to set the token's text. Override this method to emit
01172 custom Token objects.
01173
01174 If you are building trees, then you should also override
01175 Parser or TreeParser.getMissingSymbol().
01176 """
01177
01178 if token is None:
01179 token = CommonToken(
01180 input=self.input,
01181 type=self._state.type,
01182 channel=self._state.channel,
01183 start=self._state.tokenStartCharIndex,
01184 stop=self.getCharIndex()-1
01185 )
01186 token.line = self._state.tokenStartLine
01187 token.text = self._state.text
01188 token.charPositionInLine = self._state.tokenStartCharPositionInLine
01189
01190 self._state.token = token
01191
01192 return token
01193
01194
01195 def match(self, s):
01196 if isinstance(s, basestring):
01197 for c in s:
01198 if self.input.LA(1) != ord(c):
01199 if self._state.backtracking > 0:
01200 raise BacktrackingFailed
01201
01202 mte = MismatchedTokenException(c, self.input)
01203 self.recover(mte)
01204 raise mte
01205
01206 self.input.consume()
01207
01208 else:
01209 if self.input.LA(1) != s:
01210 if self._state.backtracking > 0:
01211 raise BacktrackingFailed
01212
01213 mte = MismatchedTokenException(unichr(s), self.input)
01214 self.recover(mte)
01215 raise mte
01216
01217 self.input.consume()
01218
01219
01220 def matchAny(self):
01221 self.input.consume()
01222
01223
01224 def matchRange(self, a, b):
01225 if self.input.LA(1) < a or self.input.LA(1) > b:
01226 if self._state.backtracking > 0:
01227 raise BacktrackingFailed
01228
01229 mre = MismatchedRangeException(unichr(a), unichr(b), self.input)
01230 self.recover(mre)
01231 raise mre
01232
01233 self.input.consume()
01234
01235
01236 def getLine(self):
01237 return self.input.line
01238
01239
01240 def getCharPositionInLine(self):
01241 return self.input.charPositionInLine
01242
01243
01244 def getCharIndex(self):
01245 """What is the index of the current character of lookahead?"""
01246
01247 return self.input.index()
01248
01249
01250 def getText(self):
01251 """
01252 Return the text matched so far for the current token or any
01253 text override.
01254 """
01255 if self._state.text is not None:
01256 return self._state.text
01257
01258 return self.input.substring(
01259 self._state.tokenStartCharIndex,
01260 self.getCharIndex()-1
01261 )
01262
01263
01264 def setText(self, text):
01265 """
01266 Set the complete text of this token; it wipes any previous
01267 changes to the text.
01268 """
01269 self._state.text = text
01270
01271
01272 text = property(getText, setText)
01273
01274
01275 def reportError(self, e):
01276
01277
01278
01279
01280
01281
01282
01283
01284
01285
01286 self.displayRecognitionError(self.tokenNames, e)
01287
01288
01289 def getErrorMessage(self, e, tokenNames):
01290 msg = None
01291
01292 if isinstance(e, MismatchedTokenException):
01293 msg = "mismatched character " \
01294 + self.getCharErrorDisplay(e.c) \
01295 + " expecting " \
01296 + self.getCharErrorDisplay(e.expecting)
01297
01298 elif isinstance(e, NoViableAltException):
01299 msg = "no viable alternative at character " \
01300 + self.getCharErrorDisplay(e.c)
01301
01302 elif isinstance(e, EarlyExitException):
01303 msg = "required (...)+ loop did not match anything at character " \
01304 + self.getCharErrorDisplay(e.c)
01305
01306 elif isinstance(e, MismatchedNotSetException):
01307 msg = "mismatched character " \
01308 + self.getCharErrorDisplay(e.c) \
01309 + " expecting set " \
01310 + repr(e.expecting)
01311
01312 elif isinstance(e, MismatchedSetException):
01313 msg = "mismatched character " \
01314 + self.getCharErrorDisplay(e.c) \
01315 + " expecting set " \
01316 + repr(e.expecting)
01317
01318 elif isinstance(e, MismatchedRangeException):
01319 msg = "mismatched character " \
01320 + self.getCharErrorDisplay(e.c) \
01321 + " expecting set " \
01322 + self.getCharErrorDisplay(e.a) \
01323 + ".." \
01324 + self.getCharErrorDisplay(e.b)
01325
01326 else:
01327 msg = BaseRecognizer.getErrorMessage(self, e, tokenNames)
01328
01329 return msg
01330
01331
01332 def getCharErrorDisplay(self, c):
01333 if c == EOF:
01334 c = '<EOF>'
01335 return repr(c)
01336
01337
01338 def recover(self, re):
01339 """
01340 Lexers can normally match any char in it's vocabulary after matching
01341 a token, so do the easy thing and just kill a character and hope
01342 it all works out. You can instead use the rule invocation stack
01343 to do sophisticated error recovery if you are in a fragment rule.
01344 """
01345
01346 self.input.consume()
01347
01348
01349 def traceIn(self, ruleName, ruleIndex):
01350 inputSymbol = "%s line=%d:%s" % (self.input.LT(1),
01351 self.getLine(),
01352 self.getCharPositionInLine()
01353 )
01354
01355 BaseRecognizer.traceIn(self, ruleName, ruleIndex, inputSymbol)
01356
01357
01358 def traceOut(self, ruleName, ruleIndex):
01359 inputSymbol = "%s line=%d:%s" % (self.input.LT(1),
01360 self.getLine(),
01361 self.getCharPositionInLine()
01362 )
01363
01364 BaseRecognizer.traceOut(self, ruleName, ruleIndex, inputSymbol)
01365
01366
01367
01368 class Parser(BaseRecognizer):
01369 """
01370 @brief Baseclass for generated parser classes.
01371 """
01372
01373 def __init__(self, lexer, state=None):
01374 BaseRecognizer.__init__(self, state)
01375
01376 self.setTokenStream(lexer)
01377
01378
01379 def reset(self):
01380 BaseRecognizer.reset(self)
01381 if self.input is not None:
01382 self.input.seek(0)
01383
01384
01385 def getCurrentInputSymbol(self, input):
01386 return input.LT(1)
01387
01388
01389 def getMissingSymbol(self, input, e, expectedTokenType, follow):
01390 if expectedTokenType == EOF:
01391 tokenText = "<missing EOF>"
01392 else:
01393 tokenText = "<missing " + self.tokenNames[expectedTokenType] + ">"
01394 t = CommonToken(type=expectedTokenType, text=tokenText)
01395 current = input.LT(1)
01396 if current.type == EOF:
01397 current = input.LT(-1)
01398
01399 if current is not None:
01400 t.line = current.line
01401 t.charPositionInLine = current.charPositionInLine
01402 t.channel = DEFAULT_CHANNEL
01403 return t
01404
01405
01406 def setTokenStream(self, input):
01407 """Set the token stream and reset the parser"""
01408
01409 self.input = None
01410 self.reset()
01411 self.input = input
01412
01413
01414 def getTokenStream(self):
01415 return self.input
01416
01417
01418 def getSourceName(self):
01419 return self.input.getSourceName()
01420
01421
01422 def traceIn(self, ruleName, ruleIndex):
01423 BaseRecognizer.traceIn(self, ruleName, ruleIndex, self.input.LT(1))
01424
01425
01426 def traceOut(self, ruleName, ruleIndex):
01427 BaseRecognizer.traceOut(self, ruleName, ruleIndex, self.input.LT(1))
01428
01429
01430 class RuleReturnScope(object):
01431 """
01432 Rules can return start/stop info as well as possible trees and templates.
01433 """
01434
01435 def getStart(self):
01436 """Return the start token or tree."""
01437 return None
01438
01439
01440 def getStop(self):
01441 """Return the stop token or tree."""
01442 return None
01443
01444
01445 def getTree(self):
01446 """Has a value potentially if output=AST."""
01447 return None
01448
01449
01450 def getTemplate(self):
01451 """Has a value potentially if output=template."""
01452 return None
01453
01454
01455 class ParserRuleReturnScope(RuleReturnScope):
01456 """
01457 Rules that return more than a single value must return an object
01458 containing all the values. Besides the properties defined in
01459 RuleLabelScope.predefinedRulePropertiesScope there may be user-defined
01460 return values. This class simply defines the minimum properties that
01461 are always defined and methods to access the others that might be
01462 available depending on output option such as template and tree.
01463
01464 Note text is not an actual property of the return value, it is computed
01465 from start and stop using the input stream's toString() method. I
01466 could add a ctor to this so that we can pass in and store the input
01467 stream, but I'm not sure we want to do that. It would seem to be undefined
01468 to get the .text property anyway if the rule matches tokens from multiple
01469 input streams.
01470
01471 I do not use getters for fields of objects that are used simply to
01472 group values such as this aggregate. The getters/setters are there to
01473 satisfy the superclass interface.
01474 """
01475
01476 def __init__(self):
01477 self.start = None
01478 self.stop = None
01479
01480
01481 def getStart(self):
01482 return self.start
01483
01484
01485 def getStop(self):
01486 return self.stop
01487