1*16467b97STreehugger Robot"""ANTLR3 runtime package""" 2*16467b97STreehugger Robot 3*16467b97STreehugger Robot# begin[licence] 4*16467b97STreehugger Robot# 5*16467b97STreehugger Robot# [The "BSD licence"] 6*16467b97STreehugger Robot# Copyright (c) 2005-2008 Terence Parr 7*16467b97STreehugger Robot# All rights reserved. 8*16467b97STreehugger Robot# 9*16467b97STreehugger Robot# Redistribution and use in source and binary forms, with or without 10*16467b97STreehugger Robot# modification, are permitted provided that the following conditions 11*16467b97STreehugger Robot# are met: 12*16467b97STreehugger Robot# 1. Redistributions of source code must retain the above copyright 13*16467b97STreehugger Robot# notice, this list of conditions and the following disclaimer. 14*16467b97STreehugger Robot# 2. Redistributions in binary form must reproduce the above copyright 15*16467b97STreehugger Robot# notice, this list of conditions and the following disclaimer in the 16*16467b97STreehugger Robot# documentation and/or other materials provided with the distribution. 17*16467b97STreehugger Robot# 3. The name of the author may not be used to endorse or promote products 18*16467b97STreehugger Robot# derived from this software without specific prior written permission. 19*16467b97STreehugger Robot# 20*16467b97STreehugger Robot# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21*16467b97STreehugger Robot# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22*16467b97STreehugger Robot# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23*16467b97STreehugger Robot# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24*16467b97STreehugger Robot# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25*16467b97STreehugger Robot# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26*16467b97STreehugger Robot# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27*16467b97STreehugger Robot# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28*16467b97STreehugger Robot# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29*16467b97STreehugger Robot# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30*16467b97STreehugger Robot# 31*16467b97STreehugger Robot# end[licence] 32*16467b97STreehugger Robot 33*16467b97STreehugger Robotimport sys 34*16467b97STreehugger Robotimport inspect 35*16467b97STreehugger Robot 36*16467b97STreehugger Robotfrom antlr3 import compatible_api_versions 37*16467b97STreehugger Robotfrom antlr3.constants import DEFAULT_CHANNEL, HIDDEN_CHANNEL, EOF, \ 38*16467b97STreehugger Robot EOR_TOKEN_TYPE, INVALID_TOKEN_TYPE 39*16467b97STreehugger Robotfrom antlr3.exceptions import RecognitionException, MismatchedTokenException, \ 40*16467b97STreehugger Robot MismatchedRangeException, MismatchedTreeNodeException, \ 41*16467b97STreehugger Robot NoViableAltException, EarlyExitException, MismatchedSetException, \ 42*16467b97STreehugger Robot MismatchedNotSetException, FailedPredicateException, \ 43*16467b97STreehugger Robot BacktrackingFailed, UnwantedTokenException, MissingTokenException 44*16467b97STreehugger Robotfrom antlr3.tokens import CommonToken, SKIP_TOKEN 45*16467b97STreehugger Robotfrom antlr3.compat import set, frozenset, reversed 46*16467b97STreehugger Robot 47*16467b97STreehugger Robot 48*16467b97STreehugger Robotclass RecognizerSharedState(object): 49*16467b97STreehugger Robot """ 50*16467b97STreehugger Robot The set of fields needed by an abstract recognizer to recognize input 51*16467b97STreehugger Robot and recover from errors etc... As a separate state object, it can be 52*16467b97STreehugger Robot shared among multiple grammars; e.g., when one grammar imports another. 53*16467b97STreehugger Robot 54*16467b97STreehugger Robot These fields are publically visible but the actual state pointer per 55*16467b97STreehugger Robot parser is protected. 56*16467b97STreehugger Robot """ 57*16467b97STreehugger Robot 58*16467b97STreehugger Robot def __init__(self): 59*16467b97STreehugger Robot # Track the set of token types that can follow any rule invocation. 60*16467b97STreehugger Robot # Stack grows upwards. 61*16467b97STreehugger Robot self.following = [] 62*16467b97STreehugger Robot 63*16467b97STreehugger Robot # This is true when we see an error and before having successfully 64*16467b97STreehugger Robot # matched a token. Prevents generation of more than one error message 65*16467b97STreehugger Robot # per error. 66*16467b97STreehugger Robot self.errorRecovery = False 67*16467b97STreehugger Robot 68*16467b97STreehugger Robot # The index into the input stream where the last error occurred. 69*16467b97STreehugger Robot # This is used to prevent infinite loops where an error is found 70*16467b97STreehugger Robot # but no token is consumed during recovery...another error is found, 71*16467b97STreehugger Robot # ad naseum. This is a failsafe mechanism to guarantee that at least 72*16467b97STreehugger Robot # one token/tree node is consumed for two errors. 73*16467b97STreehugger Robot self.lastErrorIndex = -1 74*16467b97STreehugger Robot 75*16467b97STreehugger Robot # If 0, no backtracking is going on. Safe to exec actions etc... 76*16467b97STreehugger Robot # If >0 then it's the level of backtracking. 77*16467b97STreehugger Robot self.backtracking = 0 78*16467b97STreehugger Robot 79*16467b97STreehugger Robot # An array[size num rules] of Map<Integer,Integer> that tracks 80*16467b97STreehugger Robot # the stop token index for each rule. ruleMemo[ruleIndex] is 81*16467b97STreehugger Robot # the memoization table for ruleIndex. For key ruleStartIndex, you 82*16467b97STreehugger Robot # get back the stop token for associated rule or MEMO_RULE_FAILED. 83*16467b97STreehugger Robot # 84*16467b97STreehugger Robot # This is only used if rule memoization is on (which it is by default). 85*16467b97STreehugger Robot self.ruleMemo = None 86*16467b97STreehugger Robot 87*16467b97STreehugger Robot ## Did the recognizer encounter a syntax error? Track how many. 88*16467b97STreehugger Robot self.syntaxErrors = 0 89*16467b97STreehugger Robot 90*16467b97STreehugger Robot 91*16467b97STreehugger Robot # LEXER FIELDS (must be in same state object to avoid casting 92*16467b97STreehugger Robot # constantly in generated code and Lexer object) :( 93*16467b97STreehugger Robot 94*16467b97STreehugger Robot 95*16467b97STreehugger Robot ## The goal of all lexer rules/methods is to create a token object. 96*16467b97STreehugger Robot # This is an instance variable as multiple rules may collaborate to 97*16467b97STreehugger Robot # create a single token. nextToken will return this object after 98*16467b97STreehugger Robot # matching lexer rule(s). If you subclass to allow multiple token 99*16467b97STreehugger Robot # emissions, then set this to the last token to be matched or 100*16467b97STreehugger Robot # something nonnull so that the auto token emit mechanism will not 101*16467b97STreehugger Robot # emit another token. 102*16467b97STreehugger Robot self.token = None 103*16467b97STreehugger Robot 104*16467b97STreehugger Robot ## What character index in the stream did the current token start at? 105*16467b97STreehugger Robot # Needed, for example, to get the text for current token. Set at 106*16467b97STreehugger Robot # the start of nextToken. 107*16467b97STreehugger Robot self.tokenStartCharIndex = -1 108*16467b97STreehugger Robot 109*16467b97STreehugger Robot ## The line on which the first character of the token resides 110*16467b97STreehugger Robot self.tokenStartLine = None 111*16467b97STreehugger Robot 112*16467b97STreehugger Robot ## The character position of first character within the line 113*16467b97STreehugger Robot self.tokenStartCharPositionInLine = None 114*16467b97STreehugger Robot 115*16467b97STreehugger Robot ## The channel number for the current token 116*16467b97STreehugger Robot self.channel = None 117*16467b97STreehugger Robot 118*16467b97STreehugger Robot ## The token type for the current token 119*16467b97STreehugger Robot self.type = None 120*16467b97STreehugger Robot 121*16467b97STreehugger Robot ## You can set the text for the current token to override what is in 122*16467b97STreehugger Robot # the input char buffer. Use setText() or can set this instance var. 123*16467b97STreehugger Robot self.text = None 124*16467b97STreehugger Robot 125*16467b97STreehugger Robot 126*16467b97STreehugger Robotclass BaseRecognizer(object): 127*16467b97STreehugger Robot """ 128*16467b97STreehugger Robot @brief Common recognizer functionality. 129*16467b97STreehugger Robot 130*16467b97STreehugger Robot A generic recognizer that can handle recognizers generated from 131*16467b97STreehugger Robot lexer, parser, and tree grammars. This is all the parsing 132*16467b97STreehugger Robot support code essentially; most of it is error recovery stuff and 133*16467b97STreehugger Robot backtracking. 134*16467b97STreehugger Robot """ 135*16467b97STreehugger Robot 136*16467b97STreehugger Robot MEMO_RULE_FAILED = -2 137*16467b97STreehugger Robot MEMO_RULE_UNKNOWN = -1 138*16467b97STreehugger Robot 139*16467b97STreehugger Robot # copies from Token object for convenience in actions 140*16467b97STreehugger Robot DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL 141*16467b97STreehugger Robot 142*16467b97STreehugger Robot # for convenience in actions 143*16467b97STreehugger Robot HIDDEN = HIDDEN_CHANNEL 144*16467b97STreehugger Robot 145*16467b97STreehugger Robot # overridden by generated subclasses 146*16467b97STreehugger Robot tokenNames = None 147*16467b97STreehugger Robot 148*16467b97STreehugger Robot # The api_version attribute has been introduced in 3.3. If it is not 149*16467b97STreehugger Robot # overwritten in the generated recognizer, we assume a default of v0. 150*16467b97STreehugger Robot api_version = 0 151*16467b97STreehugger Robot 152*16467b97STreehugger Robot def __init__(self, state=None): 153*16467b97STreehugger Robot # Input stream of the recognizer. Must be initialized by a subclass. 154*16467b97STreehugger Robot self.input = None 155*16467b97STreehugger Robot 156*16467b97STreehugger Robot ## State of a lexer, parser, or tree parser are collected into a state 157*16467b97STreehugger Robot # object so the state can be shared. This sharing is needed to 158*16467b97STreehugger Robot # have one grammar import others and share same error variables 159*16467b97STreehugger Robot # and other state variables. It's a kind of explicit multiple 160*16467b97STreehugger Robot # inheritance via delegation of methods and shared state. 161*16467b97STreehugger Robot if state is None: 162*16467b97STreehugger Robot state = RecognizerSharedState() 163*16467b97STreehugger Robot self._state = state 164*16467b97STreehugger Robot 165*16467b97STreehugger Robot if self.api_version not in compatible_api_versions: 166*16467b97STreehugger Robot raise RuntimeError( 167*16467b97STreehugger Robot ("ANTLR version mismatch: " 168*16467b97STreehugger Robot "The recognizer has been generated with API V%s, " 169*16467b97STreehugger Robot "but this runtime does not support this.") 170*16467b97STreehugger Robot % self.api_version) 171*16467b97STreehugger Robot 172*16467b97STreehugger Robot # this one only exists to shut up pylint :( 173*16467b97STreehugger Robot def setInput(self, input): 174*16467b97STreehugger Robot self.input = input 175*16467b97STreehugger Robot 176*16467b97STreehugger Robot 177*16467b97STreehugger Robot def reset(self): 178*16467b97STreehugger Robot """ 179*16467b97STreehugger Robot reset the parser's state; subclasses must rewinds the input stream 180*16467b97STreehugger Robot """ 181*16467b97STreehugger Robot 182*16467b97STreehugger Robot # wack everything related to error recovery 183*16467b97STreehugger Robot if self._state is None: 184*16467b97STreehugger Robot # no shared state work to do 185*16467b97STreehugger Robot return 186*16467b97STreehugger Robot 187*16467b97STreehugger Robot self._state.following = [] 188*16467b97STreehugger Robot self._state.errorRecovery = False 189*16467b97STreehugger Robot self._state.lastErrorIndex = -1 190*16467b97STreehugger Robot self._state.syntaxErrors = 0 191*16467b97STreehugger Robot # wack everything related to backtracking and memoization 192*16467b97STreehugger Robot self._state.backtracking = 0 193*16467b97STreehugger Robot if self._state.ruleMemo is not None: 194*16467b97STreehugger Robot self._state.ruleMemo = {} 195*16467b97STreehugger Robot 196*16467b97STreehugger Robot 197*16467b97STreehugger Robot def match(self, input, ttype, follow): 198*16467b97STreehugger Robot """ 199*16467b97STreehugger Robot Match current input symbol against ttype. Attempt 200*16467b97STreehugger Robot single token insertion or deletion error recovery. If 201*16467b97STreehugger Robot that fails, throw MismatchedTokenException. 202*16467b97STreehugger Robot 203*16467b97STreehugger Robot To turn off single token insertion or deletion error 204*16467b97STreehugger Robot recovery, override recoverFromMismatchedToken() and have it 205*16467b97STreehugger Robot throw an exception. See TreeParser.recoverFromMismatchedToken(). 206*16467b97STreehugger Robot This way any error in a rule will cause an exception and 207*16467b97STreehugger Robot immediate exit from rule. Rule would recover by resynchronizing 208*16467b97STreehugger Robot to the set of symbols that can follow rule ref. 209*16467b97STreehugger Robot """ 210*16467b97STreehugger Robot 211*16467b97STreehugger Robot matchedSymbol = self.getCurrentInputSymbol(input) 212*16467b97STreehugger Robot if self.input.LA(1) == ttype: 213*16467b97STreehugger Robot self.input.consume() 214*16467b97STreehugger Robot self._state.errorRecovery = False 215*16467b97STreehugger Robot return matchedSymbol 216*16467b97STreehugger Robot 217*16467b97STreehugger Robot if self._state.backtracking > 0: 218*16467b97STreehugger Robot # FIXME: need to return matchedSymbol here as well. damn!! 219*16467b97STreehugger Robot raise BacktrackingFailed 220*16467b97STreehugger Robot 221*16467b97STreehugger Robot matchedSymbol = self.recoverFromMismatchedToken(input, ttype, follow) 222*16467b97STreehugger Robot return matchedSymbol 223*16467b97STreehugger Robot 224*16467b97STreehugger Robot 225*16467b97STreehugger Robot def matchAny(self, input): 226*16467b97STreehugger Robot """Match the wildcard: in a symbol""" 227*16467b97STreehugger Robot 228*16467b97STreehugger Robot self._state.errorRecovery = False 229*16467b97STreehugger Robot self.input.consume() 230*16467b97STreehugger Robot 231*16467b97STreehugger Robot 232*16467b97STreehugger Robot def mismatchIsUnwantedToken(self, input, ttype): 233*16467b97STreehugger Robot return input.LA(2) == ttype 234*16467b97STreehugger Robot 235*16467b97STreehugger Robot 236*16467b97STreehugger Robot def mismatchIsMissingToken(self, input, follow): 237*16467b97STreehugger Robot if follow is None: 238*16467b97STreehugger Robot # we have no information about the follow; we can only consume 239*16467b97STreehugger Robot # a single token and hope for the best 240*16467b97STreehugger Robot return False 241*16467b97STreehugger Robot 242*16467b97STreehugger Robot # compute what can follow this grammar element reference 243*16467b97STreehugger Robot if EOR_TOKEN_TYPE in follow: 244*16467b97STreehugger Robot viableTokensFollowingThisRule = self.computeContextSensitiveRuleFOLLOW() 245*16467b97STreehugger Robot follow = follow | viableTokensFollowingThisRule 246*16467b97STreehugger Robot 247*16467b97STreehugger Robot if len(self._state.following) > 0: 248*16467b97STreehugger Robot # remove EOR if we're not the start symbol 249*16467b97STreehugger Robot follow = follow - set([EOR_TOKEN_TYPE]) 250*16467b97STreehugger Robot 251*16467b97STreehugger Robot # if current token is consistent with what could come after set 252*16467b97STreehugger Robot # then we know we're missing a token; error recovery is free to 253*16467b97STreehugger Robot # "insert" the missing token 254*16467b97STreehugger Robot if input.LA(1) in follow or EOR_TOKEN_TYPE in follow: 255*16467b97STreehugger Robot return True 256*16467b97STreehugger Robot 257*16467b97STreehugger Robot return False 258*16467b97STreehugger Robot 259*16467b97STreehugger Robot 260*16467b97STreehugger Robot def reportError(self, e): 261*16467b97STreehugger Robot """Report a recognition problem. 262*16467b97STreehugger Robot 263*16467b97STreehugger Robot This method sets errorRecovery to indicate the parser is recovering 264*16467b97STreehugger Robot not parsing. Once in recovery mode, no errors are generated. 265*16467b97STreehugger Robot To get out of recovery mode, the parser must successfully match 266*16467b97STreehugger Robot a token (after a resync). So it will go: 267*16467b97STreehugger Robot 268*16467b97STreehugger Robot 1. error occurs 269*16467b97STreehugger Robot 2. enter recovery mode, report error 270*16467b97STreehugger Robot 3. consume until token found in resynch set 271*16467b97STreehugger Robot 4. try to resume parsing 272*16467b97STreehugger Robot 5. next match() will reset errorRecovery mode 273*16467b97STreehugger Robot 274*16467b97STreehugger Robot If you override, make sure to update syntaxErrors if you care about 275*16467b97STreehugger Robot that. 276*16467b97STreehugger Robot 277*16467b97STreehugger Robot """ 278*16467b97STreehugger Robot 279*16467b97STreehugger Robot # if we've already reported an error and have not matched a token 280*16467b97STreehugger Robot # yet successfully, don't report any errors. 281*16467b97STreehugger Robot if self._state.errorRecovery: 282*16467b97STreehugger Robot return 283*16467b97STreehugger Robot 284*16467b97STreehugger Robot self._state.syntaxErrors += 1 # don't count spurious 285*16467b97STreehugger Robot self._state.errorRecovery = True 286*16467b97STreehugger Robot 287*16467b97STreehugger Robot self.displayRecognitionError(self.tokenNames, e) 288*16467b97STreehugger Robot 289*16467b97STreehugger Robot 290*16467b97STreehugger Robot def displayRecognitionError(self, tokenNames, e): 291*16467b97STreehugger Robot hdr = self.getErrorHeader(e) 292*16467b97STreehugger Robot msg = self.getErrorMessage(e, tokenNames) 293*16467b97STreehugger Robot self.emitErrorMessage(hdr+" "+msg) 294*16467b97STreehugger Robot 295*16467b97STreehugger Robot 296*16467b97STreehugger Robot def getErrorMessage(self, e, tokenNames): 297*16467b97STreehugger Robot """ 298*16467b97STreehugger Robot What error message should be generated for the various 299*16467b97STreehugger Robot exception types? 300*16467b97STreehugger Robot 301*16467b97STreehugger Robot Not very object-oriented code, but I like having all error message 302*16467b97STreehugger Robot generation within one method rather than spread among all of the 303*16467b97STreehugger Robot exception classes. This also makes it much easier for the exception 304*16467b97STreehugger Robot handling because the exception classes do not have to have pointers back 305*16467b97STreehugger Robot to this object to access utility routines and so on. Also, changing 306*16467b97STreehugger Robot the message for an exception type would be difficult because you 307*16467b97STreehugger Robot would have to subclassing exception, but then somehow get ANTLR 308*16467b97STreehugger Robot to make those kinds of exception objects instead of the default. 309*16467b97STreehugger Robot This looks weird, but trust me--it makes the most sense in terms 310*16467b97STreehugger Robot of flexibility. 311*16467b97STreehugger Robot 312*16467b97STreehugger Robot For grammar debugging, you will want to override this to add 313*16467b97STreehugger Robot more information such as the stack frame with 314*16467b97STreehugger Robot getRuleInvocationStack(e, this.getClass().getName()) and, 315*16467b97STreehugger Robot for no viable alts, the decision description and state etc... 316*16467b97STreehugger Robot 317*16467b97STreehugger Robot Override this to change the message generated for one or more 318*16467b97STreehugger Robot exception types. 319*16467b97STreehugger Robot """ 320*16467b97STreehugger Robot 321*16467b97STreehugger Robot if isinstance(e, UnwantedTokenException): 322*16467b97STreehugger Robot tokenName = "<unknown>" 323*16467b97STreehugger Robot if e.expecting == EOF: 324*16467b97STreehugger Robot tokenName = "EOF" 325*16467b97STreehugger Robot 326*16467b97STreehugger Robot else: 327*16467b97STreehugger Robot tokenName = self.tokenNames[e.expecting] 328*16467b97STreehugger Robot 329*16467b97STreehugger Robot msg = "extraneous input %s expecting %s" % ( 330*16467b97STreehugger Robot self.getTokenErrorDisplay(e.getUnexpectedToken()), 331*16467b97STreehugger Robot tokenName 332*16467b97STreehugger Robot ) 333*16467b97STreehugger Robot 334*16467b97STreehugger Robot elif isinstance(e, MissingTokenException): 335*16467b97STreehugger Robot tokenName = "<unknown>" 336*16467b97STreehugger Robot if e.expecting == EOF: 337*16467b97STreehugger Robot tokenName = "EOF" 338*16467b97STreehugger Robot 339*16467b97STreehugger Robot else: 340*16467b97STreehugger Robot tokenName = self.tokenNames[e.expecting] 341*16467b97STreehugger Robot 342*16467b97STreehugger Robot msg = "missing %s at %s" % ( 343*16467b97STreehugger Robot tokenName, self.getTokenErrorDisplay(e.token) 344*16467b97STreehugger Robot ) 345*16467b97STreehugger Robot 346*16467b97STreehugger Robot elif isinstance(e, MismatchedTokenException): 347*16467b97STreehugger Robot tokenName = "<unknown>" 348*16467b97STreehugger Robot if e.expecting == EOF: 349*16467b97STreehugger Robot tokenName = "EOF" 350*16467b97STreehugger Robot else: 351*16467b97STreehugger Robot tokenName = self.tokenNames[e.expecting] 352*16467b97STreehugger Robot 353*16467b97STreehugger Robot msg = "mismatched input " \ 354*16467b97STreehugger Robot + self.getTokenErrorDisplay(e.token) \ 355*16467b97STreehugger Robot + " expecting " \ 356*16467b97STreehugger Robot + tokenName 357*16467b97STreehugger Robot 358*16467b97STreehugger Robot elif isinstance(e, MismatchedTreeNodeException): 359*16467b97STreehugger Robot tokenName = "<unknown>" 360*16467b97STreehugger Robot if e.expecting == EOF: 361*16467b97STreehugger Robot tokenName = "EOF" 362*16467b97STreehugger Robot else: 363*16467b97STreehugger Robot tokenName = self.tokenNames[e.expecting] 364*16467b97STreehugger Robot 365*16467b97STreehugger Robot msg = "mismatched tree node: %s expecting %s" \ 366*16467b97STreehugger Robot % (e.node, tokenName) 367*16467b97STreehugger Robot 368*16467b97STreehugger Robot elif isinstance(e, NoViableAltException): 369*16467b97STreehugger Robot msg = "no viable alternative at input " \ 370*16467b97STreehugger Robot + self.getTokenErrorDisplay(e.token) 371*16467b97STreehugger Robot 372*16467b97STreehugger Robot elif isinstance(e, EarlyExitException): 373*16467b97STreehugger Robot msg = "required (...)+ loop did not match anything at input " \ 374*16467b97STreehugger Robot + self.getTokenErrorDisplay(e.token) 375*16467b97STreehugger Robot 376*16467b97STreehugger Robot elif isinstance(e, MismatchedSetException): 377*16467b97STreehugger Robot msg = "mismatched input " \ 378*16467b97STreehugger Robot + self.getTokenErrorDisplay(e.token) \ 379*16467b97STreehugger Robot + " expecting set " \ 380*16467b97STreehugger Robot + repr(e.expecting) 381*16467b97STreehugger Robot 382*16467b97STreehugger Robot elif isinstance(e, MismatchedNotSetException): 383*16467b97STreehugger Robot msg = "mismatched input " \ 384*16467b97STreehugger Robot + self.getTokenErrorDisplay(e.token) \ 385*16467b97STreehugger Robot + " expecting set " \ 386*16467b97STreehugger Robot + repr(e.expecting) 387*16467b97STreehugger Robot 388*16467b97STreehugger Robot elif isinstance(e, FailedPredicateException): 389*16467b97STreehugger Robot msg = "rule " \ 390*16467b97STreehugger Robot + e.ruleName \ 391*16467b97STreehugger Robot + " failed predicate: {" \ 392*16467b97STreehugger Robot + e.predicateText \ 393*16467b97STreehugger Robot + "}?" 394*16467b97STreehugger Robot 395*16467b97STreehugger Robot else: 396*16467b97STreehugger Robot msg = str(e) 397*16467b97STreehugger Robot 398*16467b97STreehugger Robot return msg 399*16467b97STreehugger Robot 400*16467b97STreehugger Robot 401*16467b97STreehugger Robot def getNumberOfSyntaxErrors(self): 402*16467b97STreehugger Robot """ 403*16467b97STreehugger Robot Get number of recognition errors (lexer, parser, tree parser). Each 404*16467b97STreehugger Robot recognizer tracks its own number. So parser and lexer each have 405*16467b97STreehugger Robot separate count. Does not count the spurious errors found between 406*16467b97STreehugger Robot an error and next valid token match 407*16467b97STreehugger Robot 408*16467b97STreehugger Robot See also reportError() 409*16467b97STreehugger Robot """ 410*16467b97STreehugger Robot return self._state.syntaxErrors 411*16467b97STreehugger Robot 412*16467b97STreehugger Robot 413*16467b97STreehugger Robot def getErrorHeader(self, e): 414*16467b97STreehugger Robot """ 415*16467b97STreehugger Robot What is the error header, normally line/character position information? 416*16467b97STreehugger Robot """ 417*16467b97STreehugger Robot 418*16467b97STreehugger Robot source_name = self.getSourceName() 419*16467b97STreehugger Robot if source_name is not None: 420*16467b97STreehugger Robot return "%s line %d:%d" % (source_name, e.line, e.charPositionInLine) 421*16467b97STreehugger Robot return "line %d:%d" % (e.line, e.charPositionInLine) 422*16467b97STreehugger Robot 423*16467b97STreehugger Robot 424*16467b97STreehugger Robot def getTokenErrorDisplay(self, t): 425*16467b97STreehugger Robot """ 426*16467b97STreehugger Robot How should a token be displayed in an error message? The default 427*16467b97STreehugger Robot is to display just the text, but during development you might 428*16467b97STreehugger Robot want to have a lot of information spit out. Override in that case 429*16467b97STreehugger Robot to use t.toString() (which, for CommonToken, dumps everything about 430*16467b97STreehugger Robot the token). This is better than forcing you to override a method in 431*16467b97STreehugger Robot your token objects because you don't have to go modify your lexer 432*16467b97STreehugger Robot so that it creates a new Java type. 433*16467b97STreehugger Robot """ 434*16467b97STreehugger Robot 435*16467b97STreehugger Robot s = t.text 436*16467b97STreehugger Robot if s is None: 437*16467b97STreehugger Robot if t.type == EOF: 438*16467b97STreehugger Robot s = "<EOF>" 439*16467b97STreehugger Robot else: 440*16467b97STreehugger Robot s = "<"+t.type+">" 441*16467b97STreehugger Robot 442*16467b97STreehugger Robot return repr(s) 443*16467b97STreehugger Robot 444*16467b97STreehugger Robot 445*16467b97STreehugger Robot def emitErrorMessage(self, msg): 446*16467b97STreehugger Robot """Override this method to change where error messages go""" 447*16467b97STreehugger Robot sys.stderr.write(msg + '\n') 448*16467b97STreehugger Robot 449*16467b97STreehugger Robot 450*16467b97STreehugger Robot def recover(self, input, re): 451*16467b97STreehugger Robot """ 452*16467b97STreehugger Robot Recover from an error found on the input stream. This is 453*16467b97STreehugger Robot for NoViableAlt and mismatched symbol exceptions. If you enable 454*16467b97STreehugger Robot single token insertion and deletion, this will usually not 455*16467b97STreehugger Robot handle mismatched symbol exceptions but there could be a mismatched 456*16467b97STreehugger Robot token that the match() routine could not recover from. 457*16467b97STreehugger Robot """ 458*16467b97STreehugger Robot 459*16467b97STreehugger Robot # PROBLEM? what if input stream is not the same as last time 460*16467b97STreehugger Robot # perhaps make lastErrorIndex a member of input 461*16467b97STreehugger Robot if self._state.lastErrorIndex == input.index(): 462*16467b97STreehugger Robot # uh oh, another error at same token index; must be a case 463*16467b97STreehugger Robot # where LT(1) is in the recovery token set so nothing is 464*16467b97STreehugger Robot # consumed; consume a single token so at least to prevent 465*16467b97STreehugger Robot # an infinite loop; this is a failsafe. 466*16467b97STreehugger Robot input.consume() 467*16467b97STreehugger Robot 468*16467b97STreehugger Robot self._state.lastErrorIndex = input.index() 469*16467b97STreehugger Robot followSet = self.computeErrorRecoverySet() 470*16467b97STreehugger Robot 471*16467b97STreehugger Robot self.beginResync() 472*16467b97STreehugger Robot self.consumeUntil(input, followSet) 473*16467b97STreehugger Robot self.endResync() 474*16467b97STreehugger Robot 475*16467b97STreehugger Robot 476*16467b97STreehugger Robot def beginResync(self): 477*16467b97STreehugger Robot """ 478*16467b97STreehugger Robot A hook to listen in on the token consumption during error recovery. 479*16467b97STreehugger Robot The DebugParser subclasses this to fire events to the listenter. 480*16467b97STreehugger Robot """ 481*16467b97STreehugger Robot 482*16467b97STreehugger Robot pass 483*16467b97STreehugger Robot 484*16467b97STreehugger Robot 485*16467b97STreehugger Robot def endResync(self): 486*16467b97STreehugger Robot """ 487*16467b97STreehugger Robot A hook to listen in on the token consumption during error recovery. 488*16467b97STreehugger Robot The DebugParser subclasses this to fire events to the listenter. 489*16467b97STreehugger Robot """ 490*16467b97STreehugger Robot 491*16467b97STreehugger Robot pass 492*16467b97STreehugger Robot 493*16467b97STreehugger Robot 494*16467b97STreehugger Robot def computeErrorRecoverySet(self): 495*16467b97STreehugger Robot """ 496*16467b97STreehugger Robot Compute the error recovery set for the current rule. During 497*16467b97STreehugger Robot rule invocation, the parser pushes the set of tokens that can 498*16467b97STreehugger Robot follow that rule reference on the stack; this amounts to 499*16467b97STreehugger Robot computing FIRST of what follows the rule reference in the 500*16467b97STreehugger Robot enclosing rule. This local follow set only includes tokens 501*16467b97STreehugger Robot from within the rule; i.e., the FIRST computation done by 502*16467b97STreehugger Robot ANTLR stops at the end of a rule. 503*16467b97STreehugger Robot 504*16467b97STreehugger Robot EXAMPLE 505*16467b97STreehugger Robot 506*16467b97STreehugger Robot When you find a "no viable alt exception", the input is not 507*16467b97STreehugger Robot consistent with any of the alternatives for rule r. The best 508*16467b97STreehugger Robot thing to do is to consume tokens until you see something that 509*16467b97STreehugger Robot can legally follow a call to r *or* any rule that called r. 510*16467b97STreehugger Robot You don't want the exact set of viable next tokens because the 511*16467b97STreehugger Robot input might just be missing a token--you might consume the 512*16467b97STreehugger Robot rest of the input looking for one of the missing tokens. 513*16467b97STreehugger Robot 514*16467b97STreehugger Robot Consider grammar: 515*16467b97STreehugger Robot 516*16467b97STreehugger Robot a : '[' b ']' 517*16467b97STreehugger Robot | '(' b ')' 518*16467b97STreehugger Robot ; 519*16467b97STreehugger Robot b : c '^' INT ; 520*16467b97STreehugger Robot c : ID 521*16467b97STreehugger Robot | INT 522*16467b97STreehugger Robot ; 523*16467b97STreehugger Robot 524*16467b97STreehugger Robot At each rule invocation, the set of tokens that could follow 525*16467b97STreehugger Robot that rule is pushed on a stack. Here are the various "local" 526*16467b97STreehugger Robot follow sets: 527*16467b97STreehugger Robot 528*16467b97STreehugger Robot FOLLOW(b1_in_a) = FIRST(']') = ']' 529*16467b97STreehugger Robot FOLLOW(b2_in_a) = FIRST(')') = ')' 530*16467b97STreehugger Robot FOLLOW(c_in_b) = FIRST('^') = '^' 531*16467b97STreehugger Robot 532*16467b97STreehugger Robot Upon erroneous input "[]", the call chain is 533*16467b97STreehugger Robot 534*16467b97STreehugger Robot a -> b -> c 535*16467b97STreehugger Robot 536*16467b97STreehugger Robot and, hence, the follow context stack is: 537*16467b97STreehugger Robot 538*16467b97STreehugger Robot depth local follow set after call to rule 539*16467b97STreehugger Robot 0 \<EOF> a (from main()) 540*16467b97STreehugger Robot 1 ']' b 541*16467b97STreehugger Robot 3 '^' c 542*16467b97STreehugger Robot 543*16467b97STreehugger Robot Notice that ')' is not included, because b would have to have 544*16467b97STreehugger Robot been called from a different context in rule a for ')' to be 545*16467b97STreehugger Robot included. 546*16467b97STreehugger Robot 547*16467b97STreehugger Robot For error recovery, we cannot consider FOLLOW(c) 548*16467b97STreehugger Robot (context-sensitive or otherwise). We need the combined set of 549*16467b97STreehugger Robot all context-sensitive FOLLOW sets--the set of all tokens that 550*16467b97STreehugger Robot could follow any reference in the call chain. We need to 551*16467b97STreehugger Robot resync to one of those tokens. Note that FOLLOW(c)='^' and if 552*16467b97STreehugger Robot we resync'd to that token, we'd consume until EOF. We need to 553*16467b97STreehugger Robot sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. 554*16467b97STreehugger Robot In this case, for input "[]", LA(1) is in this set so we would 555*16467b97STreehugger Robot not consume anything and after printing an error rule c would 556*16467b97STreehugger Robot return normally. It would not find the required '^' though. 557*16467b97STreehugger Robot At this point, it gets a mismatched token error and throws an 558*16467b97STreehugger Robot exception (since LA(1) is not in the viable following token 559*16467b97STreehugger Robot set). The rule exception handler tries to recover, but finds 560*16467b97STreehugger Robot the same recovery set and doesn't consume anything. Rule b 561*16467b97STreehugger Robot exits normally returning to rule a. Now it finds the ']' (and 562*16467b97STreehugger Robot with the successful match exits errorRecovery mode). 563*16467b97STreehugger Robot 564*16467b97STreehugger Robot So, you cna see that the parser walks up call chain looking 565*16467b97STreehugger Robot for the token that was a member of the recovery set. 566*16467b97STreehugger Robot 567*16467b97STreehugger Robot Errors are not generated in errorRecovery mode. 568*16467b97STreehugger Robot 569*16467b97STreehugger Robot ANTLR's error recovery mechanism is based upon original ideas: 570*16467b97STreehugger Robot 571*16467b97STreehugger Robot "Algorithms + Data Structures = Programs" by Niklaus Wirth 572*16467b97STreehugger Robot 573*16467b97STreehugger Robot and 574*16467b97STreehugger Robot 575*16467b97STreehugger Robot "A note on error recovery in recursive descent parsers": 576*16467b97STreehugger Robot http://portal.acm.org/citation.cfm?id=947902.947905 577*16467b97STreehugger Robot 578*16467b97STreehugger Robot Later, Josef Grosch had some good ideas: 579*16467b97STreehugger Robot 580*16467b97STreehugger Robot "Efficient and Comfortable Error Recovery in Recursive Descent 581*16467b97STreehugger Robot Parsers": 582*16467b97STreehugger Robot ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip 583*16467b97STreehugger Robot 584*16467b97STreehugger Robot Like Grosch I implemented local FOLLOW sets that are combined 585*16467b97STreehugger Robot at run-time upon error to avoid overhead during parsing. 586*16467b97STreehugger Robot """ 587*16467b97STreehugger Robot 588*16467b97STreehugger Robot return self.combineFollows(False) 589*16467b97STreehugger Robot 590*16467b97STreehugger Robot 591*16467b97STreehugger Robot def computeContextSensitiveRuleFOLLOW(self): 592*16467b97STreehugger Robot """ 593*16467b97STreehugger Robot Compute the context-sensitive FOLLOW set for current rule. 594*16467b97STreehugger Robot This is set of token types that can follow a specific rule 595*16467b97STreehugger Robot reference given a specific call chain. You get the set of 596*16467b97STreehugger Robot viable tokens that can possibly come next (lookahead depth 1) 597*16467b97STreehugger Robot given the current call chain. Contrast this with the 598*16467b97STreehugger Robot definition of plain FOLLOW for rule r: 599*16467b97STreehugger Robot 600*16467b97STreehugger Robot FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} 601*16467b97STreehugger Robot 602*16467b97STreehugger Robot where x in T* and alpha, beta in V*; T is set of terminals and 603*16467b97STreehugger Robot V is the set of terminals and nonterminals. In other words, 604*16467b97STreehugger Robot FOLLOW(r) is the set of all tokens that can possibly follow 605*16467b97STreehugger Robot references to r in *any* sentential form (context). At 606*16467b97STreehugger Robot runtime, however, we know precisely which context applies as 607*16467b97STreehugger Robot we have the call chain. We may compute the exact (rather 608*16467b97STreehugger Robot than covering superset) set of following tokens. 609*16467b97STreehugger Robot 610*16467b97STreehugger Robot For example, consider grammar: 611*16467b97STreehugger Robot 612*16467b97STreehugger Robot stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} 613*16467b97STreehugger Robot | "return" expr '.' 614*16467b97STreehugger Robot ; 615*16467b97STreehugger Robot expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} 616*16467b97STreehugger Robot atom : INT // FOLLOW(atom)=={'+',')',';','.'} 617*16467b97STreehugger Robot | '(' expr ')' 618*16467b97STreehugger Robot ; 619*16467b97STreehugger Robot 620*16467b97STreehugger Robot The FOLLOW sets are all inclusive whereas context-sensitive 621*16467b97STreehugger Robot FOLLOW sets are precisely what could follow a rule reference. 622*16467b97STreehugger Robot For input input "i=(3);", here is the derivation: 623*16467b97STreehugger Robot 624*16467b97STreehugger Robot stat => ID '=' expr ';' 625*16467b97STreehugger Robot => ID '=' atom ('+' atom)* ';' 626*16467b97STreehugger Robot => ID '=' '(' expr ')' ('+' atom)* ';' 627*16467b97STreehugger Robot => ID '=' '(' atom ')' ('+' atom)* ';' 628*16467b97STreehugger Robot => ID '=' '(' INT ')' ('+' atom)* ';' 629*16467b97STreehugger Robot => ID '=' '(' INT ')' ';' 630*16467b97STreehugger Robot 631*16467b97STreehugger Robot At the "3" token, you'd have a call chain of 632*16467b97STreehugger Robot 633*16467b97STreehugger Robot stat -> expr -> atom -> expr -> atom 634*16467b97STreehugger Robot 635*16467b97STreehugger Robot What can follow that specific nested ref to atom? Exactly ')' 636*16467b97STreehugger Robot as you can see by looking at the derivation of this specific 637*16467b97STreehugger Robot input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. 638*16467b97STreehugger Robot 639*16467b97STreehugger Robot You want the exact viable token set when recovering from a 640*16467b97STreehugger Robot token mismatch. Upon token mismatch, if LA(1) is member of 641*16467b97STreehugger Robot the viable next token set, then you know there is most likely 642*16467b97STreehugger Robot a missing token in the input stream. "Insert" one by just not 643*16467b97STreehugger Robot throwing an exception. 644*16467b97STreehugger Robot """ 645*16467b97STreehugger Robot 646*16467b97STreehugger Robot return self.combineFollows(True) 647*16467b97STreehugger Robot 648*16467b97STreehugger Robot 649*16467b97STreehugger Robot def combineFollows(self, exact): 650*16467b97STreehugger Robot followSet = set() 651*16467b97STreehugger Robot for idx, localFollowSet in reversed(list(enumerate(self._state.following))): 652*16467b97STreehugger Robot followSet |= localFollowSet 653*16467b97STreehugger Robot if exact: 654*16467b97STreehugger Robot # can we see end of rule? 655*16467b97STreehugger Robot if EOR_TOKEN_TYPE in localFollowSet: 656*16467b97STreehugger Robot # Only leave EOR in set if at top (start rule); this lets 657*16467b97STreehugger Robot # us know if have to include follow(start rule); i.e., EOF 658*16467b97STreehugger Robot if idx > 0: 659*16467b97STreehugger Robot followSet.remove(EOR_TOKEN_TYPE) 660*16467b97STreehugger Robot 661*16467b97STreehugger Robot else: 662*16467b97STreehugger Robot # can't see end of rule, quit 663*16467b97STreehugger Robot break 664*16467b97STreehugger Robot 665*16467b97STreehugger Robot return followSet 666*16467b97STreehugger Robot 667*16467b97STreehugger Robot 668*16467b97STreehugger Robot def recoverFromMismatchedToken(self, input, ttype, follow): 669*16467b97STreehugger Robot """Attempt to recover from a single missing or extra token. 670*16467b97STreehugger Robot 671*16467b97STreehugger Robot EXTRA TOKEN 672*16467b97STreehugger Robot 673*16467b97STreehugger Robot LA(1) is not what we are looking for. If LA(2) has the right token, 674*16467b97STreehugger Robot however, then assume LA(1) is some extra spurious token. Delete it 675*16467b97STreehugger Robot and LA(2) as if we were doing a normal match(), which advances the 676*16467b97STreehugger Robot input. 677*16467b97STreehugger Robot 678*16467b97STreehugger Robot MISSING TOKEN 679*16467b97STreehugger Robot 680*16467b97STreehugger Robot If current token is consistent with what could come after 681*16467b97STreehugger Robot ttype then it is ok to 'insert' the missing token, else throw 682*16467b97STreehugger Robot exception For example, Input 'i=(3;' is clearly missing the 683*16467b97STreehugger Robot ')'. When the parser returns from the nested call to expr, it 684*16467b97STreehugger Robot will have call chain: 685*16467b97STreehugger Robot 686*16467b97STreehugger Robot stat -> expr -> atom 687*16467b97STreehugger Robot 688*16467b97STreehugger Robot and it will be trying to match the ')' at this point in the 689*16467b97STreehugger Robot derivation: 690*16467b97STreehugger Robot 691*16467b97STreehugger Robot => ID '=' '(' INT ')' ('+' atom)* ';' 692*16467b97STreehugger Robot ^ 693*16467b97STreehugger Robot match() will see that ';' doesn't match ')' and report a 694*16467b97STreehugger Robot mismatched token error. To recover, it sees that LA(1)==';' 695*16467b97STreehugger Robot is in the set of tokens that can follow the ')' token 696*16467b97STreehugger Robot reference in rule atom. It can assume that you forgot the ')'. 697*16467b97STreehugger Robot """ 698*16467b97STreehugger Robot 699*16467b97STreehugger Robot e = None 700*16467b97STreehugger Robot 701*16467b97STreehugger Robot # if next token is what we are looking for then "delete" this token 702*16467b97STreehugger Robot if self.mismatchIsUnwantedToken(input, ttype): 703*16467b97STreehugger Robot e = UnwantedTokenException(ttype, input) 704*16467b97STreehugger Robot 705*16467b97STreehugger Robot self.beginResync() 706*16467b97STreehugger Robot input.consume() # simply delete extra token 707*16467b97STreehugger Robot self.endResync() 708*16467b97STreehugger Robot 709*16467b97STreehugger Robot # report after consuming so AW sees the token in the exception 710*16467b97STreehugger Robot self.reportError(e) 711*16467b97STreehugger Robot 712*16467b97STreehugger Robot # we want to return the token we're actually matching 713*16467b97STreehugger Robot matchedSymbol = self.getCurrentInputSymbol(input) 714*16467b97STreehugger Robot 715*16467b97STreehugger Robot # move past ttype token as if all were ok 716*16467b97STreehugger Robot input.consume() 717*16467b97STreehugger Robot return matchedSymbol 718*16467b97STreehugger Robot 719*16467b97STreehugger Robot # can't recover with single token deletion, try insertion 720*16467b97STreehugger Robot if self.mismatchIsMissingToken(input, follow): 721*16467b97STreehugger Robot inserted = self.getMissingSymbol(input, e, ttype, follow) 722*16467b97STreehugger Robot e = MissingTokenException(ttype, input, inserted) 723*16467b97STreehugger Robot 724*16467b97STreehugger Robot # report after inserting so AW sees the token in the exception 725*16467b97STreehugger Robot self.reportError(e) 726*16467b97STreehugger Robot return inserted 727*16467b97STreehugger Robot 728*16467b97STreehugger Robot # even that didn't work; must throw the exception 729*16467b97STreehugger Robot e = MismatchedTokenException(ttype, input) 730*16467b97STreehugger Robot raise e 731*16467b97STreehugger Robot 732*16467b97STreehugger Robot 733*16467b97STreehugger Robot def recoverFromMismatchedSet(self, input, e, follow): 734*16467b97STreehugger Robot """Not currently used""" 735*16467b97STreehugger Robot 736*16467b97STreehugger Robot if self.mismatchIsMissingToken(input, follow): 737*16467b97STreehugger Robot self.reportError(e) 738*16467b97STreehugger Robot # we don't know how to conjure up a token for sets yet 739*16467b97STreehugger Robot return self.getMissingSymbol(input, e, INVALID_TOKEN_TYPE, follow) 740*16467b97STreehugger Robot 741*16467b97STreehugger Robot # TODO do single token deletion like above for Token mismatch 742*16467b97STreehugger Robot raise e 743*16467b97STreehugger Robot 744*16467b97STreehugger Robot 745*16467b97STreehugger Robot def getCurrentInputSymbol(self, input): 746*16467b97STreehugger Robot """ 747*16467b97STreehugger Robot Match needs to return the current input symbol, which gets put 748*16467b97STreehugger Robot into the label for the associated token ref; e.g., x=ID. Token 749*16467b97STreehugger Robot and tree parsers need to return different objects. Rather than test 750*16467b97STreehugger Robot for input stream type or change the IntStream interface, I use 751*16467b97STreehugger Robot a simple method to ask the recognizer to tell me what the current 752*16467b97STreehugger Robot input symbol is. 753*16467b97STreehugger Robot 754*16467b97STreehugger Robot This is ignored for lexers. 755*16467b97STreehugger Robot """ 756*16467b97STreehugger Robot 757*16467b97STreehugger Robot return None 758*16467b97STreehugger Robot 759*16467b97STreehugger Robot 760*16467b97STreehugger Robot def getMissingSymbol(self, input, e, expectedTokenType, follow): 761*16467b97STreehugger Robot """Conjure up a missing token during error recovery. 762*16467b97STreehugger Robot 763*16467b97STreehugger Robot The recognizer attempts to recover from single missing 764*16467b97STreehugger Robot symbols. But, actions might refer to that missing symbol. 765*16467b97STreehugger Robot For example, x=ID {f($x);}. The action clearly assumes 766*16467b97STreehugger Robot that there has been an identifier matched previously and that 767*16467b97STreehugger Robot $x points at that token. If that token is missing, but 768*16467b97STreehugger Robot the next token in the stream is what we want we assume that 769*16467b97STreehugger Robot this token is missing and we keep going. Because we 770*16467b97STreehugger Robot have to return some token to replace the missing token, 771*16467b97STreehugger Robot we have to conjure one up. This method gives the user control 772*16467b97STreehugger Robot over the tokens returned for missing tokens. Mostly, 773*16467b97STreehugger Robot you will want to create something special for identifier 774*16467b97STreehugger Robot tokens. For literals such as '{' and ',', the default 775*16467b97STreehugger Robot action in the parser or tree parser works. It simply creates 776*16467b97STreehugger Robot a CommonToken of the appropriate type. The text will be the token. 777*16467b97STreehugger Robot If you change what tokens must be created by the lexer, 778*16467b97STreehugger Robot override this method to create the appropriate tokens. 779*16467b97STreehugger Robot """ 780*16467b97STreehugger Robot 781*16467b97STreehugger Robot return None 782*16467b97STreehugger Robot 783*16467b97STreehugger Robot 784*16467b97STreehugger Robot## def recoverFromMissingElement(self, input, e, follow): 785*16467b97STreehugger Robot## """ 786*16467b97STreehugger Robot## This code is factored out from mismatched token and mismatched set 787*16467b97STreehugger Robot## recovery. It handles "single token insertion" error recovery for 788*16467b97STreehugger Robot## both. No tokens are consumed to recover from insertions. Return 789*16467b97STreehugger Robot## true if recovery was possible else return false. 790*16467b97STreehugger Robot## """ 791*16467b97STreehugger Robot 792*16467b97STreehugger Robot## if self.mismatchIsMissingToken(input, follow): 793*16467b97STreehugger Robot## self.reportError(e) 794*16467b97STreehugger Robot## return True 795*16467b97STreehugger Robot 796*16467b97STreehugger Robot## # nothing to do; throw exception 797*16467b97STreehugger Robot## return False 798*16467b97STreehugger Robot 799*16467b97STreehugger Robot 800*16467b97STreehugger Robot def consumeUntil(self, input, tokenTypes): 801*16467b97STreehugger Robot """ 802*16467b97STreehugger Robot Consume tokens until one matches the given token or token set 803*16467b97STreehugger Robot 804*16467b97STreehugger Robot tokenTypes can be a single token type or a set of token types 805*16467b97STreehugger Robot 806*16467b97STreehugger Robot """ 807*16467b97STreehugger Robot 808*16467b97STreehugger Robot if not isinstance(tokenTypes, (set, frozenset)): 809*16467b97STreehugger Robot tokenTypes = frozenset([tokenTypes]) 810*16467b97STreehugger Robot 811*16467b97STreehugger Robot ttype = input.LA(1) 812*16467b97STreehugger Robot while ttype != EOF and ttype not in tokenTypes: 813*16467b97STreehugger Robot input.consume() 814*16467b97STreehugger Robot ttype = input.LA(1) 815*16467b97STreehugger Robot 816*16467b97STreehugger Robot 817*16467b97STreehugger Robot def getRuleInvocationStack(self): 818*16467b97STreehugger Robot """ 819*16467b97STreehugger Robot Return List<String> of the rules in your parser instance 820*16467b97STreehugger Robot leading up to a call to this method. You could override if 821*16467b97STreehugger Robot you want more details such as the file/line info of where 822*16467b97STreehugger Robot in the parser java code a rule is invoked. 823*16467b97STreehugger Robot 824*16467b97STreehugger Robot This is very useful for error messages and for context-sensitive 825*16467b97STreehugger Robot error recovery. 826*16467b97STreehugger Robot 827*16467b97STreehugger Robot You must be careful, if you subclass a generated recognizers. 828*16467b97STreehugger Robot The default implementation will only search the module of self 829*16467b97STreehugger Robot for rules, but the subclass will not contain any rules. 830*16467b97STreehugger Robot You probably want to override this method to look like 831*16467b97STreehugger Robot 832*16467b97STreehugger Robot def getRuleInvocationStack(self): 833*16467b97STreehugger Robot return self._getRuleInvocationStack(<class>.__module__) 834*16467b97STreehugger Robot 835*16467b97STreehugger Robot where <class> is the class of the generated recognizer, e.g. 836*16467b97STreehugger Robot the superclass of self. 837*16467b97STreehugger Robot """ 838*16467b97STreehugger Robot 839*16467b97STreehugger Robot return self._getRuleInvocationStack(self.__module__) 840*16467b97STreehugger Robot 841*16467b97STreehugger Robot 842*16467b97STreehugger Robot def _getRuleInvocationStack(cls, module): 843*16467b97STreehugger Robot """ 844*16467b97STreehugger Robot A more general version of getRuleInvocationStack where you can 845*16467b97STreehugger Robot pass in, for example, a RecognitionException to get it's rule 846*16467b97STreehugger Robot stack trace. This routine is shared with all recognizers, hence, 847*16467b97STreehugger Robot static. 848*16467b97STreehugger Robot 849*16467b97STreehugger Robot TODO: move to a utility class or something; weird having lexer call 850*16467b97STreehugger Robot this 851*16467b97STreehugger Robot """ 852*16467b97STreehugger Robot 853*16467b97STreehugger Robot # mmmhhh,... perhaps look at the first argument 854*16467b97STreehugger Robot # (f_locals[co_varnames[0]]?) and test if it's a (sub)class of 855*16467b97STreehugger Robot # requested recognizer... 856*16467b97STreehugger Robot 857*16467b97STreehugger Robot rules = [] 858*16467b97STreehugger Robot for frame in reversed(inspect.stack()): 859*16467b97STreehugger Robot code = frame[0].f_code 860*16467b97STreehugger Robot codeMod = inspect.getmodule(code) 861*16467b97STreehugger Robot if codeMod is None: 862*16467b97STreehugger Robot continue 863*16467b97STreehugger Robot 864*16467b97STreehugger Robot # skip frames not in requested module 865*16467b97STreehugger Robot if codeMod.__name__ != module: 866*16467b97STreehugger Robot continue 867*16467b97STreehugger Robot 868*16467b97STreehugger Robot # skip some unwanted names 869*16467b97STreehugger Robot if code.co_name in ('nextToken', '<module>'): 870*16467b97STreehugger Robot continue 871*16467b97STreehugger Robot 872*16467b97STreehugger Robot rules.append(code.co_name) 873*16467b97STreehugger Robot 874*16467b97STreehugger Robot return rules 875*16467b97STreehugger Robot 876*16467b97STreehugger Robot _getRuleInvocationStack = classmethod(_getRuleInvocationStack) 877*16467b97STreehugger Robot 878*16467b97STreehugger Robot 879*16467b97STreehugger Robot def getBacktrackingLevel(self): 880*16467b97STreehugger Robot return self._state.backtracking 881*16467b97STreehugger Robot 882*16467b97STreehugger Robot def setBacktrackingLevel(self, n): 883*16467b97STreehugger Robot self._state.backtracking = n 884*16467b97STreehugger Robot 885*16467b97STreehugger Robot 886*16467b97STreehugger Robot def getGrammarFileName(self): 887*16467b97STreehugger Robot """For debugging and other purposes, might want the grammar name. 888*16467b97STreehugger Robot 889*16467b97STreehugger Robot Have ANTLR generate an implementation for this method. 890*16467b97STreehugger Robot """ 891*16467b97STreehugger Robot 892*16467b97STreehugger Robot return self.grammarFileName 893*16467b97STreehugger Robot 894*16467b97STreehugger Robot 895*16467b97STreehugger Robot def getSourceName(self): 896*16467b97STreehugger Robot raise NotImplementedError 897*16467b97STreehugger Robot 898*16467b97STreehugger Robot 899*16467b97STreehugger Robot def toStrings(self, tokens): 900*16467b97STreehugger Robot """A convenience method for use most often with template rewrites. 901*16467b97STreehugger Robot 902*16467b97STreehugger Robot Convert a List<Token> to List<String> 903*16467b97STreehugger Robot """ 904*16467b97STreehugger Robot 905*16467b97STreehugger Robot if tokens is None: 906*16467b97STreehugger Robot return None 907*16467b97STreehugger Robot 908*16467b97STreehugger Robot return [token.text for token in tokens] 909*16467b97STreehugger Robot 910*16467b97STreehugger Robot 911*16467b97STreehugger Robot def getRuleMemoization(self, ruleIndex, ruleStartIndex): 912*16467b97STreehugger Robot """ 913*16467b97STreehugger Robot Given a rule number and a start token index number, return 914*16467b97STreehugger Robot MEMO_RULE_UNKNOWN if the rule has not parsed input starting from 915*16467b97STreehugger Robot start index. If this rule has parsed input starting from the 916*16467b97STreehugger Robot start index before, then return where the rule stopped parsing. 917*16467b97STreehugger Robot It returns the index of the last token matched by the rule. 918*16467b97STreehugger Robot """ 919*16467b97STreehugger Robot 920*16467b97STreehugger Robot if ruleIndex not in self._state.ruleMemo: 921*16467b97STreehugger Robot self._state.ruleMemo[ruleIndex] = {} 922*16467b97STreehugger Robot 923*16467b97STreehugger Robot return self._state.ruleMemo[ruleIndex].get( 924*16467b97STreehugger Robot ruleStartIndex, self.MEMO_RULE_UNKNOWN 925*16467b97STreehugger Robot ) 926*16467b97STreehugger Robot 927*16467b97STreehugger Robot 928*16467b97STreehugger Robot def alreadyParsedRule(self, input, ruleIndex): 929*16467b97STreehugger Robot """ 930*16467b97STreehugger Robot Has this rule already parsed input at the current index in the 931*16467b97STreehugger Robot input stream? Return the stop token index or MEMO_RULE_UNKNOWN. 932*16467b97STreehugger Robot If we attempted but failed to parse properly before, return 933*16467b97STreehugger Robot MEMO_RULE_FAILED. 934*16467b97STreehugger Robot 935*16467b97STreehugger Robot This method has a side-effect: if we have seen this input for 936*16467b97STreehugger Robot this rule and successfully parsed before, then seek ahead to 937*16467b97STreehugger Robot 1 past the stop token matched for this rule last time. 938*16467b97STreehugger Robot """ 939*16467b97STreehugger Robot 940*16467b97STreehugger Robot stopIndex = self.getRuleMemoization(ruleIndex, input.index()) 941*16467b97STreehugger Robot if stopIndex == self.MEMO_RULE_UNKNOWN: 942*16467b97STreehugger Robot return False 943*16467b97STreehugger Robot 944*16467b97STreehugger Robot if stopIndex == self.MEMO_RULE_FAILED: 945*16467b97STreehugger Robot raise BacktrackingFailed 946*16467b97STreehugger Robot 947*16467b97STreehugger Robot else: 948*16467b97STreehugger Robot input.seek(stopIndex + 1) 949*16467b97STreehugger Robot 950*16467b97STreehugger Robot return True 951*16467b97STreehugger Robot 952*16467b97STreehugger Robot 953*16467b97STreehugger Robot def memoize(self, input, ruleIndex, ruleStartIndex, success): 954*16467b97STreehugger Robot """ 955*16467b97STreehugger Robot Record whether or not this rule parsed the input at this position 956*16467b97STreehugger Robot successfully. 957*16467b97STreehugger Robot """ 958*16467b97STreehugger Robot 959*16467b97STreehugger Robot if success: 960*16467b97STreehugger Robot stopTokenIndex = input.index() - 1 961*16467b97STreehugger Robot else: 962*16467b97STreehugger Robot stopTokenIndex = self.MEMO_RULE_FAILED 963*16467b97STreehugger Robot 964*16467b97STreehugger Robot if ruleIndex in self._state.ruleMemo: 965*16467b97STreehugger Robot self._state.ruleMemo[ruleIndex][ruleStartIndex] = stopTokenIndex 966*16467b97STreehugger Robot 967*16467b97STreehugger Robot 968*16467b97STreehugger Robot def traceIn(self, ruleName, ruleIndex, inputSymbol): 969*16467b97STreehugger Robot sys.stdout.write("enter %s %s" % (ruleName, inputSymbol)) 970*16467b97STreehugger Robot 971*16467b97STreehugger Robot if self._state.backtracking > 0: 972*16467b97STreehugger Robot sys.stdout.write(" backtracking=%s" % self._state.backtracking) 973*16467b97STreehugger Robot 974*16467b97STreehugger Robot sys.stdout.write('\n') 975*16467b97STreehugger Robot 976*16467b97STreehugger Robot 977*16467b97STreehugger Robot def traceOut(self, ruleName, ruleIndex, inputSymbol): 978*16467b97STreehugger Robot sys.stdout.write("exit %s %s" % (ruleName, inputSymbol)) 979*16467b97STreehugger Robot 980*16467b97STreehugger Robot if self._state.backtracking > 0: 981*16467b97STreehugger Robot sys.stdout.write(" backtracking=%s" % self._state.backtracking) 982*16467b97STreehugger Robot 983*16467b97STreehugger Robot # mmmm... we use BacktrackingFailed exceptions now. So how could we 984*16467b97STreehugger Robot # get that information here? 985*16467b97STreehugger Robot #if self._state.failed: 986*16467b97STreehugger Robot # sys.stdout.write(" failed") 987*16467b97STreehugger Robot #else: 988*16467b97STreehugger Robot # sys.stdout.write(" succeeded") 989*16467b97STreehugger Robot 990*16467b97STreehugger Robot sys.stdout.write('\n') 991*16467b97STreehugger Robot 992*16467b97STreehugger Robot 993*16467b97STreehugger Robotclass TokenSource(object): 994*16467b97STreehugger Robot """ 995*16467b97STreehugger Robot @brief Abstract baseclass for token producers. 996*16467b97STreehugger Robot 997*16467b97STreehugger Robot A source of tokens must provide a sequence of tokens via nextToken() 998*16467b97STreehugger Robot and also must reveal it's source of characters; CommonToken's text is 999*16467b97STreehugger Robot computed from a CharStream; it only store indices into the char stream. 1000*16467b97STreehugger Robot 1001*16467b97STreehugger Robot Errors from the lexer are never passed to the parser. Either you want 1002*16467b97STreehugger Robot to keep going or you do not upon token recognition error. If you do not 1003*16467b97STreehugger Robot want to continue lexing then you do not want to continue parsing. Just 1004*16467b97STreehugger Robot throw an exception not under RecognitionException and Java will naturally 1005*16467b97STreehugger Robot toss you all the way out of the recognizers. If you want to continue 1006*16467b97STreehugger Robot lexing then you should not throw an exception to the parser--it has already 1007*16467b97STreehugger Robot requested a token. Keep lexing until you get a valid one. Just report 1008*16467b97STreehugger Robot errors and keep going, looking for a valid token. 1009*16467b97STreehugger Robot """ 1010*16467b97STreehugger Robot 1011*16467b97STreehugger Robot def nextToken(self): 1012*16467b97STreehugger Robot """Return a Token object from your input stream (usually a CharStream). 1013*16467b97STreehugger Robot 1014*16467b97STreehugger Robot Do not fail/return upon lexing error; keep chewing on the characters 1015*16467b97STreehugger Robot until you get a good one; errors are not passed through to the parser. 1016*16467b97STreehugger Robot """ 1017*16467b97STreehugger Robot 1018*16467b97STreehugger Robot raise NotImplementedError 1019*16467b97STreehugger Robot 1020*16467b97STreehugger Robot 1021*16467b97STreehugger Robot def __iter__(self): 1022*16467b97STreehugger Robot """The TokenSource is an interator. 1023*16467b97STreehugger Robot 1024*16467b97STreehugger Robot The iteration will not include the final EOF token, see also the note 1025*16467b97STreehugger Robot for the next() method. 1026*16467b97STreehugger Robot 1027*16467b97STreehugger Robot """ 1028*16467b97STreehugger Robot 1029*16467b97STreehugger Robot return self 1030*16467b97STreehugger Robot 1031*16467b97STreehugger Robot 1032*16467b97STreehugger Robot def next(self): 1033*16467b97STreehugger Robot """Return next token or raise StopIteration. 1034*16467b97STreehugger Robot 1035*16467b97STreehugger Robot Note that this will raise StopIteration when hitting the EOF token, 1036*16467b97STreehugger Robot so EOF will not be part of the iteration. 1037*16467b97STreehugger Robot 1038*16467b97STreehugger Robot """ 1039*16467b97STreehugger Robot 1040*16467b97STreehugger Robot token = self.nextToken() 1041*16467b97STreehugger Robot if token is None or token.type == EOF: 1042*16467b97STreehugger Robot raise StopIteration 1043*16467b97STreehugger Robot return token 1044*16467b97STreehugger Robot 1045*16467b97STreehugger Robot 1046*16467b97STreehugger Robotclass Lexer(BaseRecognizer, TokenSource): 1047*16467b97STreehugger Robot """ 1048*16467b97STreehugger Robot @brief Baseclass for generated lexer classes. 1049*16467b97STreehugger Robot 1050*16467b97STreehugger Robot A lexer is recognizer that draws input symbols from a character stream. 1051*16467b97STreehugger Robot lexer grammars result in a subclass of this object. A Lexer object 1052*16467b97STreehugger Robot uses simplified match() and error recovery mechanisms in the interest 1053*16467b97STreehugger Robot of speed. 1054*16467b97STreehugger Robot """ 1055*16467b97STreehugger Robot 1056*16467b97STreehugger Robot def __init__(self, input, state=None): 1057*16467b97STreehugger Robot BaseRecognizer.__init__(self, state) 1058*16467b97STreehugger Robot TokenSource.__init__(self) 1059*16467b97STreehugger Robot 1060*16467b97STreehugger Robot # Where is the lexer drawing characters from? 1061*16467b97STreehugger Robot self.input = input 1062*16467b97STreehugger Robot 1063*16467b97STreehugger Robot 1064*16467b97STreehugger Robot def reset(self): 1065*16467b97STreehugger Robot BaseRecognizer.reset(self) # reset all recognizer state variables 1066*16467b97STreehugger Robot 1067*16467b97STreehugger Robot if self.input is not None: 1068*16467b97STreehugger Robot # rewind the input 1069*16467b97STreehugger Robot self.input.seek(0) 1070*16467b97STreehugger Robot 1071*16467b97STreehugger Robot if self._state is None: 1072*16467b97STreehugger Robot # no shared state work to do 1073*16467b97STreehugger Robot return 1074*16467b97STreehugger Robot 1075*16467b97STreehugger Robot # wack Lexer state variables 1076*16467b97STreehugger Robot self._state.token = None 1077*16467b97STreehugger Robot self._state.type = INVALID_TOKEN_TYPE 1078*16467b97STreehugger Robot self._state.channel = DEFAULT_CHANNEL 1079*16467b97STreehugger Robot self._state.tokenStartCharIndex = -1 1080*16467b97STreehugger Robot self._state.tokenStartLine = -1 1081*16467b97STreehugger Robot self._state.tokenStartCharPositionInLine = -1 1082*16467b97STreehugger Robot self._state.text = None 1083*16467b97STreehugger Robot 1084*16467b97STreehugger Robot 1085*16467b97STreehugger Robot def makeEOFToken(self): 1086*16467b97STreehugger Robot eof = CommonToken( 1087*16467b97STreehugger Robot type=EOF, channel=DEFAULT_CHANNEL, 1088*16467b97STreehugger Robot input=self.input, 1089*16467b97STreehugger Robot start=self.input.index(), stop=self.input.index()) 1090*16467b97STreehugger Robot eof.line = self.input.line 1091*16467b97STreehugger Robot eof.charPositionInLine = self.input.charPositionInLine 1092*16467b97STreehugger Robot return eof 1093*16467b97STreehugger Robot 1094*16467b97STreehugger Robot def nextToken(self): 1095*16467b97STreehugger Robot """ 1096*16467b97STreehugger Robot Return a token from this source; i.e., match a token on the char 1097*16467b97STreehugger Robot stream. 1098*16467b97STreehugger Robot """ 1099*16467b97STreehugger Robot 1100*16467b97STreehugger Robot while 1: 1101*16467b97STreehugger Robot self._state.token = None 1102*16467b97STreehugger Robot self._state.channel = DEFAULT_CHANNEL 1103*16467b97STreehugger Robot self._state.tokenStartCharIndex = self.input.index() 1104*16467b97STreehugger Robot self._state.tokenStartCharPositionInLine = self.input.charPositionInLine 1105*16467b97STreehugger Robot self._state.tokenStartLine = self.input.line 1106*16467b97STreehugger Robot self._state.text = None 1107*16467b97STreehugger Robot if self.input.LA(1) == EOF: 1108*16467b97STreehugger Robot return self.makeEOFToken() 1109*16467b97STreehugger Robot 1110*16467b97STreehugger Robot try: 1111*16467b97STreehugger Robot self.mTokens() 1112*16467b97STreehugger Robot 1113*16467b97STreehugger Robot if self._state.token is None: 1114*16467b97STreehugger Robot self.emit() 1115*16467b97STreehugger Robot 1116*16467b97STreehugger Robot elif self._state.token == SKIP_TOKEN: 1117*16467b97STreehugger Robot continue 1118*16467b97STreehugger Robot 1119*16467b97STreehugger Robot return self._state.token 1120*16467b97STreehugger Robot 1121*16467b97STreehugger Robot except NoViableAltException, re: 1122*16467b97STreehugger Robot self.reportError(re) 1123*16467b97STreehugger Robot self.recover(re) # throw out current char and try again 1124*16467b97STreehugger Robot 1125*16467b97STreehugger Robot except RecognitionException, re: 1126*16467b97STreehugger Robot self.reportError(re) 1127*16467b97STreehugger Robot # match() routine has already called recover() 1128*16467b97STreehugger Robot 1129*16467b97STreehugger Robot 1130*16467b97STreehugger Robot def skip(self): 1131*16467b97STreehugger Robot """ 1132*16467b97STreehugger Robot Instruct the lexer to skip creating a token for current lexer rule 1133*16467b97STreehugger Robot and look for another token. nextToken() knows to keep looking when 1134*16467b97STreehugger Robot a lexer rule finishes with token set to SKIP_TOKEN. Recall that 1135*16467b97STreehugger Robot if token==null at end of any token rule, it creates one for you 1136*16467b97STreehugger Robot and emits it. 1137*16467b97STreehugger Robot """ 1138*16467b97STreehugger Robot 1139*16467b97STreehugger Robot self._state.token = SKIP_TOKEN 1140*16467b97STreehugger Robot 1141*16467b97STreehugger Robot 1142*16467b97STreehugger Robot def mTokens(self): 1143*16467b97STreehugger Robot """This is the lexer entry point that sets instance var 'token'""" 1144*16467b97STreehugger Robot 1145*16467b97STreehugger Robot # abstract method 1146*16467b97STreehugger Robot raise NotImplementedError 1147*16467b97STreehugger Robot 1148*16467b97STreehugger Robot 1149*16467b97STreehugger Robot def setCharStream(self, input): 1150*16467b97STreehugger Robot """Set the char stream and reset the lexer""" 1151*16467b97STreehugger Robot self.input = None 1152*16467b97STreehugger Robot self.reset() 1153*16467b97STreehugger Robot self.input = input 1154*16467b97STreehugger Robot 1155*16467b97STreehugger Robot 1156*16467b97STreehugger Robot def getSourceName(self): 1157*16467b97STreehugger Robot return self.input.getSourceName() 1158*16467b97STreehugger Robot 1159*16467b97STreehugger Robot 1160*16467b97STreehugger Robot def emit(self, token=None): 1161*16467b97STreehugger Robot """ 1162*16467b97STreehugger Robot The standard method called to automatically emit a token at the 1163*16467b97STreehugger Robot outermost lexical rule. The token object should point into the 1164*16467b97STreehugger Robot char buffer start..stop. If there is a text override in 'text', 1165*16467b97STreehugger Robot use that to set the token's text. Override this method to emit 1166*16467b97STreehugger Robot custom Token objects. 1167*16467b97STreehugger Robot 1168*16467b97STreehugger Robot If you are building trees, then you should also override 1169*16467b97STreehugger Robot Parser or TreeParser.getMissingSymbol(). 1170*16467b97STreehugger Robot """ 1171*16467b97STreehugger Robot 1172*16467b97STreehugger Robot if token is None: 1173*16467b97STreehugger Robot token = CommonToken( 1174*16467b97STreehugger Robot input=self.input, 1175*16467b97STreehugger Robot type=self._state.type, 1176*16467b97STreehugger Robot channel=self._state.channel, 1177*16467b97STreehugger Robot start=self._state.tokenStartCharIndex, 1178*16467b97STreehugger Robot stop=self.getCharIndex()-1 1179*16467b97STreehugger Robot ) 1180*16467b97STreehugger Robot token.line = self._state.tokenStartLine 1181*16467b97STreehugger Robot token.text = self._state.text 1182*16467b97STreehugger Robot token.charPositionInLine = self._state.tokenStartCharPositionInLine 1183*16467b97STreehugger Robot 1184*16467b97STreehugger Robot self._state.token = token 1185*16467b97STreehugger Robot 1186*16467b97STreehugger Robot return token 1187*16467b97STreehugger Robot 1188*16467b97STreehugger Robot 1189*16467b97STreehugger Robot def match(self, s): 1190*16467b97STreehugger Robot if isinstance(s, basestring): 1191*16467b97STreehugger Robot for c in s: 1192*16467b97STreehugger Robot if self.input.LA(1) != ord(c): 1193*16467b97STreehugger Robot if self._state.backtracking > 0: 1194*16467b97STreehugger Robot raise BacktrackingFailed 1195*16467b97STreehugger Robot 1196*16467b97STreehugger Robot mte = MismatchedTokenException(c, self.input) 1197*16467b97STreehugger Robot self.recover(mte) 1198*16467b97STreehugger Robot raise mte 1199*16467b97STreehugger Robot 1200*16467b97STreehugger Robot self.input.consume() 1201*16467b97STreehugger Robot 1202*16467b97STreehugger Robot else: 1203*16467b97STreehugger Robot if self.input.LA(1) != s: 1204*16467b97STreehugger Robot if self._state.backtracking > 0: 1205*16467b97STreehugger Robot raise BacktrackingFailed 1206*16467b97STreehugger Robot 1207*16467b97STreehugger Robot mte = MismatchedTokenException(unichr(s), self.input) 1208*16467b97STreehugger Robot self.recover(mte) # don't really recover; just consume in lexer 1209*16467b97STreehugger Robot raise mte 1210*16467b97STreehugger Robot 1211*16467b97STreehugger Robot self.input.consume() 1212*16467b97STreehugger Robot 1213*16467b97STreehugger Robot 1214*16467b97STreehugger Robot def matchAny(self): 1215*16467b97STreehugger Robot self.input.consume() 1216*16467b97STreehugger Robot 1217*16467b97STreehugger Robot 1218*16467b97STreehugger Robot def matchRange(self, a, b): 1219*16467b97STreehugger Robot if self.input.LA(1) < a or self.input.LA(1) > b: 1220*16467b97STreehugger Robot if self._state.backtracking > 0: 1221*16467b97STreehugger Robot raise BacktrackingFailed 1222*16467b97STreehugger Robot 1223*16467b97STreehugger Robot mre = MismatchedRangeException(unichr(a), unichr(b), self.input) 1224*16467b97STreehugger Robot self.recover(mre) 1225*16467b97STreehugger Robot raise mre 1226*16467b97STreehugger Robot 1227*16467b97STreehugger Robot self.input.consume() 1228*16467b97STreehugger Robot 1229*16467b97STreehugger Robot 1230*16467b97STreehugger Robot def getLine(self): 1231*16467b97STreehugger Robot return self.input.line 1232*16467b97STreehugger Robot 1233*16467b97STreehugger Robot 1234*16467b97STreehugger Robot def getCharPositionInLine(self): 1235*16467b97STreehugger Robot return self.input.charPositionInLine 1236*16467b97STreehugger Robot 1237*16467b97STreehugger Robot 1238*16467b97STreehugger Robot def getCharIndex(self): 1239*16467b97STreehugger Robot """What is the index of the current character of lookahead?""" 1240*16467b97STreehugger Robot 1241*16467b97STreehugger Robot return self.input.index() 1242*16467b97STreehugger Robot 1243*16467b97STreehugger Robot 1244*16467b97STreehugger Robot def getText(self): 1245*16467b97STreehugger Robot """ 1246*16467b97STreehugger Robot Return the text matched so far for the current token or any 1247*16467b97STreehugger Robot text override. 1248*16467b97STreehugger Robot """ 1249*16467b97STreehugger Robot if self._state.text is not None: 1250*16467b97STreehugger Robot return self._state.text 1251*16467b97STreehugger Robot 1252*16467b97STreehugger Robot return self.input.substring( 1253*16467b97STreehugger Robot self._state.tokenStartCharIndex, 1254*16467b97STreehugger Robot self.getCharIndex()-1 1255*16467b97STreehugger Robot ) 1256*16467b97STreehugger Robot 1257*16467b97STreehugger Robot 1258*16467b97STreehugger Robot def setText(self, text): 1259*16467b97STreehugger Robot """ 1260*16467b97STreehugger Robot Set the complete text of this token; it wipes any previous 1261*16467b97STreehugger Robot changes to the text. 1262*16467b97STreehugger Robot """ 1263*16467b97STreehugger Robot self._state.text = text 1264*16467b97STreehugger Robot 1265*16467b97STreehugger Robot 1266*16467b97STreehugger Robot text = property(getText, setText) 1267*16467b97STreehugger Robot 1268*16467b97STreehugger Robot 1269*16467b97STreehugger Robot def reportError(self, e): 1270*16467b97STreehugger Robot ## TODO: not thought about recovery in lexer yet. 1271*16467b97STreehugger Robot 1272*16467b97STreehugger Robot ## # if we've already reported an error and have not matched a token 1273*16467b97STreehugger Robot ## # yet successfully, don't report any errors. 1274*16467b97STreehugger Robot ## if self.errorRecovery: 1275*16467b97STreehugger Robot ## #System.err.print("[SPURIOUS] "); 1276*16467b97STreehugger Robot ## return; 1277*16467b97STreehugger Robot ## 1278*16467b97STreehugger Robot ## self.errorRecovery = True 1279*16467b97STreehugger Robot 1280*16467b97STreehugger Robot self.displayRecognitionError(self.tokenNames, e) 1281*16467b97STreehugger Robot 1282*16467b97STreehugger Robot 1283*16467b97STreehugger Robot def getErrorMessage(self, e, tokenNames): 1284*16467b97STreehugger Robot msg = None 1285*16467b97STreehugger Robot 1286*16467b97STreehugger Robot if isinstance(e, MismatchedTokenException): 1287*16467b97STreehugger Robot msg = "mismatched character " \ 1288*16467b97STreehugger Robot + self.getCharErrorDisplay(e.c) \ 1289*16467b97STreehugger Robot + " expecting " \ 1290*16467b97STreehugger Robot + self.getCharErrorDisplay(e.expecting) 1291*16467b97STreehugger Robot 1292*16467b97STreehugger Robot elif isinstance(e, NoViableAltException): 1293*16467b97STreehugger Robot msg = "no viable alternative at character " \ 1294*16467b97STreehugger Robot + self.getCharErrorDisplay(e.c) 1295*16467b97STreehugger Robot 1296*16467b97STreehugger Robot elif isinstance(e, EarlyExitException): 1297*16467b97STreehugger Robot msg = "required (...)+ loop did not match anything at character " \ 1298*16467b97STreehugger Robot + self.getCharErrorDisplay(e.c) 1299*16467b97STreehugger Robot 1300*16467b97STreehugger Robot elif isinstance(e, MismatchedNotSetException): 1301*16467b97STreehugger Robot msg = "mismatched character " \ 1302*16467b97STreehugger Robot + self.getCharErrorDisplay(e.c) \ 1303*16467b97STreehugger Robot + " expecting set " \ 1304*16467b97STreehugger Robot + repr(e.expecting) 1305*16467b97STreehugger Robot 1306*16467b97STreehugger Robot elif isinstance(e, MismatchedSetException): 1307*16467b97STreehugger Robot msg = "mismatched character " \ 1308*16467b97STreehugger Robot + self.getCharErrorDisplay(e.c) \ 1309*16467b97STreehugger Robot + " expecting set " \ 1310*16467b97STreehugger Robot + repr(e.expecting) 1311*16467b97STreehugger Robot 1312*16467b97STreehugger Robot elif isinstance(e, MismatchedRangeException): 1313*16467b97STreehugger Robot msg = "mismatched character " \ 1314*16467b97STreehugger Robot + self.getCharErrorDisplay(e.c) \ 1315*16467b97STreehugger Robot + " expecting set " \ 1316*16467b97STreehugger Robot + self.getCharErrorDisplay(e.a) \ 1317*16467b97STreehugger Robot + ".." \ 1318*16467b97STreehugger Robot + self.getCharErrorDisplay(e.b) 1319*16467b97STreehugger Robot 1320*16467b97STreehugger Robot else: 1321*16467b97STreehugger Robot msg = BaseRecognizer.getErrorMessage(self, e, tokenNames) 1322*16467b97STreehugger Robot 1323*16467b97STreehugger Robot return msg 1324*16467b97STreehugger Robot 1325*16467b97STreehugger Robot 1326*16467b97STreehugger Robot def getCharErrorDisplay(self, c): 1327*16467b97STreehugger Robot if c == EOF: 1328*16467b97STreehugger Robot c = '<EOF>' 1329*16467b97STreehugger Robot return repr(c) 1330*16467b97STreehugger Robot 1331*16467b97STreehugger Robot 1332*16467b97STreehugger Robot def recover(self, re): 1333*16467b97STreehugger Robot """ 1334*16467b97STreehugger Robot Lexers can normally match any char in it's vocabulary after matching 1335*16467b97STreehugger Robot a token, so do the easy thing and just kill a character and hope 1336*16467b97STreehugger Robot it all works out. You can instead use the rule invocation stack 1337*16467b97STreehugger Robot to do sophisticated error recovery if you are in a fragment rule. 1338*16467b97STreehugger Robot """ 1339*16467b97STreehugger Robot 1340*16467b97STreehugger Robot self.input.consume() 1341*16467b97STreehugger Robot 1342*16467b97STreehugger Robot 1343*16467b97STreehugger Robot def traceIn(self, ruleName, ruleIndex): 1344*16467b97STreehugger Robot inputSymbol = "%s line=%d:%s" % (self.input.LT(1), 1345*16467b97STreehugger Robot self.getLine(), 1346*16467b97STreehugger Robot self.getCharPositionInLine() 1347*16467b97STreehugger Robot ) 1348*16467b97STreehugger Robot 1349*16467b97STreehugger Robot BaseRecognizer.traceIn(self, ruleName, ruleIndex, inputSymbol) 1350*16467b97STreehugger Robot 1351*16467b97STreehugger Robot 1352*16467b97STreehugger Robot def traceOut(self, ruleName, ruleIndex): 1353*16467b97STreehugger Robot inputSymbol = "%s line=%d:%s" % (self.input.LT(1), 1354*16467b97STreehugger Robot self.getLine(), 1355*16467b97STreehugger Robot self.getCharPositionInLine() 1356*16467b97STreehugger Robot ) 1357*16467b97STreehugger Robot 1358*16467b97STreehugger Robot BaseRecognizer.traceOut(self, ruleName, ruleIndex, inputSymbol) 1359*16467b97STreehugger Robot 1360*16467b97STreehugger Robot 1361*16467b97STreehugger Robot 1362*16467b97STreehugger Robotclass Parser(BaseRecognizer): 1363*16467b97STreehugger Robot """ 1364*16467b97STreehugger Robot @brief Baseclass for generated parser classes. 1365*16467b97STreehugger Robot """ 1366*16467b97STreehugger Robot 1367*16467b97STreehugger Robot def __init__(self, lexer, state=None): 1368*16467b97STreehugger Robot BaseRecognizer.__init__(self, state) 1369*16467b97STreehugger Robot 1370*16467b97STreehugger Robot self.input = lexer 1371*16467b97STreehugger Robot 1372*16467b97STreehugger Robot 1373*16467b97STreehugger Robot def reset(self): 1374*16467b97STreehugger Robot BaseRecognizer.reset(self) # reset all recognizer state variables 1375*16467b97STreehugger Robot if self.input is not None: 1376*16467b97STreehugger Robot self.input.seek(0) # rewind the input 1377*16467b97STreehugger Robot 1378*16467b97STreehugger Robot 1379*16467b97STreehugger Robot def getCurrentInputSymbol(self, input): 1380*16467b97STreehugger Robot return input.LT(1) 1381*16467b97STreehugger Robot 1382*16467b97STreehugger Robot 1383*16467b97STreehugger Robot def getMissingSymbol(self, input, e, expectedTokenType, follow): 1384*16467b97STreehugger Robot if expectedTokenType == EOF: 1385*16467b97STreehugger Robot tokenText = "<missing EOF>" 1386*16467b97STreehugger Robot else: 1387*16467b97STreehugger Robot tokenText = "<missing " + self.tokenNames[expectedTokenType] + ">" 1388*16467b97STreehugger Robot t = CommonToken(type=expectedTokenType, text=tokenText) 1389*16467b97STreehugger Robot current = input.LT(1) 1390*16467b97STreehugger Robot if current.type == EOF: 1391*16467b97STreehugger Robot current = input.LT(-1) 1392*16467b97STreehugger Robot 1393*16467b97STreehugger Robot if current is not None: 1394*16467b97STreehugger Robot t.line = current.line 1395*16467b97STreehugger Robot t.charPositionInLine = current.charPositionInLine 1396*16467b97STreehugger Robot t.channel = DEFAULT_CHANNEL 1397*16467b97STreehugger Robot return t 1398*16467b97STreehugger Robot 1399*16467b97STreehugger Robot 1400*16467b97STreehugger Robot def setTokenStream(self, input): 1401*16467b97STreehugger Robot """Set the token stream and reset the parser""" 1402*16467b97STreehugger Robot 1403*16467b97STreehugger Robot self.input = None 1404*16467b97STreehugger Robot self.reset() 1405*16467b97STreehugger Robot self.input = input 1406*16467b97STreehugger Robot 1407*16467b97STreehugger Robot 1408*16467b97STreehugger Robot def getTokenStream(self): 1409*16467b97STreehugger Robot return self.input 1410*16467b97STreehugger Robot 1411*16467b97STreehugger Robot 1412*16467b97STreehugger Robot def getSourceName(self): 1413*16467b97STreehugger Robot return self.input.getSourceName() 1414*16467b97STreehugger Robot 1415*16467b97STreehugger Robot 1416*16467b97STreehugger Robot def traceIn(self, ruleName, ruleIndex): 1417*16467b97STreehugger Robot BaseRecognizer.traceIn(self, ruleName, ruleIndex, self.input.LT(1)) 1418*16467b97STreehugger Robot 1419*16467b97STreehugger Robot 1420*16467b97STreehugger Robot def traceOut(self, ruleName, ruleIndex): 1421*16467b97STreehugger Robot BaseRecognizer.traceOut(self, ruleName, ruleIndex, self.input.LT(1)) 1422*16467b97STreehugger Robot 1423*16467b97STreehugger Robot 1424*16467b97STreehugger Robotclass RuleReturnScope(object): 1425*16467b97STreehugger Robot """ 1426*16467b97STreehugger Robot Rules can return start/stop info as well as possible trees and templates. 1427*16467b97STreehugger Robot """ 1428*16467b97STreehugger Robot 1429*16467b97STreehugger Robot def getStart(self): 1430*16467b97STreehugger Robot """Return the start token or tree.""" 1431*16467b97STreehugger Robot return None 1432*16467b97STreehugger Robot 1433*16467b97STreehugger Robot 1434*16467b97STreehugger Robot def getStop(self): 1435*16467b97STreehugger Robot """Return the stop token or tree.""" 1436*16467b97STreehugger Robot return None 1437*16467b97STreehugger Robot 1438*16467b97STreehugger Robot 1439*16467b97STreehugger Robot def getTree(self): 1440*16467b97STreehugger Robot """Has a value potentially if output=AST.""" 1441*16467b97STreehugger Robot return None 1442*16467b97STreehugger Robot 1443*16467b97STreehugger Robot 1444*16467b97STreehugger Robot def getTemplate(self): 1445*16467b97STreehugger Robot """Has a value potentially if output=template.""" 1446*16467b97STreehugger Robot return None 1447*16467b97STreehugger Robot 1448*16467b97STreehugger Robot 1449*16467b97STreehugger Robotclass ParserRuleReturnScope(RuleReturnScope): 1450*16467b97STreehugger Robot """ 1451*16467b97STreehugger Robot Rules that return more than a single value must return an object 1452*16467b97STreehugger Robot containing all the values. Besides the properties defined in 1453*16467b97STreehugger Robot RuleLabelScope.predefinedRulePropertiesScope there may be user-defined 1454*16467b97STreehugger Robot return values. This class simply defines the minimum properties that 1455*16467b97STreehugger Robot are always defined and methods to access the others that might be 1456*16467b97STreehugger Robot available depending on output option such as template and tree. 1457*16467b97STreehugger Robot 1458*16467b97STreehugger Robot Note text is not an actual property of the return value, it is computed 1459*16467b97STreehugger Robot from start and stop using the input stream's toString() method. I 1460*16467b97STreehugger Robot could add a ctor to this so that we can pass in and store the input 1461*16467b97STreehugger Robot stream, but I'm not sure we want to do that. It would seem to be undefined 1462*16467b97STreehugger Robot to get the .text property anyway if the rule matches tokens from multiple 1463*16467b97STreehugger Robot input streams. 1464*16467b97STreehugger Robot 1465*16467b97STreehugger Robot I do not use getters for fields of objects that are used simply to 1466*16467b97STreehugger Robot group values such as this aggregate. The getters/setters are there to 1467*16467b97STreehugger Robot satisfy the superclass interface. 1468*16467b97STreehugger Robot """ 1469*16467b97STreehugger Robot 1470*16467b97STreehugger Robot def __init__(self): 1471*16467b97STreehugger Robot self.start = None 1472*16467b97STreehugger Robot self.stop = None 1473*16467b97STreehugger Robot self.tree = None # only used when output=AST 1474*16467b97STreehugger Robot 1475*16467b97STreehugger Robot 1476*16467b97STreehugger Robot def getStart(self): 1477*16467b97STreehugger Robot return self.start 1478*16467b97STreehugger Robot 1479*16467b97STreehugger Robot 1480*16467b97STreehugger Robot def getStop(self): 1481*16467b97STreehugger Robot return self.stop 1482*16467b97STreehugger Robot 1483*16467b97STreehugger Robot 1484*16467b97STreehugger Robot def getTree(self): 1485*16467b97STreehugger Robot return self.tree 1486