xref: /aosp_15_r20/external/antlr/runtime/Python/antlr3/recognizers.py (revision 16467b971bd3e2009fad32dd79016f2c7e421deb)
1*16467b97STreehugger Robot"""ANTLR3 runtime package"""
2*16467b97STreehugger Robot
3*16467b97STreehugger Robot# begin[licence]
4*16467b97STreehugger Robot#
5*16467b97STreehugger Robot# [The "BSD licence"]
6*16467b97STreehugger Robot# Copyright (c) 2005-2008 Terence Parr
7*16467b97STreehugger Robot# All rights reserved.
8*16467b97STreehugger Robot#
9*16467b97STreehugger Robot# Redistribution and use in source and binary forms, with or without
10*16467b97STreehugger Robot# modification, are permitted provided that the following conditions
11*16467b97STreehugger Robot# are met:
12*16467b97STreehugger Robot# 1. Redistributions of source code must retain the above copyright
13*16467b97STreehugger Robot#    notice, this list of conditions and the following disclaimer.
14*16467b97STreehugger Robot# 2. Redistributions in binary form must reproduce the above copyright
15*16467b97STreehugger Robot#    notice, this list of conditions and the following disclaimer in the
16*16467b97STreehugger Robot#    documentation and/or other materials provided with the distribution.
17*16467b97STreehugger Robot# 3. The name of the author may not be used to endorse or promote products
18*16467b97STreehugger Robot#    derived from this software without specific prior written permission.
19*16467b97STreehugger Robot#
20*16467b97STreehugger Robot# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21*16467b97STreehugger Robot# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22*16467b97STreehugger Robot# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23*16467b97STreehugger Robot# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24*16467b97STreehugger Robot# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25*16467b97STreehugger Robot# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26*16467b97STreehugger Robot# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27*16467b97STreehugger Robot# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28*16467b97STreehugger Robot# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29*16467b97STreehugger Robot# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30*16467b97STreehugger Robot#
31*16467b97STreehugger Robot# end[licence]
32*16467b97STreehugger Robot
33*16467b97STreehugger Robotimport sys
34*16467b97STreehugger Robotimport inspect
35*16467b97STreehugger Robot
36*16467b97STreehugger Robotfrom antlr3 import compatible_api_versions
37*16467b97STreehugger Robotfrom antlr3.constants import DEFAULT_CHANNEL, HIDDEN_CHANNEL, EOF, \
38*16467b97STreehugger Robot     EOR_TOKEN_TYPE, INVALID_TOKEN_TYPE
39*16467b97STreehugger Robotfrom antlr3.exceptions import RecognitionException, MismatchedTokenException, \
40*16467b97STreehugger Robot     MismatchedRangeException, MismatchedTreeNodeException, \
41*16467b97STreehugger Robot     NoViableAltException, EarlyExitException, MismatchedSetException, \
42*16467b97STreehugger Robot     MismatchedNotSetException, FailedPredicateException, \
43*16467b97STreehugger Robot     BacktrackingFailed, UnwantedTokenException, MissingTokenException
44*16467b97STreehugger Robotfrom antlr3.tokens import CommonToken, SKIP_TOKEN
45*16467b97STreehugger Robotfrom antlr3.compat import set, frozenset, reversed
46*16467b97STreehugger Robot
47*16467b97STreehugger Robot
48*16467b97STreehugger Robotclass RecognizerSharedState(object):
49*16467b97STreehugger Robot    """
50*16467b97STreehugger Robot    The set of fields needed by an abstract recognizer to recognize input
51*16467b97STreehugger Robot    and recover from errors etc...  As a separate state object, it can be
52*16467b97STreehugger Robot    shared among multiple grammars; e.g., when one grammar imports another.
53*16467b97STreehugger Robot
54*16467b97STreehugger Robot    These fields are publically visible but the actual state pointer per
55*16467b97STreehugger Robot    parser is protected.
56*16467b97STreehugger Robot    """
57*16467b97STreehugger Robot
58*16467b97STreehugger Robot    def __init__(self):
59*16467b97STreehugger Robot        # Track the set of token types that can follow any rule invocation.
60*16467b97STreehugger Robot        # Stack grows upwards.
61*16467b97STreehugger Robot        self.following = []
62*16467b97STreehugger Robot
63*16467b97STreehugger Robot        # This is true when we see an error and before having successfully
64*16467b97STreehugger Robot        # matched a token.  Prevents generation of more than one error message
65*16467b97STreehugger Robot        # per error.
66*16467b97STreehugger Robot        self.errorRecovery = False
67*16467b97STreehugger Robot
68*16467b97STreehugger Robot        # The index into the input stream where the last error occurred.
69*16467b97STreehugger Robot        # This is used to prevent infinite loops where an error is found
70*16467b97STreehugger Robot        # but no token is consumed during recovery...another error is found,
71*16467b97STreehugger Robot        # ad naseum.  This is a failsafe mechanism to guarantee that at least
72*16467b97STreehugger Robot        # one token/tree node is consumed for two errors.
73*16467b97STreehugger Robot        self.lastErrorIndex = -1
74*16467b97STreehugger Robot
75*16467b97STreehugger Robot        # If 0, no backtracking is going on.  Safe to exec actions etc...
76*16467b97STreehugger Robot        # If >0 then it's the level of backtracking.
77*16467b97STreehugger Robot        self.backtracking = 0
78*16467b97STreehugger Robot
79*16467b97STreehugger Robot        # An array[size num rules] of Map<Integer,Integer> that tracks
80*16467b97STreehugger Robot        # the stop token index for each rule.  ruleMemo[ruleIndex] is
81*16467b97STreehugger Robot        # the memoization table for ruleIndex.  For key ruleStartIndex, you
82*16467b97STreehugger Robot        # get back the stop token for associated rule or MEMO_RULE_FAILED.
83*16467b97STreehugger Robot        #
84*16467b97STreehugger Robot        # This is only used if rule memoization is on (which it is by default).
85*16467b97STreehugger Robot        self.ruleMemo = None
86*16467b97STreehugger Robot
87*16467b97STreehugger Robot        ## Did the recognizer encounter a syntax error?  Track how many.
88*16467b97STreehugger Robot        self.syntaxErrors = 0
89*16467b97STreehugger Robot
90*16467b97STreehugger Robot
91*16467b97STreehugger Robot        # LEXER FIELDS (must be in same state object to avoid casting
92*16467b97STreehugger Robot        # constantly in generated code and Lexer object) :(
93*16467b97STreehugger Robot
94*16467b97STreehugger Robot
95*16467b97STreehugger Robot	## The goal of all lexer rules/methods is to create a token object.
96*16467b97STreehugger Robot        # This is an instance variable as multiple rules may collaborate to
97*16467b97STreehugger Robot        # create a single token.  nextToken will return this object after
98*16467b97STreehugger Robot        # matching lexer rule(s).  If you subclass to allow multiple token
99*16467b97STreehugger Robot        # emissions, then set this to the last token to be matched or
100*16467b97STreehugger Robot        # something nonnull so that the auto token emit mechanism will not
101*16467b97STreehugger Robot        # emit another token.
102*16467b97STreehugger Robot        self.token = None
103*16467b97STreehugger Robot
104*16467b97STreehugger Robot        ## What character index in the stream did the current token start at?
105*16467b97STreehugger Robot        # Needed, for example, to get the text for current token.  Set at
106*16467b97STreehugger Robot        # the start of nextToken.
107*16467b97STreehugger Robot        self.tokenStartCharIndex = -1
108*16467b97STreehugger Robot
109*16467b97STreehugger Robot        ## The line on which the first character of the token resides
110*16467b97STreehugger Robot        self.tokenStartLine = None
111*16467b97STreehugger Robot
112*16467b97STreehugger Robot        ## The character position of first character within the line
113*16467b97STreehugger Robot        self.tokenStartCharPositionInLine = None
114*16467b97STreehugger Robot
115*16467b97STreehugger Robot        ## The channel number for the current token
116*16467b97STreehugger Robot        self.channel = None
117*16467b97STreehugger Robot
118*16467b97STreehugger Robot        ## The token type for the current token
119*16467b97STreehugger Robot        self.type = None
120*16467b97STreehugger Robot
121*16467b97STreehugger Robot        ## You can set the text for the current token to override what is in
122*16467b97STreehugger Robot        # the input char buffer.  Use setText() or can set this instance var.
123*16467b97STreehugger Robot        self.text = None
124*16467b97STreehugger Robot
125*16467b97STreehugger Robot
126*16467b97STreehugger Robotclass BaseRecognizer(object):
127*16467b97STreehugger Robot    """
128*16467b97STreehugger Robot    @brief Common recognizer functionality.
129*16467b97STreehugger Robot
130*16467b97STreehugger Robot    A generic recognizer that can handle recognizers generated from
131*16467b97STreehugger Robot    lexer, parser, and tree grammars.  This is all the parsing
132*16467b97STreehugger Robot    support code essentially; most of it is error recovery stuff and
133*16467b97STreehugger Robot    backtracking.
134*16467b97STreehugger Robot    """
135*16467b97STreehugger Robot
136*16467b97STreehugger Robot    MEMO_RULE_FAILED = -2
137*16467b97STreehugger Robot    MEMO_RULE_UNKNOWN = -1
138*16467b97STreehugger Robot
139*16467b97STreehugger Robot    # copies from Token object for convenience in actions
140*16467b97STreehugger Robot    DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL
141*16467b97STreehugger Robot
142*16467b97STreehugger Robot    # for convenience in actions
143*16467b97STreehugger Robot    HIDDEN = HIDDEN_CHANNEL
144*16467b97STreehugger Robot
145*16467b97STreehugger Robot    # overridden by generated subclasses
146*16467b97STreehugger Robot    tokenNames = None
147*16467b97STreehugger Robot
148*16467b97STreehugger Robot    # The api_version attribute has been introduced in 3.3. If it is not
149*16467b97STreehugger Robot    # overwritten in the generated recognizer, we assume a default of v0.
150*16467b97STreehugger Robot    api_version = 0
151*16467b97STreehugger Robot
152*16467b97STreehugger Robot    def __init__(self, state=None):
153*16467b97STreehugger Robot        # Input stream of the recognizer. Must be initialized by a subclass.
154*16467b97STreehugger Robot        self.input = None
155*16467b97STreehugger Robot
156*16467b97STreehugger Robot        ## State of a lexer, parser, or tree parser are collected into a state
157*16467b97STreehugger Robot        # object so the state can be shared.  This sharing is needed to
158*16467b97STreehugger Robot        # have one grammar import others and share same error variables
159*16467b97STreehugger Robot        # and other state variables.  It's a kind of explicit multiple
160*16467b97STreehugger Robot        # inheritance via delegation of methods and shared state.
161*16467b97STreehugger Robot        if state is None:
162*16467b97STreehugger Robot            state = RecognizerSharedState()
163*16467b97STreehugger Robot        self._state = state
164*16467b97STreehugger Robot
165*16467b97STreehugger Robot        if self.api_version not in compatible_api_versions:
166*16467b97STreehugger Robot            raise RuntimeError(
167*16467b97STreehugger Robot                ("ANTLR version mismatch: "
168*16467b97STreehugger Robot                 "The recognizer has been generated with API V%s, "
169*16467b97STreehugger Robot                 "but this runtime does not support this.")
170*16467b97STreehugger Robot                % self.api_version)
171*16467b97STreehugger Robot
172*16467b97STreehugger Robot    # this one only exists to shut up pylint :(
173*16467b97STreehugger Robot    def setInput(self, input):
174*16467b97STreehugger Robot        self.input = input
175*16467b97STreehugger Robot
176*16467b97STreehugger Robot
177*16467b97STreehugger Robot    def reset(self):
178*16467b97STreehugger Robot        """
179*16467b97STreehugger Robot        reset the parser's state; subclasses must rewinds the input stream
180*16467b97STreehugger Robot        """
181*16467b97STreehugger Robot
182*16467b97STreehugger Robot        # wack everything related to error recovery
183*16467b97STreehugger Robot        if self._state is None:
184*16467b97STreehugger Robot            # no shared state work to do
185*16467b97STreehugger Robot            return
186*16467b97STreehugger Robot
187*16467b97STreehugger Robot        self._state.following = []
188*16467b97STreehugger Robot        self._state.errorRecovery = False
189*16467b97STreehugger Robot        self._state.lastErrorIndex = -1
190*16467b97STreehugger Robot        self._state.syntaxErrors = 0
191*16467b97STreehugger Robot        # wack everything related to backtracking and memoization
192*16467b97STreehugger Robot        self._state.backtracking = 0
193*16467b97STreehugger Robot        if self._state.ruleMemo is not None:
194*16467b97STreehugger Robot            self._state.ruleMemo = {}
195*16467b97STreehugger Robot
196*16467b97STreehugger Robot
197*16467b97STreehugger Robot    def match(self, input, ttype, follow):
198*16467b97STreehugger Robot        """
199*16467b97STreehugger Robot        Match current input symbol against ttype.  Attempt
200*16467b97STreehugger Robot        single token insertion or deletion error recovery.  If
201*16467b97STreehugger Robot        that fails, throw MismatchedTokenException.
202*16467b97STreehugger Robot
203*16467b97STreehugger Robot        To turn off single token insertion or deletion error
204*16467b97STreehugger Robot        recovery, override recoverFromMismatchedToken() and have it
205*16467b97STreehugger Robot        throw an exception. See TreeParser.recoverFromMismatchedToken().
206*16467b97STreehugger Robot        This way any error in a rule will cause an exception and
207*16467b97STreehugger Robot        immediate exit from rule.  Rule would recover by resynchronizing
208*16467b97STreehugger Robot        to the set of symbols that can follow rule ref.
209*16467b97STreehugger Robot        """
210*16467b97STreehugger Robot
211*16467b97STreehugger Robot        matchedSymbol = self.getCurrentInputSymbol(input)
212*16467b97STreehugger Robot        if self.input.LA(1) == ttype:
213*16467b97STreehugger Robot            self.input.consume()
214*16467b97STreehugger Robot            self._state.errorRecovery = False
215*16467b97STreehugger Robot            return matchedSymbol
216*16467b97STreehugger Robot
217*16467b97STreehugger Robot        if self._state.backtracking > 0:
218*16467b97STreehugger Robot            # FIXME: need to return matchedSymbol here as well. damn!!
219*16467b97STreehugger Robot            raise BacktrackingFailed
220*16467b97STreehugger Robot
221*16467b97STreehugger Robot        matchedSymbol = self.recoverFromMismatchedToken(input, ttype, follow)
222*16467b97STreehugger Robot        return matchedSymbol
223*16467b97STreehugger Robot
224*16467b97STreehugger Robot
225*16467b97STreehugger Robot    def matchAny(self, input):
226*16467b97STreehugger Robot        """Match the wildcard: in a symbol"""
227*16467b97STreehugger Robot
228*16467b97STreehugger Robot        self._state.errorRecovery = False
229*16467b97STreehugger Robot        self.input.consume()
230*16467b97STreehugger Robot
231*16467b97STreehugger Robot
232*16467b97STreehugger Robot    def mismatchIsUnwantedToken(self, input, ttype):
233*16467b97STreehugger Robot        return input.LA(2) == ttype
234*16467b97STreehugger Robot
235*16467b97STreehugger Robot
236*16467b97STreehugger Robot    def mismatchIsMissingToken(self, input, follow):
237*16467b97STreehugger Robot        if follow is None:
238*16467b97STreehugger Robot            # we have no information about the follow; we can only consume
239*16467b97STreehugger Robot            # a single token and hope for the best
240*16467b97STreehugger Robot            return False
241*16467b97STreehugger Robot
242*16467b97STreehugger Robot        # compute what can follow this grammar element reference
243*16467b97STreehugger Robot        if EOR_TOKEN_TYPE in follow:
244*16467b97STreehugger Robot            viableTokensFollowingThisRule = self.computeContextSensitiveRuleFOLLOW()
245*16467b97STreehugger Robot            follow = follow | viableTokensFollowingThisRule
246*16467b97STreehugger Robot
247*16467b97STreehugger Robot            if len(self._state.following) > 0:
248*16467b97STreehugger Robot                # remove EOR if we're not the start symbol
249*16467b97STreehugger Robot                follow = follow - set([EOR_TOKEN_TYPE])
250*16467b97STreehugger Robot
251*16467b97STreehugger Robot        # if current token is consistent with what could come after set
252*16467b97STreehugger Robot        # then we know we're missing a token; error recovery is free to
253*16467b97STreehugger Robot        # "insert" the missing token
254*16467b97STreehugger Robot        if input.LA(1) in follow or EOR_TOKEN_TYPE in follow:
255*16467b97STreehugger Robot            return True
256*16467b97STreehugger Robot
257*16467b97STreehugger Robot        return False
258*16467b97STreehugger Robot
259*16467b97STreehugger Robot
260*16467b97STreehugger Robot    def reportError(self, e):
261*16467b97STreehugger Robot        """Report a recognition problem.
262*16467b97STreehugger Robot
263*16467b97STreehugger Robot        This method sets errorRecovery to indicate the parser is recovering
264*16467b97STreehugger Robot        not parsing.  Once in recovery mode, no errors are generated.
265*16467b97STreehugger Robot        To get out of recovery mode, the parser must successfully match
266*16467b97STreehugger Robot        a token (after a resync).  So it will go:
267*16467b97STreehugger Robot
268*16467b97STreehugger Robot        1. error occurs
269*16467b97STreehugger Robot        2. enter recovery mode, report error
270*16467b97STreehugger Robot        3. consume until token found in resynch set
271*16467b97STreehugger Robot        4. try to resume parsing
272*16467b97STreehugger Robot        5. next match() will reset errorRecovery mode
273*16467b97STreehugger Robot
274*16467b97STreehugger Robot        If you override, make sure to update syntaxErrors if you care about
275*16467b97STreehugger Robot        that.
276*16467b97STreehugger Robot
277*16467b97STreehugger Robot        """
278*16467b97STreehugger Robot
279*16467b97STreehugger Robot        # if we've already reported an error and have not matched a token
280*16467b97STreehugger Robot        # yet successfully, don't report any errors.
281*16467b97STreehugger Robot        if self._state.errorRecovery:
282*16467b97STreehugger Robot            return
283*16467b97STreehugger Robot
284*16467b97STreehugger Robot        self._state.syntaxErrors += 1 # don't count spurious
285*16467b97STreehugger Robot        self._state.errorRecovery = True
286*16467b97STreehugger Robot
287*16467b97STreehugger Robot        self.displayRecognitionError(self.tokenNames, e)
288*16467b97STreehugger Robot
289*16467b97STreehugger Robot
290*16467b97STreehugger Robot    def displayRecognitionError(self, tokenNames, e):
291*16467b97STreehugger Robot        hdr = self.getErrorHeader(e)
292*16467b97STreehugger Robot        msg = self.getErrorMessage(e, tokenNames)
293*16467b97STreehugger Robot        self.emitErrorMessage(hdr+" "+msg)
294*16467b97STreehugger Robot
295*16467b97STreehugger Robot
296*16467b97STreehugger Robot    def getErrorMessage(self, e, tokenNames):
297*16467b97STreehugger Robot        """
298*16467b97STreehugger Robot        What error message should be generated for the various
299*16467b97STreehugger Robot        exception types?
300*16467b97STreehugger Robot
301*16467b97STreehugger Robot        Not very object-oriented code, but I like having all error message
302*16467b97STreehugger Robot        generation within one method rather than spread among all of the
303*16467b97STreehugger Robot        exception classes. This also makes it much easier for the exception
304*16467b97STreehugger Robot        handling because the exception classes do not have to have pointers back
305*16467b97STreehugger Robot        to this object to access utility routines and so on. Also, changing
306*16467b97STreehugger Robot        the message for an exception type would be difficult because you
307*16467b97STreehugger Robot        would have to subclassing exception, but then somehow get ANTLR
308*16467b97STreehugger Robot        to make those kinds of exception objects instead of the default.
309*16467b97STreehugger Robot        This looks weird, but trust me--it makes the most sense in terms
310*16467b97STreehugger Robot        of flexibility.
311*16467b97STreehugger Robot
312*16467b97STreehugger Robot        For grammar debugging, you will want to override this to add
313*16467b97STreehugger Robot        more information such as the stack frame with
314*16467b97STreehugger Robot        getRuleInvocationStack(e, this.getClass().getName()) and,
315*16467b97STreehugger Robot        for no viable alts, the decision description and state etc...
316*16467b97STreehugger Robot
317*16467b97STreehugger Robot        Override this to change the message generated for one or more
318*16467b97STreehugger Robot        exception types.
319*16467b97STreehugger Robot        """
320*16467b97STreehugger Robot
321*16467b97STreehugger Robot        if isinstance(e, UnwantedTokenException):
322*16467b97STreehugger Robot            tokenName = "<unknown>"
323*16467b97STreehugger Robot            if e.expecting == EOF:
324*16467b97STreehugger Robot                tokenName = "EOF"
325*16467b97STreehugger Robot
326*16467b97STreehugger Robot            else:
327*16467b97STreehugger Robot                tokenName = self.tokenNames[e.expecting]
328*16467b97STreehugger Robot
329*16467b97STreehugger Robot            msg = "extraneous input %s expecting %s" % (
330*16467b97STreehugger Robot                self.getTokenErrorDisplay(e.getUnexpectedToken()),
331*16467b97STreehugger Robot                tokenName
332*16467b97STreehugger Robot                )
333*16467b97STreehugger Robot
334*16467b97STreehugger Robot        elif isinstance(e, MissingTokenException):
335*16467b97STreehugger Robot            tokenName = "<unknown>"
336*16467b97STreehugger Robot            if e.expecting == EOF:
337*16467b97STreehugger Robot                tokenName = "EOF"
338*16467b97STreehugger Robot
339*16467b97STreehugger Robot            else:
340*16467b97STreehugger Robot                tokenName = self.tokenNames[e.expecting]
341*16467b97STreehugger Robot
342*16467b97STreehugger Robot            msg = "missing %s at %s" % (
343*16467b97STreehugger Robot                tokenName, self.getTokenErrorDisplay(e.token)
344*16467b97STreehugger Robot                )
345*16467b97STreehugger Robot
346*16467b97STreehugger Robot        elif isinstance(e, MismatchedTokenException):
347*16467b97STreehugger Robot            tokenName = "<unknown>"
348*16467b97STreehugger Robot            if e.expecting == EOF:
349*16467b97STreehugger Robot                tokenName = "EOF"
350*16467b97STreehugger Robot            else:
351*16467b97STreehugger Robot                tokenName = self.tokenNames[e.expecting]
352*16467b97STreehugger Robot
353*16467b97STreehugger Robot            msg = "mismatched input " \
354*16467b97STreehugger Robot                  + self.getTokenErrorDisplay(e.token) \
355*16467b97STreehugger Robot                  + " expecting " \
356*16467b97STreehugger Robot                  + tokenName
357*16467b97STreehugger Robot
358*16467b97STreehugger Robot        elif isinstance(e, MismatchedTreeNodeException):
359*16467b97STreehugger Robot            tokenName = "<unknown>"
360*16467b97STreehugger Robot            if e.expecting == EOF:
361*16467b97STreehugger Robot                tokenName = "EOF"
362*16467b97STreehugger Robot            else:
363*16467b97STreehugger Robot                tokenName = self.tokenNames[e.expecting]
364*16467b97STreehugger Robot
365*16467b97STreehugger Robot            msg = "mismatched tree node: %s expecting %s" \
366*16467b97STreehugger Robot                  % (e.node, tokenName)
367*16467b97STreehugger Robot
368*16467b97STreehugger Robot        elif isinstance(e, NoViableAltException):
369*16467b97STreehugger Robot            msg = "no viable alternative at input " \
370*16467b97STreehugger Robot                  + self.getTokenErrorDisplay(e.token)
371*16467b97STreehugger Robot
372*16467b97STreehugger Robot        elif isinstance(e, EarlyExitException):
373*16467b97STreehugger Robot            msg = "required (...)+ loop did not match anything at input " \
374*16467b97STreehugger Robot                  + self.getTokenErrorDisplay(e.token)
375*16467b97STreehugger Robot
376*16467b97STreehugger Robot        elif isinstance(e, MismatchedSetException):
377*16467b97STreehugger Robot            msg = "mismatched input " \
378*16467b97STreehugger Robot                  + self.getTokenErrorDisplay(e.token) \
379*16467b97STreehugger Robot                  + " expecting set " \
380*16467b97STreehugger Robot                  + repr(e.expecting)
381*16467b97STreehugger Robot
382*16467b97STreehugger Robot        elif isinstance(e, MismatchedNotSetException):
383*16467b97STreehugger Robot            msg = "mismatched input " \
384*16467b97STreehugger Robot                  + self.getTokenErrorDisplay(e.token) \
385*16467b97STreehugger Robot                  + " expecting set " \
386*16467b97STreehugger Robot                  + repr(e.expecting)
387*16467b97STreehugger Robot
388*16467b97STreehugger Robot        elif isinstance(e, FailedPredicateException):
389*16467b97STreehugger Robot            msg = "rule " \
390*16467b97STreehugger Robot                  + e.ruleName \
391*16467b97STreehugger Robot                  + " failed predicate: {" \
392*16467b97STreehugger Robot                  + e.predicateText \
393*16467b97STreehugger Robot                  + "}?"
394*16467b97STreehugger Robot
395*16467b97STreehugger Robot        else:
396*16467b97STreehugger Robot            msg = str(e)
397*16467b97STreehugger Robot
398*16467b97STreehugger Robot        return msg
399*16467b97STreehugger Robot
400*16467b97STreehugger Robot
401*16467b97STreehugger Robot    def getNumberOfSyntaxErrors(self):
402*16467b97STreehugger Robot        """
403*16467b97STreehugger Robot        Get number of recognition errors (lexer, parser, tree parser).  Each
404*16467b97STreehugger Robot        recognizer tracks its own number.  So parser and lexer each have
405*16467b97STreehugger Robot        separate count.  Does not count the spurious errors found between
406*16467b97STreehugger Robot        an error and next valid token match
407*16467b97STreehugger Robot
408*16467b97STreehugger Robot        See also reportError()
409*16467b97STreehugger Robot	"""
410*16467b97STreehugger Robot        return self._state.syntaxErrors
411*16467b97STreehugger Robot
412*16467b97STreehugger Robot
413*16467b97STreehugger Robot    def getErrorHeader(self, e):
414*16467b97STreehugger Robot        """
415*16467b97STreehugger Robot        What is the error header, normally line/character position information?
416*16467b97STreehugger Robot        """
417*16467b97STreehugger Robot
418*16467b97STreehugger Robot        source_name = self.getSourceName()
419*16467b97STreehugger Robot        if source_name is not None:
420*16467b97STreehugger Robot            return "%s line %d:%d" % (source_name, e.line, e.charPositionInLine)
421*16467b97STreehugger Robot        return "line %d:%d" % (e.line, e.charPositionInLine)
422*16467b97STreehugger Robot
423*16467b97STreehugger Robot
424*16467b97STreehugger Robot    def getTokenErrorDisplay(self, t):
425*16467b97STreehugger Robot        """
426*16467b97STreehugger Robot        How should a token be displayed in an error message? The default
427*16467b97STreehugger Robot        is to display just the text, but during development you might
428*16467b97STreehugger Robot        want to have a lot of information spit out.  Override in that case
429*16467b97STreehugger Robot        to use t.toString() (which, for CommonToken, dumps everything about
430*16467b97STreehugger Robot        the token). This is better than forcing you to override a method in
431*16467b97STreehugger Robot        your token objects because you don't have to go modify your lexer
432*16467b97STreehugger Robot        so that it creates a new Java type.
433*16467b97STreehugger Robot        """
434*16467b97STreehugger Robot
435*16467b97STreehugger Robot        s = t.text
436*16467b97STreehugger Robot        if s is None:
437*16467b97STreehugger Robot            if t.type == EOF:
438*16467b97STreehugger Robot                s = "<EOF>"
439*16467b97STreehugger Robot            else:
440*16467b97STreehugger Robot                s = "<"+t.type+">"
441*16467b97STreehugger Robot
442*16467b97STreehugger Robot        return repr(s)
443*16467b97STreehugger Robot
444*16467b97STreehugger Robot
445*16467b97STreehugger Robot    def emitErrorMessage(self, msg):
446*16467b97STreehugger Robot        """Override this method to change where error messages go"""
447*16467b97STreehugger Robot        sys.stderr.write(msg + '\n')
448*16467b97STreehugger Robot
449*16467b97STreehugger Robot
450*16467b97STreehugger Robot    def recover(self, input, re):
451*16467b97STreehugger Robot        """
452*16467b97STreehugger Robot        Recover from an error found on the input stream.  This is
453*16467b97STreehugger Robot        for NoViableAlt and mismatched symbol exceptions.  If you enable
454*16467b97STreehugger Robot        single token insertion and deletion, this will usually not
455*16467b97STreehugger Robot        handle mismatched symbol exceptions but there could be a mismatched
456*16467b97STreehugger Robot        token that the match() routine could not recover from.
457*16467b97STreehugger Robot        """
458*16467b97STreehugger Robot
459*16467b97STreehugger Robot        # PROBLEM? what if input stream is not the same as last time
460*16467b97STreehugger Robot        # perhaps make lastErrorIndex a member of input
461*16467b97STreehugger Robot        if self._state.lastErrorIndex == input.index():
462*16467b97STreehugger Robot            # uh oh, another error at same token index; must be a case
463*16467b97STreehugger Robot            # where LT(1) is in the recovery token set so nothing is
464*16467b97STreehugger Robot            # consumed; consume a single token so at least to prevent
465*16467b97STreehugger Robot            # an infinite loop; this is a failsafe.
466*16467b97STreehugger Robot            input.consume()
467*16467b97STreehugger Robot
468*16467b97STreehugger Robot        self._state.lastErrorIndex = input.index()
469*16467b97STreehugger Robot        followSet = self.computeErrorRecoverySet()
470*16467b97STreehugger Robot
471*16467b97STreehugger Robot        self.beginResync()
472*16467b97STreehugger Robot        self.consumeUntil(input, followSet)
473*16467b97STreehugger Robot        self.endResync()
474*16467b97STreehugger Robot
475*16467b97STreehugger Robot
476*16467b97STreehugger Robot    def beginResync(self):
477*16467b97STreehugger Robot        """
478*16467b97STreehugger Robot        A hook to listen in on the token consumption during error recovery.
479*16467b97STreehugger Robot        The DebugParser subclasses this to fire events to the listenter.
480*16467b97STreehugger Robot        """
481*16467b97STreehugger Robot
482*16467b97STreehugger Robot        pass
483*16467b97STreehugger Robot
484*16467b97STreehugger Robot
485*16467b97STreehugger Robot    def endResync(self):
486*16467b97STreehugger Robot        """
487*16467b97STreehugger Robot        A hook to listen in on the token consumption during error recovery.
488*16467b97STreehugger Robot        The DebugParser subclasses this to fire events to the listenter.
489*16467b97STreehugger Robot        """
490*16467b97STreehugger Robot
491*16467b97STreehugger Robot        pass
492*16467b97STreehugger Robot
493*16467b97STreehugger Robot
494*16467b97STreehugger Robot    def computeErrorRecoverySet(self):
495*16467b97STreehugger Robot        """
496*16467b97STreehugger Robot        Compute the error recovery set for the current rule.  During
497*16467b97STreehugger Robot        rule invocation, the parser pushes the set of tokens that can
498*16467b97STreehugger Robot        follow that rule reference on the stack; this amounts to
499*16467b97STreehugger Robot        computing FIRST of what follows the rule reference in the
500*16467b97STreehugger Robot        enclosing rule. This local follow set only includes tokens
501*16467b97STreehugger Robot        from within the rule; i.e., the FIRST computation done by
502*16467b97STreehugger Robot        ANTLR stops at the end of a rule.
503*16467b97STreehugger Robot
504*16467b97STreehugger Robot        EXAMPLE
505*16467b97STreehugger Robot
506*16467b97STreehugger Robot        When you find a "no viable alt exception", the input is not
507*16467b97STreehugger Robot        consistent with any of the alternatives for rule r.  The best
508*16467b97STreehugger Robot        thing to do is to consume tokens until you see something that
509*16467b97STreehugger Robot        can legally follow a call to r *or* any rule that called r.
510*16467b97STreehugger Robot        You don't want the exact set of viable next tokens because the
511*16467b97STreehugger Robot        input might just be missing a token--you might consume the
512*16467b97STreehugger Robot        rest of the input looking for one of the missing tokens.
513*16467b97STreehugger Robot
514*16467b97STreehugger Robot        Consider grammar:
515*16467b97STreehugger Robot
516*16467b97STreehugger Robot        a : '[' b ']'
517*16467b97STreehugger Robot          | '(' b ')'
518*16467b97STreehugger Robot          ;
519*16467b97STreehugger Robot        b : c '^' INT ;
520*16467b97STreehugger Robot        c : ID
521*16467b97STreehugger Robot          | INT
522*16467b97STreehugger Robot          ;
523*16467b97STreehugger Robot
524*16467b97STreehugger Robot        At each rule invocation, the set of tokens that could follow
525*16467b97STreehugger Robot        that rule is pushed on a stack.  Here are the various "local"
526*16467b97STreehugger Robot        follow sets:
527*16467b97STreehugger Robot
528*16467b97STreehugger Robot        FOLLOW(b1_in_a) = FIRST(']') = ']'
529*16467b97STreehugger Robot        FOLLOW(b2_in_a) = FIRST(')') = ')'
530*16467b97STreehugger Robot        FOLLOW(c_in_b) = FIRST('^') = '^'
531*16467b97STreehugger Robot
532*16467b97STreehugger Robot        Upon erroneous input "[]", the call chain is
533*16467b97STreehugger Robot
534*16467b97STreehugger Robot        a -> b -> c
535*16467b97STreehugger Robot
536*16467b97STreehugger Robot        and, hence, the follow context stack is:
537*16467b97STreehugger Robot
538*16467b97STreehugger Robot        depth  local follow set     after call to rule
539*16467b97STreehugger Robot          0         \<EOF>                    a (from main())
540*16467b97STreehugger Robot          1          ']'                     b
541*16467b97STreehugger Robot          3          '^'                     c
542*16467b97STreehugger Robot
543*16467b97STreehugger Robot        Notice that ')' is not included, because b would have to have
544*16467b97STreehugger Robot        been called from a different context in rule a for ')' to be
545*16467b97STreehugger Robot        included.
546*16467b97STreehugger Robot
547*16467b97STreehugger Robot        For error recovery, we cannot consider FOLLOW(c)
548*16467b97STreehugger Robot        (context-sensitive or otherwise).  We need the combined set of
549*16467b97STreehugger Robot        all context-sensitive FOLLOW sets--the set of all tokens that
550*16467b97STreehugger Robot        could follow any reference in the call chain.  We need to
551*16467b97STreehugger Robot        resync to one of those tokens.  Note that FOLLOW(c)='^' and if
552*16467b97STreehugger Robot        we resync'd to that token, we'd consume until EOF.  We need to
553*16467b97STreehugger Robot        sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
554*16467b97STreehugger Robot        In this case, for input "[]", LA(1) is in this set so we would
555*16467b97STreehugger Robot        not consume anything and after printing an error rule c would
556*16467b97STreehugger Robot        return normally.  It would not find the required '^' though.
557*16467b97STreehugger Robot        At this point, it gets a mismatched token error and throws an
558*16467b97STreehugger Robot        exception (since LA(1) is not in the viable following token
559*16467b97STreehugger Robot        set).  The rule exception handler tries to recover, but finds
560*16467b97STreehugger Robot        the same recovery set and doesn't consume anything.  Rule b
561*16467b97STreehugger Robot        exits normally returning to rule a.  Now it finds the ']' (and
562*16467b97STreehugger Robot        with the successful match exits errorRecovery mode).
563*16467b97STreehugger Robot
564*16467b97STreehugger Robot        So, you cna see that the parser walks up call chain looking
565*16467b97STreehugger Robot        for the token that was a member of the recovery set.
566*16467b97STreehugger Robot
567*16467b97STreehugger Robot        Errors are not generated in errorRecovery mode.
568*16467b97STreehugger Robot
569*16467b97STreehugger Robot        ANTLR's error recovery mechanism is based upon original ideas:
570*16467b97STreehugger Robot
571*16467b97STreehugger Robot        "Algorithms + Data Structures = Programs" by Niklaus Wirth
572*16467b97STreehugger Robot
573*16467b97STreehugger Robot        and
574*16467b97STreehugger Robot
575*16467b97STreehugger Robot        "A note on error recovery in recursive descent parsers":
576*16467b97STreehugger Robot        http://portal.acm.org/citation.cfm?id=947902.947905
577*16467b97STreehugger Robot
578*16467b97STreehugger Robot        Later, Josef Grosch had some good ideas:
579*16467b97STreehugger Robot
580*16467b97STreehugger Robot        "Efficient and Comfortable Error Recovery in Recursive Descent
581*16467b97STreehugger Robot        Parsers":
582*16467b97STreehugger Robot        ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
583*16467b97STreehugger Robot
584*16467b97STreehugger Robot        Like Grosch I implemented local FOLLOW sets that are combined
585*16467b97STreehugger Robot        at run-time upon error to avoid overhead during parsing.
586*16467b97STreehugger Robot        """
587*16467b97STreehugger Robot
588*16467b97STreehugger Robot        return self.combineFollows(False)
589*16467b97STreehugger Robot
590*16467b97STreehugger Robot
591*16467b97STreehugger Robot    def computeContextSensitiveRuleFOLLOW(self):
592*16467b97STreehugger Robot        """
593*16467b97STreehugger Robot        Compute the context-sensitive FOLLOW set for current rule.
594*16467b97STreehugger Robot        This is set of token types that can follow a specific rule
595*16467b97STreehugger Robot        reference given a specific call chain.  You get the set of
596*16467b97STreehugger Robot        viable tokens that can possibly come next (lookahead depth 1)
597*16467b97STreehugger Robot        given the current call chain.  Contrast this with the
598*16467b97STreehugger Robot        definition of plain FOLLOW for rule r:
599*16467b97STreehugger Robot
600*16467b97STreehugger Robot         FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
601*16467b97STreehugger Robot
602*16467b97STreehugger Robot        where x in T* and alpha, beta in V*; T is set of terminals and
603*16467b97STreehugger Robot        V is the set of terminals and nonterminals.  In other words,
604*16467b97STreehugger Robot        FOLLOW(r) is the set of all tokens that can possibly follow
605*16467b97STreehugger Robot        references to r in *any* sentential form (context).  At
606*16467b97STreehugger Robot        runtime, however, we know precisely which context applies as
607*16467b97STreehugger Robot        we have the call chain.  We may compute the exact (rather
608*16467b97STreehugger Robot        than covering superset) set of following tokens.
609*16467b97STreehugger Robot
610*16467b97STreehugger Robot        For example, consider grammar:
611*16467b97STreehugger Robot
612*16467b97STreehugger Robot        stat : ID '=' expr ';'      // FOLLOW(stat)=={EOF}
613*16467b97STreehugger Robot             | "return" expr '.'
614*16467b97STreehugger Robot             ;
615*16467b97STreehugger Robot        expr : atom ('+' atom)* ;   // FOLLOW(expr)=={';','.',')'}
616*16467b97STreehugger Robot        atom : INT                  // FOLLOW(atom)=={'+',')',';','.'}
617*16467b97STreehugger Robot             | '(' expr ')'
618*16467b97STreehugger Robot             ;
619*16467b97STreehugger Robot
620*16467b97STreehugger Robot        The FOLLOW sets are all inclusive whereas context-sensitive
621*16467b97STreehugger Robot        FOLLOW sets are precisely what could follow a rule reference.
622*16467b97STreehugger Robot        For input input "i=(3);", here is the derivation:
623*16467b97STreehugger Robot
624*16467b97STreehugger Robot        stat => ID '=' expr ';'
625*16467b97STreehugger Robot             => ID '=' atom ('+' atom)* ';'
626*16467b97STreehugger Robot             => ID '=' '(' expr ')' ('+' atom)* ';'
627*16467b97STreehugger Robot             => ID '=' '(' atom ')' ('+' atom)* ';'
628*16467b97STreehugger Robot             => ID '=' '(' INT ')' ('+' atom)* ';'
629*16467b97STreehugger Robot             => ID '=' '(' INT ')' ';'
630*16467b97STreehugger Robot
631*16467b97STreehugger Robot        At the "3" token, you'd have a call chain of
632*16467b97STreehugger Robot
633*16467b97STreehugger Robot          stat -> expr -> atom -> expr -> atom
634*16467b97STreehugger Robot
635*16467b97STreehugger Robot        What can follow that specific nested ref to atom?  Exactly ')'
636*16467b97STreehugger Robot        as you can see by looking at the derivation of this specific
637*16467b97STreehugger Robot        input.  Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
638*16467b97STreehugger Robot
639*16467b97STreehugger Robot        You want the exact viable token set when recovering from a
640*16467b97STreehugger Robot        token mismatch.  Upon token mismatch, if LA(1) is member of
641*16467b97STreehugger Robot        the viable next token set, then you know there is most likely
642*16467b97STreehugger Robot        a missing token in the input stream.  "Insert" one by just not
643*16467b97STreehugger Robot        throwing an exception.
644*16467b97STreehugger Robot        """
645*16467b97STreehugger Robot
646*16467b97STreehugger Robot        return self.combineFollows(True)
647*16467b97STreehugger Robot
648*16467b97STreehugger Robot
649*16467b97STreehugger Robot    def combineFollows(self, exact):
650*16467b97STreehugger Robot        followSet = set()
651*16467b97STreehugger Robot        for idx, localFollowSet in reversed(list(enumerate(self._state.following))):
652*16467b97STreehugger Robot            followSet |= localFollowSet
653*16467b97STreehugger Robot            if exact:
654*16467b97STreehugger Robot                # can we see end of rule?
655*16467b97STreehugger Robot                if EOR_TOKEN_TYPE in localFollowSet:
656*16467b97STreehugger Robot                    # Only leave EOR in set if at top (start rule); this lets
657*16467b97STreehugger Robot                    # us know if have to include follow(start rule); i.e., EOF
658*16467b97STreehugger Robot                    if idx > 0:
659*16467b97STreehugger Robot                        followSet.remove(EOR_TOKEN_TYPE)
660*16467b97STreehugger Robot
661*16467b97STreehugger Robot                else:
662*16467b97STreehugger Robot                    # can't see end of rule, quit
663*16467b97STreehugger Robot                    break
664*16467b97STreehugger Robot
665*16467b97STreehugger Robot        return followSet
666*16467b97STreehugger Robot
667*16467b97STreehugger Robot
668*16467b97STreehugger Robot    def recoverFromMismatchedToken(self, input, ttype, follow):
669*16467b97STreehugger Robot        """Attempt to recover from a single missing or extra token.
670*16467b97STreehugger Robot
671*16467b97STreehugger Robot        EXTRA TOKEN
672*16467b97STreehugger Robot
673*16467b97STreehugger Robot        LA(1) is not what we are looking for.  If LA(2) has the right token,
674*16467b97STreehugger Robot        however, then assume LA(1) is some extra spurious token.  Delete it
675*16467b97STreehugger Robot        and LA(2) as if we were doing a normal match(), which advances the
676*16467b97STreehugger Robot        input.
677*16467b97STreehugger Robot
678*16467b97STreehugger Robot        MISSING TOKEN
679*16467b97STreehugger Robot
680*16467b97STreehugger Robot        If current token is consistent with what could come after
681*16467b97STreehugger Robot        ttype then it is ok to 'insert' the missing token, else throw
682*16467b97STreehugger Robot        exception For example, Input 'i=(3;' is clearly missing the
683*16467b97STreehugger Robot        ')'.  When the parser returns from the nested call to expr, it
684*16467b97STreehugger Robot        will have call chain:
685*16467b97STreehugger Robot
686*16467b97STreehugger Robot          stat -> expr -> atom
687*16467b97STreehugger Robot
688*16467b97STreehugger Robot        and it will be trying to match the ')' at this point in the
689*16467b97STreehugger Robot        derivation:
690*16467b97STreehugger Robot
691*16467b97STreehugger Robot             => ID '=' '(' INT ')' ('+' atom)* ';'
692*16467b97STreehugger Robot                                ^
693*16467b97STreehugger Robot        match() will see that ';' doesn't match ')' and report a
694*16467b97STreehugger Robot        mismatched token error.  To recover, it sees that LA(1)==';'
695*16467b97STreehugger Robot        is in the set of tokens that can follow the ')' token
696*16467b97STreehugger Robot        reference in rule atom.  It can assume that you forgot the ')'.
697*16467b97STreehugger Robot        """
698*16467b97STreehugger Robot
699*16467b97STreehugger Robot        e = None
700*16467b97STreehugger Robot
701*16467b97STreehugger Robot        # if next token is what we are looking for then "delete" this token
702*16467b97STreehugger Robot        if self.mismatchIsUnwantedToken(input, ttype):
703*16467b97STreehugger Robot            e = UnwantedTokenException(ttype, input)
704*16467b97STreehugger Robot
705*16467b97STreehugger Robot            self.beginResync()
706*16467b97STreehugger Robot            input.consume() # simply delete extra token
707*16467b97STreehugger Robot            self.endResync()
708*16467b97STreehugger Robot
709*16467b97STreehugger Robot            # report after consuming so AW sees the token in the exception
710*16467b97STreehugger Robot            self.reportError(e)
711*16467b97STreehugger Robot
712*16467b97STreehugger Robot            # we want to return the token we're actually matching
713*16467b97STreehugger Robot            matchedSymbol = self.getCurrentInputSymbol(input)
714*16467b97STreehugger Robot
715*16467b97STreehugger Robot            # move past ttype token as if all were ok
716*16467b97STreehugger Robot            input.consume()
717*16467b97STreehugger Robot            return matchedSymbol
718*16467b97STreehugger Robot
719*16467b97STreehugger Robot        # can't recover with single token deletion, try insertion
720*16467b97STreehugger Robot        if self.mismatchIsMissingToken(input, follow):
721*16467b97STreehugger Robot            inserted = self.getMissingSymbol(input, e, ttype, follow)
722*16467b97STreehugger Robot            e = MissingTokenException(ttype, input, inserted)
723*16467b97STreehugger Robot
724*16467b97STreehugger Robot            # report after inserting so AW sees the token in the exception
725*16467b97STreehugger Robot            self.reportError(e)
726*16467b97STreehugger Robot            return inserted
727*16467b97STreehugger Robot
728*16467b97STreehugger Robot        # even that didn't work; must throw the exception
729*16467b97STreehugger Robot        e = MismatchedTokenException(ttype, input)
730*16467b97STreehugger Robot        raise e
731*16467b97STreehugger Robot
732*16467b97STreehugger Robot
733*16467b97STreehugger Robot    def recoverFromMismatchedSet(self, input, e, follow):
734*16467b97STreehugger Robot        """Not currently used"""
735*16467b97STreehugger Robot
736*16467b97STreehugger Robot        if self.mismatchIsMissingToken(input, follow):
737*16467b97STreehugger Robot            self.reportError(e)
738*16467b97STreehugger Robot            # we don't know how to conjure up a token for sets yet
739*16467b97STreehugger Robot            return self.getMissingSymbol(input, e, INVALID_TOKEN_TYPE, follow)
740*16467b97STreehugger Robot
741*16467b97STreehugger Robot        # TODO do single token deletion like above for Token mismatch
742*16467b97STreehugger Robot        raise e
743*16467b97STreehugger Robot
744*16467b97STreehugger Robot
745*16467b97STreehugger Robot    def getCurrentInputSymbol(self, input):
746*16467b97STreehugger Robot        """
747*16467b97STreehugger Robot        Match needs to return the current input symbol, which gets put
748*16467b97STreehugger Robot        into the label for the associated token ref; e.g., x=ID.  Token
749*16467b97STreehugger Robot        and tree parsers need to return different objects. Rather than test
750*16467b97STreehugger Robot        for input stream type or change the IntStream interface, I use
751*16467b97STreehugger Robot        a simple method to ask the recognizer to tell me what the current
752*16467b97STreehugger Robot        input symbol is.
753*16467b97STreehugger Robot
754*16467b97STreehugger Robot        This is ignored for lexers.
755*16467b97STreehugger Robot        """
756*16467b97STreehugger Robot
757*16467b97STreehugger Robot        return None
758*16467b97STreehugger Robot
759*16467b97STreehugger Robot
760*16467b97STreehugger Robot    def getMissingSymbol(self, input, e, expectedTokenType, follow):
761*16467b97STreehugger Robot        """Conjure up a missing token during error recovery.
762*16467b97STreehugger Robot
763*16467b97STreehugger Robot        The recognizer attempts to recover from single missing
764*16467b97STreehugger Robot        symbols. But, actions might refer to that missing symbol.
765*16467b97STreehugger Robot        For example, x=ID {f($x);}. The action clearly assumes
766*16467b97STreehugger Robot        that there has been an identifier matched previously and that
767*16467b97STreehugger Robot        $x points at that token. If that token is missing, but
768*16467b97STreehugger Robot        the next token in the stream is what we want we assume that
769*16467b97STreehugger Robot        this token is missing and we keep going. Because we
770*16467b97STreehugger Robot        have to return some token to replace the missing token,
771*16467b97STreehugger Robot        we have to conjure one up. This method gives the user control
772*16467b97STreehugger Robot        over the tokens returned for missing tokens. Mostly,
773*16467b97STreehugger Robot        you will want to create something special for identifier
774*16467b97STreehugger Robot        tokens. For literals such as '{' and ',', the default
775*16467b97STreehugger Robot        action in the parser or tree parser works. It simply creates
776*16467b97STreehugger Robot        a CommonToken of the appropriate type. The text will be the token.
777*16467b97STreehugger Robot        If you change what tokens must be created by the lexer,
778*16467b97STreehugger Robot        override this method to create the appropriate tokens.
779*16467b97STreehugger Robot        """
780*16467b97STreehugger Robot
781*16467b97STreehugger Robot        return None
782*16467b97STreehugger Robot
783*16467b97STreehugger Robot
784*16467b97STreehugger Robot##     def recoverFromMissingElement(self, input, e, follow):
785*16467b97STreehugger Robot##         """
786*16467b97STreehugger Robot##         This code is factored out from mismatched token and mismatched set
787*16467b97STreehugger Robot##         recovery.  It handles "single token insertion" error recovery for
788*16467b97STreehugger Robot##         both.  No tokens are consumed to recover from insertions.  Return
789*16467b97STreehugger Robot##         true if recovery was possible else return false.
790*16467b97STreehugger Robot##         """
791*16467b97STreehugger Robot
792*16467b97STreehugger Robot##         if self.mismatchIsMissingToken(input, follow):
793*16467b97STreehugger Robot##             self.reportError(e)
794*16467b97STreehugger Robot##             return True
795*16467b97STreehugger Robot
796*16467b97STreehugger Robot##         # nothing to do; throw exception
797*16467b97STreehugger Robot##         return False
798*16467b97STreehugger Robot
799*16467b97STreehugger Robot
800*16467b97STreehugger Robot    def consumeUntil(self, input, tokenTypes):
801*16467b97STreehugger Robot        """
802*16467b97STreehugger Robot        Consume tokens until one matches the given token or token set
803*16467b97STreehugger Robot
804*16467b97STreehugger Robot        tokenTypes can be a single token type or a set of token types
805*16467b97STreehugger Robot
806*16467b97STreehugger Robot        """
807*16467b97STreehugger Robot
808*16467b97STreehugger Robot        if not isinstance(tokenTypes, (set, frozenset)):
809*16467b97STreehugger Robot            tokenTypes = frozenset([tokenTypes])
810*16467b97STreehugger Robot
811*16467b97STreehugger Robot        ttype = input.LA(1)
812*16467b97STreehugger Robot        while ttype != EOF and ttype not in tokenTypes:
813*16467b97STreehugger Robot            input.consume()
814*16467b97STreehugger Robot            ttype = input.LA(1)
815*16467b97STreehugger Robot
816*16467b97STreehugger Robot
817*16467b97STreehugger Robot    def getRuleInvocationStack(self):
818*16467b97STreehugger Robot        """
819*16467b97STreehugger Robot        Return List<String> of the rules in your parser instance
820*16467b97STreehugger Robot        leading up to a call to this method.  You could override if
821*16467b97STreehugger Robot        you want more details such as the file/line info of where
822*16467b97STreehugger Robot        in the parser java code a rule is invoked.
823*16467b97STreehugger Robot
824*16467b97STreehugger Robot        This is very useful for error messages and for context-sensitive
825*16467b97STreehugger Robot        error recovery.
826*16467b97STreehugger Robot
827*16467b97STreehugger Robot        You must be careful, if you subclass a generated recognizers.
828*16467b97STreehugger Robot        The default implementation will only search the module of self
829*16467b97STreehugger Robot        for rules, but the subclass will not contain any rules.
830*16467b97STreehugger Robot        You probably want to override this method to look like
831*16467b97STreehugger Robot
832*16467b97STreehugger Robot        def getRuleInvocationStack(self):
833*16467b97STreehugger Robot            return self._getRuleInvocationStack(<class>.__module__)
834*16467b97STreehugger Robot
835*16467b97STreehugger Robot        where <class> is the class of the generated recognizer, e.g.
836*16467b97STreehugger Robot        the superclass of self.
837*16467b97STreehugger Robot        """
838*16467b97STreehugger Robot
839*16467b97STreehugger Robot        return self._getRuleInvocationStack(self.__module__)
840*16467b97STreehugger Robot
841*16467b97STreehugger Robot
842*16467b97STreehugger Robot    def _getRuleInvocationStack(cls, module):
843*16467b97STreehugger Robot        """
844*16467b97STreehugger Robot        A more general version of getRuleInvocationStack where you can
845*16467b97STreehugger Robot        pass in, for example, a RecognitionException to get it's rule
846*16467b97STreehugger Robot        stack trace.  This routine is shared with all recognizers, hence,
847*16467b97STreehugger Robot        static.
848*16467b97STreehugger Robot
849*16467b97STreehugger Robot        TODO: move to a utility class or something; weird having lexer call
850*16467b97STreehugger Robot        this
851*16467b97STreehugger Robot        """
852*16467b97STreehugger Robot
853*16467b97STreehugger Robot        # mmmhhh,... perhaps look at the first argument
854*16467b97STreehugger Robot        # (f_locals[co_varnames[0]]?) and test if it's a (sub)class of
855*16467b97STreehugger Robot        # requested recognizer...
856*16467b97STreehugger Robot
857*16467b97STreehugger Robot        rules = []
858*16467b97STreehugger Robot        for frame in reversed(inspect.stack()):
859*16467b97STreehugger Robot            code = frame[0].f_code
860*16467b97STreehugger Robot            codeMod = inspect.getmodule(code)
861*16467b97STreehugger Robot            if codeMod is None:
862*16467b97STreehugger Robot                continue
863*16467b97STreehugger Robot
864*16467b97STreehugger Robot            # skip frames not in requested module
865*16467b97STreehugger Robot            if codeMod.__name__ != module:
866*16467b97STreehugger Robot                continue
867*16467b97STreehugger Robot
868*16467b97STreehugger Robot            # skip some unwanted names
869*16467b97STreehugger Robot            if code.co_name in ('nextToken', '<module>'):
870*16467b97STreehugger Robot                continue
871*16467b97STreehugger Robot
872*16467b97STreehugger Robot            rules.append(code.co_name)
873*16467b97STreehugger Robot
874*16467b97STreehugger Robot        return rules
875*16467b97STreehugger Robot
876*16467b97STreehugger Robot    _getRuleInvocationStack = classmethod(_getRuleInvocationStack)
877*16467b97STreehugger Robot
878*16467b97STreehugger Robot
879*16467b97STreehugger Robot    def getBacktrackingLevel(self):
880*16467b97STreehugger Robot        return self._state.backtracking
881*16467b97STreehugger Robot
882*16467b97STreehugger Robot    def setBacktrackingLevel(self, n):
883*16467b97STreehugger Robot        self._state.backtracking = n
884*16467b97STreehugger Robot
885*16467b97STreehugger Robot
886*16467b97STreehugger Robot    def getGrammarFileName(self):
887*16467b97STreehugger Robot        """For debugging and other purposes, might want the grammar name.
888*16467b97STreehugger Robot
889*16467b97STreehugger Robot        Have ANTLR generate an implementation for this method.
890*16467b97STreehugger Robot        """
891*16467b97STreehugger Robot
892*16467b97STreehugger Robot        return self.grammarFileName
893*16467b97STreehugger Robot
894*16467b97STreehugger Robot
895*16467b97STreehugger Robot    def getSourceName(self):
896*16467b97STreehugger Robot        raise NotImplementedError
897*16467b97STreehugger Robot
898*16467b97STreehugger Robot
899*16467b97STreehugger Robot    def toStrings(self, tokens):
900*16467b97STreehugger Robot        """A convenience method for use most often with template rewrites.
901*16467b97STreehugger Robot
902*16467b97STreehugger Robot        Convert a List<Token> to List<String>
903*16467b97STreehugger Robot        """
904*16467b97STreehugger Robot
905*16467b97STreehugger Robot        if tokens is None:
906*16467b97STreehugger Robot            return None
907*16467b97STreehugger Robot
908*16467b97STreehugger Robot        return [token.text for token in tokens]
909*16467b97STreehugger Robot
910*16467b97STreehugger Robot
911*16467b97STreehugger Robot    def getRuleMemoization(self, ruleIndex, ruleStartIndex):
912*16467b97STreehugger Robot        """
913*16467b97STreehugger Robot        Given a rule number and a start token index number, return
914*16467b97STreehugger Robot        MEMO_RULE_UNKNOWN if the rule has not parsed input starting from
915*16467b97STreehugger Robot        start index.  If this rule has parsed input starting from the
916*16467b97STreehugger Robot        start index before, then return where the rule stopped parsing.
917*16467b97STreehugger Robot        It returns the index of the last token matched by the rule.
918*16467b97STreehugger Robot        """
919*16467b97STreehugger Robot
920*16467b97STreehugger Robot        if ruleIndex not in self._state.ruleMemo:
921*16467b97STreehugger Robot            self._state.ruleMemo[ruleIndex] = {}
922*16467b97STreehugger Robot
923*16467b97STreehugger Robot        return self._state.ruleMemo[ruleIndex].get(
924*16467b97STreehugger Robot            ruleStartIndex, self.MEMO_RULE_UNKNOWN
925*16467b97STreehugger Robot            )
926*16467b97STreehugger Robot
927*16467b97STreehugger Robot
928*16467b97STreehugger Robot    def alreadyParsedRule(self, input, ruleIndex):
929*16467b97STreehugger Robot        """
930*16467b97STreehugger Robot        Has this rule already parsed input at the current index in the
931*16467b97STreehugger Robot        input stream?  Return the stop token index or MEMO_RULE_UNKNOWN.
932*16467b97STreehugger Robot        If we attempted but failed to parse properly before, return
933*16467b97STreehugger Robot        MEMO_RULE_FAILED.
934*16467b97STreehugger Robot
935*16467b97STreehugger Robot        This method has a side-effect: if we have seen this input for
936*16467b97STreehugger Robot        this rule and successfully parsed before, then seek ahead to
937*16467b97STreehugger Robot        1 past the stop token matched for this rule last time.
938*16467b97STreehugger Robot        """
939*16467b97STreehugger Robot
940*16467b97STreehugger Robot        stopIndex = self.getRuleMemoization(ruleIndex, input.index())
941*16467b97STreehugger Robot        if stopIndex == self.MEMO_RULE_UNKNOWN:
942*16467b97STreehugger Robot            return False
943*16467b97STreehugger Robot
944*16467b97STreehugger Robot        if stopIndex == self.MEMO_RULE_FAILED:
945*16467b97STreehugger Robot            raise BacktrackingFailed
946*16467b97STreehugger Robot
947*16467b97STreehugger Robot        else:
948*16467b97STreehugger Robot            input.seek(stopIndex + 1)
949*16467b97STreehugger Robot
950*16467b97STreehugger Robot        return True
951*16467b97STreehugger Robot
952*16467b97STreehugger Robot
953*16467b97STreehugger Robot    def memoize(self, input, ruleIndex, ruleStartIndex, success):
954*16467b97STreehugger Robot        """
955*16467b97STreehugger Robot        Record whether or not this rule parsed the input at this position
956*16467b97STreehugger Robot        successfully.
957*16467b97STreehugger Robot        """
958*16467b97STreehugger Robot
959*16467b97STreehugger Robot        if success:
960*16467b97STreehugger Robot            stopTokenIndex = input.index() - 1
961*16467b97STreehugger Robot        else:
962*16467b97STreehugger Robot            stopTokenIndex = self.MEMO_RULE_FAILED
963*16467b97STreehugger Robot
964*16467b97STreehugger Robot        if ruleIndex in self._state.ruleMemo:
965*16467b97STreehugger Robot            self._state.ruleMemo[ruleIndex][ruleStartIndex] = stopTokenIndex
966*16467b97STreehugger Robot
967*16467b97STreehugger Robot
968*16467b97STreehugger Robot    def traceIn(self, ruleName, ruleIndex, inputSymbol):
969*16467b97STreehugger Robot        sys.stdout.write("enter %s %s" % (ruleName, inputSymbol))
970*16467b97STreehugger Robot
971*16467b97STreehugger Robot        if self._state.backtracking > 0:
972*16467b97STreehugger Robot            sys.stdout.write(" backtracking=%s" % self._state.backtracking)
973*16467b97STreehugger Robot
974*16467b97STreehugger Robot        sys.stdout.write('\n')
975*16467b97STreehugger Robot
976*16467b97STreehugger Robot
977*16467b97STreehugger Robot    def traceOut(self, ruleName, ruleIndex, inputSymbol):
978*16467b97STreehugger Robot        sys.stdout.write("exit %s %s" % (ruleName, inputSymbol))
979*16467b97STreehugger Robot
980*16467b97STreehugger Robot        if self._state.backtracking > 0:
981*16467b97STreehugger Robot            sys.stdout.write(" backtracking=%s" % self._state.backtracking)
982*16467b97STreehugger Robot
983*16467b97STreehugger Robot        # mmmm... we use BacktrackingFailed exceptions now. So how could we
984*16467b97STreehugger Robot        # get that information here?
985*16467b97STreehugger Robot        #if self._state.failed:
986*16467b97STreehugger Robot        #    sys.stdout.write(" failed")
987*16467b97STreehugger Robot        #else:
988*16467b97STreehugger Robot        #    sys.stdout.write(" succeeded")
989*16467b97STreehugger Robot
990*16467b97STreehugger Robot        sys.stdout.write('\n')
991*16467b97STreehugger Robot
992*16467b97STreehugger Robot
993*16467b97STreehugger Robotclass TokenSource(object):
994*16467b97STreehugger Robot    """
995*16467b97STreehugger Robot    @brief Abstract baseclass for token producers.
996*16467b97STreehugger Robot
997*16467b97STreehugger Robot    A source of tokens must provide a sequence of tokens via nextToken()
998*16467b97STreehugger Robot    and also must reveal it's source of characters; CommonToken's text is
999*16467b97STreehugger Robot    computed from a CharStream; it only store indices into the char stream.
1000*16467b97STreehugger Robot
1001*16467b97STreehugger Robot    Errors from the lexer are never passed to the parser.  Either you want
1002*16467b97STreehugger Robot    to keep going or you do not upon token recognition error.  If you do not
1003*16467b97STreehugger Robot    want to continue lexing then you do not want to continue parsing.  Just
1004*16467b97STreehugger Robot    throw an exception not under RecognitionException and Java will naturally
1005*16467b97STreehugger Robot    toss you all the way out of the recognizers.  If you want to continue
1006*16467b97STreehugger Robot    lexing then you should not throw an exception to the parser--it has already
1007*16467b97STreehugger Robot    requested a token.  Keep lexing until you get a valid one.  Just report
1008*16467b97STreehugger Robot    errors and keep going, looking for a valid token.
1009*16467b97STreehugger Robot    """
1010*16467b97STreehugger Robot
1011*16467b97STreehugger Robot    def nextToken(self):
1012*16467b97STreehugger Robot        """Return a Token object from your input stream (usually a CharStream).
1013*16467b97STreehugger Robot
1014*16467b97STreehugger Robot        Do not fail/return upon lexing error; keep chewing on the characters
1015*16467b97STreehugger Robot        until you get a good one; errors are not passed through to the parser.
1016*16467b97STreehugger Robot        """
1017*16467b97STreehugger Robot
1018*16467b97STreehugger Robot        raise NotImplementedError
1019*16467b97STreehugger Robot
1020*16467b97STreehugger Robot
1021*16467b97STreehugger Robot    def __iter__(self):
1022*16467b97STreehugger Robot        """The TokenSource is an interator.
1023*16467b97STreehugger Robot
1024*16467b97STreehugger Robot        The iteration will not include the final EOF token, see also the note
1025*16467b97STreehugger Robot        for the next() method.
1026*16467b97STreehugger Robot
1027*16467b97STreehugger Robot        """
1028*16467b97STreehugger Robot
1029*16467b97STreehugger Robot        return self
1030*16467b97STreehugger Robot
1031*16467b97STreehugger Robot
1032*16467b97STreehugger Robot    def next(self):
1033*16467b97STreehugger Robot        """Return next token or raise StopIteration.
1034*16467b97STreehugger Robot
1035*16467b97STreehugger Robot        Note that this will raise StopIteration when hitting the EOF token,
1036*16467b97STreehugger Robot        so EOF will not be part of the iteration.
1037*16467b97STreehugger Robot
1038*16467b97STreehugger Robot        """
1039*16467b97STreehugger Robot
1040*16467b97STreehugger Robot        token = self.nextToken()
1041*16467b97STreehugger Robot        if token is None or token.type == EOF:
1042*16467b97STreehugger Robot            raise StopIteration
1043*16467b97STreehugger Robot        return token
1044*16467b97STreehugger Robot
1045*16467b97STreehugger Robot
1046*16467b97STreehugger Robotclass Lexer(BaseRecognizer, TokenSource):
1047*16467b97STreehugger Robot    """
1048*16467b97STreehugger Robot    @brief Baseclass for generated lexer classes.
1049*16467b97STreehugger Robot
1050*16467b97STreehugger Robot    A lexer is recognizer that draws input symbols from a character stream.
1051*16467b97STreehugger Robot    lexer grammars result in a subclass of this object. A Lexer object
1052*16467b97STreehugger Robot    uses simplified match() and error recovery mechanisms in the interest
1053*16467b97STreehugger Robot    of speed.
1054*16467b97STreehugger Robot    """
1055*16467b97STreehugger Robot
1056*16467b97STreehugger Robot    def __init__(self, input, state=None):
1057*16467b97STreehugger Robot        BaseRecognizer.__init__(self, state)
1058*16467b97STreehugger Robot        TokenSource.__init__(self)
1059*16467b97STreehugger Robot
1060*16467b97STreehugger Robot        # Where is the lexer drawing characters from?
1061*16467b97STreehugger Robot        self.input = input
1062*16467b97STreehugger Robot
1063*16467b97STreehugger Robot
1064*16467b97STreehugger Robot    def reset(self):
1065*16467b97STreehugger Robot        BaseRecognizer.reset(self) # reset all recognizer state variables
1066*16467b97STreehugger Robot
1067*16467b97STreehugger Robot        if self.input is not None:
1068*16467b97STreehugger Robot            # rewind the input
1069*16467b97STreehugger Robot            self.input.seek(0)
1070*16467b97STreehugger Robot
1071*16467b97STreehugger Robot        if self._state is None:
1072*16467b97STreehugger Robot            # no shared state work to do
1073*16467b97STreehugger Robot            return
1074*16467b97STreehugger Robot
1075*16467b97STreehugger Robot        # wack Lexer state variables
1076*16467b97STreehugger Robot        self._state.token = None
1077*16467b97STreehugger Robot        self._state.type = INVALID_TOKEN_TYPE
1078*16467b97STreehugger Robot        self._state.channel = DEFAULT_CHANNEL
1079*16467b97STreehugger Robot        self._state.tokenStartCharIndex = -1
1080*16467b97STreehugger Robot        self._state.tokenStartLine = -1
1081*16467b97STreehugger Robot        self._state.tokenStartCharPositionInLine = -1
1082*16467b97STreehugger Robot        self._state.text = None
1083*16467b97STreehugger Robot
1084*16467b97STreehugger Robot
1085*16467b97STreehugger Robot    def makeEOFToken(self):
1086*16467b97STreehugger Robot        eof = CommonToken(
1087*16467b97STreehugger Robot            type=EOF, channel=DEFAULT_CHANNEL,
1088*16467b97STreehugger Robot            input=self.input,
1089*16467b97STreehugger Robot            start=self.input.index(), stop=self.input.index())
1090*16467b97STreehugger Robot        eof.line = self.input.line
1091*16467b97STreehugger Robot        eof.charPositionInLine = self.input.charPositionInLine
1092*16467b97STreehugger Robot        return eof
1093*16467b97STreehugger Robot
1094*16467b97STreehugger Robot    def nextToken(self):
1095*16467b97STreehugger Robot        """
1096*16467b97STreehugger Robot        Return a token from this source; i.e., match a token on the char
1097*16467b97STreehugger Robot        stream.
1098*16467b97STreehugger Robot        """
1099*16467b97STreehugger Robot
1100*16467b97STreehugger Robot        while 1:
1101*16467b97STreehugger Robot            self._state.token = None
1102*16467b97STreehugger Robot            self._state.channel = DEFAULT_CHANNEL
1103*16467b97STreehugger Robot            self._state.tokenStartCharIndex = self.input.index()
1104*16467b97STreehugger Robot            self._state.tokenStartCharPositionInLine = self.input.charPositionInLine
1105*16467b97STreehugger Robot            self._state.tokenStartLine = self.input.line
1106*16467b97STreehugger Robot            self._state.text = None
1107*16467b97STreehugger Robot            if self.input.LA(1) == EOF:
1108*16467b97STreehugger Robot                return self.makeEOFToken()
1109*16467b97STreehugger Robot
1110*16467b97STreehugger Robot            try:
1111*16467b97STreehugger Robot                self.mTokens()
1112*16467b97STreehugger Robot
1113*16467b97STreehugger Robot                if self._state.token is None:
1114*16467b97STreehugger Robot                    self.emit()
1115*16467b97STreehugger Robot
1116*16467b97STreehugger Robot                elif self._state.token == SKIP_TOKEN:
1117*16467b97STreehugger Robot                    continue
1118*16467b97STreehugger Robot
1119*16467b97STreehugger Robot                return self._state.token
1120*16467b97STreehugger Robot
1121*16467b97STreehugger Robot            except NoViableAltException, re:
1122*16467b97STreehugger Robot                self.reportError(re)
1123*16467b97STreehugger Robot                self.recover(re) # throw out current char and try again
1124*16467b97STreehugger Robot
1125*16467b97STreehugger Robot            except RecognitionException, re:
1126*16467b97STreehugger Robot                self.reportError(re)
1127*16467b97STreehugger Robot                # match() routine has already called recover()
1128*16467b97STreehugger Robot
1129*16467b97STreehugger Robot
1130*16467b97STreehugger Robot    def skip(self):
1131*16467b97STreehugger Robot        """
1132*16467b97STreehugger Robot        Instruct the lexer to skip creating a token for current lexer rule
1133*16467b97STreehugger Robot        and look for another token.  nextToken() knows to keep looking when
1134*16467b97STreehugger Robot        a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
1135*16467b97STreehugger Robot        if token==null at end of any token rule, it creates one for you
1136*16467b97STreehugger Robot        and emits it.
1137*16467b97STreehugger Robot        """
1138*16467b97STreehugger Robot
1139*16467b97STreehugger Robot        self._state.token = SKIP_TOKEN
1140*16467b97STreehugger Robot
1141*16467b97STreehugger Robot
1142*16467b97STreehugger Robot    def mTokens(self):
1143*16467b97STreehugger Robot        """This is the lexer entry point that sets instance var 'token'"""
1144*16467b97STreehugger Robot
1145*16467b97STreehugger Robot        # abstract method
1146*16467b97STreehugger Robot        raise NotImplementedError
1147*16467b97STreehugger Robot
1148*16467b97STreehugger Robot
1149*16467b97STreehugger Robot    def setCharStream(self, input):
1150*16467b97STreehugger Robot        """Set the char stream and reset the lexer"""
1151*16467b97STreehugger Robot        self.input = None
1152*16467b97STreehugger Robot        self.reset()
1153*16467b97STreehugger Robot        self.input = input
1154*16467b97STreehugger Robot
1155*16467b97STreehugger Robot
1156*16467b97STreehugger Robot    def getSourceName(self):
1157*16467b97STreehugger Robot        return self.input.getSourceName()
1158*16467b97STreehugger Robot
1159*16467b97STreehugger Robot
1160*16467b97STreehugger Robot    def emit(self, token=None):
1161*16467b97STreehugger Robot        """
1162*16467b97STreehugger Robot        The standard method called to automatically emit a token at the
1163*16467b97STreehugger Robot        outermost lexical rule.  The token object should point into the
1164*16467b97STreehugger Robot        char buffer start..stop.  If there is a text override in 'text',
1165*16467b97STreehugger Robot        use that to set the token's text.  Override this method to emit
1166*16467b97STreehugger Robot        custom Token objects.
1167*16467b97STreehugger Robot
1168*16467b97STreehugger Robot        If you are building trees, then you should also override
1169*16467b97STreehugger Robot        Parser or TreeParser.getMissingSymbol().
1170*16467b97STreehugger Robot        """
1171*16467b97STreehugger Robot
1172*16467b97STreehugger Robot        if token is None:
1173*16467b97STreehugger Robot            token = CommonToken(
1174*16467b97STreehugger Robot                input=self.input,
1175*16467b97STreehugger Robot                type=self._state.type,
1176*16467b97STreehugger Robot                channel=self._state.channel,
1177*16467b97STreehugger Robot                start=self._state.tokenStartCharIndex,
1178*16467b97STreehugger Robot                stop=self.getCharIndex()-1
1179*16467b97STreehugger Robot                )
1180*16467b97STreehugger Robot            token.line = self._state.tokenStartLine
1181*16467b97STreehugger Robot            token.text = self._state.text
1182*16467b97STreehugger Robot            token.charPositionInLine = self._state.tokenStartCharPositionInLine
1183*16467b97STreehugger Robot
1184*16467b97STreehugger Robot        self._state.token = token
1185*16467b97STreehugger Robot
1186*16467b97STreehugger Robot        return token
1187*16467b97STreehugger Robot
1188*16467b97STreehugger Robot
1189*16467b97STreehugger Robot    def match(self, s):
1190*16467b97STreehugger Robot        if isinstance(s, basestring):
1191*16467b97STreehugger Robot            for c in s:
1192*16467b97STreehugger Robot                if self.input.LA(1) != ord(c):
1193*16467b97STreehugger Robot                    if self._state.backtracking > 0:
1194*16467b97STreehugger Robot                        raise BacktrackingFailed
1195*16467b97STreehugger Robot
1196*16467b97STreehugger Robot                    mte = MismatchedTokenException(c, self.input)
1197*16467b97STreehugger Robot                    self.recover(mte)
1198*16467b97STreehugger Robot                    raise mte
1199*16467b97STreehugger Robot
1200*16467b97STreehugger Robot                self.input.consume()
1201*16467b97STreehugger Robot
1202*16467b97STreehugger Robot        else:
1203*16467b97STreehugger Robot            if self.input.LA(1) != s:
1204*16467b97STreehugger Robot                if self._state.backtracking > 0:
1205*16467b97STreehugger Robot                    raise BacktrackingFailed
1206*16467b97STreehugger Robot
1207*16467b97STreehugger Robot                mte = MismatchedTokenException(unichr(s), self.input)
1208*16467b97STreehugger Robot                self.recover(mte) # don't really recover; just consume in lexer
1209*16467b97STreehugger Robot                raise mte
1210*16467b97STreehugger Robot
1211*16467b97STreehugger Robot            self.input.consume()
1212*16467b97STreehugger Robot
1213*16467b97STreehugger Robot
1214*16467b97STreehugger Robot    def matchAny(self):
1215*16467b97STreehugger Robot        self.input.consume()
1216*16467b97STreehugger Robot
1217*16467b97STreehugger Robot
1218*16467b97STreehugger Robot    def matchRange(self, a, b):
1219*16467b97STreehugger Robot        if self.input.LA(1) < a or self.input.LA(1) > b:
1220*16467b97STreehugger Robot            if self._state.backtracking > 0:
1221*16467b97STreehugger Robot                raise BacktrackingFailed
1222*16467b97STreehugger Robot
1223*16467b97STreehugger Robot            mre = MismatchedRangeException(unichr(a), unichr(b), self.input)
1224*16467b97STreehugger Robot            self.recover(mre)
1225*16467b97STreehugger Robot            raise mre
1226*16467b97STreehugger Robot
1227*16467b97STreehugger Robot        self.input.consume()
1228*16467b97STreehugger Robot
1229*16467b97STreehugger Robot
1230*16467b97STreehugger Robot    def getLine(self):
1231*16467b97STreehugger Robot        return self.input.line
1232*16467b97STreehugger Robot
1233*16467b97STreehugger Robot
1234*16467b97STreehugger Robot    def getCharPositionInLine(self):
1235*16467b97STreehugger Robot        return self.input.charPositionInLine
1236*16467b97STreehugger Robot
1237*16467b97STreehugger Robot
1238*16467b97STreehugger Robot    def getCharIndex(self):
1239*16467b97STreehugger Robot        """What is the index of the current character of lookahead?"""
1240*16467b97STreehugger Robot
1241*16467b97STreehugger Robot        return self.input.index()
1242*16467b97STreehugger Robot
1243*16467b97STreehugger Robot
1244*16467b97STreehugger Robot    def getText(self):
1245*16467b97STreehugger Robot        """
1246*16467b97STreehugger Robot        Return the text matched so far for the current token or any
1247*16467b97STreehugger Robot        text override.
1248*16467b97STreehugger Robot        """
1249*16467b97STreehugger Robot        if self._state.text is not None:
1250*16467b97STreehugger Robot            return self._state.text
1251*16467b97STreehugger Robot
1252*16467b97STreehugger Robot        return self.input.substring(
1253*16467b97STreehugger Robot            self._state.tokenStartCharIndex,
1254*16467b97STreehugger Robot            self.getCharIndex()-1
1255*16467b97STreehugger Robot            )
1256*16467b97STreehugger Robot
1257*16467b97STreehugger Robot
1258*16467b97STreehugger Robot    def setText(self, text):
1259*16467b97STreehugger Robot        """
1260*16467b97STreehugger Robot        Set the complete text of this token; it wipes any previous
1261*16467b97STreehugger Robot        changes to the text.
1262*16467b97STreehugger Robot        """
1263*16467b97STreehugger Robot        self._state.text = text
1264*16467b97STreehugger Robot
1265*16467b97STreehugger Robot
1266*16467b97STreehugger Robot    text = property(getText, setText)
1267*16467b97STreehugger Robot
1268*16467b97STreehugger Robot
1269*16467b97STreehugger Robot    def reportError(self, e):
1270*16467b97STreehugger Robot        ## TODO: not thought about recovery in lexer yet.
1271*16467b97STreehugger Robot
1272*16467b97STreehugger Robot        ## # if we've already reported an error and have not matched a token
1273*16467b97STreehugger Robot        ## # yet successfully, don't report any errors.
1274*16467b97STreehugger Robot        ## if self.errorRecovery:
1275*16467b97STreehugger Robot        ##     #System.err.print("[SPURIOUS] ");
1276*16467b97STreehugger Robot        ##     return;
1277*16467b97STreehugger Robot        ##
1278*16467b97STreehugger Robot        ## self.errorRecovery = True
1279*16467b97STreehugger Robot
1280*16467b97STreehugger Robot        self.displayRecognitionError(self.tokenNames, e)
1281*16467b97STreehugger Robot
1282*16467b97STreehugger Robot
1283*16467b97STreehugger Robot    def getErrorMessage(self, e, tokenNames):
1284*16467b97STreehugger Robot        msg = None
1285*16467b97STreehugger Robot
1286*16467b97STreehugger Robot        if isinstance(e, MismatchedTokenException):
1287*16467b97STreehugger Robot            msg = "mismatched character " \
1288*16467b97STreehugger Robot                  + self.getCharErrorDisplay(e.c) \
1289*16467b97STreehugger Robot                  + " expecting " \
1290*16467b97STreehugger Robot                  + self.getCharErrorDisplay(e.expecting)
1291*16467b97STreehugger Robot
1292*16467b97STreehugger Robot        elif isinstance(e, NoViableAltException):
1293*16467b97STreehugger Robot            msg = "no viable alternative at character " \
1294*16467b97STreehugger Robot                  + self.getCharErrorDisplay(e.c)
1295*16467b97STreehugger Robot
1296*16467b97STreehugger Robot        elif isinstance(e, EarlyExitException):
1297*16467b97STreehugger Robot            msg = "required (...)+ loop did not match anything at character " \
1298*16467b97STreehugger Robot                  + self.getCharErrorDisplay(e.c)
1299*16467b97STreehugger Robot
1300*16467b97STreehugger Robot        elif isinstance(e, MismatchedNotSetException):
1301*16467b97STreehugger Robot            msg = "mismatched character " \
1302*16467b97STreehugger Robot                  + self.getCharErrorDisplay(e.c) \
1303*16467b97STreehugger Robot                  + " expecting set " \
1304*16467b97STreehugger Robot                  + repr(e.expecting)
1305*16467b97STreehugger Robot
1306*16467b97STreehugger Robot        elif isinstance(e, MismatchedSetException):
1307*16467b97STreehugger Robot            msg = "mismatched character " \
1308*16467b97STreehugger Robot                  + self.getCharErrorDisplay(e.c) \
1309*16467b97STreehugger Robot                  + " expecting set " \
1310*16467b97STreehugger Robot                  + repr(e.expecting)
1311*16467b97STreehugger Robot
1312*16467b97STreehugger Robot        elif isinstance(e, MismatchedRangeException):
1313*16467b97STreehugger Robot            msg = "mismatched character " \
1314*16467b97STreehugger Robot                  + self.getCharErrorDisplay(e.c) \
1315*16467b97STreehugger Robot                  + " expecting set " \
1316*16467b97STreehugger Robot                  + self.getCharErrorDisplay(e.a) \
1317*16467b97STreehugger Robot                  + ".." \
1318*16467b97STreehugger Robot                  + self.getCharErrorDisplay(e.b)
1319*16467b97STreehugger Robot
1320*16467b97STreehugger Robot        else:
1321*16467b97STreehugger Robot            msg = BaseRecognizer.getErrorMessage(self, e, tokenNames)
1322*16467b97STreehugger Robot
1323*16467b97STreehugger Robot        return msg
1324*16467b97STreehugger Robot
1325*16467b97STreehugger Robot
1326*16467b97STreehugger Robot    def getCharErrorDisplay(self, c):
1327*16467b97STreehugger Robot        if c == EOF:
1328*16467b97STreehugger Robot            c = '<EOF>'
1329*16467b97STreehugger Robot        return repr(c)
1330*16467b97STreehugger Robot
1331*16467b97STreehugger Robot
1332*16467b97STreehugger Robot    def recover(self, re):
1333*16467b97STreehugger Robot        """
1334*16467b97STreehugger Robot        Lexers can normally match any char in it's vocabulary after matching
1335*16467b97STreehugger Robot        a token, so do the easy thing and just kill a character and hope
1336*16467b97STreehugger Robot        it all works out.  You can instead use the rule invocation stack
1337*16467b97STreehugger Robot        to do sophisticated error recovery if you are in a fragment rule.
1338*16467b97STreehugger Robot        """
1339*16467b97STreehugger Robot
1340*16467b97STreehugger Robot        self.input.consume()
1341*16467b97STreehugger Robot
1342*16467b97STreehugger Robot
1343*16467b97STreehugger Robot    def traceIn(self, ruleName, ruleIndex):
1344*16467b97STreehugger Robot        inputSymbol = "%s line=%d:%s" % (self.input.LT(1),
1345*16467b97STreehugger Robot                                         self.getLine(),
1346*16467b97STreehugger Robot                                         self.getCharPositionInLine()
1347*16467b97STreehugger Robot                                         )
1348*16467b97STreehugger Robot
1349*16467b97STreehugger Robot        BaseRecognizer.traceIn(self, ruleName, ruleIndex, inputSymbol)
1350*16467b97STreehugger Robot
1351*16467b97STreehugger Robot
1352*16467b97STreehugger Robot    def traceOut(self, ruleName, ruleIndex):
1353*16467b97STreehugger Robot        inputSymbol = "%s line=%d:%s" % (self.input.LT(1),
1354*16467b97STreehugger Robot                                         self.getLine(),
1355*16467b97STreehugger Robot                                         self.getCharPositionInLine()
1356*16467b97STreehugger Robot                                         )
1357*16467b97STreehugger Robot
1358*16467b97STreehugger Robot        BaseRecognizer.traceOut(self, ruleName, ruleIndex, inputSymbol)
1359*16467b97STreehugger Robot
1360*16467b97STreehugger Robot
1361*16467b97STreehugger Robot
1362*16467b97STreehugger Robotclass Parser(BaseRecognizer):
1363*16467b97STreehugger Robot    """
1364*16467b97STreehugger Robot    @brief Baseclass for generated parser classes.
1365*16467b97STreehugger Robot    """
1366*16467b97STreehugger Robot
1367*16467b97STreehugger Robot    def __init__(self, lexer, state=None):
1368*16467b97STreehugger Robot        BaseRecognizer.__init__(self, state)
1369*16467b97STreehugger Robot
1370*16467b97STreehugger Robot        self.input = lexer
1371*16467b97STreehugger Robot
1372*16467b97STreehugger Robot
1373*16467b97STreehugger Robot    def reset(self):
1374*16467b97STreehugger Robot        BaseRecognizer.reset(self) # reset all recognizer state variables
1375*16467b97STreehugger Robot        if self.input is not None:
1376*16467b97STreehugger Robot            self.input.seek(0) # rewind the input
1377*16467b97STreehugger Robot
1378*16467b97STreehugger Robot
1379*16467b97STreehugger Robot    def getCurrentInputSymbol(self, input):
1380*16467b97STreehugger Robot        return input.LT(1)
1381*16467b97STreehugger Robot
1382*16467b97STreehugger Robot
1383*16467b97STreehugger Robot    def getMissingSymbol(self, input, e, expectedTokenType, follow):
1384*16467b97STreehugger Robot        if expectedTokenType == EOF:
1385*16467b97STreehugger Robot            tokenText = "<missing EOF>"
1386*16467b97STreehugger Robot        else:
1387*16467b97STreehugger Robot            tokenText = "<missing " + self.tokenNames[expectedTokenType] + ">"
1388*16467b97STreehugger Robot        t = CommonToken(type=expectedTokenType, text=tokenText)
1389*16467b97STreehugger Robot        current = input.LT(1)
1390*16467b97STreehugger Robot        if current.type == EOF:
1391*16467b97STreehugger Robot            current = input.LT(-1)
1392*16467b97STreehugger Robot
1393*16467b97STreehugger Robot        if current is not None:
1394*16467b97STreehugger Robot            t.line = current.line
1395*16467b97STreehugger Robot            t.charPositionInLine = current.charPositionInLine
1396*16467b97STreehugger Robot        t.channel = DEFAULT_CHANNEL
1397*16467b97STreehugger Robot        return t
1398*16467b97STreehugger Robot
1399*16467b97STreehugger Robot
1400*16467b97STreehugger Robot    def setTokenStream(self, input):
1401*16467b97STreehugger Robot        """Set the token stream and reset the parser"""
1402*16467b97STreehugger Robot
1403*16467b97STreehugger Robot        self.input = None
1404*16467b97STreehugger Robot        self.reset()
1405*16467b97STreehugger Robot        self.input = input
1406*16467b97STreehugger Robot
1407*16467b97STreehugger Robot
1408*16467b97STreehugger Robot    def getTokenStream(self):
1409*16467b97STreehugger Robot        return self.input
1410*16467b97STreehugger Robot
1411*16467b97STreehugger Robot
1412*16467b97STreehugger Robot    def getSourceName(self):
1413*16467b97STreehugger Robot        return self.input.getSourceName()
1414*16467b97STreehugger Robot
1415*16467b97STreehugger Robot
1416*16467b97STreehugger Robot    def traceIn(self, ruleName, ruleIndex):
1417*16467b97STreehugger Robot        BaseRecognizer.traceIn(self, ruleName, ruleIndex, self.input.LT(1))
1418*16467b97STreehugger Robot
1419*16467b97STreehugger Robot
1420*16467b97STreehugger Robot    def traceOut(self, ruleName, ruleIndex):
1421*16467b97STreehugger Robot        BaseRecognizer.traceOut(self, ruleName, ruleIndex, self.input.LT(1))
1422*16467b97STreehugger Robot
1423*16467b97STreehugger Robot
1424*16467b97STreehugger Robotclass RuleReturnScope(object):
1425*16467b97STreehugger Robot    """
1426*16467b97STreehugger Robot    Rules can return start/stop info as well as possible trees and templates.
1427*16467b97STreehugger Robot    """
1428*16467b97STreehugger Robot
1429*16467b97STreehugger Robot    def getStart(self):
1430*16467b97STreehugger Robot        """Return the start token or tree."""
1431*16467b97STreehugger Robot        return None
1432*16467b97STreehugger Robot
1433*16467b97STreehugger Robot
1434*16467b97STreehugger Robot    def getStop(self):
1435*16467b97STreehugger Robot        """Return the stop token or tree."""
1436*16467b97STreehugger Robot        return None
1437*16467b97STreehugger Robot
1438*16467b97STreehugger Robot
1439*16467b97STreehugger Robot    def getTree(self):
1440*16467b97STreehugger Robot        """Has a value potentially if output=AST."""
1441*16467b97STreehugger Robot        return None
1442*16467b97STreehugger Robot
1443*16467b97STreehugger Robot
1444*16467b97STreehugger Robot    def getTemplate(self):
1445*16467b97STreehugger Robot        """Has a value potentially if output=template."""
1446*16467b97STreehugger Robot        return None
1447*16467b97STreehugger Robot
1448*16467b97STreehugger Robot
1449*16467b97STreehugger Robotclass ParserRuleReturnScope(RuleReturnScope):
1450*16467b97STreehugger Robot    """
1451*16467b97STreehugger Robot    Rules that return more than a single value must return an object
1452*16467b97STreehugger Robot    containing all the values.  Besides the properties defined in
1453*16467b97STreehugger Robot    RuleLabelScope.predefinedRulePropertiesScope there may be user-defined
1454*16467b97STreehugger Robot    return values.  This class simply defines the minimum properties that
1455*16467b97STreehugger Robot    are always defined and methods to access the others that might be
1456*16467b97STreehugger Robot    available depending on output option such as template and tree.
1457*16467b97STreehugger Robot
1458*16467b97STreehugger Robot    Note text is not an actual property of the return value, it is computed
1459*16467b97STreehugger Robot    from start and stop using the input stream's toString() method.  I
1460*16467b97STreehugger Robot    could add a ctor to this so that we can pass in and store the input
1461*16467b97STreehugger Robot    stream, but I'm not sure we want to do that.  It would seem to be undefined
1462*16467b97STreehugger Robot    to get the .text property anyway if the rule matches tokens from multiple
1463*16467b97STreehugger Robot    input streams.
1464*16467b97STreehugger Robot
1465*16467b97STreehugger Robot    I do not use getters for fields of objects that are used simply to
1466*16467b97STreehugger Robot    group values such as this aggregate.  The getters/setters are there to
1467*16467b97STreehugger Robot    satisfy the superclass interface.
1468*16467b97STreehugger Robot    """
1469*16467b97STreehugger Robot
1470*16467b97STreehugger Robot    def __init__(self):
1471*16467b97STreehugger Robot        self.start = None
1472*16467b97STreehugger Robot        self.stop = None
1473*16467b97STreehugger Robot        self.tree = None  # only used when output=AST
1474*16467b97STreehugger Robot
1475*16467b97STreehugger Robot
1476*16467b97STreehugger Robot    def getStart(self):
1477*16467b97STreehugger Robot        return self.start
1478*16467b97STreehugger Robot
1479*16467b97STreehugger Robot
1480*16467b97STreehugger Robot    def getStop(self):
1481*16467b97STreehugger Robot        return self.stop
1482*16467b97STreehugger Robot
1483*16467b97STreehugger Robot
1484*16467b97STreehugger Robot    def getTree(self):
1485*16467b97STreehugger Robot        return self.tree
1486