xref: /aosp_15_r20/external/antlr/runtime/Ruby/lib/antlr3/recognizers.rb (revision 16467b971bd3e2009fad32dd79016f2c7e421deb)
1*16467b97STreehugger Robot#!/usr/bin/ruby
2*16467b97STreehugger Robot# encoding: utf-8
3*16467b97STreehugger Robot
4*16467b97STreehugger Robot=begin LICENSE
5*16467b97STreehugger Robot
6*16467b97STreehugger Robot[The "BSD licence"]
7*16467b97STreehugger RobotCopyright (c) 2009-2010 Kyle Yetter
8*16467b97STreehugger RobotAll rights reserved.
9*16467b97STreehugger Robot
10*16467b97STreehugger RobotRedistribution and use in source and binary forms, with or without
11*16467b97STreehugger Robotmodification, are permitted provided that the following conditions
12*16467b97STreehugger Robotare met:
13*16467b97STreehugger Robot
14*16467b97STreehugger Robot 1. Redistributions of source code must retain the above copyright
15*16467b97STreehugger Robot    notice, this list of conditions and the following disclaimer.
16*16467b97STreehugger Robot 2. Redistributions in binary form must reproduce the above copyright
17*16467b97STreehugger Robot    notice, this list of conditions and the following disclaimer in the
18*16467b97STreehugger Robot    documentation and/or other materials provided with the distribution.
19*16467b97STreehugger Robot 3. The name of the author may not be used to endorse or promote products
20*16467b97STreehugger Robot    derived from this software without specific prior written permission.
21*16467b97STreehugger Robot
22*16467b97STreehugger RobotTHIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23*16467b97STreehugger RobotIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24*16467b97STreehugger RobotOF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25*16467b97STreehugger RobotIN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26*16467b97STreehugger RobotINCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27*16467b97STreehugger RobotNOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28*16467b97STreehugger RobotDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29*16467b97STreehugger RobotTHEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30*16467b97STreehugger Robot(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31*16467b97STreehugger RobotTHIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32*16467b97STreehugger Robot
33*16467b97STreehugger Robot=end
34*16467b97STreehugger Robot
35*16467b97STreehugger Robotmodule ANTLR3
36*16467b97STreehugger Robotunless const_defined?( :RecognizerSharedState )
37*16467b97STreehugger Robot
38*16467b97STreehugger RobotRecognizerSharedState = Struct.new(
39*16467b97STreehugger Robot  :following,
40*16467b97STreehugger Robot  :error_recovery,
41*16467b97STreehugger Robot  :last_error_index,
42*16467b97STreehugger Robot  :backtracking,
43*16467b97STreehugger Robot  :rule_memory,
44*16467b97STreehugger Robot  :syntax_errors,
45*16467b97STreehugger Robot  :token,
46*16467b97STreehugger Robot  :token_start_position,
47*16467b97STreehugger Robot  :token_start_line,
48*16467b97STreehugger Robot  :token_start_column,
49*16467b97STreehugger Robot  :channel,
50*16467b97STreehugger Robot  :type,
51*16467b97STreehugger Robot  :text
52*16467b97STreehugger Robot)
53*16467b97STreehugger Robot
54*16467b97STreehugger Robot=begin rdoc ANTLR3::RecognizerSharedState
55*16467b97STreehugger Robot
56*16467b97STreehugger RobotA big Struct-based class containing most of the data that makes up a
57*16467b97STreehugger Robotrecognizer's state. These attributes are externalized from the recognizer itself
58*16467b97STreehugger Robotso that recognizer delegation (which occurs when you import other grammars into
59*16467b97STreehugger Robotyour grammar) can function; multiple recognizers can share a common state.
60*16467b97STreehugger Robot
61*16467b97STreehugger Robot== Structure Attributes
62*16467b97STreehugger Robot
63*16467b97STreehugger Robotfollowing::
64*16467b97STreehugger Robot  a stack that tracks follow sets for error recovery
65*16467b97STreehugger Roboterror_recovery::
66*16467b97STreehugger Robot  a flag indicating whether or not the recognizer is in error recovery mode
67*16467b97STreehugger Robotlast_error_index::
68*16467b97STreehugger Robot  the index in the input stream of the last error
69*16467b97STreehugger Robotbacktracking::
70*16467b97STreehugger Robot  tracks the backtracking depth
71*16467b97STreehugger Robotrule_memory::
72*16467b97STreehugger Robot  if a grammar is compiled with the memoization option, this will be
73*16467b97STreehugger Robot  set to a hash mapping previously parsed rules to cached indices
74*16467b97STreehugger Robotsyntax_errors::
75*16467b97STreehugger Robot  tracks the number of syntax errors seen so far
76*16467b97STreehugger Robottoken::
77*16467b97STreehugger Robot  holds newly constructed tokens for lexer rules
78*16467b97STreehugger Robottoken_start_position::
79*16467b97STreehugger Robot  the input stream index at which the token starts
80*16467b97STreehugger Robottoken_start_line::
81*16467b97STreehugger Robot  the input stream line number at which the token starts
82*16467b97STreehugger Robottoken_start_column::
83*16467b97STreehugger Robot  the input stream column at which the token starts
84*16467b97STreehugger Robotchannel::
85*16467b97STreehugger Robot  the channel value of the target token
86*16467b97STreehugger Robottype::
87*16467b97STreehugger Robot  the type value of the target token
88*16467b97STreehugger Robottext::
89*16467b97STreehugger Robot  the text of the target token
90*16467b97STreehugger Robot
91*16467b97STreehugger Robot=end
92*16467b97STreehugger Robot
93*16467b97STreehugger Robotclass RecognizerSharedState
94*16467b97STreehugger Robot  def initialize
95*16467b97STreehugger Robot    super( [], false, -1, 0, nil, 0, nil, -1 )
96*16467b97STreehugger Robot    # ^-- same as this --v
97*16467b97STreehugger Robot    # self.following = []
98*16467b97STreehugger Robot    # self.error_recovery = false
99*16467b97STreehugger Robot    # self.last_error_index = -1
100*16467b97STreehugger Robot    # self.backtracking = 0
101*16467b97STreehugger Robot    # self.syntax_errors = 0
102*16467b97STreehugger Robot    # self.token_start_position = -1
103*16467b97STreehugger Robot  end
104*16467b97STreehugger Robot
105*16467b97STreehugger Robot
106*16467b97STreehugger Robot  # restores all of the state variables to their respective
107*16467b97STreehugger Robot  # initial default values
108*16467b97STreehugger Robot  def reset!
109*16467b97STreehugger Robot    self.following.clear
110*16467b97STreehugger Robot    self.error_recovery = false
111*16467b97STreehugger Robot    self.last_error_index = -1
112*16467b97STreehugger Robot    self.backtracking = 0
113*16467b97STreehugger Robot    self.rule_memory and rule_memory.clear
114*16467b97STreehugger Robot    self.syntax_errors = 0
115*16467b97STreehugger Robot    self.token = nil
116*16467b97STreehugger Robot    self.token_start_position = -1
117*16467b97STreehugger Robot    self.token_start_line = nil
118*16467b97STreehugger Robot    self.token_start_column = nil
119*16467b97STreehugger Robot    self.channel = nil
120*16467b97STreehugger Robot    self.type = nil
121*16467b97STreehugger Robot    self.text = nil
122*16467b97STreehugger Robot  end
123*16467b97STreehugger Robotend
124*16467b97STreehugger Robot
125*16467b97STreehugger Robotend # unless const_defined?( :RecognizerSharedState )
126*16467b97STreehugger Robot
127*16467b97STreehugger Robot=begin rdoc ANTLR3::Recognizer
128*16467b97STreehugger Robot
129*16467b97STreehugger Robot= Scope
130*16467b97STreehugger Robot
131*16467b97STreehugger RobotScope is used to represent instances of ANTLR's various attribute scopes.
132*16467b97STreehugger RobotIt is identical to Ruby's built-in Struct class, but it takes string
133*16467b97STreehugger Robotattribute declarations from the ANTLR grammar as parameters, and overrides
134*16467b97STreehugger Robotthe #initialize method to set the default values if any are present in
135*16467b97STreehugger Robotthe scope declaration.
136*16467b97STreehugger Robot
137*16467b97STreehugger Robot  Block = Scope.new( "name", "depth = 0", "variables = {}" )
138*16467b97STreehugger Robot  Block.new                    # => #<struct Block name=nil, depth=0, variables={}>
139*16467b97STreehugger Robot  Block.new( "function" )      # => #<struct Block name="function", depth=0, variables={}>
140*16467b97STreehugger Robot  Block.new( 'a', 1, :x => 3 ) # => #<struct Block name="a", depth=1, variables={ :x => 3 }>
141*16467b97STreehugger Robot
142*16467b97STreehugger Robot=end
143*16467b97STreehugger Robot
144*16467b97STreehugger Robotclass Scope < ::Struct
145*16467b97STreehugger Robot  def self.new( *declarations, &body )
146*16467b97STreehugger Robot    names = []
147*16467b97STreehugger Robot    defaults = {}
148*16467b97STreehugger Robot    for decl in declarations
149*16467b97STreehugger Robot      name, default = decl.to_s.split( /\s*=\s*/, 2 )
150*16467b97STreehugger Robot      names << ( name = name.to_sym )
151*16467b97STreehugger Robot      default and defaults[ name ] = default
152*16467b97STreehugger Robot    end
153*16467b97STreehugger Robot    super( *names ) do
154*16467b97STreehugger Robot
155*16467b97STreehugger Robot      # If no defaults, leave the initialize method the same as
156*16467b97STreehugger Robot      # the struct's default initialize for speed. Otherwise,
157*16467b97STreehugger Robot      # overwrite the initialize to populate with default values.
158*16467b97STreehugger Robot      unless defaults.empty?
159*16467b97STreehugger Robot        parameters = names.map do | name |
160*16467b97STreehugger Robot          "#{ name } = " << defaults.fetch( name, 'nil' )
161*16467b97STreehugger Robot        end.join( ', ' )
162*16467b97STreehugger Robot        class_eval( <<-END )
163*16467b97STreehugger Robot          def initialize( #{ parameters } )
164*16467b97STreehugger Robot            super( #{ names.join( ', ' ) } )
165*16467b97STreehugger Robot          end
166*16467b97STreehugger Robot        END
167*16467b97STreehugger Robot      end
168*16467b97STreehugger Robot
169*16467b97STreehugger Robot      body and class_eval( &body )
170*16467b97STreehugger Robot    end
171*16467b97STreehugger Robot  end
172*16467b97STreehugger Robotend
173*16467b97STreehugger Robot
174*16467b97STreehugger Robot=begin rdoc ANTLR3::Recognizer
175*16467b97STreehugger Robot
176*16467b97STreehugger Robot= Recognizer
177*16467b97STreehugger Robot
178*16467b97STreehugger RobotAs the base class of all ANTLR-generated recognizers, Recognizer provides
179*16467b97STreehugger Robotmuch of the shared functionality and structure used in the recognition process.
180*16467b97STreehugger RobotFor all effective purposes, the class and its immediate subclasses Lexer,
181*16467b97STreehugger RobotParser, and TreeParser are abstract classes. They can be instantiated, but
182*16467b97STreehugger Robotthey're pretty useless on their own. Instead, to make useful code, you write an
183*16467b97STreehugger RobotANTLR grammar and ANTLR will generate classes which inherit from one of the
184*16467b97STreehugger Robotrecognizer base classes, providing the implementation of the grammar rules
185*16467b97STreehugger Robotitself. this group of classes to implement necessary tasks. Recognizer
186*16467b97STreehugger Robotdefines methods related to:
187*16467b97STreehugger Robot
188*16467b97STreehugger Robot* token and character matching
189*16467b97STreehugger Robot* prediction and recognition strategy
190*16467b97STreehugger Robot* recovering from errors
191*16467b97STreehugger Robot* reporting errors
192*16467b97STreehugger Robot* memoization
193*16467b97STreehugger Robot* simple rule tracing and debugging
194*16467b97STreehugger Robot
195*16467b97STreehugger Robot=end
196*16467b97STreehugger Robot
197*16467b97STreehugger Robotclass Recognizer
198*16467b97STreehugger Robot  include Constants
199*16467b97STreehugger Robot  include Error
200*16467b97STreehugger Robot  include TokenFactory
201*16467b97STreehugger Robot  extend ClassMacros
202*16467b97STreehugger Robot
203*16467b97STreehugger Robot  @rules = {}
204*16467b97STreehugger Robot
205*16467b97STreehugger Robot  # inherited class methods and hooks
206*16467b97STreehugger Robot  class << self
207*16467b97STreehugger Robot    attr_reader :grammar_file_name,
208*16467b97STreehugger Robot                :antlr_version,
209*16467b97STreehugger Robot                :antlr_version_string,
210*16467b97STreehugger Robot                :library_version_string,
211*16467b97STreehugger Robot                :grammar_home
212*16467b97STreehugger Robot
213*16467b97STreehugger Robot    attr_accessor :token_scheme, :default_rule
214*16467b97STreehugger Robot
215*16467b97STreehugger Robot    # generated recognizer code uses this method to stamp
216*16467b97STreehugger Robot    # the code with the name of the grammar file and
217*16467b97STreehugger Robot    # the current version of ANTLR being used to generate
218*16467b97STreehugger Robot    # the code
219*16467b97STreehugger Robot    def generated_using( grammar_file, antlr_version, library_version = nil )
220*16467b97STreehugger Robot      @grammar_file_name = grammar_file.freeze
221*16467b97STreehugger Robot      @antlr_version_string = antlr_version.freeze
222*16467b97STreehugger Robot      @library_version = Util.parse_version( library_version )
223*16467b97STreehugger Robot      if @antlr_version_string =~ /^(\d+)\.(\d+)(?:\.(\d+)(?:b(\d+))?)?(.*)$/
224*16467b97STreehugger Robot        @antlr_version = [ $1, $2, $3, $4 ].map! { |str| str.to_i }
225*16467b97STreehugger Robot        timestamp = $5.strip
226*16467b97STreehugger Robot        #@antlr_release_time = $5.empty? ? nil : Time.parse($5)
227*16467b97STreehugger Robot      else
228*16467b97STreehugger Robot        raise "bad version string: %p" % version_string
229*16467b97STreehugger Robot      end
230*16467b97STreehugger Robot    end
231*16467b97STreehugger Robot
232*16467b97STreehugger Robot    # this method is used to generate return-value structures for
233*16467b97STreehugger Robot    # rules with multiple return values. To avoid generating
234*16467b97STreehugger Robot    # a special class for ever rule in AST parsers and such
235*16467b97STreehugger Robot    # (where most rules have the same default set of return values),
236*16467b97STreehugger Robot    # each recognizer gets a default return value structure
237*16467b97STreehugger Robot    # assigned to the constant +Return+. Rules which don't
238*16467b97STreehugger Robot    # require additional custom members will have a rule-return
239*16467b97STreehugger Robot    # name constant that just points to the generic return
240*16467b97STreehugger Robot    # value.
241*16467b97STreehugger Robot    def define_return_scope( *members )
242*16467b97STreehugger Robot      if members.empty? then generic_return_scope
243*16467b97STreehugger Robot      else
244*16467b97STreehugger Robot        members += return_scope_members
245*16467b97STreehugger Robot        Struct.new( *members )
246*16467b97STreehugger Robot      end
247*16467b97STreehugger Robot    end
248*16467b97STreehugger Robot
249*16467b97STreehugger Robot    # used as a hook to add additional default members
250*16467b97STreehugger Robot    # to default return value structures
251*16467b97STreehugger Robot    # For example, all AST-building parsers override
252*16467b97STreehugger Robot    # this method to add an extra +:tree+ field to
253*16467b97STreehugger Robot    # all rule return structures.
254*16467b97STreehugger Robot    def return_scope_members
255*16467b97STreehugger Robot      [ :start, :stop ]
256*16467b97STreehugger Robot    end
257*16467b97STreehugger Robot
258*16467b97STreehugger Robot    # sets up and returns the generic rule return
259*16467b97STreehugger Robot    # scope for a recognizer
260*16467b97STreehugger Robot    def generic_return_scope
261*16467b97STreehugger Robot      @generic_return_scope ||= begin
262*16467b97STreehugger Robot        struct = Struct.new( *return_scope_members )
263*16467b97STreehugger Robot        const_set( :Return, struct )
264*16467b97STreehugger Robot      end
265*16467b97STreehugger Robot    end
266*16467b97STreehugger Robot
267*16467b97STreehugger Robot    def imported_grammars
268*16467b97STreehugger Robot      @imported_grammars ||= Set.new
269*16467b97STreehugger Robot    end
270*16467b97STreehugger Robot
271*16467b97STreehugger Robot    def master_grammars
272*16467b97STreehugger Robot      @master_grammars ||= []
273*16467b97STreehugger Robot    end
274*16467b97STreehugger Robot
275*16467b97STreehugger Robot    def master
276*16467b97STreehugger Robot      master_grammars.last
277*16467b97STreehugger Robot    end
278*16467b97STreehugger Robot
279*16467b97STreehugger Robot    def masters( *grammar_names )
280*16467b97STreehugger Robot      for grammar in grammar_names
281*16467b97STreehugger Robot        unless master_grammars.include?( grammar )
282*16467b97STreehugger Robot          master_grammars << grammar
283*16467b97STreehugger Robot          attr_reader( Util.snake_case( grammar ) )
284*16467b97STreehugger Robot        end
285*16467b97STreehugger Robot      end
286*16467b97STreehugger Robot    end
287*16467b97STreehugger Robot    private :masters
288*16467b97STreehugger Robot
289*16467b97STreehugger Robot    def imports( *grammar_names )
290*16467b97STreehugger Robot      for grammar in grammar_names
291*16467b97STreehugger Robot        imported_grammars.add?( grammar.to_sym ) and
292*16467b97STreehugger Robot          attr_reader( Util.snake_case( grammar ) )
293*16467b97STreehugger Robot      end
294*16467b97STreehugger Robot      return imported_grammars
295*16467b97STreehugger Robot    end
296*16467b97STreehugger Robot    private :imports
297*16467b97STreehugger Robot
298*16467b97STreehugger Robot    def rules
299*16467b97STreehugger Robot      self::RULE_METHODS.dup rescue []
300*16467b97STreehugger Robot    end
301*16467b97STreehugger Robot
302*16467b97STreehugger Robot    def default_rule
303*16467b97STreehugger Robot      @default_rule ||= rules.first
304*16467b97STreehugger Robot    end
305*16467b97STreehugger Robot
306*16467b97STreehugger Robot    def debug?
307*16467b97STreehugger Robot      return false
308*16467b97STreehugger Robot    end
309*16467b97STreehugger Robot
310*16467b97STreehugger Robot    def profile?
311*16467b97STreehugger Robot      return false
312*16467b97STreehugger Robot    end
313*16467b97STreehugger Robot
314*16467b97STreehugger Robot    def Scope( *declarations, &body )
315*16467b97STreehugger Robot      Scope.new( *declarations, &body )
316*16467b97STreehugger Robot    end
317*16467b97STreehugger Robot
318*16467b97STreehugger Robot    def token_class
319*16467b97STreehugger Robot      @token_class ||= begin
320*16467b97STreehugger Robot        self::Token            rescue
321*16467b97STreehugger Robot        superclass.token_class rescue
322*16467b97STreehugger Robot        ANTLR3::CommonToken
323*16467b97STreehugger Robot      end
324*16467b97STreehugger Robot    end
325*16467b97STreehugger Robot    private :generated_using
326*16467b97STreehugger Robot  end
327*16467b97STreehugger Robot
328*16467b97STreehugger Robot  @grammar_file_name = nil
329*16467b97STreehugger Robot  @antlr_version = ANTLR3::ANTLR_VERSION
330*16467b97STreehugger Robot  @antlr_version_string = ANTLR3::ANTLR_VERSION_STRING
331*16467b97STreehugger Robot
332*16467b97STreehugger Robot  def grammar_file_name
333*16467b97STreehugger Robot    self.class.grammar_file_name
334*16467b97STreehugger Robot  end
335*16467b97STreehugger Robot
336*16467b97STreehugger Robot  def antlr_version
337*16467b97STreehugger Robot    self.class.antlr_version
338*16467b97STreehugger Robot  end
339*16467b97STreehugger Robot
340*16467b97STreehugger Robot  def antlr_version_string
341*16467b97STreehugger Robot    self.class.antlr_version_string
342*16467b97STreehugger Robot  end
343*16467b97STreehugger Robot
344*16467b97STreehugger Robot  attr_accessor :input
345*16467b97STreehugger Robot  attr_reader :state
346*16467b97STreehugger Robot
347*16467b97STreehugger Robot  def each_delegate
348*16467b97STreehugger Robot    block_given? or return enum_for( __method__ )
349*16467b97STreehugger Robot    for grammar in self.class.imported_grammars
350*16467b97STreehugger Robot      del = __send__( Util.snake_case( grammar ) ) and
351*16467b97STreehugger Robot        yield( del )
352*16467b97STreehugger Robot    end
353*16467b97STreehugger Robot  end
354*16467b97STreehugger Robot
355*16467b97STreehugger Robot  # Create a new recognizer. The constructor simply ensures that
356*16467b97STreehugger Robot  # all recognizers are initialized with a shared state object.
357*16467b97STreehugger Robot  # See the main recognizer subclasses for more specific
358*16467b97STreehugger Robot  # information about creating recognizer objects like
359*16467b97STreehugger Robot  # lexers and parsers.
360*16467b97STreehugger Robot  def initialize( options = {} )
361*16467b97STreehugger Robot    @state  = options[ :state ] || RecognizerSharedState.new
362*16467b97STreehugger Robot    @error_output = options.fetch( :error_output, $stderr )
363*16467b97STreehugger Robot    defined?( @input ) or @input = nil
364*16467b97STreehugger Robot    initialize_dfas
365*16467b97STreehugger Robot  end
366*16467b97STreehugger Robot
367*16467b97STreehugger Robot  # Resets the recognizer's state data to initial values.
368*16467b97STreehugger Robot  # As a result, all error tracking and error recovery
369*16467b97STreehugger Robot  # data accumulated in the current state will be cleared.
370*16467b97STreehugger Robot  # It will also attempt to reset the input stream
371*16467b97STreehugger Robot  # via input.reset, but it ignores any errors received
372*16467b97STreehugger Robot  # from doing so. Thus the input stream is not guarenteed
373*16467b97STreehugger Robot  # to be rewound to its initial position
374*16467b97STreehugger Robot  def reset
375*16467b97STreehugger Robot    @state and @state.reset!
376*16467b97STreehugger Robot    @input and @input.reset rescue nil
377*16467b97STreehugger Robot  end
378*16467b97STreehugger Robot
379*16467b97STreehugger Robot  # Attempt to match the current input symbol the token type
380*16467b97STreehugger Robot  # specified by +type+. If the symbol matches the type,
381*16467b97STreehugger Robot  # consume the current symbol and return its value. If
382*16467b97STreehugger Robot  # the symbol doesn't match, attempt to use the follow-set
383*16467b97STreehugger Robot  # data provided by +follow+ to recover from the mismatched
384*16467b97STreehugger Robot  # token.
385*16467b97STreehugger Robot  def match( type, follow )
386*16467b97STreehugger Robot    matched_symbol = current_symbol
387*16467b97STreehugger Robot    if @input.peek == type
388*16467b97STreehugger Robot      @input.consume
389*16467b97STreehugger Robot      @state.error_recovery = false
390*16467b97STreehugger Robot      return matched_symbol
391*16467b97STreehugger Robot    end
392*16467b97STreehugger Robot    raise( BacktrackingFailed ) if @state.backtracking > 0
393*16467b97STreehugger Robot    return recover_from_mismatched_token( type, follow )
394*16467b97STreehugger Robot  end
395*16467b97STreehugger Robot
396*16467b97STreehugger Robot  # match anything -- i.e. wildcard match. Simply consume
397*16467b97STreehugger Robot  # the current symbol from the input stream.
398*16467b97STreehugger Robot  def match_any
399*16467b97STreehugger Robot    @state.error_recovery = false
400*16467b97STreehugger Robot    @input.consume
401*16467b97STreehugger Robot  end
402*16467b97STreehugger Robot
403*16467b97STreehugger Robot  ##############################################################################################
404*16467b97STreehugger Robot  ###################################### Error Reporting #######################################
405*16467b97STreehugger Robot  ##############################################################################################
406*16467b97STreehugger Robot  ##############################################################################################
407*16467b97STreehugger Robot
408*16467b97STreehugger Robot  # When a recognition error occurs, this method is the main
409*16467b97STreehugger Robot  # hook for carrying out the error reporting process. The
410*16467b97STreehugger Robot  # default implementation calls +display_recognition_error+
411*16467b97STreehugger Robot  # to display the error info on $stderr.
412*16467b97STreehugger Robot  def report_error( e = $! )
413*16467b97STreehugger Robot    @state.error_recovery and return
414*16467b97STreehugger Robot    @state.syntax_errors += 1
415*16467b97STreehugger Robot    @state.error_recovery = true
416*16467b97STreehugger Robot    display_recognition_error( e )
417*16467b97STreehugger Robot  end
418*16467b97STreehugger Robot
419*16467b97STreehugger Robot  # error reporting hook for presenting the information
420*16467b97STreehugger Robot  # The default implementation builds appropriate error
421*16467b97STreehugger Robot  # message text using +error_header+ and +error_message+,
422*16467b97STreehugger Robot  # and calls +emit_error_message+ to write the error
423*16467b97STreehugger Robot  # message out to some source
424*16467b97STreehugger Robot  def display_recognition_error( e = $! )
425*16467b97STreehugger Robot    header = error_header( e )
426*16467b97STreehugger Robot    message = error_message( e )
427*16467b97STreehugger Robot    emit_error_message( "#{ header } #{ message }" )
428*16467b97STreehugger Robot  end
429*16467b97STreehugger Robot
430*16467b97STreehugger Robot  # used to construct an appropriate error message
431*16467b97STreehugger Robot  # based on the specific type of error and the
432*16467b97STreehugger Robot  # error's attributes
433*16467b97STreehugger Robot  def error_message( e = $! )
434*16467b97STreehugger Robot    case e
435*16467b97STreehugger Robot    when UnwantedToken
436*16467b97STreehugger Robot      token_name = token_name( e.expecting )
437*16467b97STreehugger Robot      "extraneous input #{ token_error_display( e.unexpected_token ) } expecting #{ token_name }"
438*16467b97STreehugger Robot    when MissingToken
439*16467b97STreehugger Robot      token_name = token_name( e.expecting )
440*16467b97STreehugger Robot      "missing #{ token_name } at #{ token_error_display( e.symbol ) }"
441*16467b97STreehugger Robot    when MismatchedToken
442*16467b97STreehugger Robot      token_name = token_name( e.expecting )
443*16467b97STreehugger Robot      "mismatched input #{ token_error_display( e.symbol ) } expecting #{ token_name }"
444*16467b97STreehugger Robot    when MismatchedTreeNode
445*16467b97STreehugger Robot      token_name = token_name( e.expecting )
446*16467b97STreehugger Robot      "mismatched tree node: #{ e.symbol } expecting #{ token_name }"
447*16467b97STreehugger Robot    when NoViableAlternative
448*16467b97STreehugger Robot      "no viable alternative at input " << token_error_display( e.symbol )
449*16467b97STreehugger Robot    when MismatchedSet
450*16467b97STreehugger Robot      "mismatched input %s expecting set %s" %
451*16467b97STreehugger Robot        [ token_error_display( e.symbol ), e.expecting.inspect ]
452*16467b97STreehugger Robot    when MismatchedNotSet
453*16467b97STreehugger Robot      "mismatched input %s expecting set %s" %
454*16467b97STreehugger Robot        [ token_error_display( e.symbol ), e.expecting.inspect ]
455*16467b97STreehugger Robot    when FailedPredicate
456*16467b97STreehugger Robot      "rule %s failed predicate: { %s }?" % [ e.rule_name, e.predicate_text ]
457*16467b97STreehugger Robot    else e.message
458*16467b97STreehugger Robot    end
459*16467b97STreehugger Robot  end
460*16467b97STreehugger Robot
461*16467b97STreehugger Robot  #
462*16467b97STreehugger Robot  # used to add a tag to the error message that indicates
463*16467b97STreehugger Robot  # the location of the input stream when the error
464*16467b97STreehugger Robot  # occurred
465*16467b97STreehugger Robot  #
466*16467b97STreehugger Robot  def error_header( e = $! )
467*16467b97STreehugger Robot    e.location
468*16467b97STreehugger Robot  end
469*16467b97STreehugger Robot
470*16467b97STreehugger Robot  #
471*16467b97STreehugger Robot  # formats a token object appropriately for inspection
472*16467b97STreehugger Robot  # within an error message
473*16467b97STreehugger Robot  #
474*16467b97STreehugger Robot  def token_error_display( token )
475*16467b97STreehugger Robot    unless text = token.text || ( token.source_text rescue nil )
476*16467b97STreehugger Robot      text =
477*16467b97STreehugger Robot        case
478*16467b97STreehugger Robot        when token.type == EOF then '<EOF>'
479*16467b97STreehugger Robot        when name = token_name( token.type ) rescue nil then "<#{ name }>"
480*16467b97STreehugger Robot        when token.respond_to?( :name ) then "<#{ token.name }>"
481*16467b97STreehugger Robot        else "<#{ token.type }>"
482*16467b97STreehugger Robot        end
483*16467b97STreehugger Robot    end
484*16467b97STreehugger Robot    return text.inspect
485*16467b97STreehugger Robot  end
486*16467b97STreehugger Robot
487*16467b97STreehugger Robot  #
488*16467b97STreehugger Robot  # Write the error report data out to some source. By default,
489*16467b97STreehugger Robot  # the error message is written to $stderr
490*16467b97STreehugger Robot  #
491*16467b97STreehugger Robot  def emit_error_message( message )
492*16467b97STreehugger Robot    @error_output.puts( message ) if @error_output
493*16467b97STreehugger Robot  end
494*16467b97STreehugger Robot
495*16467b97STreehugger Robot  ##############################################################################################
496*16467b97STreehugger Robot  ###################################### Error Recovery ########################################
497*16467b97STreehugger Robot  ##############################################################################################
498*16467b97STreehugger Robot
499*16467b97STreehugger Robot  def recover( error = $! )
500*16467b97STreehugger Robot    @state.last_error_index == @input.index and @input.consume
501*16467b97STreehugger Robot    @state.last_error_index = @input.index
502*16467b97STreehugger Robot
503*16467b97STreehugger Robot    follow_set = compute_error_recovery_set
504*16467b97STreehugger Robot
505*16467b97STreehugger Robot    resync { consume_until( follow_set ) }
506*16467b97STreehugger Robot  end
507*16467b97STreehugger Robot
508*16467b97STreehugger Robot  def resync
509*16467b97STreehugger Robot    begin_resync
510*16467b97STreehugger Robot    return( yield )
511*16467b97STreehugger Robot  ensure
512*16467b97STreehugger Robot    end_resync
513*16467b97STreehugger Robot  end
514*16467b97STreehugger Robot
515*16467b97STreehugger Robot  # overridable hook method that is executed at the start of the
516*16467b97STreehugger Robot  # resyncing procedure in recover
517*16467b97STreehugger Robot  #
518*16467b97STreehugger Robot  # by default, it does nothing
519*16467b97STreehugger Robot  def begin_resync
520*16467b97STreehugger Robot    # do nothing
521*16467b97STreehugger Robot  end
522*16467b97STreehugger Robot
523*16467b97STreehugger Robot  # overridable hook method that is after the resyncing procedure has completed
524*16467b97STreehugger Robot  #
525*16467b97STreehugger Robot  # by default, it does nothing
526*16467b97STreehugger Robot  def end_resync
527*16467b97STreehugger Robot    # do nothing
528*16467b97STreehugger Robot  end
529*16467b97STreehugger Robot
530*16467b97STreehugger Robot  # (The following explanation has been lifted directly from the
531*16467b97STreehugger Robot  #  source code documentation of the ANTLR Java runtime library)
532*16467b97STreehugger Robot  #
533*16467b97STreehugger Robot  # Compute the error recovery set for the current rule.  During
534*16467b97STreehugger Robot  # rule invocation, the parser pushes the set of tokens that can
535*16467b97STreehugger Robot  # follow that rule reference on the stack; this amounts to
536*16467b97STreehugger Robot  # computing FIRST of what follows the rule reference in the
537*16467b97STreehugger Robot  # enclosing rule. This local follow set only includes tokens
538*16467b97STreehugger Robot  # from within the rule; i.e., the FIRST computation done by
539*16467b97STreehugger Robot  # ANTLR stops at the end of a rule.
540*16467b97STreehugger Robot  #
541*16467b97STreehugger Robot  # EXAMPLE
542*16467b97STreehugger Robot  #
543*16467b97STreehugger Robot  # When you find a "no viable alt exception", the input is not
544*16467b97STreehugger Robot  # consistent with any of the alternatives for rule r.  The best
545*16467b97STreehugger Robot  # thing to do is to consume tokens until you see something that
546*16467b97STreehugger Robot  # can legally follow a call to r *or* any rule that called r.
547*16467b97STreehugger Robot  # You don't want the exact set of viable next tokens because the
548*16467b97STreehugger Robot  # input might just be missing a token--you might consume the
549*16467b97STreehugger Robot  # rest of the input looking for one of the missing tokens.
550*16467b97STreehugger Robot  #
551*16467b97STreehugger Robot  # Consider grammar:
552*16467b97STreehugger Robot  #
553*16467b97STreehugger Robot  #   a : '[' b ']'
554*16467b97STreehugger Robot  #     | '(' b ')'
555*16467b97STreehugger Robot  #     ;
556*16467b97STreehugger Robot  #   b : c '^' INT ;
557*16467b97STreehugger Robot  #   c : ID
558*16467b97STreehugger Robot  #     | INT
559*16467b97STreehugger Robot  #     ;
560*16467b97STreehugger Robot  #
561*16467b97STreehugger Robot  # At each rule invocation, the set of tokens that could follow
562*16467b97STreehugger Robot  # that rule is pushed on a stack.  Here are the various "local"
563*16467b97STreehugger Robot  # follow sets:
564*16467b97STreehugger Robot  #
565*16467b97STreehugger Robot  #   FOLLOW( b1_in_a ) = FIRST( ']' ) = ']'
566*16467b97STreehugger Robot  #   FOLLOW( b2_in_a ) = FIRST( ')' ) = ')'
567*16467b97STreehugger Robot  #   FOLLOW( c_in_b ) = FIRST( '^' ) = '^'
568*16467b97STreehugger Robot  #
569*16467b97STreehugger Robot  # Upon erroneous input "[]", the call chain is
570*16467b97STreehugger Robot  #
571*16467b97STreehugger Robot  #   a -> b -> c
572*16467b97STreehugger Robot  #
573*16467b97STreehugger Robot  # and, hence, the follow context stack is:
574*16467b97STreehugger Robot  #
575*16467b97STreehugger Robot  #   depth  local follow set     after call to rule
576*16467b97STreehugger Robot  #     0         \<EOF>                   a (from main( ) )
577*16467b97STreehugger Robot  #     1          ']'                     b
578*16467b97STreehugger Robot  #     3          '^'                     c
579*16467b97STreehugger Robot  #
580*16467b97STreehugger Robot  # Notice that <tt>')'</tt> is not included, because b would have to have
581*16467b97STreehugger Robot  # been called from a different context in rule a for ')' to be
582*16467b97STreehugger Robot  # included.
583*16467b97STreehugger Robot  #
584*16467b97STreehugger Robot  # For error recovery, we cannot consider FOLLOW(c)
585*16467b97STreehugger Robot  # (context-sensitive or otherwise).  We need the combined set of
586*16467b97STreehugger Robot  # all context-sensitive FOLLOW sets--the set of all tokens that
587*16467b97STreehugger Robot  # could follow any reference in the call chain.  We need to
588*16467b97STreehugger Robot  # resync to one of those tokens.  Note that FOLLOW(c)='^' and if
589*16467b97STreehugger Robot  # we resync'd to that token, we'd consume until EOF.  We need to
590*16467b97STreehugger Robot  # sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
591*16467b97STreehugger Robot  # In this case, for input "[]", LA(1) is in this set so we would
592*16467b97STreehugger Robot  # not consume anything and after printing an error rule c would
593*16467b97STreehugger Robot  # return normally.  It would not find the required '^' though.
594*16467b97STreehugger Robot  # At this point, it gets a mismatched token error and throws an
595*16467b97STreehugger Robot  # exception (since LA(1) is not in the viable following token
596*16467b97STreehugger Robot  # set).  The rule exception handler tries to recover, but finds
597*16467b97STreehugger Robot  # the same recovery set and doesn't consume anything.  Rule b
598*16467b97STreehugger Robot  # exits normally returning to rule a.  Now it finds the ']' (and
599*16467b97STreehugger Robot  # with the successful match exits errorRecovery mode).
600*16467b97STreehugger Robot  #
601*16467b97STreehugger Robot  # So, you cna see that the parser walks up call chain looking
602*16467b97STreehugger Robot  # for the token that was a member of the recovery set.
603*16467b97STreehugger Robot  #
604*16467b97STreehugger Robot  # Errors are not generated in errorRecovery mode.
605*16467b97STreehugger Robot  #
606*16467b97STreehugger Robot  # ANTLR's error recovery mechanism is based upon original ideas:
607*16467b97STreehugger Robot  #
608*16467b97STreehugger Robot  # "Algorithms + Data Structures = Programs" by Niklaus Wirth
609*16467b97STreehugger Robot  #
610*16467b97STreehugger Robot  # and
611*16467b97STreehugger Robot  #
612*16467b97STreehugger Robot  # "A note on error recovery in recursive descent parsers":
613*16467b97STreehugger Robot  # http://portal.acm.org/citation.cfm?id=947902.947905
614*16467b97STreehugger Robot  #
615*16467b97STreehugger Robot  # Later, Josef Grosch had some good ideas:
616*16467b97STreehugger Robot  #
617*16467b97STreehugger Robot  # "Efficient and Comfortable Error Recovery in Recursive Descent
618*16467b97STreehugger Robot  # Parsers":
619*16467b97STreehugger Robot  # ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
620*16467b97STreehugger Robot  #
621*16467b97STreehugger Robot  # Like Grosch I implemented local FOLLOW sets that are combined
622*16467b97STreehugger Robot  # at run-time upon error to avoid overhead during parsing.
623*16467b97STreehugger Robot  def compute_error_recovery_set
624*16467b97STreehugger Robot    combine_follows( false )
625*16467b97STreehugger Robot  end
626*16467b97STreehugger Robot
627*16467b97STreehugger Robot  def recover_from_mismatched_token( type, follow )
628*16467b97STreehugger Robot    if mismatch_is_unwanted_token?( type )
629*16467b97STreehugger Robot      err = UnwantedToken( type )
630*16467b97STreehugger Robot      resync { @input.consume }
631*16467b97STreehugger Robot      report_error( err )
632*16467b97STreehugger Robot
633*16467b97STreehugger Robot      return @input.consume
634*16467b97STreehugger Robot    end
635*16467b97STreehugger Robot
636*16467b97STreehugger Robot    if mismatch_is_missing_token?( follow )
637*16467b97STreehugger Robot      inserted = missing_symbol( nil, type, follow )
638*16467b97STreehugger Robot      report_error( MissingToken( type, inserted ) )
639*16467b97STreehugger Robot      return inserted
640*16467b97STreehugger Robot    end
641*16467b97STreehugger Robot
642*16467b97STreehugger Robot    raise MismatchedToken( type )
643*16467b97STreehugger Robot  end
644*16467b97STreehugger Robot
645*16467b97STreehugger Robot  def recover_from_mismatched_set( e, follow )
646*16467b97STreehugger Robot    if mismatch_is_missing_token?( follow )
647*16467b97STreehugger Robot      report_error( e )
648*16467b97STreehugger Robot      return missing_symbol( e, INVALID_TOKEN_TYPE, follow )
649*16467b97STreehugger Robot    end
650*16467b97STreehugger Robot    raise e
651*16467b97STreehugger Robot  end
652*16467b97STreehugger Robot
653*16467b97STreehugger Robot  def recover_from_mismatched_element( e, follow )
654*16467b97STreehugger Robot    follow.nil? and return false
655*16467b97STreehugger Robot    if follow.include?( EOR_TOKEN_TYPE )
656*16467b97STreehugger Robot      viable_tokens = compute_context_sensitive_rule_follow
657*16467b97STreehugger Robot      follow = ( follow | viable_tokens ) - Set[ EOR_TOKEN_TYPE ]
658*16467b97STreehugger Robot    end
659*16467b97STreehugger Robot    if follow.include?( @input.peek )
660*16467b97STreehugger Robot      report_error( e )
661*16467b97STreehugger Robot      return true
662*16467b97STreehugger Robot    end
663*16467b97STreehugger Robot    return false
664*16467b97STreehugger Robot  end
665*16467b97STreehugger Robot
666*16467b97STreehugger Robot  # Conjure up a missing token during error recovery.
667*16467b97STreehugger Robot  #
668*16467b97STreehugger Robot  # The recognizer attempts to recover from single missing
669*16467b97STreehugger Robot  # symbols. But, actions might refer to that missing symbol.
670*16467b97STreehugger Robot  # For example, x=ID {f($x);}. The action clearly assumes
671*16467b97STreehugger Robot  # that there has been an identifier matched previously and that
672*16467b97STreehugger Robot  # $x points at that token. If that token is missing, but
673*16467b97STreehugger Robot  # the next token in the stream is what we want we assume that
674*16467b97STreehugger Robot  # this token is missing and we keep going. Because we
675*16467b97STreehugger Robot  # have to return some token to replace the missing token,
676*16467b97STreehugger Robot  # we have to conjure one up. This method gives the user control
677*16467b97STreehugger Robot  # over the tokens returned for missing tokens. Mostly,
678*16467b97STreehugger Robot  # you will want to create something special for identifier
679*16467b97STreehugger Robot  # tokens. For literals such as '{' and ',', the default
680*16467b97STreehugger Robot  # action in the parser or tree parser works. It simply creates
681*16467b97STreehugger Robot  # a CommonToken of the appropriate type. The text will be the token.
682*16467b97STreehugger Robot  # If you change what tokens must be created by the lexer,
683*16467b97STreehugger Robot  # override this method to create the appropriate tokens.
684*16467b97STreehugger Robot  def missing_symbol( error, expected_token_type, follow )
685*16467b97STreehugger Robot    return nil
686*16467b97STreehugger Robot  end
687*16467b97STreehugger Robot
688*16467b97STreehugger Robot  def mismatch_is_unwanted_token?( type )
689*16467b97STreehugger Robot    @input.peek( 2 ) == type
690*16467b97STreehugger Robot  end
691*16467b97STreehugger Robot
692*16467b97STreehugger Robot  def mismatch_is_missing_token?( follow )
693*16467b97STreehugger Robot    follow.nil? and return false
694*16467b97STreehugger Robot    if follow.include?( EOR_TOKEN_TYPE )
695*16467b97STreehugger Robot      viable_tokens = compute_context_sensitive_rule_follow
696*16467b97STreehugger Robot      follow = follow | viable_tokens
697*16467b97STreehugger Robot
698*16467b97STreehugger Robot      follow.delete( EOR_TOKEN_TYPE ) unless @state.following.empty?
699*16467b97STreehugger Robot    end
700*16467b97STreehugger Robot    if follow.include?( @input.peek ) or follow.include?( EOR_TOKEN_TYPE )
701*16467b97STreehugger Robot      return true
702*16467b97STreehugger Robot    end
703*16467b97STreehugger Robot    return false
704*16467b97STreehugger Robot  end
705*16467b97STreehugger Robot
706*16467b97STreehugger Robot  def syntax_errors?
707*16467b97STreehugger Robot    ( error_count = @state.syntax_errors ) > 0 and return( error_count )
708*16467b97STreehugger Robot  end
709*16467b97STreehugger Robot
710*16467b97STreehugger Robot  # factor out what to do upon token mismatch so
711*16467b97STreehugger Robot  # tree parsers can behave differently.
712*16467b97STreehugger Robot  #
713*16467b97STreehugger Robot  # * override this method in your parser to do things
714*16467b97STreehugger Robot  #	  like bailing out after the first error
715*16467b97STreehugger Robot  #	* just raise the exception instead of
716*16467b97STreehugger Robot  #	  calling the recovery method.
717*16467b97STreehugger Robot  #
718*16467b97STreehugger Robot  def number_of_syntax_errors
719*16467b97STreehugger Robot    @state.syntax_errors
720*16467b97STreehugger Robot  end
721*16467b97STreehugger Robot
722*16467b97STreehugger Robot  #
723*16467b97STreehugger Robot  # Compute the context-sensitive +FOLLOW+ set for current rule.
724*16467b97STreehugger Robot  # This is set of token types that can follow a specific rule
725*16467b97STreehugger Robot  # reference given a specific call chain.  You get the set of
726*16467b97STreehugger Robot  # viable tokens that can possibly come next (look depth 1)
727*16467b97STreehugger Robot  # given the current call chain.  Contrast this with the
728*16467b97STreehugger Robot  # definition of plain FOLLOW for rule r:
729*16467b97STreehugger Robot  #
730*16467b97STreehugger Robot  #    FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
731*16467b97STreehugger Robot  #
732*16467b97STreehugger Robot  # where x in T* and alpha, beta in V*; T is set of terminals and
733*16467b97STreehugger Robot  # V is the set of terminals and nonterminals.  In other words,
734*16467b97STreehugger Robot  # FOLLOW(r) is the set of all tokens that can possibly follow
735*16467b97STreehugger Robot  # references to r in *any* sentential form (context).  At
736*16467b97STreehugger Robot  # runtime, however, we know precisely which context applies as
737*16467b97STreehugger Robot  # we have the call chain.  We may compute the exact (rather
738*16467b97STreehugger Robot  # than covering superset) set of following tokens.
739*16467b97STreehugger Robot  #
740*16467b97STreehugger Robot  # For example, consider grammar:
741*16467b97STreehugger Robot  #
742*16467b97STreehugger Robot  #   stat : ID '=' expr ';'      // FOLLOW(stat)=={EOF}
743*16467b97STreehugger Robot  #        | "return" expr '.'
744*16467b97STreehugger Robot  #        ;
745*16467b97STreehugger Robot  #   expr : atom ('+' atom)* ;   // FOLLOW(expr)=={';','.',')'}
746*16467b97STreehugger Robot  #   atom : INT                  // FOLLOW(atom)=={'+',')',';','.'}
747*16467b97STreehugger Robot  #        | '(' expr ')'
748*16467b97STreehugger Robot  #        ;
749*16467b97STreehugger Robot  #
750*16467b97STreehugger Robot  # The FOLLOW sets are all inclusive whereas context-sensitive
751*16467b97STreehugger Robot  # FOLLOW sets are precisely what could follow a rule reference.
752*16467b97STreehugger Robot  # For input input "i=(3);", here is the derivation:
753*16467b97STreehugger Robot  #
754*16467b97STreehugger Robot  #   stat => ID '=' expr ';'
755*16467b97STreehugger Robot  #        => ID '=' atom ('+' atom)* ';'
756*16467b97STreehugger Robot  #        => ID '=' '(' expr ')' ('+' atom)* ';'
757*16467b97STreehugger Robot  #        => ID '=' '(' atom ')' ('+' atom)* ';'
758*16467b97STreehugger Robot  #        => ID '=' '(' INT ')' ('+' atom)* ';'
759*16467b97STreehugger Robot  #        => ID '=' '(' INT ')' ';'
760*16467b97STreehugger Robot  #
761*16467b97STreehugger Robot  # At the "3" token, you'd have a call chain of
762*16467b97STreehugger Robot  #
763*16467b97STreehugger Robot  #   stat -> expr -> atom -> expr -> atom
764*16467b97STreehugger Robot  #
765*16467b97STreehugger Robot  # What can follow that specific nested ref to atom?  Exactly ')'
766*16467b97STreehugger Robot  # as you can see by looking at the derivation of this specific
767*16467b97STreehugger Robot  # input.  Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
768*16467b97STreehugger Robot  #
769*16467b97STreehugger Robot  # You want the exact viable token set when recovering from a
770*16467b97STreehugger Robot  # token mismatch.  Upon token mismatch, if LA(1) is member of
771*16467b97STreehugger Robot  # the viable next token set, then you know there is most likely
772*16467b97STreehugger Robot  # a missing token in the input stream.  "Insert" one by just not
773*16467b97STreehugger Robot  # throwing an exception.
774*16467b97STreehugger Robot  #
775*16467b97STreehugger Robot  def compute_context_sensitive_rule_follow
776*16467b97STreehugger Robot    combine_follows true
777*16467b97STreehugger Robot  end
778*16467b97STreehugger Robot
779*16467b97STreehugger Robot  def combine_follows( exact )
780*16467b97STreehugger Robot    follow_set = Set.new
781*16467b97STreehugger Robot    @state.following.each_with_index.reverse_each do |local_follow_set, index|
782*16467b97STreehugger Robot      follow_set |= local_follow_set
783*16467b97STreehugger Robot      if exact
784*16467b97STreehugger Robot        if local_follow_set.include?( EOR_TOKEN_TYPE )
785*16467b97STreehugger Robot          follow_set.delete( EOR_TOKEN_TYPE ) if index > 0
786*16467b97STreehugger Robot        else
787*16467b97STreehugger Robot          break
788*16467b97STreehugger Robot        end
789*16467b97STreehugger Robot      end
790*16467b97STreehugger Robot    end
791*16467b97STreehugger Robot    return follow_set
792*16467b97STreehugger Robot  end
793*16467b97STreehugger Robot
794*16467b97STreehugger Robot  #
795*16467b97STreehugger Robot  # Match needs to return the current input symbol, which gets put
796*16467b97STreehugger Robot  # into the label for the associated token ref; e.g., x=ID.  Token
797*16467b97STreehugger Robot  # and tree parsers need to return different objects. Rather than test
798*16467b97STreehugger Robot  # for input stream type or change the IntStream interface, I use
799*16467b97STreehugger Robot  # a simple method to ask the recognizer to tell me what the current
800*16467b97STreehugger Robot  # input symbol is.
801*16467b97STreehugger Robot  #
802*16467b97STreehugger Robot  # This is ignored for lexers.
803*16467b97STreehugger Robot  #
804*16467b97STreehugger Robot  def current_symbol
805*16467b97STreehugger Robot    @input.look
806*16467b97STreehugger Robot  end
807*16467b97STreehugger Robot
808*16467b97STreehugger Robot  #
809*16467b97STreehugger Robot  # Consume input symbols until one matches a type within types
810*16467b97STreehugger Robot  #
811*16467b97STreehugger Robot  # types can be a single symbol type or a set of symbol types
812*16467b97STreehugger Robot  #
813*16467b97STreehugger Robot  def consume_until( types )
814*16467b97STreehugger Robot    types.is_a?( Set ) or types = Set[ *types ]
815*16467b97STreehugger Robot    type = @input.peek
816*16467b97STreehugger Robot    until type == EOF or types.include?( type )
817*16467b97STreehugger Robot      @input.consume
818*16467b97STreehugger Robot      type = @input.peek
819*16467b97STreehugger Robot    end
820*16467b97STreehugger Robot    return( type )
821*16467b97STreehugger Robot  end
822*16467b97STreehugger Robot
823*16467b97STreehugger Robot  #
824*16467b97STreehugger Robot  # Returns true if the recognizer is currently in a decision for which
825*16467b97STreehugger Robot  # backtracking has been enabled
826*16467b97STreehugger Robot  #
827*16467b97STreehugger Robot  def backtracking?
828*16467b97STreehugger Robot    @state.backtracking > 0
829*16467b97STreehugger Robot  end
830*16467b97STreehugger Robot
831*16467b97STreehugger Robot  def backtracking_level
832*16467b97STreehugger Robot    @state.backtracking
833*16467b97STreehugger Robot  end
834*16467b97STreehugger Robot
835*16467b97STreehugger Robot  def backtracking_level=( n )
836*16467b97STreehugger Robot    @state.backtracking = n
837*16467b97STreehugger Robot  end
838*16467b97STreehugger Robot
839*16467b97STreehugger Robot  def backtrack
840*16467b97STreehugger Robot    @state.backtracking += 1
841*16467b97STreehugger Robot    start = @input.mark
842*16467b97STreehugger Robot    success =
843*16467b97STreehugger Robot      begin yield
844*16467b97STreehugger Robot      rescue BacktrackingFailed then false
845*16467b97STreehugger Robot      else true
846*16467b97STreehugger Robot      end
847*16467b97STreehugger Robot    return success
848*16467b97STreehugger Robot  ensure
849*16467b97STreehugger Robot    @input.rewind( start )
850*16467b97STreehugger Robot    @state.backtracking -= 1
851*16467b97STreehugger Robot  end
852*16467b97STreehugger Robot
853*16467b97STreehugger Robot  def syntactic_predicate?( name )
854*16467b97STreehugger Robot    backtrack { send name }
855*16467b97STreehugger Robot  end
856*16467b97STreehugger Robot
857*16467b97STreehugger Robot  alias backtracking backtracking_level
858*16467b97STreehugger Robot  alias backtracking= backtracking_level=
859*16467b97STreehugger Robot
860*16467b97STreehugger Robot  def rule_memoization( rule, start_index )
861*16467b97STreehugger Robot    @state.rule_memory.fetch( rule ) do
862*16467b97STreehugger Robot      @state.rule_memory[ rule ] = Hash.new( MEMO_RULE_UNKNOWN )
863*16467b97STreehugger Robot    end[ start_index ]
864*16467b97STreehugger Robot  end
865*16467b97STreehugger Robot
866*16467b97STreehugger Robot  def already_parsed_rule?( rule )
867*16467b97STreehugger Robot    stop_index = rule_memoization( rule, @input.index )
868*16467b97STreehugger Robot    case stop_index
869*16467b97STreehugger Robot    when MEMO_RULE_UNKNOWN then return false
870*16467b97STreehugger Robot    when MEMO_RULE_FAILED
871*16467b97STreehugger Robot      raise BacktrackingFailed
872*16467b97STreehugger Robot    else
873*16467b97STreehugger Robot      @input.seek( stop_index + 1 )
874*16467b97STreehugger Robot    end
875*16467b97STreehugger Robot    return true
876*16467b97STreehugger Robot  end
877*16467b97STreehugger Robot
878*16467b97STreehugger Robot  def memoize( rule, start_index, success )
879*16467b97STreehugger Robot    stop_index = success ? @input.index - 1 : MEMO_RULE_FAILED
880*16467b97STreehugger Robot    memo = @state.rule_memory[ rule ] and memo[ start_index ] = stop_index
881*16467b97STreehugger Robot  end
882*16467b97STreehugger Robot
883*16467b97STreehugger Robot  def trace_in( rule_name, rule_index, input_symbol )
884*16467b97STreehugger Robot    @error_output.printf( "--> enter %s on %s", rule_name, input_symbol )
885*16467b97STreehugger Robot    @state.backtracking > 0 and @error_output.printf(
886*16467b97STreehugger Robot      " (in backtracking mode: depth = %s)", @state.backtracking
887*16467b97STreehugger Robot    )
888*16467b97STreehugger Robot    @error_output.print( "\n" )
889*16467b97STreehugger Robot  end
890*16467b97STreehugger Robot
891*16467b97STreehugger Robot  def trace_out( rule_name, rule_index, input_symbol )
892*16467b97STreehugger Robot    @error_output.printf( "<-- exit %s on %s", rule_name, input_symbol )
893*16467b97STreehugger Robot    @state.backtracking > 0 and @error_output.printf(
894*16467b97STreehugger Robot      " (in backtracking mode: depth = %s)", @state.backtracking
895*16467b97STreehugger Robot    )
896*16467b97STreehugger Robot    @error_output.print( "\n" )
897*16467b97STreehugger Robot  end
898*16467b97STreehugger Robot
899*16467b97STreehugger Robotprivate
900*16467b97STreehugger Robot
901*16467b97STreehugger Robot  def initialize_dfas
902*16467b97STreehugger Robot    # do nothing
903*16467b97STreehugger Robot  end
904*16467b97STreehugger Robotend
905*16467b97STreehugger Robot
906*16467b97STreehugger Robot
907*16467b97STreehugger Robot# constant alias for compatibility with older versions of the
908*16467b97STreehugger Robot# runtime library
909*16467b97STreehugger RobotBaseRecognizer = Recognizer
910*16467b97STreehugger Robot
911*16467b97STreehugger Robot=begin rdoc ANTLR3::Lexer
912*16467b97STreehugger Robot
913*16467b97STreehugger Robot= Lexer
914*16467b97STreehugger Robot
915*16467b97STreehugger RobotLexer is the default superclass of all lexers generated by ANTLR. The class
916*16467b97STreehugger Robottailors the core functionality provided by Recognizer to the task of
917*16467b97STreehugger Robotmatching patterns in the text input and breaking the input into tokens.
918*16467b97STreehugger Robot
919*16467b97STreehugger Robot== About Lexers
920*16467b97STreehugger Robot
921*16467b97STreehugger RobotA lexer's job is to take input text and break it up into _tokens_ -- objects
922*16467b97STreehugger Robotthat encapsulate a piece of text, a type label (such as ID or INTEGER), and the
923*16467b97STreehugger Robotposition of the text with respect to the input. Thus, a lexer is essentially a
924*16467b97STreehugger Robotcomplicated iterator that steps through an input stream and produces a sequence
925*16467b97STreehugger Robotof tokens. Sometimes lexers are enough to carry out a goal on their own, such as
926*16467b97STreehugger Robottasks like source code highlighting and simple code analysis. Usually, however,
927*16467b97STreehugger Robotthe lexer converts text into tokens for use by a parser, which recognizes larger
928*16467b97STreehugger Robotstructures within the text.
929*16467b97STreehugger Robot
930*16467b97STreehugger RobotANTLR parsers have a variety of entry points specified by parser rules, each of
931*16467b97STreehugger Robotwhich defines the structure of a specific type of sentence in a grammar. Lexers,
932*16467b97STreehugger Robothowever, are primarily intended to have a single entry point. It looks at the
933*16467b97STreehugger Robotcharacters starting at the current input position, decides if the chunk of text
934*16467b97STreehugger Robotmatches one of a number of possible token type definitions, wraps the chunk into
935*16467b97STreehugger Robota token with information on its type and location, and advances the input stream
936*16467b97STreehugger Robotto the next place.
937*16467b97STreehugger Robot
938*16467b97STreehugger Robot== ANTLR Lexers and the Lexer API
939*16467b97STreehugger Robot
940*16467b97STreehugger RobotANTLR-generated lexers will subclass this class, unless specified otherwise
941*16467b97STreehugger Robotwithin a grammar file. The generated class will provide an implementation of
942*16467b97STreehugger Roboteach lexer rule as a method of the same name. The subclass will also provide an
943*16467b97STreehugger Robotimplementation for the abstract method #m_tokens, the purpose of which is to
944*16467b97STreehugger Robotmultiplex the token type definitions and predict what rule definition to execute
945*16467b97STreehugger Robotto fetch a token. The primary method in the lexer API, #next_token, uses
946*16467b97STreehugger Robot#m_tokens to fetch the next token and drive the iteration.
947*16467b97STreehugger Robot
948*16467b97STreehugger RobotIf the lexer is preparing tokens for use by an ANTLR generated parser, the lexer
949*16467b97STreehugger Robotwill generally be used to build a TokenStream object. The following code example
950*16467b97STreehugger Robotdemonstrates the typical setup for using ANTLR parsers and lexers in Ruby.
951*16467b97STreehugger Robot
952*16467b97STreehugger Robot  # in HypotheticalLexer.rb
953*16467b97STreehugger Robot  module Hypothetical
954*16467b97STreehugger Robot  class Lexer < ANTLR3::Lexer
955*16467b97STreehugger Robot    # ...
956*16467b97STreehugger Robot    # ANTLR generated code
957*16467b97STreehugger Robot    # ...
958*16467b97STreehugger Robot  end
959*16467b97STreehugger Robot  end
960*16467b97STreehugger Robot
961*16467b97STreehugger Robot  # in HypotheticalParser.rb
962*16467b97STreehugger Robot  module Hypothetical
963*16467b97STreehugger Robot  class Parser < ANTLR3::Parser
964*16467b97STreehugger Robot    # ...
965*16467b97STreehugger Robot    # more ANTLR generated code
966*16467b97STreehugger Robot    # ...
967*16467b97STreehugger Robot  end
968*16467b97STreehugger Robot  end
969*16467b97STreehugger Robot
970*16467b97STreehugger Robot  # to take hypothetical source code and prepare it for parsing,
971*16467b97STreehugger Robot  # there is generally a four-step construction process
972*16467b97STreehugger Robot
973*16467b97STreehugger Robot  source = "some hypothetical source code"
974*16467b97STreehugger Robot  input = ANTLR3::StringStream.new(source, :file => 'blah-de-blah.hyp')
975*16467b97STreehugger Robot  lexer = Hypothetical::Lexer.new( input )
976*16467b97STreehugger Robot  tokens = ANTLR3::CommonTokenStream.new( lexer )
977*16467b97STreehugger Robot  parser = Hypothetical::Parser.new( tokens )
978*16467b97STreehugger Robot
979*16467b97STreehugger Robot  # if you're using the standard streams, ANTLR3::StringStream and
980*16467b97STreehugger Robot  # ANTLR3::CommonTokenStream, you can write the same process
981*16467b97STreehugger Robot  # shown above more succinctly:
982*16467b97STreehugger Robot
983*16467b97STreehugger Robot  lexer  = Hypothetical::Lexer.new("some hypothetical source code", :file => 'blah-de-blah.hyp')
984*16467b97STreehugger Robot  parser = Hypothetical::Parser.new( lexer )
985*16467b97STreehugger Robot
986*16467b97STreehugger Robot=end
987*16467b97STreehugger Robotclass Lexer < Recognizer
988*16467b97STreehugger Robot  include TokenSource
989*16467b97STreehugger Robot  @token_class = CommonToken
990*16467b97STreehugger Robot
991*16467b97STreehugger Robot  def self.default_rule
992*16467b97STreehugger Robot    @default_rule ||= :token!
993*16467b97STreehugger Robot  end
994*16467b97STreehugger Robot
995*16467b97STreehugger Robot  def self.main( argv = ARGV, options = {} )
996*16467b97STreehugger Robot    if argv.is_a?( ::Hash ) then argv, options = ARGV, argv end
997*16467b97STreehugger Robot    main = ANTLR3::Main::LexerMain.new( self, options )
998*16467b97STreehugger Robot    block_given? ? yield( main ) : main.execute( argv )
999*16467b97STreehugger Robot  end
1000*16467b97STreehugger Robot
1001*16467b97STreehugger Robot  def self.associated_parser
1002*16467b97STreehugger Robot    @associated_parser ||= begin
1003*16467b97STreehugger Robot      @grammar_home and @grammar_home::Parser
1004*16467b97STreehugger Robot    rescue NameError
1005*16467b97STreehugger Robot      grammar_name = @grammar_home.name.split( "::" ).last
1006*16467b97STreehugger Robot      begin
1007*16467b97STreehugger Robot        require "#{ grammar_name }Parser"
1008*16467b97STreehugger Robot        @grammar_home::Parser
1009*16467b97STreehugger Robot      rescue LoadError, NameError
1010*16467b97STreehugger Robot      end
1011*16467b97STreehugger Robot    end
1012*16467b97STreehugger Robot  end
1013*16467b97STreehugger Robot
1014*16467b97STreehugger Robot  def initialize( input, options = {} )
1015*16467b97STreehugger Robot    super( options )
1016*16467b97STreehugger Robot    @input = cast_input( input, options )
1017*16467b97STreehugger Robot  end
1018*16467b97STreehugger Robot
1019*16467b97STreehugger Robot  def current_symbol
1020*16467b97STreehugger Robot    nil
1021*16467b97STreehugger Robot  end
1022*16467b97STreehugger Robot
1023*16467b97STreehugger Robot  def next_token
1024*16467b97STreehugger Robot    loop do
1025*16467b97STreehugger Robot      @state.token = nil
1026*16467b97STreehugger Robot      @state.channel = DEFAULT_CHANNEL
1027*16467b97STreehugger Robot      @state.token_start_position = @input.index
1028*16467b97STreehugger Robot      @state.token_start_column = @input.column
1029*16467b97STreehugger Robot      @state.token_start_line = @input.line
1030*16467b97STreehugger Robot      @state.text = nil
1031*16467b97STreehugger Robot      @input.peek == EOF and return EOF_TOKEN
1032*16467b97STreehugger Robot      begin
1033*16467b97STreehugger Robot        token!
1034*16467b97STreehugger Robot
1035*16467b97STreehugger Robot        case token = @state.token
1036*16467b97STreehugger Robot        when nil then return( emit )
1037*16467b97STreehugger Robot        when SKIP_TOKEN then next
1038*16467b97STreehugger Robot        else
1039*16467b97STreehugger Robot          return token
1040*16467b97STreehugger Robot        end
1041*16467b97STreehugger Robot      rescue NoViableAlternative => re
1042*16467b97STreehugger Robot        report_error( re )
1043*16467b97STreehugger Robot        recover( re )
1044*16467b97STreehugger Robot      rescue Error::RecognitionError => re
1045*16467b97STreehugger Robot        report_error( re )
1046*16467b97STreehugger Robot      end
1047*16467b97STreehugger Robot    end
1048*16467b97STreehugger Robot  end
1049*16467b97STreehugger Robot
1050*16467b97STreehugger Robot  def skip
1051*16467b97STreehugger Robot    @state.token = SKIP_TOKEN
1052*16467b97STreehugger Robot  end
1053*16467b97STreehugger Robot
1054*16467b97STreehugger Robot  abstract :token!
1055*16467b97STreehugger Robot
1056*16467b97STreehugger Robot  def exhaust
1057*16467b97STreehugger Robot    self.to_a
1058*16467b97STreehugger Robot  end
1059*16467b97STreehugger Robot
1060*16467b97STreehugger Robot  def char_stream=( input )
1061*16467b97STreehugger Robot    @input = nil
1062*16467b97STreehugger Robot    reset()
1063*16467b97STreehugger Robot    @input = input
1064*16467b97STreehugger Robot  end
1065*16467b97STreehugger Robot
1066*16467b97STreehugger Robot  def source_name
1067*16467b97STreehugger Robot    @input.source_name
1068*16467b97STreehugger Robot  end
1069*16467b97STreehugger Robot
1070*16467b97STreehugger Robot  def emit( token = @state.token )
1071*16467b97STreehugger Robot    token ||= create_token
1072*16467b97STreehugger Robot    @state.token = token
1073*16467b97STreehugger Robot    return token
1074*16467b97STreehugger Robot  end
1075*16467b97STreehugger Robot
1076*16467b97STreehugger Robot  def match( expected )
1077*16467b97STreehugger Robot    case expected
1078*16467b97STreehugger Robot    when String
1079*16467b97STreehugger Robot      expected.each_byte do |char|
1080*16467b97STreehugger Robot        unless @input.peek == char
1081*16467b97STreehugger Robot          @state.backtracking > 0 and raise BacktrackingFailed
1082*16467b97STreehugger Robot          error = MismatchedToken( char )
1083*16467b97STreehugger Robot          recover( error )
1084*16467b97STreehugger Robot          raise error
1085*16467b97STreehugger Robot        end
1086*16467b97STreehugger Robot        @input.consume()
1087*16467b97STreehugger Robot      end
1088*16467b97STreehugger Robot    else # single integer character
1089*16467b97STreehugger Robot      unless @input.peek == expected
1090*16467b97STreehugger Robot        @state.backtracking > 0 and raise BacktrackingFailed
1091*16467b97STreehugger Robot        error = MismatchedToken( expected )
1092*16467b97STreehugger Robot        recover( error )
1093*16467b97STreehugger Robot        raise error
1094*16467b97STreehugger Robot      end
1095*16467b97STreehugger Robot      @input.consume
1096*16467b97STreehugger Robot    end
1097*16467b97STreehugger Robot    return true
1098*16467b97STreehugger Robot  end
1099*16467b97STreehugger Robot
1100*16467b97STreehugger Robot  def match_any
1101*16467b97STreehugger Robot    @input.consume
1102*16467b97STreehugger Robot  end
1103*16467b97STreehugger Robot
1104*16467b97STreehugger Robot  def match_range( min, max )
1105*16467b97STreehugger Robot    char = @input.peek
1106*16467b97STreehugger Robot    if char.between?( min, max ) then @input.consume
1107*16467b97STreehugger Robot    else
1108*16467b97STreehugger Robot      @state.backtracking > 0 and raise BacktrackingFailed
1109*16467b97STreehugger Robot      error = MismatchedRange( min.chr, max.chr )
1110*16467b97STreehugger Robot      recover( error )
1111*16467b97STreehugger Robot      raise( error )
1112*16467b97STreehugger Robot    end
1113*16467b97STreehugger Robot    return true
1114*16467b97STreehugger Robot  end
1115*16467b97STreehugger Robot
1116*16467b97STreehugger Robot  def line
1117*16467b97STreehugger Robot    @input.line
1118*16467b97STreehugger Robot  end
1119*16467b97STreehugger Robot
1120*16467b97STreehugger Robot  def column
1121*16467b97STreehugger Robot    @input.column
1122*16467b97STreehugger Robot  end
1123*16467b97STreehugger Robot
1124*16467b97STreehugger Robot  def character_index
1125*16467b97STreehugger Robot    @input.index
1126*16467b97STreehugger Robot  end
1127*16467b97STreehugger Robot
1128*16467b97STreehugger Robot  def text
1129*16467b97STreehugger Robot    @state.text and return @state.text
1130*16467b97STreehugger Robot    @input.substring( @state.token_start_position, character_index - 1 )
1131*16467b97STreehugger Robot  end
1132*16467b97STreehugger Robot
1133*16467b97STreehugger Robot  def text=( text )
1134*16467b97STreehugger Robot    @state.text = text
1135*16467b97STreehugger Robot  end
1136*16467b97STreehugger Robot
1137*16467b97STreehugger Robot  def report_error( e )
1138*16467b97STreehugger Robot    display_recognition_error( e )
1139*16467b97STreehugger Robot  end
1140*16467b97STreehugger Robot
1141*16467b97STreehugger Robot  def error_message( e )
1142*16467b97STreehugger Robot    char = character_error_display( e.symbol ) rescue nil
1143*16467b97STreehugger Robot    case e
1144*16467b97STreehugger Robot    when Error::MismatchedToken
1145*16467b97STreehugger Robot      expecting = character_error_display( e.expecting )
1146*16467b97STreehugger Robot      "mismatched character #{ char }; expecting #{ expecting }"
1147*16467b97STreehugger Robot    when Error::NoViableAlternative
1148*16467b97STreehugger Robot      "no viable alternative at character #{ char }"
1149*16467b97STreehugger Robot    when Error::EarlyExit
1150*16467b97STreehugger Robot      "required ( ... )+ loop did not match anything at character #{ char }"
1151*16467b97STreehugger Robot    when Error::MismatchedNotSet
1152*16467b97STreehugger Robot      "mismatched character %s; expecting set %p" % [ char, e.expecting ]
1153*16467b97STreehugger Robot    when Error::MismatchedSet
1154*16467b97STreehugger Robot      "mismatched character %s; expecting set %p" % [ char, e.expecting ]
1155*16467b97STreehugger Robot    when Error::MismatchedRange
1156*16467b97STreehugger Robot      a = character_error_display( e.min )
1157*16467b97STreehugger Robot      b = character_error_display( e.max )
1158*16467b97STreehugger Robot      "mismatched character %s; expecting set %s..%s" % [ char, a, b ]
1159*16467b97STreehugger Robot    else super
1160*16467b97STreehugger Robot    end
1161*16467b97STreehugger Robot  end
1162*16467b97STreehugger Robot
1163*16467b97STreehugger Robot  def character_error_display( char )
1164*16467b97STreehugger Robot    case char
1165*16467b97STreehugger Robot    when EOF then '<EOF>'
1166*16467b97STreehugger Robot    when Integer then char.chr.inspect
1167*16467b97STreehugger Robot    else char.inspect
1168*16467b97STreehugger Robot    end
1169*16467b97STreehugger Robot  end
1170*16467b97STreehugger Robot
1171*16467b97STreehugger Robot  def recover( re )
1172*16467b97STreehugger Robot    @input.consume
1173*16467b97STreehugger Robot  end
1174*16467b97STreehugger Robot
1175*16467b97STreehugger Robot  alias input= char_stream=
1176*16467b97STreehugger Robot
1177*16467b97STreehugger Robotprivate
1178*16467b97STreehugger Robot
1179*16467b97STreehugger Robot  def cast_input( input, options )
1180*16467b97STreehugger Robot    case input
1181*16467b97STreehugger Robot    when CharacterStream then input
1182*16467b97STreehugger Robot    when ::String then StringStream.new( input, options )
1183*16467b97STreehugger Robot    when ::IO, ARGF then FileStream.new( input, options )
1184*16467b97STreehugger Robot    else input
1185*16467b97STreehugger Robot    end
1186*16467b97STreehugger Robot  end
1187*16467b97STreehugger Robot
1188*16467b97STreehugger Robot  def trace_in( rule_name, rule_index )
1189*16467b97STreehugger Robot    if symbol = @input.look and symbol != EOF then symbol = symbol.inspect
1190*16467b97STreehugger Robot    else symbol = '<EOF>' end
1191*16467b97STreehugger Robot    input_symbol = "#{ symbol } @ line #{ line } / col #{ column }"
1192*16467b97STreehugger Robot    super( rule_name, rule_index, input_symbol )
1193*16467b97STreehugger Robot  end
1194*16467b97STreehugger Robot
1195*16467b97STreehugger Robot  def trace_out( rule_name, rule_index )
1196*16467b97STreehugger Robot    if symbol = @input.look and symbol != EOF then symbol = symbol.inspect
1197*16467b97STreehugger Robot    else symbol = '<EOF>' end
1198*16467b97STreehugger Robot    input_symbol = "#{ symbol } @ line #{ line } / col #{ column }"
1199*16467b97STreehugger Robot    super( rule_name, rule_index, input_symbol )
1200*16467b97STreehugger Robot  end
1201*16467b97STreehugger Robot
1202*16467b97STreehugger Robot  def create_token( &b )
1203*16467b97STreehugger Robot    if block_given? then super( &b )
1204*16467b97STreehugger Robot    else
1205*16467b97STreehugger Robot      super do |t|
1206*16467b97STreehugger Robot        t.input = @input
1207*16467b97STreehugger Robot        t.type = @state.type
1208*16467b97STreehugger Robot        t.channel = @state.channel
1209*16467b97STreehugger Robot        t.start = @state.token_start_position
1210*16467b97STreehugger Robot        t.stop = @input.index - 1
1211*16467b97STreehugger Robot        t.line = @state.token_start_line
1212*16467b97STreehugger Robot        t.text = self.text
1213*16467b97STreehugger Robot        t.column = @state.token_start_column
1214*16467b97STreehugger Robot      end
1215*16467b97STreehugger Robot    end
1216*16467b97STreehugger Robot  end
1217*16467b97STreehugger Robotend
1218*16467b97STreehugger Robot
1219*16467b97STreehugger Robot
1220*16467b97STreehugger Robot=begin rdoc ANTLR3::Parser
1221*16467b97STreehugger Robot
1222*16467b97STreehugger Robot= Parser
1223*16467b97STreehugger Robot
1224*16467b97STreehugger RobotParser is the default base class of ANTLR-generated parser classes. The class
1225*16467b97STreehugger Robottailors the functionality provided by Recognizer to the task of parsing.
1226*16467b97STreehugger Robot
1227*16467b97STreehugger Robot== About Parsing
1228*16467b97STreehugger Robot
1229*16467b97STreehugger RobotThis is just a lose overview of parsing. For considerably more in-depth coverage
1230*16467b97STreehugger Robotof the topic, read the ANTLR documentation or check out the ANTLR website
1231*16467b97STreehugger Robot(http://www.antlr.org).
1232*16467b97STreehugger Robot
1233*16467b97STreehugger RobotA grammar defines the vocabulary and the sentence structure of a language. While
1234*16467b97STreehugger Robota lexer concerns the basic vocabulary symbols of the language, a parser's
1235*16467b97STreehugger Robotprimary task is to implement the sentence structure.
1236*16467b97STreehugger Robot
1237*16467b97STreehugger RobotParsers are set up by providing a stream of tokens, which is usually created by
1238*16467b97STreehugger Robota corresponding lexer. Then, the user requests a specific sentence-structure
1239*16467b97STreehugger Robotwithin the grammar, such as "class_definition" or "xml_node", from the parser.
1240*16467b97STreehugger RobotIt iterates through the tokens, verifying the syntax of the sentence and
1241*16467b97STreehugger Robotperforming actions specified by the grammar. It stops when it encounters an
1242*16467b97STreehugger Roboterror or when it has matched the full sentence according to its defined
1243*16467b97STreehugger Robotstructure.
1244*16467b97STreehugger Robot
1245*16467b97STreehugger Robot== ANTLR Parsers and the Parser API
1246*16467b97STreehugger Robot
1247*16467b97STreehugger RobotPlain ANTLR-generated parsers directly subclass this class, unless specified
1248*16467b97STreehugger Robototherwise within the grammar options. The generated code will provide a method
1249*16467b97STreehugger Robotfor each parser rule defined in the ANTLR grammar, as well as any other
1250*16467b97STreehugger Robotcustomized member attributes and methods specified in the source grammar.
1251*16467b97STreehugger Robot
1252*16467b97STreehugger RobotThis class does not override much of the functionality in Recognizer, and
1253*16467b97STreehugger Robotthus the API closely mirrors Recognizer.
1254*16467b97STreehugger Robot
1255*16467b97STreehugger Robot=end
1256*16467b97STreehugger Robotclass Parser < Recognizer
1257*16467b97STreehugger Robot  def self.main( argv = ARGV, options = {} )
1258*16467b97STreehugger Robot    if argv.is_a?( ::Hash ) then argv, options = ARGV, argv end
1259*16467b97STreehugger Robot    main = ANTLR3::Main::ParserMain.new( self, options )
1260*16467b97STreehugger Robot    block_given? ? yield( main ) : main.execute( argv )
1261*16467b97STreehugger Robot  end
1262*16467b97STreehugger Robot
1263*16467b97STreehugger Robot  def self.associated_lexer
1264*16467b97STreehugger Robot    @associated_lexer ||= begin
1265*16467b97STreehugger Robot      @grammar_home and @grammar_home::Lexer
1266*16467b97STreehugger Robot    rescue NameError
1267*16467b97STreehugger Robot      grammar_name = @grammar_home.name.split( "::" ).last
1268*16467b97STreehugger Robot      begin
1269*16467b97STreehugger Robot        require "#{ grammar_name }Lexer"
1270*16467b97STreehugger Robot        @grammar_home::Lexer
1271*16467b97STreehugger Robot      rescue LoadError, NameError
1272*16467b97STreehugger Robot      end
1273*16467b97STreehugger Robot    end
1274*16467b97STreehugger Robot  end
1275*16467b97STreehugger Robot
1276*16467b97STreehugger Robot
1277*16467b97STreehugger Robot  def initialize( input, options = {} )
1278*16467b97STreehugger Robot    super( options )
1279*16467b97STreehugger Robot    @input = nil
1280*16467b97STreehugger Robot    reset
1281*16467b97STreehugger Robot    @input = cast_input( input, options )
1282*16467b97STreehugger Robot  end
1283*16467b97STreehugger Robot
1284*16467b97STreehugger Robot  def missing_symbol( error, expected_type, follow )
1285*16467b97STreehugger Robot    current = @input.look
1286*16467b97STreehugger Robot    current = @input.look( -1 ) if current == ANTLR3::EOF_TOKEN
1287*16467b97STreehugger Robot    t =
1288*16467b97STreehugger Robot      case
1289*16467b97STreehugger Robot      when current && current != ANTLR3::EOF_TOKEN then current.clone
1290*16467b97STreehugger Robot      when @input.token_class then @input.token_class.new
1291*16467b97STreehugger Robot      else ( create_token rescue CommonToken.new )
1292*16467b97STreehugger Robot      end
1293*16467b97STreehugger Robot
1294*16467b97STreehugger Robot    t.type = expected_type
1295*16467b97STreehugger Robot    name = t.name.gsub( /(^<)|(>$)/,'' )
1296*16467b97STreehugger Robot    t.text = "<missing #{ name }>"
1297*16467b97STreehugger Robot    t.channel = DEFAULT_CHANNEL
1298*16467b97STreehugger Robot    return( t )
1299*16467b97STreehugger Robot  end
1300*16467b97STreehugger Robot
1301*16467b97STreehugger Robot  def token_stream=( input )
1302*16467b97STreehugger Robot    @input = nil
1303*16467b97STreehugger Robot    reset
1304*16467b97STreehugger Robot    @input = input
1305*16467b97STreehugger Robot  end
1306*16467b97STreehugger Robot  alias token_stream input
1307*16467b97STreehugger Robot
1308*16467b97STreehugger Robot  def source_name
1309*16467b97STreehugger Robot    @input.source_name
1310*16467b97STreehugger Robot  end
1311*16467b97STreehugger Robot
1312*16467b97STreehugger Robot
1313*16467b97STreehugger Robotprivate
1314*16467b97STreehugger Robot
1315*16467b97STreehugger Robot  def trace_in( rule_name, rule_index )
1316*16467b97STreehugger Robot    super( rule_name, rule_index, @input.look.inspect )
1317*16467b97STreehugger Robot  end
1318*16467b97STreehugger Robot
1319*16467b97STreehugger Robot  def trace_out( rule_name, rule_index )
1320*16467b97STreehugger Robot    super( rule_name, rule_index, @input.look.inspect )
1321*16467b97STreehugger Robot  end
1322*16467b97STreehugger Robot
1323*16467b97STreehugger Robot  def cast_input( input, options )
1324*16467b97STreehugger Robot    case input
1325*16467b97STreehugger Robot    when TokenStream then input
1326*16467b97STreehugger Robot    when TokenSource then CommonTokenStream.new( input, options )
1327*16467b97STreehugger Robot    when IO, String, CharacterStream
1328*16467b97STreehugger Robot      if lexer_class = self.class.associated_lexer
1329*16467b97STreehugger Robot        CommonTokenStream.new( lexer_class.new( input, options ), options )
1330*16467b97STreehugger Robot      else
1331*16467b97STreehugger Robot        raise ArgumentError, Util.tidy( <<-END, true )
1332*16467b97STreehugger Robot        | unable to automatically convert input #{ input.inspect }
1333*16467b97STreehugger Robot        | to a ANTLR3::TokenStream object as #{ self.class }
1334*16467b97STreehugger Robot        | does not appear to have an associated lexer class
1335*16467b97STreehugger Robot        END
1336*16467b97STreehugger Robot      end
1337*16467b97STreehugger Robot    else
1338*16467b97STreehugger Robot      # assume it's a stream if it at least implements peek and consume
1339*16467b97STreehugger Robot      unless input.respond_to?( :peek ) and input.respond_to?( :consume )
1340*16467b97STreehugger Robot        raise ArgumentError, Util.tidy( <<-END, true )
1341*16467b97STreehugger Robot        | #{ self.class } requires a token stream as input, but
1342*16467b97STreehugger Robot        | #{ input.inspect } was provided
1343*16467b97STreehugger Robot        END
1344*16467b97STreehugger Robot      end
1345*16467b97STreehugger Robot      input
1346*16467b97STreehugger Robot    end
1347*16467b97STreehugger Robot  end
1348*16467b97STreehugger Robot
1349*16467b97STreehugger Robotend
1350*16467b97STreehugger Robot
1351*16467b97STreehugger Robotend
1352