1*16467b97STreehugger Robot#!/usr/bin/ruby 2*16467b97STreehugger Robot# encoding: utf-8 3*16467b97STreehugger Robot 4*16467b97STreehugger Robot=begin LICENSE 5*16467b97STreehugger Robot 6*16467b97STreehugger Robot[The "BSD licence"] 7*16467b97STreehugger RobotCopyright (c) 2009-2010 Kyle Yetter 8*16467b97STreehugger RobotAll rights reserved. 9*16467b97STreehugger Robot 10*16467b97STreehugger RobotRedistribution and use in source and binary forms, with or without 11*16467b97STreehugger Robotmodification, are permitted provided that the following conditions 12*16467b97STreehugger Robotare met: 13*16467b97STreehugger Robot 14*16467b97STreehugger Robot 1. Redistributions of source code must retain the above copyright 15*16467b97STreehugger Robot notice, this list of conditions and the following disclaimer. 16*16467b97STreehugger Robot 2. Redistributions in binary form must reproduce the above copyright 17*16467b97STreehugger Robot notice, this list of conditions and the following disclaimer in the 18*16467b97STreehugger Robot documentation and/or other materials provided with the distribution. 19*16467b97STreehugger Robot 3. The name of the author may not be used to endorse or promote products 20*16467b97STreehugger Robot derived from this software without specific prior written permission. 21*16467b97STreehugger Robot 22*16467b97STreehugger RobotTHIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 23*16467b97STreehugger RobotIMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 24*16467b97STreehugger RobotOF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 25*16467b97STreehugger RobotIN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 26*16467b97STreehugger RobotINCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 27*16467b97STreehugger RobotNOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28*16467b97STreehugger RobotDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29*16467b97STreehugger RobotTHEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30*16467b97STreehugger Robot(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 31*16467b97STreehugger RobotTHIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32*16467b97STreehugger Robot 33*16467b97STreehugger Robot=end 34*16467b97STreehugger Robot 35*16467b97STreehugger Robotmodule ANTLR3 36*16467b97STreehugger Robotunless const_defined?( :RecognizerSharedState ) 37*16467b97STreehugger Robot 38*16467b97STreehugger RobotRecognizerSharedState = Struct.new( 39*16467b97STreehugger Robot :following, 40*16467b97STreehugger Robot :error_recovery, 41*16467b97STreehugger Robot :last_error_index, 42*16467b97STreehugger Robot :backtracking, 43*16467b97STreehugger Robot :rule_memory, 44*16467b97STreehugger Robot :syntax_errors, 45*16467b97STreehugger Robot :token, 46*16467b97STreehugger Robot :token_start_position, 47*16467b97STreehugger Robot :token_start_line, 48*16467b97STreehugger Robot :token_start_column, 49*16467b97STreehugger Robot :channel, 50*16467b97STreehugger Robot :type, 51*16467b97STreehugger Robot :text 52*16467b97STreehugger Robot) 53*16467b97STreehugger Robot 54*16467b97STreehugger Robot=begin rdoc ANTLR3::RecognizerSharedState 55*16467b97STreehugger Robot 56*16467b97STreehugger RobotA big Struct-based class containing most of the data that makes up a 57*16467b97STreehugger Robotrecognizer's state. These attributes are externalized from the recognizer itself 58*16467b97STreehugger Robotso that recognizer delegation (which occurs when you import other grammars into 59*16467b97STreehugger Robotyour grammar) can function; multiple recognizers can share a common state. 60*16467b97STreehugger Robot 61*16467b97STreehugger Robot== Structure Attributes 62*16467b97STreehugger Robot 63*16467b97STreehugger Robotfollowing:: 64*16467b97STreehugger Robot a stack that tracks follow sets for error recovery 65*16467b97STreehugger Roboterror_recovery:: 66*16467b97STreehugger Robot a flag indicating whether or not the recognizer is in error recovery mode 67*16467b97STreehugger Robotlast_error_index:: 68*16467b97STreehugger Robot the index in the input stream of the last error 69*16467b97STreehugger Robotbacktracking:: 70*16467b97STreehugger Robot tracks the backtracking depth 71*16467b97STreehugger Robotrule_memory:: 72*16467b97STreehugger Robot if a grammar is compiled with the memoization option, this will be 73*16467b97STreehugger Robot set to a hash mapping previously parsed rules to cached indices 74*16467b97STreehugger Robotsyntax_errors:: 75*16467b97STreehugger Robot tracks the number of syntax errors seen so far 76*16467b97STreehugger Robottoken:: 77*16467b97STreehugger Robot holds newly constructed tokens for lexer rules 78*16467b97STreehugger Robottoken_start_position:: 79*16467b97STreehugger Robot the input stream index at which the token starts 80*16467b97STreehugger Robottoken_start_line:: 81*16467b97STreehugger Robot the input stream line number at which the token starts 82*16467b97STreehugger Robottoken_start_column:: 83*16467b97STreehugger Robot the input stream column at which the token starts 84*16467b97STreehugger Robotchannel:: 85*16467b97STreehugger Robot the channel value of the target token 86*16467b97STreehugger Robottype:: 87*16467b97STreehugger Robot the type value of the target token 88*16467b97STreehugger Robottext:: 89*16467b97STreehugger Robot the text of the target token 90*16467b97STreehugger Robot 91*16467b97STreehugger Robot=end 92*16467b97STreehugger Robot 93*16467b97STreehugger Robotclass RecognizerSharedState 94*16467b97STreehugger Robot def initialize 95*16467b97STreehugger Robot super( [], false, -1, 0, nil, 0, nil, -1 ) 96*16467b97STreehugger Robot # ^-- same as this --v 97*16467b97STreehugger Robot # self.following = [] 98*16467b97STreehugger Robot # self.error_recovery = false 99*16467b97STreehugger Robot # self.last_error_index = -1 100*16467b97STreehugger Robot # self.backtracking = 0 101*16467b97STreehugger Robot # self.syntax_errors = 0 102*16467b97STreehugger Robot # self.token_start_position = -1 103*16467b97STreehugger Robot end 104*16467b97STreehugger Robot 105*16467b97STreehugger Robot 106*16467b97STreehugger Robot # restores all of the state variables to their respective 107*16467b97STreehugger Robot # initial default values 108*16467b97STreehugger Robot def reset! 109*16467b97STreehugger Robot self.following.clear 110*16467b97STreehugger Robot self.error_recovery = false 111*16467b97STreehugger Robot self.last_error_index = -1 112*16467b97STreehugger Robot self.backtracking = 0 113*16467b97STreehugger Robot self.rule_memory and rule_memory.clear 114*16467b97STreehugger Robot self.syntax_errors = 0 115*16467b97STreehugger Robot self.token = nil 116*16467b97STreehugger Robot self.token_start_position = -1 117*16467b97STreehugger Robot self.token_start_line = nil 118*16467b97STreehugger Robot self.token_start_column = nil 119*16467b97STreehugger Robot self.channel = nil 120*16467b97STreehugger Robot self.type = nil 121*16467b97STreehugger Robot self.text = nil 122*16467b97STreehugger Robot end 123*16467b97STreehugger Robotend 124*16467b97STreehugger Robot 125*16467b97STreehugger Robotend # unless const_defined?( :RecognizerSharedState ) 126*16467b97STreehugger Robot 127*16467b97STreehugger Robot=begin rdoc ANTLR3::Recognizer 128*16467b97STreehugger Robot 129*16467b97STreehugger Robot= Scope 130*16467b97STreehugger Robot 131*16467b97STreehugger RobotScope is used to represent instances of ANTLR's various attribute scopes. 132*16467b97STreehugger RobotIt is identical to Ruby's built-in Struct class, but it takes string 133*16467b97STreehugger Robotattribute declarations from the ANTLR grammar as parameters, and overrides 134*16467b97STreehugger Robotthe #initialize method to set the default values if any are present in 135*16467b97STreehugger Robotthe scope declaration. 136*16467b97STreehugger Robot 137*16467b97STreehugger Robot Block = Scope.new( "name", "depth = 0", "variables = {}" ) 138*16467b97STreehugger Robot Block.new # => #<struct Block name=nil, depth=0, variables={}> 139*16467b97STreehugger Robot Block.new( "function" ) # => #<struct Block name="function", depth=0, variables={}> 140*16467b97STreehugger Robot Block.new( 'a', 1, :x => 3 ) # => #<struct Block name="a", depth=1, variables={ :x => 3 }> 141*16467b97STreehugger Robot 142*16467b97STreehugger Robot=end 143*16467b97STreehugger Robot 144*16467b97STreehugger Robotclass Scope < ::Struct 145*16467b97STreehugger Robot def self.new( *declarations, &body ) 146*16467b97STreehugger Robot names = [] 147*16467b97STreehugger Robot defaults = {} 148*16467b97STreehugger Robot for decl in declarations 149*16467b97STreehugger Robot name, default = decl.to_s.split( /\s*=\s*/, 2 ) 150*16467b97STreehugger Robot names << ( name = name.to_sym ) 151*16467b97STreehugger Robot default and defaults[ name ] = default 152*16467b97STreehugger Robot end 153*16467b97STreehugger Robot super( *names ) do 154*16467b97STreehugger Robot 155*16467b97STreehugger Robot # If no defaults, leave the initialize method the same as 156*16467b97STreehugger Robot # the struct's default initialize for speed. Otherwise, 157*16467b97STreehugger Robot # overwrite the initialize to populate with default values. 158*16467b97STreehugger Robot unless defaults.empty? 159*16467b97STreehugger Robot parameters = names.map do | name | 160*16467b97STreehugger Robot "#{ name } = " << defaults.fetch( name, 'nil' ) 161*16467b97STreehugger Robot end.join( ', ' ) 162*16467b97STreehugger Robot class_eval( <<-END ) 163*16467b97STreehugger Robot def initialize( #{ parameters } ) 164*16467b97STreehugger Robot super( #{ names.join( ', ' ) } ) 165*16467b97STreehugger Robot end 166*16467b97STreehugger Robot END 167*16467b97STreehugger Robot end 168*16467b97STreehugger Robot 169*16467b97STreehugger Robot body and class_eval( &body ) 170*16467b97STreehugger Robot end 171*16467b97STreehugger Robot end 172*16467b97STreehugger Robotend 173*16467b97STreehugger Robot 174*16467b97STreehugger Robot=begin rdoc ANTLR3::Recognizer 175*16467b97STreehugger Robot 176*16467b97STreehugger Robot= Recognizer 177*16467b97STreehugger Robot 178*16467b97STreehugger RobotAs the base class of all ANTLR-generated recognizers, Recognizer provides 179*16467b97STreehugger Robotmuch of the shared functionality and structure used in the recognition process. 180*16467b97STreehugger RobotFor all effective purposes, the class and its immediate subclasses Lexer, 181*16467b97STreehugger RobotParser, and TreeParser are abstract classes. They can be instantiated, but 182*16467b97STreehugger Robotthey're pretty useless on their own. Instead, to make useful code, you write an 183*16467b97STreehugger RobotANTLR grammar and ANTLR will generate classes which inherit from one of the 184*16467b97STreehugger Robotrecognizer base classes, providing the implementation of the grammar rules 185*16467b97STreehugger Robotitself. this group of classes to implement necessary tasks. Recognizer 186*16467b97STreehugger Robotdefines methods related to: 187*16467b97STreehugger Robot 188*16467b97STreehugger Robot* token and character matching 189*16467b97STreehugger Robot* prediction and recognition strategy 190*16467b97STreehugger Robot* recovering from errors 191*16467b97STreehugger Robot* reporting errors 192*16467b97STreehugger Robot* memoization 193*16467b97STreehugger Robot* simple rule tracing and debugging 194*16467b97STreehugger Robot 195*16467b97STreehugger Robot=end 196*16467b97STreehugger Robot 197*16467b97STreehugger Robotclass Recognizer 198*16467b97STreehugger Robot include Constants 199*16467b97STreehugger Robot include Error 200*16467b97STreehugger Robot include TokenFactory 201*16467b97STreehugger Robot extend ClassMacros 202*16467b97STreehugger Robot 203*16467b97STreehugger Robot @rules = {} 204*16467b97STreehugger Robot 205*16467b97STreehugger Robot # inherited class methods and hooks 206*16467b97STreehugger Robot class << self 207*16467b97STreehugger Robot attr_reader :grammar_file_name, 208*16467b97STreehugger Robot :antlr_version, 209*16467b97STreehugger Robot :antlr_version_string, 210*16467b97STreehugger Robot :library_version_string, 211*16467b97STreehugger Robot :grammar_home 212*16467b97STreehugger Robot 213*16467b97STreehugger Robot attr_accessor :token_scheme, :default_rule 214*16467b97STreehugger Robot 215*16467b97STreehugger Robot # generated recognizer code uses this method to stamp 216*16467b97STreehugger Robot # the code with the name of the grammar file and 217*16467b97STreehugger Robot # the current version of ANTLR being used to generate 218*16467b97STreehugger Robot # the code 219*16467b97STreehugger Robot def generated_using( grammar_file, antlr_version, library_version = nil ) 220*16467b97STreehugger Robot @grammar_file_name = grammar_file.freeze 221*16467b97STreehugger Robot @antlr_version_string = antlr_version.freeze 222*16467b97STreehugger Robot @library_version = Util.parse_version( library_version ) 223*16467b97STreehugger Robot if @antlr_version_string =~ /^(\d+)\.(\d+)(?:\.(\d+)(?:b(\d+))?)?(.*)$/ 224*16467b97STreehugger Robot @antlr_version = [ $1, $2, $3, $4 ].map! { |str| str.to_i } 225*16467b97STreehugger Robot timestamp = $5.strip 226*16467b97STreehugger Robot #@antlr_release_time = $5.empty? ? nil : Time.parse($5) 227*16467b97STreehugger Robot else 228*16467b97STreehugger Robot raise "bad version string: %p" % version_string 229*16467b97STreehugger Robot end 230*16467b97STreehugger Robot end 231*16467b97STreehugger Robot 232*16467b97STreehugger Robot # this method is used to generate return-value structures for 233*16467b97STreehugger Robot # rules with multiple return values. To avoid generating 234*16467b97STreehugger Robot # a special class for ever rule in AST parsers and such 235*16467b97STreehugger Robot # (where most rules have the same default set of return values), 236*16467b97STreehugger Robot # each recognizer gets a default return value structure 237*16467b97STreehugger Robot # assigned to the constant +Return+. Rules which don't 238*16467b97STreehugger Robot # require additional custom members will have a rule-return 239*16467b97STreehugger Robot # name constant that just points to the generic return 240*16467b97STreehugger Robot # value. 241*16467b97STreehugger Robot def define_return_scope( *members ) 242*16467b97STreehugger Robot if members.empty? then generic_return_scope 243*16467b97STreehugger Robot else 244*16467b97STreehugger Robot members += return_scope_members 245*16467b97STreehugger Robot Struct.new( *members ) 246*16467b97STreehugger Robot end 247*16467b97STreehugger Robot end 248*16467b97STreehugger Robot 249*16467b97STreehugger Robot # used as a hook to add additional default members 250*16467b97STreehugger Robot # to default return value structures 251*16467b97STreehugger Robot # For example, all AST-building parsers override 252*16467b97STreehugger Robot # this method to add an extra +:tree+ field to 253*16467b97STreehugger Robot # all rule return structures. 254*16467b97STreehugger Robot def return_scope_members 255*16467b97STreehugger Robot [ :start, :stop ] 256*16467b97STreehugger Robot end 257*16467b97STreehugger Robot 258*16467b97STreehugger Robot # sets up and returns the generic rule return 259*16467b97STreehugger Robot # scope for a recognizer 260*16467b97STreehugger Robot def generic_return_scope 261*16467b97STreehugger Robot @generic_return_scope ||= begin 262*16467b97STreehugger Robot struct = Struct.new( *return_scope_members ) 263*16467b97STreehugger Robot const_set( :Return, struct ) 264*16467b97STreehugger Robot end 265*16467b97STreehugger Robot end 266*16467b97STreehugger Robot 267*16467b97STreehugger Robot def imported_grammars 268*16467b97STreehugger Robot @imported_grammars ||= Set.new 269*16467b97STreehugger Robot end 270*16467b97STreehugger Robot 271*16467b97STreehugger Robot def master_grammars 272*16467b97STreehugger Robot @master_grammars ||= [] 273*16467b97STreehugger Robot end 274*16467b97STreehugger Robot 275*16467b97STreehugger Robot def master 276*16467b97STreehugger Robot master_grammars.last 277*16467b97STreehugger Robot end 278*16467b97STreehugger Robot 279*16467b97STreehugger Robot def masters( *grammar_names ) 280*16467b97STreehugger Robot for grammar in grammar_names 281*16467b97STreehugger Robot unless master_grammars.include?( grammar ) 282*16467b97STreehugger Robot master_grammars << grammar 283*16467b97STreehugger Robot attr_reader( Util.snake_case( grammar ) ) 284*16467b97STreehugger Robot end 285*16467b97STreehugger Robot end 286*16467b97STreehugger Robot end 287*16467b97STreehugger Robot private :masters 288*16467b97STreehugger Robot 289*16467b97STreehugger Robot def imports( *grammar_names ) 290*16467b97STreehugger Robot for grammar in grammar_names 291*16467b97STreehugger Robot imported_grammars.add?( grammar.to_sym ) and 292*16467b97STreehugger Robot attr_reader( Util.snake_case( grammar ) ) 293*16467b97STreehugger Robot end 294*16467b97STreehugger Robot return imported_grammars 295*16467b97STreehugger Robot end 296*16467b97STreehugger Robot private :imports 297*16467b97STreehugger Robot 298*16467b97STreehugger Robot def rules 299*16467b97STreehugger Robot self::RULE_METHODS.dup rescue [] 300*16467b97STreehugger Robot end 301*16467b97STreehugger Robot 302*16467b97STreehugger Robot def default_rule 303*16467b97STreehugger Robot @default_rule ||= rules.first 304*16467b97STreehugger Robot end 305*16467b97STreehugger Robot 306*16467b97STreehugger Robot def debug? 307*16467b97STreehugger Robot return false 308*16467b97STreehugger Robot end 309*16467b97STreehugger Robot 310*16467b97STreehugger Robot def profile? 311*16467b97STreehugger Robot return false 312*16467b97STreehugger Robot end 313*16467b97STreehugger Robot 314*16467b97STreehugger Robot def Scope( *declarations, &body ) 315*16467b97STreehugger Robot Scope.new( *declarations, &body ) 316*16467b97STreehugger Robot end 317*16467b97STreehugger Robot 318*16467b97STreehugger Robot def token_class 319*16467b97STreehugger Robot @token_class ||= begin 320*16467b97STreehugger Robot self::Token rescue 321*16467b97STreehugger Robot superclass.token_class rescue 322*16467b97STreehugger Robot ANTLR3::CommonToken 323*16467b97STreehugger Robot end 324*16467b97STreehugger Robot end 325*16467b97STreehugger Robot private :generated_using 326*16467b97STreehugger Robot end 327*16467b97STreehugger Robot 328*16467b97STreehugger Robot @grammar_file_name = nil 329*16467b97STreehugger Robot @antlr_version = ANTLR3::ANTLR_VERSION 330*16467b97STreehugger Robot @antlr_version_string = ANTLR3::ANTLR_VERSION_STRING 331*16467b97STreehugger Robot 332*16467b97STreehugger Robot def grammar_file_name 333*16467b97STreehugger Robot self.class.grammar_file_name 334*16467b97STreehugger Robot end 335*16467b97STreehugger Robot 336*16467b97STreehugger Robot def antlr_version 337*16467b97STreehugger Robot self.class.antlr_version 338*16467b97STreehugger Robot end 339*16467b97STreehugger Robot 340*16467b97STreehugger Robot def antlr_version_string 341*16467b97STreehugger Robot self.class.antlr_version_string 342*16467b97STreehugger Robot end 343*16467b97STreehugger Robot 344*16467b97STreehugger Robot attr_accessor :input 345*16467b97STreehugger Robot attr_reader :state 346*16467b97STreehugger Robot 347*16467b97STreehugger Robot def each_delegate 348*16467b97STreehugger Robot block_given? or return enum_for( __method__ ) 349*16467b97STreehugger Robot for grammar in self.class.imported_grammars 350*16467b97STreehugger Robot del = __send__( Util.snake_case( grammar ) ) and 351*16467b97STreehugger Robot yield( del ) 352*16467b97STreehugger Robot end 353*16467b97STreehugger Robot end 354*16467b97STreehugger Robot 355*16467b97STreehugger Robot # Create a new recognizer. The constructor simply ensures that 356*16467b97STreehugger Robot # all recognizers are initialized with a shared state object. 357*16467b97STreehugger Robot # See the main recognizer subclasses for more specific 358*16467b97STreehugger Robot # information about creating recognizer objects like 359*16467b97STreehugger Robot # lexers and parsers. 360*16467b97STreehugger Robot def initialize( options = {} ) 361*16467b97STreehugger Robot @state = options[ :state ] || RecognizerSharedState.new 362*16467b97STreehugger Robot @error_output = options.fetch( :error_output, $stderr ) 363*16467b97STreehugger Robot defined?( @input ) or @input = nil 364*16467b97STreehugger Robot initialize_dfas 365*16467b97STreehugger Robot end 366*16467b97STreehugger Robot 367*16467b97STreehugger Robot # Resets the recognizer's state data to initial values. 368*16467b97STreehugger Robot # As a result, all error tracking and error recovery 369*16467b97STreehugger Robot # data accumulated in the current state will be cleared. 370*16467b97STreehugger Robot # It will also attempt to reset the input stream 371*16467b97STreehugger Robot # via input.reset, but it ignores any errors received 372*16467b97STreehugger Robot # from doing so. Thus the input stream is not guarenteed 373*16467b97STreehugger Robot # to be rewound to its initial position 374*16467b97STreehugger Robot def reset 375*16467b97STreehugger Robot @state and @state.reset! 376*16467b97STreehugger Robot @input and @input.reset rescue nil 377*16467b97STreehugger Robot end 378*16467b97STreehugger Robot 379*16467b97STreehugger Robot # Attempt to match the current input symbol the token type 380*16467b97STreehugger Robot # specified by +type+. If the symbol matches the type, 381*16467b97STreehugger Robot # consume the current symbol and return its value. If 382*16467b97STreehugger Robot # the symbol doesn't match, attempt to use the follow-set 383*16467b97STreehugger Robot # data provided by +follow+ to recover from the mismatched 384*16467b97STreehugger Robot # token. 385*16467b97STreehugger Robot def match( type, follow ) 386*16467b97STreehugger Robot matched_symbol = current_symbol 387*16467b97STreehugger Robot if @input.peek == type 388*16467b97STreehugger Robot @input.consume 389*16467b97STreehugger Robot @state.error_recovery = false 390*16467b97STreehugger Robot return matched_symbol 391*16467b97STreehugger Robot end 392*16467b97STreehugger Robot raise( BacktrackingFailed ) if @state.backtracking > 0 393*16467b97STreehugger Robot return recover_from_mismatched_token( type, follow ) 394*16467b97STreehugger Robot end 395*16467b97STreehugger Robot 396*16467b97STreehugger Robot # match anything -- i.e. wildcard match. Simply consume 397*16467b97STreehugger Robot # the current symbol from the input stream. 398*16467b97STreehugger Robot def match_any 399*16467b97STreehugger Robot @state.error_recovery = false 400*16467b97STreehugger Robot @input.consume 401*16467b97STreehugger Robot end 402*16467b97STreehugger Robot 403*16467b97STreehugger Robot ############################################################################################## 404*16467b97STreehugger Robot ###################################### Error Reporting ####################################### 405*16467b97STreehugger Robot ############################################################################################## 406*16467b97STreehugger Robot ############################################################################################## 407*16467b97STreehugger Robot 408*16467b97STreehugger Robot # When a recognition error occurs, this method is the main 409*16467b97STreehugger Robot # hook for carrying out the error reporting process. The 410*16467b97STreehugger Robot # default implementation calls +display_recognition_error+ 411*16467b97STreehugger Robot # to display the error info on $stderr. 412*16467b97STreehugger Robot def report_error( e = $! ) 413*16467b97STreehugger Robot @state.error_recovery and return 414*16467b97STreehugger Robot @state.syntax_errors += 1 415*16467b97STreehugger Robot @state.error_recovery = true 416*16467b97STreehugger Robot display_recognition_error( e ) 417*16467b97STreehugger Robot end 418*16467b97STreehugger Robot 419*16467b97STreehugger Robot # error reporting hook for presenting the information 420*16467b97STreehugger Robot # The default implementation builds appropriate error 421*16467b97STreehugger Robot # message text using +error_header+ and +error_message+, 422*16467b97STreehugger Robot # and calls +emit_error_message+ to write the error 423*16467b97STreehugger Robot # message out to some source 424*16467b97STreehugger Robot def display_recognition_error( e = $! ) 425*16467b97STreehugger Robot header = error_header( e ) 426*16467b97STreehugger Robot message = error_message( e ) 427*16467b97STreehugger Robot emit_error_message( "#{ header } #{ message }" ) 428*16467b97STreehugger Robot end 429*16467b97STreehugger Robot 430*16467b97STreehugger Robot # used to construct an appropriate error message 431*16467b97STreehugger Robot # based on the specific type of error and the 432*16467b97STreehugger Robot # error's attributes 433*16467b97STreehugger Robot def error_message( e = $! ) 434*16467b97STreehugger Robot case e 435*16467b97STreehugger Robot when UnwantedToken 436*16467b97STreehugger Robot token_name = token_name( e.expecting ) 437*16467b97STreehugger Robot "extraneous input #{ token_error_display( e.unexpected_token ) } expecting #{ token_name }" 438*16467b97STreehugger Robot when MissingToken 439*16467b97STreehugger Robot token_name = token_name( e.expecting ) 440*16467b97STreehugger Robot "missing #{ token_name } at #{ token_error_display( e.symbol ) }" 441*16467b97STreehugger Robot when MismatchedToken 442*16467b97STreehugger Robot token_name = token_name( e.expecting ) 443*16467b97STreehugger Robot "mismatched input #{ token_error_display( e.symbol ) } expecting #{ token_name }" 444*16467b97STreehugger Robot when MismatchedTreeNode 445*16467b97STreehugger Robot token_name = token_name( e.expecting ) 446*16467b97STreehugger Robot "mismatched tree node: #{ e.symbol } expecting #{ token_name }" 447*16467b97STreehugger Robot when NoViableAlternative 448*16467b97STreehugger Robot "no viable alternative at input " << token_error_display( e.symbol ) 449*16467b97STreehugger Robot when MismatchedSet 450*16467b97STreehugger Robot "mismatched input %s expecting set %s" % 451*16467b97STreehugger Robot [ token_error_display( e.symbol ), e.expecting.inspect ] 452*16467b97STreehugger Robot when MismatchedNotSet 453*16467b97STreehugger Robot "mismatched input %s expecting set %s" % 454*16467b97STreehugger Robot [ token_error_display( e.symbol ), e.expecting.inspect ] 455*16467b97STreehugger Robot when FailedPredicate 456*16467b97STreehugger Robot "rule %s failed predicate: { %s }?" % [ e.rule_name, e.predicate_text ] 457*16467b97STreehugger Robot else e.message 458*16467b97STreehugger Robot end 459*16467b97STreehugger Robot end 460*16467b97STreehugger Robot 461*16467b97STreehugger Robot # 462*16467b97STreehugger Robot # used to add a tag to the error message that indicates 463*16467b97STreehugger Robot # the location of the input stream when the error 464*16467b97STreehugger Robot # occurred 465*16467b97STreehugger Robot # 466*16467b97STreehugger Robot def error_header( e = $! ) 467*16467b97STreehugger Robot e.location 468*16467b97STreehugger Robot end 469*16467b97STreehugger Robot 470*16467b97STreehugger Robot # 471*16467b97STreehugger Robot # formats a token object appropriately for inspection 472*16467b97STreehugger Robot # within an error message 473*16467b97STreehugger Robot # 474*16467b97STreehugger Robot def token_error_display( token ) 475*16467b97STreehugger Robot unless text = token.text || ( token.source_text rescue nil ) 476*16467b97STreehugger Robot text = 477*16467b97STreehugger Robot case 478*16467b97STreehugger Robot when token.type == EOF then '<EOF>' 479*16467b97STreehugger Robot when name = token_name( token.type ) rescue nil then "<#{ name }>" 480*16467b97STreehugger Robot when token.respond_to?( :name ) then "<#{ token.name }>" 481*16467b97STreehugger Robot else "<#{ token.type }>" 482*16467b97STreehugger Robot end 483*16467b97STreehugger Robot end 484*16467b97STreehugger Robot return text.inspect 485*16467b97STreehugger Robot end 486*16467b97STreehugger Robot 487*16467b97STreehugger Robot # 488*16467b97STreehugger Robot # Write the error report data out to some source. By default, 489*16467b97STreehugger Robot # the error message is written to $stderr 490*16467b97STreehugger Robot # 491*16467b97STreehugger Robot def emit_error_message( message ) 492*16467b97STreehugger Robot @error_output.puts( message ) if @error_output 493*16467b97STreehugger Robot end 494*16467b97STreehugger Robot 495*16467b97STreehugger Robot ############################################################################################## 496*16467b97STreehugger Robot ###################################### Error Recovery ######################################## 497*16467b97STreehugger Robot ############################################################################################## 498*16467b97STreehugger Robot 499*16467b97STreehugger Robot def recover( error = $! ) 500*16467b97STreehugger Robot @state.last_error_index == @input.index and @input.consume 501*16467b97STreehugger Robot @state.last_error_index = @input.index 502*16467b97STreehugger Robot 503*16467b97STreehugger Robot follow_set = compute_error_recovery_set 504*16467b97STreehugger Robot 505*16467b97STreehugger Robot resync { consume_until( follow_set ) } 506*16467b97STreehugger Robot end 507*16467b97STreehugger Robot 508*16467b97STreehugger Robot def resync 509*16467b97STreehugger Robot begin_resync 510*16467b97STreehugger Robot return( yield ) 511*16467b97STreehugger Robot ensure 512*16467b97STreehugger Robot end_resync 513*16467b97STreehugger Robot end 514*16467b97STreehugger Robot 515*16467b97STreehugger Robot # overridable hook method that is executed at the start of the 516*16467b97STreehugger Robot # resyncing procedure in recover 517*16467b97STreehugger Robot # 518*16467b97STreehugger Robot # by default, it does nothing 519*16467b97STreehugger Robot def begin_resync 520*16467b97STreehugger Robot # do nothing 521*16467b97STreehugger Robot end 522*16467b97STreehugger Robot 523*16467b97STreehugger Robot # overridable hook method that is after the resyncing procedure has completed 524*16467b97STreehugger Robot # 525*16467b97STreehugger Robot # by default, it does nothing 526*16467b97STreehugger Robot def end_resync 527*16467b97STreehugger Robot # do nothing 528*16467b97STreehugger Robot end 529*16467b97STreehugger Robot 530*16467b97STreehugger Robot # (The following explanation has been lifted directly from the 531*16467b97STreehugger Robot # source code documentation of the ANTLR Java runtime library) 532*16467b97STreehugger Robot # 533*16467b97STreehugger Robot # Compute the error recovery set for the current rule. During 534*16467b97STreehugger Robot # rule invocation, the parser pushes the set of tokens that can 535*16467b97STreehugger Robot # follow that rule reference on the stack; this amounts to 536*16467b97STreehugger Robot # computing FIRST of what follows the rule reference in the 537*16467b97STreehugger Robot # enclosing rule. This local follow set only includes tokens 538*16467b97STreehugger Robot # from within the rule; i.e., the FIRST computation done by 539*16467b97STreehugger Robot # ANTLR stops at the end of a rule. 540*16467b97STreehugger Robot # 541*16467b97STreehugger Robot # EXAMPLE 542*16467b97STreehugger Robot # 543*16467b97STreehugger Robot # When you find a "no viable alt exception", the input is not 544*16467b97STreehugger Robot # consistent with any of the alternatives for rule r. The best 545*16467b97STreehugger Robot # thing to do is to consume tokens until you see something that 546*16467b97STreehugger Robot # can legally follow a call to r *or* any rule that called r. 547*16467b97STreehugger Robot # You don't want the exact set of viable next tokens because the 548*16467b97STreehugger Robot # input might just be missing a token--you might consume the 549*16467b97STreehugger Robot # rest of the input looking for one of the missing tokens. 550*16467b97STreehugger Robot # 551*16467b97STreehugger Robot # Consider grammar: 552*16467b97STreehugger Robot # 553*16467b97STreehugger Robot # a : '[' b ']' 554*16467b97STreehugger Robot # | '(' b ')' 555*16467b97STreehugger Robot # ; 556*16467b97STreehugger Robot # b : c '^' INT ; 557*16467b97STreehugger Robot # c : ID 558*16467b97STreehugger Robot # | INT 559*16467b97STreehugger Robot # ; 560*16467b97STreehugger Robot # 561*16467b97STreehugger Robot # At each rule invocation, the set of tokens that could follow 562*16467b97STreehugger Robot # that rule is pushed on a stack. Here are the various "local" 563*16467b97STreehugger Robot # follow sets: 564*16467b97STreehugger Robot # 565*16467b97STreehugger Robot # FOLLOW( b1_in_a ) = FIRST( ']' ) = ']' 566*16467b97STreehugger Robot # FOLLOW( b2_in_a ) = FIRST( ')' ) = ')' 567*16467b97STreehugger Robot # FOLLOW( c_in_b ) = FIRST( '^' ) = '^' 568*16467b97STreehugger Robot # 569*16467b97STreehugger Robot # Upon erroneous input "[]", the call chain is 570*16467b97STreehugger Robot # 571*16467b97STreehugger Robot # a -> b -> c 572*16467b97STreehugger Robot # 573*16467b97STreehugger Robot # and, hence, the follow context stack is: 574*16467b97STreehugger Robot # 575*16467b97STreehugger Robot # depth local follow set after call to rule 576*16467b97STreehugger Robot # 0 \<EOF> a (from main( ) ) 577*16467b97STreehugger Robot # 1 ']' b 578*16467b97STreehugger Robot # 3 '^' c 579*16467b97STreehugger Robot # 580*16467b97STreehugger Robot # Notice that <tt>')'</tt> is not included, because b would have to have 581*16467b97STreehugger Robot # been called from a different context in rule a for ')' to be 582*16467b97STreehugger Robot # included. 583*16467b97STreehugger Robot # 584*16467b97STreehugger Robot # For error recovery, we cannot consider FOLLOW(c) 585*16467b97STreehugger Robot # (context-sensitive or otherwise). We need the combined set of 586*16467b97STreehugger Robot # all context-sensitive FOLLOW sets--the set of all tokens that 587*16467b97STreehugger Robot # could follow any reference in the call chain. We need to 588*16467b97STreehugger Robot # resync to one of those tokens. Note that FOLLOW(c)='^' and if 589*16467b97STreehugger Robot # we resync'd to that token, we'd consume until EOF. We need to 590*16467b97STreehugger Robot # sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. 591*16467b97STreehugger Robot # In this case, for input "[]", LA(1) is in this set so we would 592*16467b97STreehugger Robot # not consume anything and after printing an error rule c would 593*16467b97STreehugger Robot # return normally. It would not find the required '^' though. 594*16467b97STreehugger Robot # At this point, it gets a mismatched token error and throws an 595*16467b97STreehugger Robot # exception (since LA(1) is not in the viable following token 596*16467b97STreehugger Robot # set). The rule exception handler tries to recover, but finds 597*16467b97STreehugger Robot # the same recovery set and doesn't consume anything. Rule b 598*16467b97STreehugger Robot # exits normally returning to rule a. Now it finds the ']' (and 599*16467b97STreehugger Robot # with the successful match exits errorRecovery mode). 600*16467b97STreehugger Robot # 601*16467b97STreehugger Robot # So, you cna see that the parser walks up call chain looking 602*16467b97STreehugger Robot # for the token that was a member of the recovery set. 603*16467b97STreehugger Robot # 604*16467b97STreehugger Robot # Errors are not generated in errorRecovery mode. 605*16467b97STreehugger Robot # 606*16467b97STreehugger Robot # ANTLR's error recovery mechanism is based upon original ideas: 607*16467b97STreehugger Robot # 608*16467b97STreehugger Robot # "Algorithms + Data Structures = Programs" by Niklaus Wirth 609*16467b97STreehugger Robot # 610*16467b97STreehugger Robot # and 611*16467b97STreehugger Robot # 612*16467b97STreehugger Robot # "A note on error recovery in recursive descent parsers": 613*16467b97STreehugger Robot # http://portal.acm.org/citation.cfm?id=947902.947905 614*16467b97STreehugger Robot # 615*16467b97STreehugger Robot # Later, Josef Grosch had some good ideas: 616*16467b97STreehugger Robot # 617*16467b97STreehugger Robot # "Efficient and Comfortable Error Recovery in Recursive Descent 618*16467b97STreehugger Robot # Parsers": 619*16467b97STreehugger Robot # ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip 620*16467b97STreehugger Robot # 621*16467b97STreehugger Robot # Like Grosch I implemented local FOLLOW sets that are combined 622*16467b97STreehugger Robot # at run-time upon error to avoid overhead during parsing. 623*16467b97STreehugger Robot def compute_error_recovery_set 624*16467b97STreehugger Robot combine_follows( false ) 625*16467b97STreehugger Robot end 626*16467b97STreehugger Robot 627*16467b97STreehugger Robot def recover_from_mismatched_token( type, follow ) 628*16467b97STreehugger Robot if mismatch_is_unwanted_token?( type ) 629*16467b97STreehugger Robot err = UnwantedToken( type ) 630*16467b97STreehugger Robot resync { @input.consume } 631*16467b97STreehugger Robot report_error( err ) 632*16467b97STreehugger Robot 633*16467b97STreehugger Robot return @input.consume 634*16467b97STreehugger Robot end 635*16467b97STreehugger Robot 636*16467b97STreehugger Robot if mismatch_is_missing_token?( follow ) 637*16467b97STreehugger Robot inserted = missing_symbol( nil, type, follow ) 638*16467b97STreehugger Robot report_error( MissingToken( type, inserted ) ) 639*16467b97STreehugger Robot return inserted 640*16467b97STreehugger Robot end 641*16467b97STreehugger Robot 642*16467b97STreehugger Robot raise MismatchedToken( type ) 643*16467b97STreehugger Robot end 644*16467b97STreehugger Robot 645*16467b97STreehugger Robot def recover_from_mismatched_set( e, follow ) 646*16467b97STreehugger Robot if mismatch_is_missing_token?( follow ) 647*16467b97STreehugger Robot report_error( e ) 648*16467b97STreehugger Robot return missing_symbol( e, INVALID_TOKEN_TYPE, follow ) 649*16467b97STreehugger Robot end 650*16467b97STreehugger Robot raise e 651*16467b97STreehugger Robot end 652*16467b97STreehugger Robot 653*16467b97STreehugger Robot def recover_from_mismatched_element( e, follow ) 654*16467b97STreehugger Robot follow.nil? and return false 655*16467b97STreehugger Robot if follow.include?( EOR_TOKEN_TYPE ) 656*16467b97STreehugger Robot viable_tokens = compute_context_sensitive_rule_follow 657*16467b97STreehugger Robot follow = ( follow | viable_tokens ) - Set[ EOR_TOKEN_TYPE ] 658*16467b97STreehugger Robot end 659*16467b97STreehugger Robot if follow.include?( @input.peek ) 660*16467b97STreehugger Robot report_error( e ) 661*16467b97STreehugger Robot return true 662*16467b97STreehugger Robot end 663*16467b97STreehugger Robot return false 664*16467b97STreehugger Robot end 665*16467b97STreehugger Robot 666*16467b97STreehugger Robot # Conjure up a missing token during error recovery. 667*16467b97STreehugger Robot # 668*16467b97STreehugger Robot # The recognizer attempts to recover from single missing 669*16467b97STreehugger Robot # symbols. But, actions might refer to that missing symbol. 670*16467b97STreehugger Robot # For example, x=ID {f($x);}. The action clearly assumes 671*16467b97STreehugger Robot # that there has been an identifier matched previously and that 672*16467b97STreehugger Robot # $x points at that token. If that token is missing, but 673*16467b97STreehugger Robot # the next token in the stream is what we want we assume that 674*16467b97STreehugger Robot # this token is missing and we keep going. Because we 675*16467b97STreehugger Robot # have to return some token to replace the missing token, 676*16467b97STreehugger Robot # we have to conjure one up. This method gives the user control 677*16467b97STreehugger Robot # over the tokens returned for missing tokens. Mostly, 678*16467b97STreehugger Robot # you will want to create something special for identifier 679*16467b97STreehugger Robot # tokens. For literals such as '{' and ',', the default 680*16467b97STreehugger Robot # action in the parser or tree parser works. It simply creates 681*16467b97STreehugger Robot # a CommonToken of the appropriate type. The text will be the token. 682*16467b97STreehugger Robot # If you change what tokens must be created by the lexer, 683*16467b97STreehugger Robot # override this method to create the appropriate tokens. 684*16467b97STreehugger Robot def missing_symbol( error, expected_token_type, follow ) 685*16467b97STreehugger Robot return nil 686*16467b97STreehugger Robot end 687*16467b97STreehugger Robot 688*16467b97STreehugger Robot def mismatch_is_unwanted_token?( type ) 689*16467b97STreehugger Robot @input.peek( 2 ) == type 690*16467b97STreehugger Robot end 691*16467b97STreehugger Robot 692*16467b97STreehugger Robot def mismatch_is_missing_token?( follow ) 693*16467b97STreehugger Robot follow.nil? and return false 694*16467b97STreehugger Robot if follow.include?( EOR_TOKEN_TYPE ) 695*16467b97STreehugger Robot viable_tokens = compute_context_sensitive_rule_follow 696*16467b97STreehugger Robot follow = follow | viable_tokens 697*16467b97STreehugger Robot 698*16467b97STreehugger Robot follow.delete( EOR_TOKEN_TYPE ) unless @state.following.empty? 699*16467b97STreehugger Robot end 700*16467b97STreehugger Robot if follow.include?( @input.peek ) or follow.include?( EOR_TOKEN_TYPE ) 701*16467b97STreehugger Robot return true 702*16467b97STreehugger Robot end 703*16467b97STreehugger Robot return false 704*16467b97STreehugger Robot end 705*16467b97STreehugger Robot 706*16467b97STreehugger Robot def syntax_errors? 707*16467b97STreehugger Robot ( error_count = @state.syntax_errors ) > 0 and return( error_count ) 708*16467b97STreehugger Robot end 709*16467b97STreehugger Robot 710*16467b97STreehugger Robot # factor out what to do upon token mismatch so 711*16467b97STreehugger Robot # tree parsers can behave differently. 712*16467b97STreehugger Robot # 713*16467b97STreehugger Robot # * override this method in your parser to do things 714*16467b97STreehugger Robot # like bailing out after the first error 715*16467b97STreehugger Robot # * just raise the exception instead of 716*16467b97STreehugger Robot # calling the recovery method. 717*16467b97STreehugger Robot # 718*16467b97STreehugger Robot def number_of_syntax_errors 719*16467b97STreehugger Robot @state.syntax_errors 720*16467b97STreehugger Robot end 721*16467b97STreehugger Robot 722*16467b97STreehugger Robot # 723*16467b97STreehugger Robot # Compute the context-sensitive +FOLLOW+ set for current rule. 724*16467b97STreehugger Robot # This is set of token types that can follow a specific rule 725*16467b97STreehugger Robot # reference given a specific call chain. You get the set of 726*16467b97STreehugger Robot # viable tokens that can possibly come next (look depth 1) 727*16467b97STreehugger Robot # given the current call chain. Contrast this with the 728*16467b97STreehugger Robot # definition of plain FOLLOW for rule r: 729*16467b97STreehugger Robot # 730*16467b97STreehugger Robot # FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} 731*16467b97STreehugger Robot # 732*16467b97STreehugger Robot # where x in T* and alpha, beta in V*; T is set of terminals and 733*16467b97STreehugger Robot # V is the set of terminals and nonterminals. In other words, 734*16467b97STreehugger Robot # FOLLOW(r) is the set of all tokens that can possibly follow 735*16467b97STreehugger Robot # references to r in *any* sentential form (context). At 736*16467b97STreehugger Robot # runtime, however, we know precisely which context applies as 737*16467b97STreehugger Robot # we have the call chain. We may compute the exact (rather 738*16467b97STreehugger Robot # than covering superset) set of following tokens. 739*16467b97STreehugger Robot # 740*16467b97STreehugger Robot # For example, consider grammar: 741*16467b97STreehugger Robot # 742*16467b97STreehugger Robot # stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} 743*16467b97STreehugger Robot # | "return" expr '.' 744*16467b97STreehugger Robot # ; 745*16467b97STreehugger Robot # expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} 746*16467b97STreehugger Robot # atom : INT // FOLLOW(atom)=={'+',')',';','.'} 747*16467b97STreehugger Robot # | '(' expr ')' 748*16467b97STreehugger Robot # ; 749*16467b97STreehugger Robot # 750*16467b97STreehugger Robot # The FOLLOW sets are all inclusive whereas context-sensitive 751*16467b97STreehugger Robot # FOLLOW sets are precisely what could follow a rule reference. 752*16467b97STreehugger Robot # For input input "i=(3);", here is the derivation: 753*16467b97STreehugger Robot # 754*16467b97STreehugger Robot # stat => ID '=' expr ';' 755*16467b97STreehugger Robot # => ID '=' atom ('+' atom)* ';' 756*16467b97STreehugger Robot # => ID '=' '(' expr ')' ('+' atom)* ';' 757*16467b97STreehugger Robot # => ID '=' '(' atom ')' ('+' atom)* ';' 758*16467b97STreehugger Robot # => ID '=' '(' INT ')' ('+' atom)* ';' 759*16467b97STreehugger Robot # => ID '=' '(' INT ')' ';' 760*16467b97STreehugger Robot # 761*16467b97STreehugger Robot # At the "3" token, you'd have a call chain of 762*16467b97STreehugger Robot # 763*16467b97STreehugger Robot # stat -> expr -> atom -> expr -> atom 764*16467b97STreehugger Robot # 765*16467b97STreehugger Robot # What can follow that specific nested ref to atom? Exactly ')' 766*16467b97STreehugger Robot # as you can see by looking at the derivation of this specific 767*16467b97STreehugger Robot # input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. 768*16467b97STreehugger Robot # 769*16467b97STreehugger Robot # You want the exact viable token set when recovering from a 770*16467b97STreehugger Robot # token mismatch. Upon token mismatch, if LA(1) is member of 771*16467b97STreehugger Robot # the viable next token set, then you know there is most likely 772*16467b97STreehugger Robot # a missing token in the input stream. "Insert" one by just not 773*16467b97STreehugger Robot # throwing an exception. 774*16467b97STreehugger Robot # 775*16467b97STreehugger Robot def compute_context_sensitive_rule_follow 776*16467b97STreehugger Robot combine_follows true 777*16467b97STreehugger Robot end 778*16467b97STreehugger Robot 779*16467b97STreehugger Robot def combine_follows( exact ) 780*16467b97STreehugger Robot follow_set = Set.new 781*16467b97STreehugger Robot @state.following.each_with_index.reverse_each do |local_follow_set, index| 782*16467b97STreehugger Robot follow_set |= local_follow_set 783*16467b97STreehugger Robot if exact 784*16467b97STreehugger Robot if local_follow_set.include?( EOR_TOKEN_TYPE ) 785*16467b97STreehugger Robot follow_set.delete( EOR_TOKEN_TYPE ) if index > 0 786*16467b97STreehugger Robot else 787*16467b97STreehugger Robot break 788*16467b97STreehugger Robot end 789*16467b97STreehugger Robot end 790*16467b97STreehugger Robot end 791*16467b97STreehugger Robot return follow_set 792*16467b97STreehugger Robot end 793*16467b97STreehugger Robot 794*16467b97STreehugger Robot # 795*16467b97STreehugger Robot # Match needs to return the current input symbol, which gets put 796*16467b97STreehugger Robot # into the label for the associated token ref; e.g., x=ID. Token 797*16467b97STreehugger Robot # and tree parsers need to return different objects. Rather than test 798*16467b97STreehugger Robot # for input stream type or change the IntStream interface, I use 799*16467b97STreehugger Robot # a simple method to ask the recognizer to tell me what the current 800*16467b97STreehugger Robot # input symbol is. 801*16467b97STreehugger Robot # 802*16467b97STreehugger Robot # This is ignored for lexers. 803*16467b97STreehugger Robot # 804*16467b97STreehugger Robot def current_symbol 805*16467b97STreehugger Robot @input.look 806*16467b97STreehugger Robot end 807*16467b97STreehugger Robot 808*16467b97STreehugger Robot # 809*16467b97STreehugger Robot # Consume input symbols until one matches a type within types 810*16467b97STreehugger Robot # 811*16467b97STreehugger Robot # types can be a single symbol type or a set of symbol types 812*16467b97STreehugger Robot # 813*16467b97STreehugger Robot def consume_until( types ) 814*16467b97STreehugger Robot types.is_a?( Set ) or types = Set[ *types ] 815*16467b97STreehugger Robot type = @input.peek 816*16467b97STreehugger Robot until type == EOF or types.include?( type ) 817*16467b97STreehugger Robot @input.consume 818*16467b97STreehugger Robot type = @input.peek 819*16467b97STreehugger Robot end 820*16467b97STreehugger Robot return( type ) 821*16467b97STreehugger Robot end 822*16467b97STreehugger Robot 823*16467b97STreehugger Robot # 824*16467b97STreehugger Robot # Returns true if the recognizer is currently in a decision for which 825*16467b97STreehugger Robot # backtracking has been enabled 826*16467b97STreehugger Robot # 827*16467b97STreehugger Robot def backtracking? 828*16467b97STreehugger Robot @state.backtracking > 0 829*16467b97STreehugger Robot end 830*16467b97STreehugger Robot 831*16467b97STreehugger Robot def backtracking_level 832*16467b97STreehugger Robot @state.backtracking 833*16467b97STreehugger Robot end 834*16467b97STreehugger Robot 835*16467b97STreehugger Robot def backtracking_level=( n ) 836*16467b97STreehugger Robot @state.backtracking = n 837*16467b97STreehugger Robot end 838*16467b97STreehugger Robot 839*16467b97STreehugger Robot def backtrack 840*16467b97STreehugger Robot @state.backtracking += 1 841*16467b97STreehugger Robot start = @input.mark 842*16467b97STreehugger Robot success = 843*16467b97STreehugger Robot begin yield 844*16467b97STreehugger Robot rescue BacktrackingFailed then false 845*16467b97STreehugger Robot else true 846*16467b97STreehugger Robot end 847*16467b97STreehugger Robot return success 848*16467b97STreehugger Robot ensure 849*16467b97STreehugger Robot @input.rewind( start ) 850*16467b97STreehugger Robot @state.backtracking -= 1 851*16467b97STreehugger Robot end 852*16467b97STreehugger Robot 853*16467b97STreehugger Robot def syntactic_predicate?( name ) 854*16467b97STreehugger Robot backtrack { send name } 855*16467b97STreehugger Robot end 856*16467b97STreehugger Robot 857*16467b97STreehugger Robot alias backtracking backtracking_level 858*16467b97STreehugger Robot alias backtracking= backtracking_level= 859*16467b97STreehugger Robot 860*16467b97STreehugger Robot def rule_memoization( rule, start_index ) 861*16467b97STreehugger Robot @state.rule_memory.fetch( rule ) do 862*16467b97STreehugger Robot @state.rule_memory[ rule ] = Hash.new( MEMO_RULE_UNKNOWN ) 863*16467b97STreehugger Robot end[ start_index ] 864*16467b97STreehugger Robot end 865*16467b97STreehugger Robot 866*16467b97STreehugger Robot def already_parsed_rule?( rule ) 867*16467b97STreehugger Robot stop_index = rule_memoization( rule, @input.index ) 868*16467b97STreehugger Robot case stop_index 869*16467b97STreehugger Robot when MEMO_RULE_UNKNOWN then return false 870*16467b97STreehugger Robot when MEMO_RULE_FAILED 871*16467b97STreehugger Robot raise BacktrackingFailed 872*16467b97STreehugger Robot else 873*16467b97STreehugger Robot @input.seek( stop_index + 1 ) 874*16467b97STreehugger Robot end 875*16467b97STreehugger Robot return true 876*16467b97STreehugger Robot end 877*16467b97STreehugger Robot 878*16467b97STreehugger Robot def memoize( rule, start_index, success ) 879*16467b97STreehugger Robot stop_index = success ? @input.index - 1 : MEMO_RULE_FAILED 880*16467b97STreehugger Robot memo = @state.rule_memory[ rule ] and memo[ start_index ] = stop_index 881*16467b97STreehugger Robot end 882*16467b97STreehugger Robot 883*16467b97STreehugger Robot def trace_in( rule_name, rule_index, input_symbol ) 884*16467b97STreehugger Robot @error_output.printf( "--> enter %s on %s", rule_name, input_symbol ) 885*16467b97STreehugger Robot @state.backtracking > 0 and @error_output.printf( 886*16467b97STreehugger Robot " (in backtracking mode: depth = %s)", @state.backtracking 887*16467b97STreehugger Robot ) 888*16467b97STreehugger Robot @error_output.print( "\n" ) 889*16467b97STreehugger Robot end 890*16467b97STreehugger Robot 891*16467b97STreehugger Robot def trace_out( rule_name, rule_index, input_symbol ) 892*16467b97STreehugger Robot @error_output.printf( "<-- exit %s on %s", rule_name, input_symbol ) 893*16467b97STreehugger Robot @state.backtracking > 0 and @error_output.printf( 894*16467b97STreehugger Robot " (in backtracking mode: depth = %s)", @state.backtracking 895*16467b97STreehugger Robot ) 896*16467b97STreehugger Robot @error_output.print( "\n" ) 897*16467b97STreehugger Robot end 898*16467b97STreehugger Robot 899*16467b97STreehugger Robotprivate 900*16467b97STreehugger Robot 901*16467b97STreehugger Robot def initialize_dfas 902*16467b97STreehugger Robot # do nothing 903*16467b97STreehugger Robot end 904*16467b97STreehugger Robotend 905*16467b97STreehugger Robot 906*16467b97STreehugger Robot 907*16467b97STreehugger Robot# constant alias for compatibility with older versions of the 908*16467b97STreehugger Robot# runtime library 909*16467b97STreehugger RobotBaseRecognizer = Recognizer 910*16467b97STreehugger Robot 911*16467b97STreehugger Robot=begin rdoc ANTLR3::Lexer 912*16467b97STreehugger Robot 913*16467b97STreehugger Robot= Lexer 914*16467b97STreehugger Robot 915*16467b97STreehugger RobotLexer is the default superclass of all lexers generated by ANTLR. The class 916*16467b97STreehugger Robottailors the core functionality provided by Recognizer to the task of 917*16467b97STreehugger Robotmatching patterns in the text input and breaking the input into tokens. 918*16467b97STreehugger Robot 919*16467b97STreehugger Robot== About Lexers 920*16467b97STreehugger Robot 921*16467b97STreehugger RobotA lexer's job is to take input text and break it up into _tokens_ -- objects 922*16467b97STreehugger Robotthat encapsulate a piece of text, a type label (such as ID or INTEGER), and the 923*16467b97STreehugger Robotposition of the text with respect to the input. Thus, a lexer is essentially a 924*16467b97STreehugger Robotcomplicated iterator that steps through an input stream and produces a sequence 925*16467b97STreehugger Robotof tokens. Sometimes lexers are enough to carry out a goal on their own, such as 926*16467b97STreehugger Robottasks like source code highlighting and simple code analysis. Usually, however, 927*16467b97STreehugger Robotthe lexer converts text into tokens for use by a parser, which recognizes larger 928*16467b97STreehugger Robotstructures within the text. 929*16467b97STreehugger Robot 930*16467b97STreehugger RobotANTLR parsers have a variety of entry points specified by parser rules, each of 931*16467b97STreehugger Robotwhich defines the structure of a specific type of sentence in a grammar. Lexers, 932*16467b97STreehugger Robothowever, are primarily intended to have a single entry point. It looks at the 933*16467b97STreehugger Robotcharacters starting at the current input position, decides if the chunk of text 934*16467b97STreehugger Robotmatches one of a number of possible token type definitions, wraps the chunk into 935*16467b97STreehugger Robota token with information on its type and location, and advances the input stream 936*16467b97STreehugger Robotto the next place. 937*16467b97STreehugger Robot 938*16467b97STreehugger Robot== ANTLR Lexers and the Lexer API 939*16467b97STreehugger Robot 940*16467b97STreehugger RobotANTLR-generated lexers will subclass this class, unless specified otherwise 941*16467b97STreehugger Robotwithin a grammar file. The generated class will provide an implementation of 942*16467b97STreehugger Roboteach lexer rule as a method of the same name. The subclass will also provide an 943*16467b97STreehugger Robotimplementation for the abstract method #m_tokens, the purpose of which is to 944*16467b97STreehugger Robotmultiplex the token type definitions and predict what rule definition to execute 945*16467b97STreehugger Robotto fetch a token. The primary method in the lexer API, #next_token, uses 946*16467b97STreehugger Robot#m_tokens to fetch the next token and drive the iteration. 947*16467b97STreehugger Robot 948*16467b97STreehugger RobotIf the lexer is preparing tokens for use by an ANTLR generated parser, the lexer 949*16467b97STreehugger Robotwill generally be used to build a TokenStream object. The following code example 950*16467b97STreehugger Robotdemonstrates the typical setup for using ANTLR parsers and lexers in Ruby. 951*16467b97STreehugger Robot 952*16467b97STreehugger Robot # in HypotheticalLexer.rb 953*16467b97STreehugger Robot module Hypothetical 954*16467b97STreehugger Robot class Lexer < ANTLR3::Lexer 955*16467b97STreehugger Robot # ... 956*16467b97STreehugger Robot # ANTLR generated code 957*16467b97STreehugger Robot # ... 958*16467b97STreehugger Robot end 959*16467b97STreehugger Robot end 960*16467b97STreehugger Robot 961*16467b97STreehugger Robot # in HypotheticalParser.rb 962*16467b97STreehugger Robot module Hypothetical 963*16467b97STreehugger Robot class Parser < ANTLR3::Parser 964*16467b97STreehugger Robot # ... 965*16467b97STreehugger Robot # more ANTLR generated code 966*16467b97STreehugger Robot # ... 967*16467b97STreehugger Robot end 968*16467b97STreehugger Robot end 969*16467b97STreehugger Robot 970*16467b97STreehugger Robot # to take hypothetical source code and prepare it for parsing, 971*16467b97STreehugger Robot # there is generally a four-step construction process 972*16467b97STreehugger Robot 973*16467b97STreehugger Robot source = "some hypothetical source code" 974*16467b97STreehugger Robot input = ANTLR3::StringStream.new(source, :file => 'blah-de-blah.hyp') 975*16467b97STreehugger Robot lexer = Hypothetical::Lexer.new( input ) 976*16467b97STreehugger Robot tokens = ANTLR3::CommonTokenStream.new( lexer ) 977*16467b97STreehugger Robot parser = Hypothetical::Parser.new( tokens ) 978*16467b97STreehugger Robot 979*16467b97STreehugger Robot # if you're using the standard streams, ANTLR3::StringStream and 980*16467b97STreehugger Robot # ANTLR3::CommonTokenStream, you can write the same process 981*16467b97STreehugger Robot # shown above more succinctly: 982*16467b97STreehugger Robot 983*16467b97STreehugger Robot lexer = Hypothetical::Lexer.new("some hypothetical source code", :file => 'blah-de-blah.hyp') 984*16467b97STreehugger Robot parser = Hypothetical::Parser.new( lexer ) 985*16467b97STreehugger Robot 986*16467b97STreehugger Robot=end 987*16467b97STreehugger Robotclass Lexer < Recognizer 988*16467b97STreehugger Robot include TokenSource 989*16467b97STreehugger Robot @token_class = CommonToken 990*16467b97STreehugger Robot 991*16467b97STreehugger Robot def self.default_rule 992*16467b97STreehugger Robot @default_rule ||= :token! 993*16467b97STreehugger Robot end 994*16467b97STreehugger Robot 995*16467b97STreehugger Robot def self.main( argv = ARGV, options = {} ) 996*16467b97STreehugger Robot if argv.is_a?( ::Hash ) then argv, options = ARGV, argv end 997*16467b97STreehugger Robot main = ANTLR3::Main::LexerMain.new( self, options ) 998*16467b97STreehugger Robot block_given? ? yield( main ) : main.execute( argv ) 999*16467b97STreehugger Robot end 1000*16467b97STreehugger Robot 1001*16467b97STreehugger Robot def self.associated_parser 1002*16467b97STreehugger Robot @associated_parser ||= begin 1003*16467b97STreehugger Robot @grammar_home and @grammar_home::Parser 1004*16467b97STreehugger Robot rescue NameError 1005*16467b97STreehugger Robot grammar_name = @grammar_home.name.split( "::" ).last 1006*16467b97STreehugger Robot begin 1007*16467b97STreehugger Robot require "#{ grammar_name }Parser" 1008*16467b97STreehugger Robot @grammar_home::Parser 1009*16467b97STreehugger Robot rescue LoadError, NameError 1010*16467b97STreehugger Robot end 1011*16467b97STreehugger Robot end 1012*16467b97STreehugger Robot end 1013*16467b97STreehugger Robot 1014*16467b97STreehugger Robot def initialize( input, options = {} ) 1015*16467b97STreehugger Robot super( options ) 1016*16467b97STreehugger Robot @input = cast_input( input, options ) 1017*16467b97STreehugger Robot end 1018*16467b97STreehugger Robot 1019*16467b97STreehugger Robot def current_symbol 1020*16467b97STreehugger Robot nil 1021*16467b97STreehugger Robot end 1022*16467b97STreehugger Robot 1023*16467b97STreehugger Robot def next_token 1024*16467b97STreehugger Robot loop do 1025*16467b97STreehugger Robot @state.token = nil 1026*16467b97STreehugger Robot @state.channel = DEFAULT_CHANNEL 1027*16467b97STreehugger Robot @state.token_start_position = @input.index 1028*16467b97STreehugger Robot @state.token_start_column = @input.column 1029*16467b97STreehugger Robot @state.token_start_line = @input.line 1030*16467b97STreehugger Robot @state.text = nil 1031*16467b97STreehugger Robot @input.peek == EOF and return EOF_TOKEN 1032*16467b97STreehugger Robot begin 1033*16467b97STreehugger Robot token! 1034*16467b97STreehugger Robot 1035*16467b97STreehugger Robot case token = @state.token 1036*16467b97STreehugger Robot when nil then return( emit ) 1037*16467b97STreehugger Robot when SKIP_TOKEN then next 1038*16467b97STreehugger Robot else 1039*16467b97STreehugger Robot return token 1040*16467b97STreehugger Robot end 1041*16467b97STreehugger Robot rescue NoViableAlternative => re 1042*16467b97STreehugger Robot report_error( re ) 1043*16467b97STreehugger Robot recover( re ) 1044*16467b97STreehugger Robot rescue Error::RecognitionError => re 1045*16467b97STreehugger Robot report_error( re ) 1046*16467b97STreehugger Robot end 1047*16467b97STreehugger Robot end 1048*16467b97STreehugger Robot end 1049*16467b97STreehugger Robot 1050*16467b97STreehugger Robot def skip 1051*16467b97STreehugger Robot @state.token = SKIP_TOKEN 1052*16467b97STreehugger Robot end 1053*16467b97STreehugger Robot 1054*16467b97STreehugger Robot abstract :token! 1055*16467b97STreehugger Robot 1056*16467b97STreehugger Robot def exhaust 1057*16467b97STreehugger Robot self.to_a 1058*16467b97STreehugger Robot end 1059*16467b97STreehugger Robot 1060*16467b97STreehugger Robot def char_stream=( input ) 1061*16467b97STreehugger Robot @input = nil 1062*16467b97STreehugger Robot reset() 1063*16467b97STreehugger Robot @input = input 1064*16467b97STreehugger Robot end 1065*16467b97STreehugger Robot 1066*16467b97STreehugger Robot def source_name 1067*16467b97STreehugger Robot @input.source_name 1068*16467b97STreehugger Robot end 1069*16467b97STreehugger Robot 1070*16467b97STreehugger Robot def emit( token = @state.token ) 1071*16467b97STreehugger Robot token ||= create_token 1072*16467b97STreehugger Robot @state.token = token 1073*16467b97STreehugger Robot return token 1074*16467b97STreehugger Robot end 1075*16467b97STreehugger Robot 1076*16467b97STreehugger Robot def match( expected ) 1077*16467b97STreehugger Robot case expected 1078*16467b97STreehugger Robot when String 1079*16467b97STreehugger Robot expected.each_byte do |char| 1080*16467b97STreehugger Robot unless @input.peek == char 1081*16467b97STreehugger Robot @state.backtracking > 0 and raise BacktrackingFailed 1082*16467b97STreehugger Robot error = MismatchedToken( char ) 1083*16467b97STreehugger Robot recover( error ) 1084*16467b97STreehugger Robot raise error 1085*16467b97STreehugger Robot end 1086*16467b97STreehugger Robot @input.consume() 1087*16467b97STreehugger Robot end 1088*16467b97STreehugger Robot else # single integer character 1089*16467b97STreehugger Robot unless @input.peek == expected 1090*16467b97STreehugger Robot @state.backtracking > 0 and raise BacktrackingFailed 1091*16467b97STreehugger Robot error = MismatchedToken( expected ) 1092*16467b97STreehugger Robot recover( error ) 1093*16467b97STreehugger Robot raise error 1094*16467b97STreehugger Robot end 1095*16467b97STreehugger Robot @input.consume 1096*16467b97STreehugger Robot end 1097*16467b97STreehugger Robot return true 1098*16467b97STreehugger Robot end 1099*16467b97STreehugger Robot 1100*16467b97STreehugger Robot def match_any 1101*16467b97STreehugger Robot @input.consume 1102*16467b97STreehugger Robot end 1103*16467b97STreehugger Robot 1104*16467b97STreehugger Robot def match_range( min, max ) 1105*16467b97STreehugger Robot char = @input.peek 1106*16467b97STreehugger Robot if char.between?( min, max ) then @input.consume 1107*16467b97STreehugger Robot else 1108*16467b97STreehugger Robot @state.backtracking > 0 and raise BacktrackingFailed 1109*16467b97STreehugger Robot error = MismatchedRange( min.chr, max.chr ) 1110*16467b97STreehugger Robot recover( error ) 1111*16467b97STreehugger Robot raise( error ) 1112*16467b97STreehugger Robot end 1113*16467b97STreehugger Robot return true 1114*16467b97STreehugger Robot end 1115*16467b97STreehugger Robot 1116*16467b97STreehugger Robot def line 1117*16467b97STreehugger Robot @input.line 1118*16467b97STreehugger Robot end 1119*16467b97STreehugger Robot 1120*16467b97STreehugger Robot def column 1121*16467b97STreehugger Robot @input.column 1122*16467b97STreehugger Robot end 1123*16467b97STreehugger Robot 1124*16467b97STreehugger Robot def character_index 1125*16467b97STreehugger Robot @input.index 1126*16467b97STreehugger Robot end 1127*16467b97STreehugger Robot 1128*16467b97STreehugger Robot def text 1129*16467b97STreehugger Robot @state.text and return @state.text 1130*16467b97STreehugger Robot @input.substring( @state.token_start_position, character_index - 1 ) 1131*16467b97STreehugger Robot end 1132*16467b97STreehugger Robot 1133*16467b97STreehugger Robot def text=( text ) 1134*16467b97STreehugger Robot @state.text = text 1135*16467b97STreehugger Robot end 1136*16467b97STreehugger Robot 1137*16467b97STreehugger Robot def report_error( e ) 1138*16467b97STreehugger Robot display_recognition_error( e ) 1139*16467b97STreehugger Robot end 1140*16467b97STreehugger Robot 1141*16467b97STreehugger Robot def error_message( e ) 1142*16467b97STreehugger Robot char = character_error_display( e.symbol ) rescue nil 1143*16467b97STreehugger Robot case e 1144*16467b97STreehugger Robot when Error::MismatchedToken 1145*16467b97STreehugger Robot expecting = character_error_display( e.expecting ) 1146*16467b97STreehugger Robot "mismatched character #{ char }; expecting #{ expecting }" 1147*16467b97STreehugger Robot when Error::NoViableAlternative 1148*16467b97STreehugger Robot "no viable alternative at character #{ char }" 1149*16467b97STreehugger Robot when Error::EarlyExit 1150*16467b97STreehugger Robot "required ( ... )+ loop did not match anything at character #{ char }" 1151*16467b97STreehugger Robot when Error::MismatchedNotSet 1152*16467b97STreehugger Robot "mismatched character %s; expecting set %p" % [ char, e.expecting ] 1153*16467b97STreehugger Robot when Error::MismatchedSet 1154*16467b97STreehugger Robot "mismatched character %s; expecting set %p" % [ char, e.expecting ] 1155*16467b97STreehugger Robot when Error::MismatchedRange 1156*16467b97STreehugger Robot a = character_error_display( e.min ) 1157*16467b97STreehugger Robot b = character_error_display( e.max ) 1158*16467b97STreehugger Robot "mismatched character %s; expecting set %s..%s" % [ char, a, b ] 1159*16467b97STreehugger Robot else super 1160*16467b97STreehugger Robot end 1161*16467b97STreehugger Robot end 1162*16467b97STreehugger Robot 1163*16467b97STreehugger Robot def character_error_display( char ) 1164*16467b97STreehugger Robot case char 1165*16467b97STreehugger Robot when EOF then '<EOF>' 1166*16467b97STreehugger Robot when Integer then char.chr.inspect 1167*16467b97STreehugger Robot else char.inspect 1168*16467b97STreehugger Robot end 1169*16467b97STreehugger Robot end 1170*16467b97STreehugger Robot 1171*16467b97STreehugger Robot def recover( re ) 1172*16467b97STreehugger Robot @input.consume 1173*16467b97STreehugger Robot end 1174*16467b97STreehugger Robot 1175*16467b97STreehugger Robot alias input= char_stream= 1176*16467b97STreehugger Robot 1177*16467b97STreehugger Robotprivate 1178*16467b97STreehugger Robot 1179*16467b97STreehugger Robot def cast_input( input, options ) 1180*16467b97STreehugger Robot case input 1181*16467b97STreehugger Robot when CharacterStream then input 1182*16467b97STreehugger Robot when ::String then StringStream.new( input, options ) 1183*16467b97STreehugger Robot when ::IO, ARGF then FileStream.new( input, options ) 1184*16467b97STreehugger Robot else input 1185*16467b97STreehugger Robot end 1186*16467b97STreehugger Robot end 1187*16467b97STreehugger Robot 1188*16467b97STreehugger Robot def trace_in( rule_name, rule_index ) 1189*16467b97STreehugger Robot if symbol = @input.look and symbol != EOF then symbol = symbol.inspect 1190*16467b97STreehugger Robot else symbol = '<EOF>' end 1191*16467b97STreehugger Robot input_symbol = "#{ symbol } @ line #{ line } / col #{ column }" 1192*16467b97STreehugger Robot super( rule_name, rule_index, input_symbol ) 1193*16467b97STreehugger Robot end 1194*16467b97STreehugger Robot 1195*16467b97STreehugger Robot def trace_out( rule_name, rule_index ) 1196*16467b97STreehugger Robot if symbol = @input.look and symbol != EOF then symbol = symbol.inspect 1197*16467b97STreehugger Robot else symbol = '<EOF>' end 1198*16467b97STreehugger Robot input_symbol = "#{ symbol } @ line #{ line } / col #{ column }" 1199*16467b97STreehugger Robot super( rule_name, rule_index, input_symbol ) 1200*16467b97STreehugger Robot end 1201*16467b97STreehugger Robot 1202*16467b97STreehugger Robot def create_token( &b ) 1203*16467b97STreehugger Robot if block_given? then super( &b ) 1204*16467b97STreehugger Robot else 1205*16467b97STreehugger Robot super do |t| 1206*16467b97STreehugger Robot t.input = @input 1207*16467b97STreehugger Robot t.type = @state.type 1208*16467b97STreehugger Robot t.channel = @state.channel 1209*16467b97STreehugger Robot t.start = @state.token_start_position 1210*16467b97STreehugger Robot t.stop = @input.index - 1 1211*16467b97STreehugger Robot t.line = @state.token_start_line 1212*16467b97STreehugger Robot t.text = self.text 1213*16467b97STreehugger Robot t.column = @state.token_start_column 1214*16467b97STreehugger Robot end 1215*16467b97STreehugger Robot end 1216*16467b97STreehugger Robot end 1217*16467b97STreehugger Robotend 1218*16467b97STreehugger Robot 1219*16467b97STreehugger Robot 1220*16467b97STreehugger Robot=begin rdoc ANTLR3::Parser 1221*16467b97STreehugger Robot 1222*16467b97STreehugger Robot= Parser 1223*16467b97STreehugger Robot 1224*16467b97STreehugger RobotParser is the default base class of ANTLR-generated parser classes. The class 1225*16467b97STreehugger Robottailors the functionality provided by Recognizer to the task of parsing. 1226*16467b97STreehugger Robot 1227*16467b97STreehugger Robot== About Parsing 1228*16467b97STreehugger Robot 1229*16467b97STreehugger RobotThis is just a lose overview of parsing. For considerably more in-depth coverage 1230*16467b97STreehugger Robotof the topic, read the ANTLR documentation or check out the ANTLR website 1231*16467b97STreehugger Robot(http://www.antlr.org). 1232*16467b97STreehugger Robot 1233*16467b97STreehugger RobotA grammar defines the vocabulary and the sentence structure of a language. While 1234*16467b97STreehugger Robota lexer concerns the basic vocabulary symbols of the language, a parser's 1235*16467b97STreehugger Robotprimary task is to implement the sentence structure. 1236*16467b97STreehugger Robot 1237*16467b97STreehugger RobotParsers are set up by providing a stream of tokens, which is usually created by 1238*16467b97STreehugger Robota corresponding lexer. Then, the user requests a specific sentence-structure 1239*16467b97STreehugger Robotwithin the grammar, such as "class_definition" or "xml_node", from the parser. 1240*16467b97STreehugger RobotIt iterates through the tokens, verifying the syntax of the sentence and 1241*16467b97STreehugger Robotperforming actions specified by the grammar. It stops when it encounters an 1242*16467b97STreehugger Roboterror or when it has matched the full sentence according to its defined 1243*16467b97STreehugger Robotstructure. 1244*16467b97STreehugger Robot 1245*16467b97STreehugger Robot== ANTLR Parsers and the Parser API 1246*16467b97STreehugger Robot 1247*16467b97STreehugger RobotPlain ANTLR-generated parsers directly subclass this class, unless specified 1248*16467b97STreehugger Robototherwise within the grammar options. The generated code will provide a method 1249*16467b97STreehugger Robotfor each parser rule defined in the ANTLR grammar, as well as any other 1250*16467b97STreehugger Robotcustomized member attributes and methods specified in the source grammar. 1251*16467b97STreehugger Robot 1252*16467b97STreehugger RobotThis class does not override much of the functionality in Recognizer, and 1253*16467b97STreehugger Robotthus the API closely mirrors Recognizer. 1254*16467b97STreehugger Robot 1255*16467b97STreehugger Robot=end 1256*16467b97STreehugger Robotclass Parser < Recognizer 1257*16467b97STreehugger Robot def self.main( argv = ARGV, options = {} ) 1258*16467b97STreehugger Robot if argv.is_a?( ::Hash ) then argv, options = ARGV, argv end 1259*16467b97STreehugger Robot main = ANTLR3::Main::ParserMain.new( self, options ) 1260*16467b97STreehugger Robot block_given? ? yield( main ) : main.execute( argv ) 1261*16467b97STreehugger Robot end 1262*16467b97STreehugger Robot 1263*16467b97STreehugger Robot def self.associated_lexer 1264*16467b97STreehugger Robot @associated_lexer ||= begin 1265*16467b97STreehugger Robot @grammar_home and @grammar_home::Lexer 1266*16467b97STreehugger Robot rescue NameError 1267*16467b97STreehugger Robot grammar_name = @grammar_home.name.split( "::" ).last 1268*16467b97STreehugger Robot begin 1269*16467b97STreehugger Robot require "#{ grammar_name }Lexer" 1270*16467b97STreehugger Robot @grammar_home::Lexer 1271*16467b97STreehugger Robot rescue LoadError, NameError 1272*16467b97STreehugger Robot end 1273*16467b97STreehugger Robot end 1274*16467b97STreehugger Robot end 1275*16467b97STreehugger Robot 1276*16467b97STreehugger Robot 1277*16467b97STreehugger Robot def initialize( input, options = {} ) 1278*16467b97STreehugger Robot super( options ) 1279*16467b97STreehugger Robot @input = nil 1280*16467b97STreehugger Robot reset 1281*16467b97STreehugger Robot @input = cast_input( input, options ) 1282*16467b97STreehugger Robot end 1283*16467b97STreehugger Robot 1284*16467b97STreehugger Robot def missing_symbol( error, expected_type, follow ) 1285*16467b97STreehugger Robot current = @input.look 1286*16467b97STreehugger Robot current = @input.look( -1 ) if current == ANTLR3::EOF_TOKEN 1287*16467b97STreehugger Robot t = 1288*16467b97STreehugger Robot case 1289*16467b97STreehugger Robot when current && current != ANTLR3::EOF_TOKEN then current.clone 1290*16467b97STreehugger Robot when @input.token_class then @input.token_class.new 1291*16467b97STreehugger Robot else ( create_token rescue CommonToken.new ) 1292*16467b97STreehugger Robot end 1293*16467b97STreehugger Robot 1294*16467b97STreehugger Robot t.type = expected_type 1295*16467b97STreehugger Robot name = t.name.gsub( /(^<)|(>$)/,'' ) 1296*16467b97STreehugger Robot t.text = "<missing #{ name }>" 1297*16467b97STreehugger Robot t.channel = DEFAULT_CHANNEL 1298*16467b97STreehugger Robot return( t ) 1299*16467b97STreehugger Robot end 1300*16467b97STreehugger Robot 1301*16467b97STreehugger Robot def token_stream=( input ) 1302*16467b97STreehugger Robot @input = nil 1303*16467b97STreehugger Robot reset 1304*16467b97STreehugger Robot @input = input 1305*16467b97STreehugger Robot end 1306*16467b97STreehugger Robot alias token_stream input 1307*16467b97STreehugger Robot 1308*16467b97STreehugger Robot def source_name 1309*16467b97STreehugger Robot @input.source_name 1310*16467b97STreehugger Robot end 1311*16467b97STreehugger Robot 1312*16467b97STreehugger Robot 1313*16467b97STreehugger Robotprivate 1314*16467b97STreehugger Robot 1315*16467b97STreehugger Robot def trace_in( rule_name, rule_index ) 1316*16467b97STreehugger Robot super( rule_name, rule_index, @input.look.inspect ) 1317*16467b97STreehugger Robot end 1318*16467b97STreehugger Robot 1319*16467b97STreehugger Robot def trace_out( rule_name, rule_index ) 1320*16467b97STreehugger Robot super( rule_name, rule_index, @input.look.inspect ) 1321*16467b97STreehugger Robot end 1322*16467b97STreehugger Robot 1323*16467b97STreehugger Robot def cast_input( input, options ) 1324*16467b97STreehugger Robot case input 1325*16467b97STreehugger Robot when TokenStream then input 1326*16467b97STreehugger Robot when TokenSource then CommonTokenStream.new( input, options ) 1327*16467b97STreehugger Robot when IO, String, CharacterStream 1328*16467b97STreehugger Robot if lexer_class = self.class.associated_lexer 1329*16467b97STreehugger Robot CommonTokenStream.new( lexer_class.new( input, options ), options ) 1330*16467b97STreehugger Robot else 1331*16467b97STreehugger Robot raise ArgumentError, Util.tidy( <<-END, true ) 1332*16467b97STreehugger Robot | unable to automatically convert input #{ input.inspect } 1333*16467b97STreehugger Robot | to a ANTLR3::TokenStream object as #{ self.class } 1334*16467b97STreehugger Robot | does not appear to have an associated lexer class 1335*16467b97STreehugger Robot END 1336*16467b97STreehugger Robot end 1337*16467b97STreehugger Robot else 1338*16467b97STreehugger Robot # assume it's a stream if it at least implements peek and consume 1339*16467b97STreehugger Robot unless input.respond_to?( :peek ) and input.respond_to?( :consume ) 1340*16467b97STreehugger Robot raise ArgumentError, Util.tidy( <<-END, true ) 1341*16467b97STreehugger Robot | #{ self.class } requires a token stream as input, but 1342*16467b97STreehugger Robot | #{ input.inspect } was provided 1343*16467b97STreehugger Robot END 1344*16467b97STreehugger Robot end 1345*16467b97STreehugger Robot input 1346*16467b97STreehugger Robot end 1347*16467b97STreehugger Robot end 1348*16467b97STreehugger Robot 1349*16467b97STreehugger Robotend 1350*16467b97STreehugger Robot 1351*16467b97STreehugger Robotend 1352