1*16467b97STreehugger Robot /** \file 2*16467b97STreehugger Robot * While the C runtime does not need to model the state of 3*16467b97STreehugger Robot * multiple lexers and parsers in the same way as the Java runtime does 4*16467b97STreehugger Robot * it is no overhead to reflect that model. In fact the 5*16467b97STreehugger Robot * C runtime has always been able to share recognizer state. 6*16467b97STreehugger Robot * 7*16467b97STreehugger Robot * This 'class' therefore defines all the elements of a recognizer 8*16467b97STreehugger Robot * (either lexer, parser or tree parser) that are need to 9*16467b97STreehugger Robot * track the current recognition state. Multiple recognizers 10*16467b97STreehugger Robot * may then share this state, for instance when one grammar 11*16467b97STreehugger Robot * imports another. 12*16467b97STreehugger Robot */ 13*16467b97STreehugger Robot 14*16467b97STreehugger Robot #ifndef _ANTLR3_RECOGNIZER_SHARED_STATE_HPP 15*16467b97STreehugger Robot #define _ANTLR3_RECOGNIZER_SHARED_STATE_HPP 16*16467b97STreehugger Robot 17*16467b97STreehugger Robot // [The "BSD licence"] 18*16467b97STreehugger Robot // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB 19*16467b97STreehugger Robot 20*16467b97STreehugger Robot // 21*16467b97STreehugger Robot // All rights reserved. 22*16467b97STreehugger Robot // 23*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without 24*16467b97STreehugger Robot // modification, are permitted provided that the following conditions 25*16467b97STreehugger Robot // are met: 26*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright 27*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer. 28*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright 29*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer in the 30*16467b97STreehugger Robot // documentation and/or other materials provided with the distribution. 31*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products 32*16467b97STreehugger Robot // derived from this software without specific prior written permission. 33*16467b97STreehugger Robot // 34*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 35*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 36*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 37*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 38*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 39*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 40*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 41*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 42*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 43*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 44*16467b97STreehugger Robot 45*16467b97STreehugger Robot #include "antlr3defs.hpp" 46*16467b97STreehugger Robot 47*16467b97STreehugger Robot ANTLR_BEGIN_NAMESPACE() 48*16467b97STreehugger Robot 49*16467b97STreehugger Robot /** All the data elements required to track the current state 50*16467b97STreehugger Robot * of any recognizer (lexer, parser, tree parser). 51*16467b97STreehugger Robot * May be share between multiple recognizers such that 52*16467b97STreehugger Robot * grammar inheritance is easily supported. 53*16467b97STreehugger Robot */ 54*16467b97STreehugger Robot template<class ImplTraits, class StreamType> 55*16467b97STreehugger Robot class RecognizerSharedState : public ImplTraits::AllocPolicyType 56*16467b97STreehugger Robot { 57*16467b97STreehugger Robot public: 58*16467b97STreehugger Robot typedef typename ImplTraits::AllocPolicyType AllocPolicyType; 59*16467b97STreehugger Robot typedef typename StreamType::UnitType TokenType; 60*16467b97STreehugger Robot typedef typename ImplTraits::CommonTokenType CommonTokenType; 61*16467b97STreehugger Robot 62*16467b97STreehugger Robot typedef typename ComponentTypeFinder<ImplTraits, StreamType>::ComponentType ComponentType; 63*16467b97STreehugger Robot typedef typename ImplTraits::template RewriteStreamType< ComponentType > RewriteStreamType; 64*16467b97STreehugger Robot typedef typename ImplTraits::StringType StringType; 65*16467b97STreehugger Robot typedef typename ImplTraits::TokenSourceType TokenSourceType; 66*16467b97STreehugger Robot typedef typename ImplTraits::template ExceptionBaseType<StreamType> ExceptionBaseType; 67*16467b97STreehugger Robot typedef typename ImplTraits::BitsetType BitsetType; 68*16467b97STreehugger Robot typedef typename ImplTraits::BitsetListType BitsetListType; 69*16467b97STreehugger Robot 70*16467b97STreehugger Robot typedef typename AllocPolicyType::template StackType< BitsetListType > FollowingType; 71*16467b97STreehugger Robot typedef typename AllocPolicyType::template StackType< typename ImplTraits::InputStreamType* > InputStreamsType; 72*16467b97STreehugger Robot typedef InputStreamsType StreamsType; 73*16467b97STreehugger Robot typedef typename AllocPolicyType::template VectorType<RewriteStreamType> RewriteStreamsType; 74*16467b97STreehugger Robot 75*16467b97STreehugger Robot typedef IntTrie<ImplTraits, ANTLR_MARKER> RuleListType; 76*16467b97STreehugger Robot typedef IntTrie<ImplTraits, RuleListType*> RuleMemoType; 77*16467b97STreehugger Robot 78*16467b97STreehugger Robot private: 79*16467b97STreehugger Robot /** Points to the first in a possible chain of exceptions that the 80*16467b97STreehugger Robot * recognizer has discovered. 81*16467b97STreehugger Robot */ 82*16467b97STreehugger Robot ExceptionBaseType* m_exception; 83*16467b97STreehugger Robot 84*16467b97STreehugger Robot 85*16467b97STreehugger Robot /** Track the set of token types that can follow any rule invocation. 86*16467b97STreehugger Robot * Stack structure, to support: List<BitSet>. 87*16467b97STreehugger Robot */ 88*16467b97STreehugger Robot FollowingType m_following; 89*16467b97STreehugger Robot 90*16467b97STreehugger Robot /** Track around a hint from the creator of the recognizer as to how big this 91*16467b97STreehugger Robot * thing is going to get, as the actress said to the bishop. This allows us 92*16467b97STreehugger Robot * to tune hash tables accordingly. This might not be the best place for this 93*16467b97STreehugger Robot * in the end but we will see. 94*16467b97STreehugger Robot */ 95*16467b97STreehugger Robot ANTLR_UINT32 m_sizeHint; 96*16467b97STreehugger Robot 97*16467b97STreehugger Robot 98*16467b97STreehugger Robot /** If set to true then the recognizer has an exception 99*16467b97STreehugger Robot * condition (this is tested by the generated code for the rules of 100*16467b97STreehugger Robot * the grammar). 101*16467b97STreehugger Robot */ 102*16467b97STreehugger Robot bool m_error; 103*16467b97STreehugger Robot 104*16467b97STreehugger Robot 105*16467b97STreehugger Robot /** This is true when we see an error and before having successfully 106*16467b97STreehugger Robot * matched a token. Prevents generation of more than one error message 107*16467b97STreehugger Robot * per error. 108*16467b97STreehugger Robot */ 109*16467b97STreehugger Robot bool m_errorRecovery; 110*16467b97STreehugger Robot 111*16467b97STreehugger Robot /** In lieu of a return value, this indicates that a rule or token 112*16467b97STreehugger Robot * has failed to match. Reset to false upon valid token match. 113*16467b97STreehugger Robot */ 114*16467b97STreehugger Robot bool m_failed; 115*16467b97STreehugger Robot 116*16467b97STreehugger Robot /* 117*16467b97STreehugger Robot Instead of allocating CommonTokenType, we do it in the stack. hence we need a null indicator 118*16467b97STreehugger Robot */ 119*16467b97STreehugger Robot bool m_token_present; 120*16467b97STreehugger Robot 121*16467b97STreehugger Robot /** The index into the input stream where the last error occurred. 122*16467b97STreehugger Robot * This is used to prevent infinite loops where an error is found 123*16467b97STreehugger Robot * but no token is consumed during recovery...another error is found, 124*16467b97STreehugger Robot * ad nauseam. This is a failsafe mechanism to guarantee that at least 125*16467b97STreehugger Robot * one token/tree node is consumed for two errors. 126*16467b97STreehugger Robot */ 127*16467b97STreehugger Robot ANTLR_MARKER m_lastErrorIndex; 128*16467b97STreehugger Robot 129*16467b97STreehugger Robot /** When the recognizer terminates, the error handling functions 130*16467b97STreehugger Robot * will have incremented this value if any error occurred (that was displayed). It can then be 131*16467b97STreehugger Robot * used by the grammar programmer without having to use static globals. 132*16467b97STreehugger Robot */ 133*16467b97STreehugger Robot ANTLR_UINT32 m_errorCount; 134*16467b97STreehugger Robot 135*16467b97STreehugger Robot /** If 0, no backtracking is going on. Safe to exec actions etc... 136*16467b97STreehugger Robot * If >0 then it's the level of backtracking. 137*16467b97STreehugger Robot */ 138*16467b97STreehugger Robot ANTLR_INT32 m_backtracking; 139*16467b97STreehugger Robot 140*16467b97STreehugger Robot /** ANTLR3_VECTOR of ANTLR3_LIST for rule memoizing. 141*16467b97STreehugger Robot * Tracks the stop token index for each rule. ruleMemo[ruleIndex] is 142*16467b97STreehugger Robot * the memoization table for ruleIndex. For key ruleStartIndex, you 143*16467b97STreehugger Robot * get back the stop token for associated rule or MEMO_RULE_FAILED. 144*16467b97STreehugger Robot * 145*16467b97STreehugger Robot * This is only used if rule memoization is on. 146*16467b97STreehugger Robot */ 147*16467b97STreehugger Robot RuleMemoType* m_ruleMemo; 148*16467b97STreehugger Robot 149*16467b97STreehugger Robot /** Pointer to an array of token names 150*16467b97STreehugger Robot * that are generally useful in error reporting. The generated parsers install 151*16467b97STreehugger Robot * this pointer. The table it points to is statically allocated as 8 bit ascii 152*16467b97STreehugger Robot * at parser compile time - grammar token names are thus restricted in character 153*16467b97STreehugger Robot * sets, which does not seem to terrible. 154*16467b97STreehugger Robot */ 155*16467b97STreehugger Robot ANTLR_UINT8** m_tokenNames; 156*16467b97STreehugger Robot 157*16467b97STreehugger Robot /** The goal of all lexer rules/methods is to create a token object. 158*16467b97STreehugger Robot * This is an instance variable as multiple rules may collaborate to 159*16467b97STreehugger Robot * create a single token. For example, NUM : INT | FLOAT ; 160*16467b97STreehugger Robot * In this case, you want the INT or FLOAT rule to set token and not 161*16467b97STreehugger Robot * have it reset to a NUM token in rule NUM. 162*16467b97STreehugger Robot */ 163*16467b97STreehugger Robot CommonTokenType m_token; 164*16467b97STreehugger Robot 165*16467b97STreehugger Robot /** A lexer is a source of tokens, produced by all the generated (or 166*16467b97STreehugger Robot * hand crafted if you like) matching rules. As such it needs to provide 167*16467b97STreehugger Robot * a token source interface implementation. For others, this will become a empty class 168*16467b97STreehugger Robot */ 169*16467b97STreehugger Robot TokenSourceType* m_tokSource; 170*16467b97STreehugger Robot 171*16467b97STreehugger Robot /** The channel number for the current token 172*16467b97STreehugger Robot */ 173*16467b97STreehugger Robot ANTLR_UINT32 m_channel; 174*16467b97STreehugger Robot 175*16467b97STreehugger Robot /** The token type for the current token 176*16467b97STreehugger Robot */ 177*16467b97STreehugger Robot ANTLR_UINT32 m_type; 178*16467b97STreehugger Robot 179*16467b97STreehugger Robot /** The input line (where it makes sense) on which the first character of the current 180*16467b97STreehugger Robot * token resides. 181*16467b97STreehugger Robot */ 182*16467b97STreehugger Robot ANTLR_INT32 m_tokenStartLine; 183*16467b97STreehugger Robot 184*16467b97STreehugger Robot /** The character position of the first character of the current token 185*16467b97STreehugger Robot * within the line specified by tokenStartLine 186*16467b97STreehugger Robot */ 187*16467b97STreehugger Robot ANTLR_INT32 m_tokenStartCharPositionInLine; 188*16467b97STreehugger Robot 189*16467b97STreehugger Robot /** What character index in the stream did the current token start at? 190*16467b97STreehugger Robot * Needed, for example, to get the text for current token. Set at 191*16467b97STreehugger Robot * the start of nextToken. 192*16467b97STreehugger Robot */ 193*16467b97STreehugger Robot ANTLR_MARKER m_tokenStartCharIndex; 194*16467b97STreehugger Robot 195*16467b97STreehugger Robot /** Text for the current token. This can be overridden by setting this 196*16467b97STreehugger Robot * variable directly or by using the SETTEXT() macro (preferred) in your 197*16467b97STreehugger Robot * lexer rules. 198*16467b97STreehugger Robot */ 199*16467b97STreehugger Robot StringType m_text; 200*16467b97STreehugger Robot 201*16467b97STreehugger Robot /** Input stream stack, which allows the C programmer to switch input streams 202*16467b97STreehugger Robot * easily and allow the standard nextToken() implementation to deal with it 203*16467b97STreehugger Robot * as this is a common requirement. 204*16467b97STreehugger Robot */ 205*16467b97STreehugger Robot InputStreamsType m_streams; 206*16467b97STreehugger Robot 207*16467b97STreehugger Robot public: 208*16467b97STreehugger Robot RecognizerSharedState(); 209*16467b97STreehugger Robot ExceptionBaseType* get_exception() const; 210*16467b97STreehugger Robot FollowingType& get_following(); 211*16467b97STreehugger Robot ANTLR_UINT32 get_sizeHint() const; 212*16467b97STreehugger Robot bool get_error() const; 213*16467b97STreehugger Robot bool get_errorRecovery() const; 214*16467b97STreehugger Robot bool get_failed() const; 215*16467b97STreehugger Robot bool get_token_present() const; 216*16467b97STreehugger Robot ANTLR_MARKER get_lastErrorIndex() const; 217*16467b97STreehugger Robot ANTLR_UINT32 get_errorCount() const; 218*16467b97STreehugger Robot ANTLR_INT32 get_backtracking() const; 219*16467b97STreehugger Robot RuleMemoType* get_ruleMemo() const; 220*16467b97STreehugger Robot ANTLR_UINT8** get_tokenNames() const; 221*16467b97STreehugger Robot ANTLR_UINT8* get_tokenName( ANTLR_UINT32 i ) const; 222*16467b97STreehugger Robot CommonTokenType* get_token(); 223*16467b97STreehugger Robot TokenSourceType* get_tokSource() const; 224*16467b97STreehugger Robot ANTLR_UINT32& get_channel(); 225*16467b97STreehugger Robot ANTLR_UINT32 get_type() const; 226*16467b97STreehugger Robot ANTLR_INT32 get_tokenStartLine() const; 227*16467b97STreehugger Robot ANTLR_INT32 get_tokenStartCharPositionInLine() const; 228*16467b97STreehugger Robot ANTLR_MARKER get_tokenStartCharIndex() const; 229*16467b97STreehugger Robot StringType& get_text(); 230*16467b97STreehugger Robot InputStreamsType& get_streams(); 231*16467b97STreehugger Robot 232*16467b97STreehugger Robot void set_following( const FollowingType& following ); 233*16467b97STreehugger Robot void set_sizeHint( ANTLR_UINT32 sizeHint ); 234*16467b97STreehugger Robot void set_error( bool error ); 235*16467b97STreehugger Robot void set_errorRecovery( bool errorRecovery ); 236*16467b97STreehugger Robot void set_failed( bool failed ); 237*16467b97STreehugger Robot void set_token_present(bool token_present); 238*16467b97STreehugger Robot void set_lastErrorIndex( ANTLR_MARKER lastErrorIndex ); 239*16467b97STreehugger Robot void set_errorCount( ANTLR_UINT32 errorCount ); 240*16467b97STreehugger Robot void set_backtracking( ANTLR_INT32 backtracking ); 241*16467b97STreehugger Robot void set_ruleMemo( RuleMemoType* ruleMemo ); 242*16467b97STreehugger Robot void set_tokenNames( ANTLR_UINT8** tokenNames ); 243*16467b97STreehugger Robot void set_tokSource( TokenSourceType* tokSource ); 244*16467b97STreehugger Robot void set_channel( ANTLR_UINT32 channel ); 245*16467b97STreehugger Robot void set_exception( ExceptionBaseType* exception ); 246*16467b97STreehugger Robot void set_type( ANTLR_UINT32 type ); 247*16467b97STreehugger Robot void set_token( const CommonTokenType* tok); 248*16467b97STreehugger Robot void set_tokenStartLine( ANTLR_INT32 tokenStartLine ); 249*16467b97STreehugger Robot void set_tokenStartCharPositionInLine( ANTLR_INT32 tokenStartCharPositionInLine ); 250*16467b97STreehugger Robot void set_tokenStartCharIndex( ANTLR_MARKER tokenStartCharIndex ); 251*16467b97STreehugger Robot void set_text( const StringType& text ); 252*16467b97STreehugger Robot void set_streams( const InputStreamsType& streams ); 253*16467b97STreehugger Robot 254*16467b97STreehugger Robot void inc_errorCount(); 255*16467b97STreehugger Robot void inc_backtracking(); 256*16467b97STreehugger Robot void dec_backtracking(); 257*16467b97STreehugger Robot }; 258*16467b97STreehugger Robot 259*16467b97STreehugger Robot ANTLR_END_NAMESPACE() 260*16467b97STreehugger Robot 261*16467b97STreehugger Robot #include "antlr3recognizersharedstate.inl" 262*16467b97STreehugger Robot 263*16467b97STreehugger Robot #endif 264*16467b97STreehugger Robot 265*16467b97STreehugger Robot 266