1*16467b97STreehugger Robot /** \file 2*16467b97STreehugger Robot * Defines the interface for an ANTLR3 common token stream. Custom token streams should create 3*16467b97STreehugger Robot * one of these and then override any functions by installing their own pointers 4*16467b97STreehugger Robot * to implement the various functions. 5*16467b97STreehugger Robot */ 6*16467b97STreehugger Robot #ifndef _ANTLR3_TOKENSTREAM_H 7*16467b97STreehugger Robot #define _ANTLR3_TOKENSTREAM_H 8*16467b97STreehugger Robot 9*16467b97STreehugger Robot // [The "BSD licence"] 10*16467b97STreehugger Robot // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC 11*16467b97STreehugger Robot // http://www.temporal-wave.com 12*16467b97STreehugger Robot // http://www.linkedin.com/in/jimidle 13*16467b97STreehugger Robot // 14*16467b97STreehugger Robot // All rights reserved. 15*16467b97STreehugger Robot // 16*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without 17*16467b97STreehugger Robot // modification, are permitted provided that the following conditions 18*16467b97STreehugger Robot // are met: 19*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright 20*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer. 21*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright 22*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer in the 23*16467b97STreehugger Robot // documentation and/or other materials provided with the distribution. 24*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products 25*16467b97STreehugger Robot // derived from this software without specific prior written permission. 26*16467b97STreehugger Robot // 27*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 28*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 29*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 30*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 31*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 32*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 33*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 34*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 36*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37*16467b97STreehugger Robot 38*16467b97STreehugger Robot #include <antlr3defs.h> 39*16467b97STreehugger Robot #include <antlr3string.h> 40*16467b97STreehugger Robot #include <antlr3collections.h> 41*16467b97STreehugger Robot #include <antlr3input.h> 42*16467b97STreehugger Robot #include <antlr3commontoken.h> 43*16467b97STreehugger Robot #include <antlr3bitset.h> 44*16467b97STreehugger Robot #include <antlr3debugeventlistener.h> 45*16467b97STreehugger Robot 46*16467b97STreehugger Robot #ifdef __cplusplus 47*16467b97STreehugger Robot extern "C" { 48*16467b97STreehugger Robot #endif 49*16467b97STreehugger Robot 50*16467b97STreehugger Robot /** Definition of a token source, which has a pointer to a function that 51*16467b97STreehugger Robot * returns the next token (using a token factory if it is going to be 52*16467b97STreehugger Robot * efficient) and a pointer to an ANTLR3_INPUT_STREAM. This is slightly 53*16467b97STreehugger Robot * different to the Java interface because we have no way to implement 54*16467b97STreehugger Robot * multiple interfaces without defining them in the interface structure 55*16467b97STreehugger Robot * or casting (void *), which is too convoluted. 56*16467b97STreehugger Robot */ 57*16467b97STreehugger Robot typedef struct ANTLR3_TOKEN_SOURCE_struct 58*16467b97STreehugger Robot { 59*16467b97STreehugger Robot /** Pointer to a function that returns the next token in the stream. 60*16467b97STreehugger Robot */ 61*16467b97STreehugger Robot pANTLR3_COMMON_TOKEN (*nextToken)(struct ANTLR3_TOKEN_SOURCE_struct * tokenSource); 62*16467b97STreehugger Robot 63*16467b97STreehugger Robot /** Whoever is providing tokens, needs to provide a string factory too 64*16467b97STreehugger Robot */ 65*16467b97STreehugger Robot pANTLR3_STRING_FACTORY strFactory; 66*16467b97STreehugger Robot 67*16467b97STreehugger Robot /** A special pre-allocated token, which signifies End Of Tokens. Because this must 68*16467b97STreehugger Robot * be set up with the current input index and so on, we embed the structure and 69*16467b97STreehugger Robot * return the address of it. It is marked as factoryMade, so that it is never 70*16467b97STreehugger Robot * attempted to be freed. 71*16467b97STreehugger Robot */ 72*16467b97STreehugger Robot ANTLR3_COMMON_TOKEN eofToken; 73*16467b97STreehugger Robot 74*16467b97STreehugger Robot /// A special pre-allocated token, which is returned by mTokens() if the 75*16467b97STreehugger Robot /// lexer rule said to just skip the generated token altogether. 76*16467b97STreehugger Robot /// Having this single token stops us wasting memory by have the token factory 77*16467b97STreehugger Robot /// actually create something that we are going to SKIP(); anyway. 78*16467b97STreehugger Robot /// 79*16467b97STreehugger Robot ANTLR3_COMMON_TOKEN skipToken; 80*16467b97STreehugger Robot 81*16467b97STreehugger Robot /** Whatever is supplying the token source interface, needs a pointer to 82*16467b97STreehugger Robot * itself so that this pointer can be passed to it when the nextToken 83*16467b97STreehugger Robot * function is called. 84*16467b97STreehugger Robot */ 85*16467b97STreehugger Robot void * super; 86*16467b97STreehugger Robot 87*16467b97STreehugger Robot /** When the token source is constructed, it is populated with the file 88*16467b97STreehugger Robot * name from whence the tokens were produced by the lexer. This pointer is a 89*16467b97STreehugger Robot * copy of the one supplied by the CharStream (and may be NULL) so should 90*16467b97STreehugger Robot * not be manipulated other than to copy or print it. 91*16467b97STreehugger Robot */ 92*16467b97STreehugger Robot pANTLR3_STRING fileName; 93*16467b97STreehugger Robot } 94*16467b97STreehugger Robot ANTLR3_TOKEN_SOURCE; 95*16467b97STreehugger Robot 96*16467b97STreehugger Robot /** Definition of the ANTLR3 common token stream interface. 97*16467b97STreehugger Robot * \remark 98*16467b97STreehugger Robot * Much of the documentation for this interface is stolen from Ter's Java implementation. 99*16467b97STreehugger Robot */ 100*16467b97STreehugger Robot typedef struct ANTLR3_TOKEN_STREAM_struct 101*16467b97STreehugger Robot { 102*16467b97STreehugger Robot /** Pointer to the token source for this stream 103*16467b97STreehugger Robot */ 104*16467b97STreehugger Robot pANTLR3_TOKEN_SOURCE tokenSource; 105*16467b97STreehugger Robot 106*16467b97STreehugger Robot /** Whatever is providing this interface needs a pointer to itself 107*16467b97STreehugger Robot * so that this can be passed back to it whenever the api functions 108*16467b97STreehugger Robot * are called. 109*16467b97STreehugger Robot */ 110*16467b97STreehugger Robot void * super; 111*16467b97STreehugger Robot 112*16467b97STreehugger Robot /** All input streams implement the ANTLR3_INT_STREAM interface... 113*16467b97STreehugger Robot */ 114*16467b97STreehugger Robot pANTLR3_INT_STREAM istream; 115*16467b97STreehugger Robot 116*16467b97STreehugger Robot /// Debugger interface, is this is a debugging token stream 117*16467b97STreehugger Robot /// 118*16467b97STreehugger Robot pANTLR3_DEBUG_EVENT_LISTENER debugger; 119*16467b97STreehugger Robot 120*16467b97STreehugger Robot /// Indicates the initial stream state for dbgConsume() 121*16467b97STreehugger Robot /// 122*16467b97STreehugger Robot ANTLR3_BOOLEAN initialStreamState; 123*16467b97STreehugger Robot 124*16467b97STreehugger Robot /** Get Token at current input pointer + i ahead where i=1 is next Token. 125*16467b97STreehugger Robot * i<0 indicates tokens in the past. So -1 is previous token and -2 is 126*16467b97STreehugger Robot * two tokens ago. LT(0) is undefined. For i>=n, return Token.EOFToken. 127*16467b97STreehugger Robot * Return null for LT(0) and any index that results in an absolute address 128*16467b97STreehugger Robot * that is negative. 129*16467b97STreehugger Robot */ 130*16467b97STreehugger Robot pANTLR3_COMMON_TOKEN (*_LT) (struct ANTLR3_TOKEN_STREAM_struct * tokenStream, ANTLR3_INT32 k); 131*16467b97STreehugger Robot 132*16467b97STreehugger Robot /** Get a token at an absolute index i; 0..n-1. This is really only 133*16467b97STreehugger Robot * needed for profiling and debugging and token stream rewriting. 134*16467b97STreehugger Robot * If you don't want to buffer up tokens, then this method makes no 135*16467b97STreehugger Robot * sense for you. Naturally you can't use the rewrite stream feature. 136*16467b97STreehugger Robot * I believe DebugTokenStream can easily be altered to not use 137*16467b97STreehugger Robot * this method, removing the dependency. 138*16467b97STreehugger Robot */ 139*16467b97STreehugger Robot pANTLR3_COMMON_TOKEN (*get) (struct ANTLR3_TOKEN_STREAM_struct * tokenStream, ANTLR3_UINT32 i); 140*16467b97STreehugger Robot 141*16467b97STreehugger Robot /** Where is this stream pulling tokens from? This is not the name, but 142*16467b97STreehugger Robot * a pointer into an interface that contains a ANTLR3_TOKEN_SOURCE interface. 143*16467b97STreehugger Robot * The Token Source interface contains a pointer to the input stream and a pointer 144*16467b97STreehugger Robot * to a function that returns the next token. 145*16467b97STreehugger Robot */ 146*16467b97STreehugger Robot pANTLR3_TOKEN_SOURCE (*getTokenSource) (struct ANTLR3_TOKEN_STREAM_struct * tokenStream); 147*16467b97STreehugger Robot 148*16467b97STreehugger Robot /** Function that installs a token source for teh stream 149*16467b97STreehugger Robot */ 150*16467b97STreehugger Robot void (*setTokenSource) (struct ANTLR3_TOKEN_STREAM_struct * tokenStream, 151*16467b97STreehugger Robot pANTLR3_TOKEN_SOURCE tokenSource); 152*16467b97STreehugger Robot 153*16467b97STreehugger Robot /** Return the text of all the tokens in the stream, as the old tramp in 154*16467b97STreehugger Robot * Leeds market used to say; "Get the lot!" 155*16467b97STreehugger Robot */ 156*16467b97STreehugger Robot pANTLR3_STRING (*toString) (struct ANTLR3_TOKEN_STREAM_struct * tokenStream); 157*16467b97STreehugger Robot 158*16467b97STreehugger Robot /** Return the text of all tokens from start to stop, inclusive. 159*16467b97STreehugger Robot * If the stream does not buffer all the tokens then it can just 160*16467b97STreehugger Robot * return an empty ANTLR3_STRING or NULL; Grammars should not access $ruleLabel.text in 161*16467b97STreehugger Robot * an action in that case. 162*16467b97STreehugger Robot */ 163*16467b97STreehugger Robot pANTLR3_STRING (*toStringSS) (struct ANTLR3_TOKEN_STREAM_struct * tokenStream, ANTLR3_UINT32 start, ANTLR3_UINT32 stop); 164*16467b97STreehugger Robot 165*16467b97STreehugger Robot /** Because the user is not required to use a token with an index stored 166*16467b97STreehugger Robot * in it, we must provide a means for two token objects themselves to 167*16467b97STreehugger Robot * indicate the start/end location. Most often this will just delegate 168*16467b97STreehugger Robot * to the other toString(int,int). This is also parallel with 169*16467b97STreehugger Robot * the pTREENODE_STREAM->toString(Object,Object). 170*16467b97STreehugger Robot */ 171*16467b97STreehugger Robot pANTLR3_STRING (*toStringTT) (struct ANTLR3_TOKEN_STREAM_struct * tokenStream, pANTLR3_COMMON_TOKEN start, pANTLR3_COMMON_TOKEN stop); 172*16467b97STreehugger Robot 173*16467b97STreehugger Robot 174*16467b97STreehugger Robot /** Function that sets the token stream into debugging mode 175*16467b97STreehugger Robot */ 176*16467b97STreehugger Robot void (*setDebugListener) (struct ANTLR3_TOKEN_STREAM_struct * tokenStream, pANTLR3_DEBUG_EVENT_LISTENER debugger); 177*16467b97STreehugger Robot 178*16467b97STreehugger Robot 179*16467b97STreehugger Robot 180*16467b97STreehugger Robot /** Function that knows how to free the memory for an ANTLR3_TOKEN_STREAM 181*16467b97STreehugger Robot */ 182*16467b97STreehugger Robot void (*free) (struct ANTLR3_TOKEN_STREAM_struct * tokenStream); 183*16467b97STreehugger Robot } 184*16467b97STreehugger Robot ANTLR3_TOKEN_STREAM; 185*16467b97STreehugger Robot 186*16467b97STreehugger Robot /** Common token stream is an implementation of ANTLR_TOKEN_STREAM for the default 187*16467b97STreehugger Robot * parsers and recognizers. You may of course build your own implementation if 188*16467b97STreehugger Robot * you are so inclined. 189*16467b97STreehugger Robot */ 190*16467b97STreehugger Robot typedef struct ANTLR3_COMMON_TOKEN_STREAM_struct 191*16467b97STreehugger Robot { 192*16467b97STreehugger Robot /** The ANTLR3_TOKEN_STREAM interface implementation, which also includes 193*16467b97STreehugger Robot * the intstream implementation. We could duplicate the pANTLR_INT_STREAM 194*16467b97STreehugger Robot * in this interface and initialize it to a copy, but this could be confusing 195*16467b97STreehugger Robot * it just results in one more level of indirection and I think that with 196*16467b97STreehugger Robot * judicial use of 'const' later, the optimizer will do decent job. 197*16467b97STreehugger Robot */ 198*16467b97STreehugger Robot pANTLR3_TOKEN_STREAM tstream; 199*16467b97STreehugger Robot 200*16467b97STreehugger Robot /** Whatever is supplying the COMMON_TOKEN_STREAM needs a pointer to itself 201*16467b97STreehugger Robot * so that this can be accessed by any of the API functions which it implements. 202*16467b97STreehugger Robot */ 203*16467b97STreehugger Robot void * super; 204*16467b97STreehugger Robot 205*16467b97STreehugger Robot /** Records every single token pulled from the source indexed by the token index. 206*16467b97STreehugger Robot * There might be more efficient ways to do this, such as referencing directly in to 207*16467b97STreehugger Robot * the token factory pools, but for now this is convenient and the ANTLR3_LIST is not 208*16467b97STreehugger Robot * a huge overhead as it only stores pointers anyway, but allows for iterations and 209*16467b97STreehugger Robot * so on. 210*16467b97STreehugger Robot */ 211*16467b97STreehugger Robot pANTLR3_VECTOR tokens; 212*16467b97STreehugger Robot 213*16467b97STreehugger Robot /** Override map of tokens. If a token type has an entry in here, then 214*16467b97STreehugger Robot * the pointer in the table points to an int, being the override channel number 215*16467b97STreehugger Robot * that should always be used for this token type. 216*16467b97STreehugger Robot */ 217*16467b97STreehugger Robot pANTLR3_LIST channelOverrides; 218*16467b97STreehugger Robot 219*16467b97STreehugger Robot /** Discared set. If a token has an entry in this table, then it is thrown 220*16467b97STreehugger Robot * away (data pointer is always NULL). 221*16467b97STreehugger Robot */ 222*16467b97STreehugger Robot pANTLR3_LIST discardSet; 223*16467b97STreehugger Robot 224*16467b97STreehugger Robot /* The channel number that this token stream is tuned to. For instance, whitespace 225*16467b97STreehugger Robot * is usually tuned to channel 99, which no token stream would normally tune to and 226*16467b97STreehugger Robot * so it is thrown away. 227*16467b97STreehugger Robot */ 228*16467b97STreehugger Robot ANTLR3_UINT32 channel; 229*16467b97STreehugger Robot 230*16467b97STreehugger Robot /** If this flag is set to ANTLR3_TRUE, then tokens that the stream sees that are not 231*16467b97STreehugger Robot * in the channel that this stream is tuned to, are not tracked in the 232*16467b97STreehugger Robot * tokens table. When set to false, ALL tokens are added to the tracking. 233*16467b97STreehugger Robot */ 234*16467b97STreehugger Robot ANTLR3_BOOLEAN discardOffChannel; 235*16467b97STreehugger Robot 236*16467b97STreehugger Robot /** The index into the tokens list of the current token (the next one that will be 237*16467b97STreehugger Robot * consumed. p = -1 indicates that the token list is empty. 238*16467b97STreehugger Robot */ 239*16467b97STreehugger Robot ANTLR3_INT32 p; 240*16467b97STreehugger Robot 241*16467b97STreehugger Robot /** A simple filter mechanism whereby you can tell this token stream 242*16467b97STreehugger Robot * to force all tokens of type ttype to be on channel. For example, 243*16467b97STreehugger Robot * when interpreting, we cannot exec actions so we need to tell 244*16467b97STreehugger Robot * the stream to force all WS and NEWLINE to be a different, ignored 245*16467b97STreehugger Robot * channel. 246*16467b97STreehugger Robot */ 247*16467b97STreehugger Robot void (*setTokenTypeChannel) (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream, 248*16467b97STreehugger Robot ANTLR3_UINT32 ttype, ANTLR3_UINT32 channel); 249*16467b97STreehugger Robot 250*16467b97STreehugger Robot /** Add a particular token type to the discard set. If a token is found to belong 251*16467b97STreehugger Robot * to this set, then it is skipped/thrown away 252*16467b97STreehugger Robot */ 253*16467b97STreehugger Robot void (*discardTokenType) (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream, ANTLR3_INT32 ttype); 254*16467b97STreehugger Robot 255*16467b97STreehugger Robot /** Signal to discard off channel tokens from here on in. 256*16467b97STreehugger Robot */ 257*16467b97STreehugger Robot void (*discardOffChannelToks)(struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream, ANTLR3_BOOLEAN discard); 258*16467b97STreehugger Robot 259*16467b97STreehugger Robot /** Function that returns a pointer to the ANTLR3_LIST of all tokens 260*16467b97STreehugger Robot * in the stream (this causes the buffer to fill if we have not get any yet) 261*16467b97STreehugger Robot */ 262*16467b97STreehugger Robot pANTLR3_VECTOR (*getTokens) (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream); 263*16467b97STreehugger Robot 264*16467b97STreehugger Robot /** Function that returns all the tokens between a start and a stop index. 265*16467b97STreehugger Robot * TODO: This is a new list (Ack! Maybe this is a reason to have factories for LISTS and HASHTABLES etc :-( come back to this) 266*16467b97STreehugger Robot */ 267*16467b97STreehugger Robot pANTLR3_LIST (*getTokenRange) (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream, ANTLR3_UINT32 start, ANTLR3_UINT32 stop); 268*16467b97STreehugger Robot 269*16467b97STreehugger Robot /** Function that returns all the tokens indicated by the specified bitset, within a range of tokens 270*16467b97STreehugger Robot */ 271*16467b97STreehugger Robot pANTLR3_LIST (*getTokensSet) (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream, 272*16467b97STreehugger Robot ANTLR3_UINT32 start, ANTLR3_UINT32 stop, pANTLR3_BITSET types); 273*16467b97STreehugger Robot 274*16467b97STreehugger Robot /** Function that returns all the tokens indicated by being a member of the supplied List 275*16467b97STreehugger Robot */ 276*16467b97STreehugger Robot pANTLR3_LIST (*getTokensList) (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream, 277*16467b97STreehugger Robot ANTLR3_UINT32 start, ANTLR3_UINT32 stop, pANTLR3_LIST list); 278*16467b97STreehugger Robot 279*16467b97STreehugger Robot /** Function that returns all tokens of a certain type within a range. 280*16467b97STreehugger Robot */ 281*16467b97STreehugger Robot pANTLR3_LIST (*getTokensType) (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream, 282*16467b97STreehugger Robot ANTLR3_UINT32 start, ANTLR3_UINT32 stop, ANTLR3_UINT32 type); 283*16467b97STreehugger Robot 284*16467b97STreehugger Robot /** Function that resets the token stream so that it can be reused, but 285*16467b97STreehugger Robot * but that does not free up any resources, such as the token factory 286*16467b97STreehugger Robot * the factory pool and so on. This prevents the need to keep freeing 287*16467b97STreehugger Robot * and reallocating the token pools if the thing you are building is 288*16467b97STreehugger Robot * a multi-shot dameon or somethign like that. It is much faster to 289*16467b97STreehugger Robot * just reuse all the vectors. 290*16467b97STreehugger Robot */ 291*16467b97STreehugger Robot void (*reset) (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream); 292*16467b97STreehugger Robot 293*16467b97STreehugger Robot /** Function that knows how to free an ANTLR3_COMMON_TOKEN_STREAM 294*16467b97STreehugger Robot */ 295*16467b97STreehugger Robot void (*free) (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream); 296*16467b97STreehugger Robot } 297*16467b97STreehugger Robot ANTLR3_COMMON_TOKEN_STREAM; 298*16467b97STreehugger Robot 299*16467b97STreehugger Robot #ifdef __cplusplus 300*16467b97STreehugger Robot } 301*16467b97STreehugger Robot #endif 302*16467b97STreehugger Robot 303*16467b97STreehugger Robot #endif 304