xref: /aosp_15_r20/external/antlr/runtime/C/include/antlr3tokenstream.h (revision 16467b971bd3e2009fad32dd79016f2c7e421deb)
1*16467b97STreehugger Robot /** \file
2*16467b97STreehugger Robot  * Defines the interface for an ANTLR3 common token stream. Custom token streams should create
3*16467b97STreehugger Robot  * one of these and then override any functions by installing their own pointers
4*16467b97STreehugger Robot  * to implement the various functions.
5*16467b97STreehugger Robot  */
6*16467b97STreehugger Robot #ifndef	_ANTLR3_TOKENSTREAM_H
7*16467b97STreehugger Robot #define	_ANTLR3_TOKENSTREAM_H
8*16467b97STreehugger Robot 
9*16467b97STreehugger Robot // [The "BSD licence"]
10*16467b97STreehugger Robot // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
11*16467b97STreehugger Robot // http://www.temporal-wave.com
12*16467b97STreehugger Robot // http://www.linkedin.com/in/jimidle
13*16467b97STreehugger Robot //
14*16467b97STreehugger Robot // All rights reserved.
15*16467b97STreehugger Robot //
16*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without
17*16467b97STreehugger Robot // modification, are permitted provided that the following conditions
18*16467b97STreehugger Robot // are met:
19*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright
20*16467b97STreehugger Robot //    notice, this list of conditions and the following disclaimer.
21*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright
22*16467b97STreehugger Robot //    notice, this list of conditions and the following disclaimer in the
23*16467b97STreehugger Robot //    documentation and/or other materials provided with the distribution.
24*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products
25*16467b97STreehugger Robot //    derived from this software without specific prior written permission.
26*16467b97STreehugger Robot //
27*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
28*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
29*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
30*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
31*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
32*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
36*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37*16467b97STreehugger Robot 
38*16467b97STreehugger Robot #include    <antlr3defs.h>
39*16467b97STreehugger Robot #include    <antlr3string.h>
40*16467b97STreehugger Robot #include    <antlr3collections.h>
41*16467b97STreehugger Robot #include    <antlr3input.h>
42*16467b97STreehugger Robot #include    <antlr3commontoken.h>
43*16467b97STreehugger Robot #include    <antlr3bitset.h>
44*16467b97STreehugger Robot #include	<antlr3debugeventlistener.h>
45*16467b97STreehugger Robot 
46*16467b97STreehugger Robot #ifdef __cplusplus
47*16467b97STreehugger Robot extern "C" {
48*16467b97STreehugger Robot #endif
49*16467b97STreehugger Robot 
50*16467b97STreehugger Robot /** Definition of a token source, which has a pointer to a function that
51*16467b97STreehugger Robot  *  returns the next token (using a token factory if it is going to be
52*16467b97STreehugger Robot  *  efficient) and a pointer to an ANTLR3_INPUT_STREAM. This is slightly
53*16467b97STreehugger Robot  *  different to the Java interface because we have no way to implement
54*16467b97STreehugger Robot  *  multiple interfaces without defining them in the interface structure
55*16467b97STreehugger Robot  *  or casting (void *), which is too convoluted.
56*16467b97STreehugger Robot  */
57*16467b97STreehugger Robot typedef struct ANTLR3_TOKEN_SOURCE_struct
58*16467b97STreehugger Robot {
59*16467b97STreehugger Robot     /** Pointer to a function that returns the next token in the stream.
60*16467b97STreehugger Robot      */
61*16467b97STreehugger Robot     pANTLR3_COMMON_TOKEN    (*nextToken)(struct ANTLR3_TOKEN_SOURCE_struct * tokenSource);
62*16467b97STreehugger Robot 
63*16467b97STreehugger Robot     /** Whoever is providing tokens, needs to provide a string factory too
64*16467b97STreehugger Robot      */
65*16467b97STreehugger Robot     pANTLR3_STRING_FACTORY  strFactory;
66*16467b97STreehugger Robot 
67*16467b97STreehugger Robot     /** A special pre-allocated token, which signifies End Of Tokens. Because this must
68*16467b97STreehugger Robot      *  be set up with the current input index and so on, we embed the structure and
69*16467b97STreehugger Robot      *  return the address of it. It is marked as factoryMade, so that it is never
70*16467b97STreehugger Robot      *  attempted to be freed.
71*16467b97STreehugger Robot      */
72*16467b97STreehugger Robot     ANTLR3_COMMON_TOKEN	    eofToken;
73*16467b97STreehugger Robot 
74*16467b97STreehugger Robot 	/// A special pre-allocated token, which is returned by mTokens() if the
75*16467b97STreehugger Robot 	/// lexer rule said to just skip the generated token altogether.
76*16467b97STreehugger Robot 	/// Having this single token stops us wasting memory by have the token factory
77*16467b97STreehugger Robot 	/// actually create something that we are going to SKIP(); anyway.
78*16467b97STreehugger Robot 	///
79*16467b97STreehugger Robot 	ANTLR3_COMMON_TOKEN		skipToken;
80*16467b97STreehugger Robot 
81*16467b97STreehugger Robot     /** Whatever is supplying the token source interface, needs a pointer to
82*16467b97STreehugger Robot      *  itself so that this pointer can be passed to it when the nextToken
83*16467b97STreehugger Robot      *  function is called.
84*16467b97STreehugger Robot      */
85*16467b97STreehugger Robot     void		    * super;
86*16467b97STreehugger Robot 
87*16467b97STreehugger Robot     /** When the token source is constructed, it is populated with the file
88*16467b97STreehugger Robot      *  name from whence the tokens were produced by the lexer. This pointer is a
89*16467b97STreehugger Robot      *  copy of the one supplied by the CharStream (and may be NULL) so should
90*16467b97STreehugger Robot      *  not be manipulated other than to copy or print it.
91*16467b97STreehugger Robot      */
92*16467b97STreehugger Robot     pANTLR3_STRING	    fileName;
93*16467b97STreehugger Robot }
94*16467b97STreehugger Robot     ANTLR3_TOKEN_SOURCE;
95*16467b97STreehugger Robot 
96*16467b97STreehugger Robot /** Definition of the ANTLR3 common token stream interface.
97*16467b97STreehugger Robot  * \remark
98*16467b97STreehugger Robot  * Much of the documentation for this interface is stolen from Ter's Java implementation.
99*16467b97STreehugger Robot  */
100*16467b97STreehugger Robot typedef	struct ANTLR3_TOKEN_STREAM_struct
101*16467b97STreehugger Robot {
102*16467b97STreehugger Robot     /** Pointer to the token source for this stream
103*16467b97STreehugger Robot      */
104*16467b97STreehugger Robot     pANTLR3_TOKEN_SOURCE    tokenSource;
105*16467b97STreehugger Robot 
106*16467b97STreehugger Robot     /** Whatever is providing this interface needs a pointer to itself
107*16467b97STreehugger Robot      *  so that this can be passed back to it whenever the api functions
108*16467b97STreehugger Robot      *  are called.
109*16467b97STreehugger Robot      */
110*16467b97STreehugger Robot     void	      * super;
111*16467b97STreehugger Robot 
112*16467b97STreehugger Robot     /** All input streams implement the ANTLR3_INT_STREAM interface...
113*16467b97STreehugger Robot      */
114*16467b97STreehugger Robot     pANTLR3_INT_STREAM	    istream;
115*16467b97STreehugger Robot 
116*16467b97STreehugger Robot 	/// Debugger interface, is this is a debugging token stream
117*16467b97STreehugger Robot 	///
118*16467b97STreehugger Robot 	pANTLR3_DEBUG_EVENT_LISTENER		debugger;
119*16467b97STreehugger Robot 
120*16467b97STreehugger Robot 	/// Indicates the initial stream state for dbgConsume()
121*16467b97STreehugger Robot 	///
122*16467b97STreehugger Robot 	ANTLR3_BOOLEAN			initialStreamState;
123*16467b97STreehugger Robot 
124*16467b97STreehugger Robot     /** Get Token at current input pointer + i ahead where i=1 is next Token.
125*16467b97STreehugger Robot      *  i<0 indicates tokens in the past.  So -1 is previous token and -2 is
126*16467b97STreehugger Robot      *  two tokens ago. LT(0) is undefined.  For i>=n, return Token.EOFToken.
127*16467b97STreehugger Robot      *  Return null for LT(0) and any index that results in an absolute address
128*16467b97STreehugger Robot      *  that is negative.
129*16467b97STreehugger Robot      */
130*16467b97STreehugger Robot     pANTLR3_COMMON_TOKEN    (*_LT)		(struct ANTLR3_TOKEN_STREAM_struct * tokenStream, ANTLR3_INT32 k);
131*16467b97STreehugger Robot 
132*16467b97STreehugger Robot     /** Get a token at an absolute index i; 0..n-1.  This is really only
133*16467b97STreehugger Robot      *  needed for profiling and debugging and token stream rewriting.
134*16467b97STreehugger Robot      *  If you don't want to buffer up tokens, then this method makes no
135*16467b97STreehugger Robot      *  sense for you.  Naturally you can't use the rewrite stream feature.
136*16467b97STreehugger Robot      *  I believe DebugTokenStream can easily be altered to not use
137*16467b97STreehugger Robot      *  this method, removing the dependency.
138*16467b97STreehugger Robot      */
139*16467b97STreehugger Robot     pANTLR3_COMMON_TOKEN    (*get)		(struct ANTLR3_TOKEN_STREAM_struct * tokenStream, ANTLR3_UINT32 i);
140*16467b97STreehugger Robot 
141*16467b97STreehugger Robot     /** Where is this stream pulling tokens from?  This is not the name, but
142*16467b97STreehugger Robot      *  a pointer into an interface that contains a ANTLR3_TOKEN_SOURCE interface.
143*16467b97STreehugger Robot      *  The Token Source interface contains a pointer to the input stream and a pointer
144*16467b97STreehugger Robot      *  to a function that returns the next token.
145*16467b97STreehugger Robot      */
146*16467b97STreehugger Robot     pANTLR3_TOKEN_SOURCE    (*getTokenSource)	(struct ANTLR3_TOKEN_STREAM_struct * tokenStream);
147*16467b97STreehugger Robot 
148*16467b97STreehugger Robot     /** Function that installs a token source for teh stream
149*16467b97STreehugger Robot      */
150*16467b97STreehugger Robot     void		    (*setTokenSource)	(struct ANTLR3_TOKEN_STREAM_struct * tokenStream,
151*16467b97STreehugger Robot 						 pANTLR3_TOKEN_SOURCE		   tokenSource);
152*16467b97STreehugger Robot 
153*16467b97STreehugger Robot     /** Return the text of all the tokens in the stream, as the old tramp in
154*16467b97STreehugger Robot      *  Leeds market used to say; "Get the lot!"
155*16467b97STreehugger Robot      */
156*16467b97STreehugger Robot     pANTLR3_STRING	    (*toString)		(struct ANTLR3_TOKEN_STREAM_struct * tokenStream);
157*16467b97STreehugger Robot 
158*16467b97STreehugger Robot     /** Return the text of all tokens from start to stop, inclusive.
159*16467b97STreehugger Robot      *  If the stream does not buffer all the tokens then it can just
160*16467b97STreehugger Robot      *  return an empty ANTLR3_STRING or NULL;  Grammars should not access $ruleLabel.text in
161*16467b97STreehugger Robot      *  an action in that case.
162*16467b97STreehugger Robot      */
163*16467b97STreehugger Robot     pANTLR3_STRING	    (*toStringSS)	(struct ANTLR3_TOKEN_STREAM_struct * tokenStream, ANTLR3_UINT32 start, ANTLR3_UINT32 stop);
164*16467b97STreehugger Robot 
165*16467b97STreehugger Robot     /** Because the user is not required to use a token with an index stored
166*16467b97STreehugger Robot      *  in it, we must provide a means for two token objects themselves to
167*16467b97STreehugger Robot      *  indicate the start/end location.  Most often this will just delegate
168*16467b97STreehugger Robot      *  to the other toString(int,int).  This is also parallel with
169*16467b97STreehugger Robot      *  the pTREENODE_STREAM->toString(Object,Object).
170*16467b97STreehugger Robot      */
171*16467b97STreehugger Robot     pANTLR3_STRING	    (*toStringTT)	(struct ANTLR3_TOKEN_STREAM_struct * tokenStream, pANTLR3_COMMON_TOKEN start, pANTLR3_COMMON_TOKEN stop);
172*16467b97STreehugger Robot 
173*16467b97STreehugger Robot 
174*16467b97STreehugger Robot     /** Function that sets the token stream into debugging mode
175*16467b97STreehugger Robot      */
176*16467b97STreehugger Robot     void		    (*setDebugListener)	    (struct ANTLR3_TOKEN_STREAM_struct * tokenStream, pANTLR3_DEBUG_EVENT_LISTENER debugger);
177*16467b97STreehugger Robot 
178*16467b97STreehugger Robot 
179*16467b97STreehugger Robot 
180*16467b97STreehugger Robot     /** Function that knows how to free the memory for an ANTLR3_TOKEN_STREAM
181*16467b97STreehugger Robot      */
182*16467b97STreehugger Robot     void		    (*free)		(struct ANTLR3_TOKEN_STREAM_struct * tokenStream);
183*16467b97STreehugger Robot }
184*16467b97STreehugger Robot     ANTLR3_TOKEN_STREAM;
185*16467b97STreehugger Robot 
186*16467b97STreehugger Robot /** Common token stream is an implementation of ANTLR_TOKEN_STREAM for the default
187*16467b97STreehugger Robot  *  parsers and recognizers. You may of course build your own implementation if
188*16467b97STreehugger Robot  *  you are so inclined.
189*16467b97STreehugger Robot  */
190*16467b97STreehugger Robot typedef	struct	ANTLR3_COMMON_TOKEN_STREAM_struct
191*16467b97STreehugger Robot {
192*16467b97STreehugger Robot     /** The ANTLR3_TOKEN_STREAM interface implementation, which also includes
193*16467b97STreehugger Robot      *  the intstream implementation. We could duplicate the pANTLR_INT_STREAM
194*16467b97STreehugger Robot      *  in this interface and initialize it to a copy, but this could be confusing
195*16467b97STreehugger Robot      *  it just results in one more level of indirection and I think that with
196*16467b97STreehugger Robot      *  judicial use of 'const' later, the optimizer will do decent job.
197*16467b97STreehugger Robot      */
198*16467b97STreehugger Robot     pANTLR3_TOKEN_STREAM    tstream;
199*16467b97STreehugger Robot 
200*16467b97STreehugger Robot     /** Whatever is supplying the COMMON_TOKEN_STREAM needs a pointer to itself
201*16467b97STreehugger Robot      *  so that this can be accessed by any of the API functions which it implements.
202*16467b97STreehugger Robot      */
203*16467b97STreehugger Robot     void		    * super;
204*16467b97STreehugger Robot 
205*16467b97STreehugger Robot     /** Records every single token pulled from the source indexed by the token index.
206*16467b97STreehugger Robot      *  There might be more efficient ways to do this, such as referencing directly in to
207*16467b97STreehugger Robot      *  the token factory pools, but for now this is convenient and the ANTLR3_LIST is not
208*16467b97STreehugger Robot      *  a huge overhead as it only stores pointers anyway, but allows for iterations and
209*16467b97STreehugger Robot      *  so on.
210*16467b97STreehugger Robot      */
211*16467b97STreehugger Robot     pANTLR3_VECTOR	    tokens;
212*16467b97STreehugger Robot 
213*16467b97STreehugger Robot     /** Override map of tokens. If a token type has an entry in here, then
214*16467b97STreehugger Robot      *  the pointer in the table points to an int, being the override channel number
215*16467b97STreehugger Robot      *  that should always be used for this token type.
216*16467b97STreehugger Robot      */
217*16467b97STreehugger Robot     pANTLR3_LIST	    channelOverrides;
218*16467b97STreehugger Robot 
219*16467b97STreehugger Robot     /** Discared set. If a token has an entry in this table, then it is thrown
220*16467b97STreehugger Robot      *  away (data pointer is always NULL).
221*16467b97STreehugger Robot      */
222*16467b97STreehugger Robot     pANTLR3_LIST	    discardSet;
223*16467b97STreehugger Robot 
224*16467b97STreehugger Robot     /* The channel number that this token stream is tuned to. For instance, whitespace
225*16467b97STreehugger Robot      * is usually tuned to channel 99, which no token stream would normally tune to and
226*16467b97STreehugger Robot      * so it is thrown away.
227*16467b97STreehugger Robot      */
228*16467b97STreehugger Robot     ANTLR3_UINT32	    channel;
229*16467b97STreehugger Robot 
230*16467b97STreehugger Robot     /** If this flag is set to ANTLR3_TRUE, then tokens that the stream sees that are not
231*16467b97STreehugger Robot      *  in the channel that this stream is tuned to, are not tracked in the
232*16467b97STreehugger Robot      *  tokens table. When set to false, ALL tokens are added to the tracking.
233*16467b97STreehugger Robot      */
234*16467b97STreehugger Robot     ANTLR3_BOOLEAN	    discardOffChannel;
235*16467b97STreehugger Robot 
236*16467b97STreehugger Robot     /** The index into the tokens list of the current token (the next one that will be
237*16467b97STreehugger Robot      *  consumed. p = -1 indicates that the token list is empty.
238*16467b97STreehugger Robot      */
239*16467b97STreehugger Robot     ANTLR3_INT32	    p;
240*16467b97STreehugger Robot 
241*16467b97STreehugger Robot     /** A simple filter mechanism whereby you can tell this token stream
242*16467b97STreehugger Robot      *  to force all tokens of type ttype to be on channel.  For example,
243*16467b97STreehugger Robot      *  when interpreting, we cannot exec actions so we need to tell
244*16467b97STreehugger Robot      *  the stream to force all WS and NEWLINE to be a different, ignored
245*16467b97STreehugger Robot      *  channel.
246*16467b97STreehugger Robot      */
247*16467b97STreehugger Robot     void		    (*setTokenTypeChannel)  (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream,
248*16467b97STreehugger Robot 							ANTLR3_UINT32 ttype, ANTLR3_UINT32 channel);
249*16467b97STreehugger Robot 
250*16467b97STreehugger Robot     /** Add a particular token type to the discard set. If a token is found to belong
251*16467b97STreehugger Robot      *  to this set, then it is skipped/thrown away
252*16467b97STreehugger Robot      */
253*16467b97STreehugger Robot     void		    (*discardTokenType)	    (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream, ANTLR3_INT32 ttype);
254*16467b97STreehugger Robot 
255*16467b97STreehugger Robot     /** Signal to discard off channel tokens from here on in.
256*16467b97STreehugger Robot      */
257*16467b97STreehugger Robot     void		    (*discardOffChannelToks)(struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream, ANTLR3_BOOLEAN discard);
258*16467b97STreehugger Robot 
259*16467b97STreehugger Robot     /** Function that returns a pointer to the ANTLR3_LIST of all tokens
260*16467b97STreehugger Robot      *  in the stream (this causes the buffer to fill if we have not get any yet)
261*16467b97STreehugger Robot      */
262*16467b97STreehugger Robot     pANTLR3_VECTOR	    (*getTokens)	    (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream);
263*16467b97STreehugger Robot 
264*16467b97STreehugger Robot     /** Function that returns all the tokens between a start and a stop index.
265*16467b97STreehugger Robot      *  TODO: This is a new list (Ack! Maybe this is a reason to have factories for LISTS and HASHTABLES etc :-( come back to this)
266*16467b97STreehugger Robot      */
267*16467b97STreehugger Robot     pANTLR3_LIST	    (*getTokenRange)	    (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream, ANTLR3_UINT32 start, ANTLR3_UINT32 stop);
268*16467b97STreehugger Robot 
269*16467b97STreehugger Robot     /** Function that returns all the tokens indicated by the specified bitset, within a range of tokens
270*16467b97STreehugger Robot      */
271*16467b97STreehugger Robot     pANTLR3_LIST	    (*getTokensSet)	    (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream,
272*16467b97STreehugger Robot 							ANTLR3_UINT32 start, ANTLR3_UINT32 stop, pANTLR3_BITSET types);
273*16467b97STreehugger Robot 
274*16467b97STreehugger Robot     /** Function that returns all the tokens indicated by being a member of the supplied List
275*16467b97STreehugger Robot      */
276*16467b97STreehugger Robot     pANTLR3_LIST	    (*getTokensList)	    (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream,
277*16467b97STreehugger Robot 							ANTLR3_UINT32 start, ANTLR3_UINT32 stop, pANTLR3_LIST list);
278*16467b97STreehugger Robot 
279*16467b97STreehugger Robot     /** Function that returns all tokens of a certain type within a range.
280*16467b97STreehugger Robot      */
281*16467b97STreehugger Robot     pANTLR3_LIST	    (*getTokensType)	    (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream,
282*16467b97STreehugger Robot 							ANTLR3_UINT32 start, ANTLR3_UINT32 stop, ANTLR3_UINT32 type);
283*16467b97STreehugger Robot 
284*16467b97STreehugger Robot     /** Function that resets the token stream so that it can be reused, but
285*16467b97STreehugger Robot      *  but that does not free up any resources, such as the token factory
286*16467b97STreehugger Robot      *  the factory pool and so on. This prevents the need to keep freeing
287*16467b97STreehugger Robot      *  and reallocating the token pools if the thing you are building is
288*16467b97STreehugger Robot      *  a multi-shot dameon or somethign like that. It is much faster to
289*16467b97STreehugger Robot      *  just reuse all the vectors.
290*16467b97STreehugger Robot      */
291*16467b97STreehugger Robot     void                    (*reset)            (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream);
292*16467b97STreehugger Robot 
293*16467b97STreehugger Robot     /** Function that knows how to free an ANTLR3_COMMON_TOKEN_STREAM
294*16467b97STreehugger Robot      */
295*16467b97STreehugger Robot     void		    (*free)		    (struct ANTLR3_COMMON_TOKEN_STREAM_struct * tokenStream);
296*16467b97STreehugger Robot }
297*16467b97STreehugger Robot     ANTLR3_COMMON_TOKEN_STREAM;
298*16467b97STreehugger Robot 
299*16467b97STreehugger Robot #ifdef __cplusplus
300*16467b97STreehugger Robot }
301*16467b97STreehugger Robot #endif
302*16467b97STreehugger Robot 
303*16467b97STreehugger Robot #endif
304