1 /*
2  * Copyright (c) 2009-2022, Google LLC
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of Google LLC nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 // Class for parsing tokenized text from a ZeroCopyInputStream.
29 
30 #ifndef UPB_IO_TOKENIZER_H_
31 #define UPB_IO_TOKENIZER_H_
32 
33 #include "upb/base/status.h"
34 #include "upb/base/string_view.h"
35 #include "upb/io/zero_copy_input_stream.h"
36 #include "upb/mem/arena.h"
37 
38 // Must be included last.
39 #include "upb/port/def.inc"
40 
41 #ifdef __cplusplus
42 extern "C" {
43 #endif
44 
45 typedef enum {
46   kUpb_TokenType_Start,  // Next() has not yet been called.
47   kUpb_TokenType_End,    // End of input reached. "text" is empty.
48 
49   // A sequence of letters, digits, and underscores, not starting with a digit.
50   // It is an error for a number to be followed by an identifier with no space
51   // in between.
52   kUpb_TokenType_Identifier,
53 
54   // A sequence of digits representing an integer. Normally the digits are
55   // decimal, but a prefix of "0x" indicates a hex number and a leading zero
56   // indicates octal, just like with C numeric literals. A leading negative
57   // sign is NOT included in the token; it's up to the parser to interpret the
58   // unary minus operator on its own.
59   kUpb_TokenType_Integer,
60 
61   // A floating point literal, with a fractional part and/or an exponent.
62   // Always in decimal. Again, never negative.
63   kUpb_TokenType_Float,
64 
65   // A quoted sequence of escaped characters.
66   // Either single or double quotes can be used, but they must match.
67   // A string literal cannot cross a line break.
68   kUpb_TokenType_String,
69 
70   // Any other printable character, like '!' or '+'.
71   // Symbols are always a single character, so "!+$%" is four tokens.
72   kUpb_TokenType_Symbol,
73 
74   // A sequence of whitespace.
75   // This token type is only produced if report_whitespace() is true.
76   // It is not reported for whitespace within comments or strings.
77   kUpb_TokenType_Whitespace,
78 
79   // A newline ('\n'). This token type is only produced if report_whitespace()
80   // is true and report_newlines() is also true.
81   // It is not reported for newlines in comments or strings.
82   kUpb_TokenType_Newline,
83 } upb_TokenType;
84 
85 typedef enum {
86   // Set to allow floats to be suffixed with the letter 'f'. Tokens which would
87   // otherwise be integers but which have the 'f' suffix will be forced to be
88   // interpreted as floats. For all other purposes, the 'f' is ignored.
89   kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0,
90 
91   // If set, whitespace tokens are reported by Next().
92   kUpb_TokenizerOption_ReportWhitespace = 1 << 1,
93 
94   // If set, newline tokens are reported by Next().
95   // This is a superset of ReportWhitespace.
96   kUpb_TokenizerOption_ReportNewlines = 1 << 2,
97 
98   // By default the tokenizer expects C-style (/* */) comments.
99   // If set, it expects shell-style (#) comments instead.
100   kUpb_TokenizerOption_CommentStyleShell = 1 << 3,
101 } upb_Tokenizer_Option;
102 
103 typedef struct upb_Tokenizer upb_Tokenizer;
104 
105 // Can be passed a flat array and/or a ZCIS as input.
106 // The array will be read first (if non-NULL), then the stream (if non-NULL).
107 upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size,
108                                  upb_ZeroCopyInputStream* input, int options,
109                                  upb_Arena* arena);
110 
111 void upb_Tokenizer_Fini(upb_Tokenizer* t);
112 
113 // Advance the tokenizer to the next input token. Returns True on success.
114 // Returns False and (clears *status on EOF, sets *status on error).
115 bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status);
116 
117 // Accessors for inspecting current/previous parse tokens,
118 // which are opaque to the tokenizer (to reduce copying).
119 
120 upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t);
121 int upb_Tokenizer_Column(const upb_Tokenizer* t);
122 int upb_Tokenizer_EndColumn(const upb_Tokenizer* t);
123 int upb_Tokenizer_Line(const upb_Tokenizer* t);
124 int upb_Tokenizer_TextSize(const upb_Tokenizer* t);
125 const char* upb_Tokenizer_TextData(const upb_Tokenizer* t);
126 
127 // External helper: validate an identifier.
128 bool upb_Tokenizer_IsIdentifier(const char* data, int size);
129 
130 // Parses a TYPE_INTEGER token. Returns false if the result would be
131 // greater than max_value. Otherwise, returns true and sets *output to the
132 // result. If the text is not from a Token of type TYPE_INTEGER originally
133 // parsed by a Tokenizer, the result is undefined (possibly an assert
134 // failure).
135 bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output);
136 
137 // Parses a TYPE_FLOAT token. This never fails, so long as the text actually
138 // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the
139 // result is undefined (possibly an assert failure).
140 double upb_Parse_Float(const char* text);
141 
142 // Parses a TYPE_STRING token. This never fails, so long as the text actually
143 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
144 // result is undefined (possibly an assert failure).
145 upb_StringView upb_Parse_String(const char* text, upb_Arena* arena);
146 
147 #ifdef __cplusplus
148 } /* extern "C" */
149 #endif
150 
151 #include "upb/port/undef.inc"
152 
153 #endif  // UPB_IO_TOKENIZER_H_
154