1 /* 2 * Copyright (c) 2009-2022, Google LLC 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of Google LLC nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT, 20 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 // Class for parsing tokenized text from a ZeroCopyInputStream. 29 30 #ifndef UPB_IO_TOKENIZER_H_ 31 #define UPB_IO_TOKENIZER_H_ 32 33 #include "upb/base/status.h" 34 #include "upb/base/string_view.h" 35 #include "upb/io/zero_copy_input_stream.h" 36 #include "upb/mem/arena.h" 37 38 // Must be included last. 39 #include "upb/port/def.inc" 40 41 #ifdef __cplusplus 42 extern "C" { 43 #endif 44 45 typedef enum { 46 kUpb_TokenType_Start, // Next() has not yet been called. 47 kUpb_TokenType_End, // End of input reached. "text" is empty. 48 49 // A sequence of letters, digits, and underscores, not starting with a digit. 50 // It is an error for a number to be followed by an identifier with no space 51 // in between. 52 kUpb_TokenType_Identifier, 53 54 // A sequence of digits representing an integer. Normally the digits are 55 // decimal, but a prefix of "0x" indicates a hex number and a leading zero 56 // indicates octal, just like with C numeric literals. A leading negative 57 // sign is NOT included in the token; it's up to the parser to interpret the 58 // unary minus operator on its own. 59 kUpb_TokenType_Integer, 60 61 // A floating point literal, with a fractional part and/or an exponent. 62 // Always in decimal. Again, never negative. 63 kUpb_TokenType_Float, 64 65 // A quoted sequence of escaped characters. 66 // Either single or double quotes can be used, but they must match. 67 // A string literal cannot cross a line break. 68 kUpb_TokenType_String, 69 70 // Any other printable character, like '!' or '+'. 71 // Symbols are always a single character, so "!+$%" is four tokens. 72 kUpb_TokenType_Symbol, 73 74 // A sequence of whitespace. 75 // This token type is only produced if report_whitespace() is true. 76 // It is not reported for whitespace within comments or strings. 77 kUpb_TokenType_Whitespace, 78 79 // A newline ('\n'). This token type is only produced if report_whitespace() 80 // is true and report_newlines() is also true. 81 // It is not reported for newlines in comments or strings. 82 kUpb_TokenType_Newline, 83 } upb_TokenType; 84 85 typedef enum { 86 // Set to allow floats to be suffixed with the letter 'f'. Tokens which would 87 // otherwise be integers but which have the 'f' suffix will be forced to be 88 // interpreted as floats. For all other purposes, the 'f' is ignored. 89 kUpb_TokenizerOption_AllowFAfterFloat = 1 << 0, 90 91 // If set, whitespace tokens are reported by Next(). 92 kUpb_TokenizerOption_ReportWhitespace = 1 << 1, 93 94 // If set, newline tokens are reported by Next(). 95 // This is a superset of ReportWhitespace. 96 kUpb_TokenizerOption_ReportNewlines = 1 << 2, 97 98 // By default the tokenizer expects C-style (/* */) comments. 99 // If set, it expects shell-style (#) comments instead. 100 kUpb_TokenizerOption_CommentStyleShell = 1 << 3, 101 } upb_Tokenizer_Option; 102 103 typedef struct upb_Tokenizer upb_Tokenizer; 104 105 // Can be passed a flat array and/or a ZCIS as input. 106 // The array will be read first (if non-NULL), then the stream (if non-NULL). 107 upb_Tokenizer* upb_Tokenizer_New(const void* data, size_t size, 108 upb_ZeroCopyInputStream* input, int options, 109 upb_Arena* arena); 110 111 void upb_Tokenizer_Fini(upb_Tokenizer* t); 112 113 // Advance the tokenizer to the next input token. Returns True on success. 114 // Returns False and (clears *status on EOF, sets *status on error). 115 bool upb_Tokenizer_Next(upb_Tokenizer* t, upb_Status* status); 116 117 // Accessors for inspecting current/previous parse tokens, 118 // which are opaque to the tokenizer (to reduce copying). 119 120 upb_TokenType upb_Tokenizer_Type(const upb_Tokenizer* t); 121 int upb_Tokenizer_Column(const upb_Tokenizer* t); 122 int upb_Tokenizer_EndColumn(const upb_Tokenizer* t); 123 int upb_Tokenizer_Line(const upb_Tokenizer* t); 124 int upb_Tokenizer_TextSize(const upb_Tokenizer* t); 125 const char* upb_Tokenizer_TextData(const upb_Tokenizer* t); 126 127 // External helper: validate an identifier. 128 bool upb_Tokenizer_IsIdentifier(const char* data, int size); 129 130 // Parses a TYPE_INTEGER token. Returns false if the result would be 131 // greater than max_value. Otherwise, returns true and sets *output to the 132 // result. If the text is not from a Token of type TYPE_INTEGER originally 133 // parsed by a Tokenizer, the result is undefined (possibly an assert 134 // failure). 135 bool upb_Parse_Integer(const char* text, uint64_t max_value, uint64_t* output); 136 137 // Parses a TYPE_FLOAT token. This never fails, so long as the text actually 138 // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the 139 // result is undefined (possibly an assert failure). 140 double upb_Parse_Float(const char* text); 141 142 // Parses a TYPE_STRING token. This never fails, so long as the text actually 143 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the 144 // result is undefined (possibly an assert failure). 145 upb_StringView upb_Parse_String(const char* text, upb_Arena* arena); 146 147 #ifdef __cplusplus 148 } /* extern "C" */ 149 #endif 150 151 #include "upb/port/undef.inc" 152 153 #endif // UPB_IO_TOKENIZER_H_ 154