1 // Copyright 2012 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef BASE_JSON_JSON_PARSER_H_ 6 #define BASE_JSON_JSON_PARSER_H_ 7 8 #include <stddef.h> 9 #include <stdint.h> 10 11 #include <memory> 12 #include <optional> 13 #include <string> 14 #include <string_view> 15 16 #include "base/base_export.h" 17 #include "base/compiler_specific.h" 18 #include "base/gtest_prod_util.h" 19 #include "base/json/json_common.h" 20 #include "base/third_party/icu/icu_utf.h" 21 #include "base/values.h" 22 23 namespace base { 24 25 class Value; 26 27 namespace internal { 28 29 class JSONParserTest; 30 31 // The implementation behind the JSONReader interface. This class is not meant 32 // to be used directly; it encapsulates logic that need not be exposed publicly. 33 // 34 // This parser guarantees O(n) time through the input string. Iteration happens 35 // on the byte level, with the functions ConsumeChars() and ConsumeChar(). The 36 // conversion from byte to JSON token happens without advancing the parser in 37 // GetNextToken/ParseToken, that is tokenization operates on the current parser 38 // position without advancing. 39 // 40 // Built on top of these are a family of Consume functions that iterate 41 // internally. Invariant: on entry of a Consume function, the parser is wound 42 // to the first byte of a valid JSON token. On exit, it is on the first byte 43 // after the token that was just consumed, which would likely be the first byte 44 // of the next token. 45 class BASE_EXPORT JSONParser { 46 public: 47 // Error codes during parsing. 48 enum JsonParseError { 49 JSON_NO_ERROR = base::ValueDeserializer::kErrorCodeNoError, 50 JSON_SYNTAX_ERROR = base::ValueDeserializer::kErrorCodeInvalidFormat, 51 JSON_INVALID_ESCAPE, 52 JSON_UNEXPECTED_TOKEN, 53 JSON_TRAILING_COMMA, 54 JSON_TOO_MUCH_NESTING, 55 JSON_UNEXPECTED_DATA_AFTER_ROOT, 56 JSON_UNSUPPORTED_ENCODING, 57 JSON_UNQUOTED_DICTIONARY_KEY, 58 JSON_UNREPRESENTABLE_NUMBER, 59 JSON_PARSE_ERROR_COUNT 60 }; 61 62 // String versions of parse error codes. 63 static const char kSyntaxError[]; 64 static const char kInvalidEscape[]; 65 static const char kUnexpectedToken[]; 66 static const char kTrailingComma[]; 67 static const char kTooMuchNesting[]; 68 static const char kUnexpectedDataAfterRoot[]; 69 static const char kUnsupportedEncoding[]; 70 static const char kUnquotedDictionaryKey[]; 71 static const char kUnrepresentableNumber[]; 72 73 explicit JSONParser(int options, size_t max_depth = kAbsoluteMaxDepth); 74 75 JSONParser(const JSONParser&) = delete; 76 JSONParser& operator=(const JSONParser&) = delete; 77 78 ~JSONParser(); 79 80 // Parses the input string according to the set options and returns the 81 // result as a Value. 82 // Wrap this in base::FooValue::From() to check the Value is of type Foo and 83 // convert to a FooValue at the same time. 84 std::optional<Value> Parse(std::string_view input); 85 86 // Returns the error code. 87 JsonParseError error_code() const; 88 89 // Returns the human-friendly error message. 90 std::string GetErrorMessage() const; 91 92 // Returns the error line number if parse error happened. Otherwise always 93 // returns 0. 94 int error_line() const; 95 96 // Returns the error column number if parse error happened. Otherwise always 97 // returns 0. 98 int error_column() const; 99 100 private: 101 enum Token { 102 T_OBJECT_BEGIN, // { 103 T_OBJECT_END, // } 104 T_ARRAY_BEGIN, // [ 105 T_ARRAY_END, // ] 106 T_STRING, 107 T_NUMBER, 108 T_BOOL_TRUE, // true 109 T_BOOL_FALSE, // false 110 T_NULL, // null 111 T_LIST_SEPARATOR, // , 112 T_OBJECT_PAIR_SEPARATOR, // : 113 T_END_OF_INPUT, 114 T_INVALID_TOKEN, 115 }; 116 117 // A helper class used for parsing strings. One optimization performed is to 118 // create base::Value with a std::string_view to avoid unnecessary std::string 119 // copies. This is not possible if the input string needs to be decoded from 120 // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped. 121 // This class centralizes that logic. 122 class StringBuilder { 123 public: 124 // Empty constructor. Used for creating a builder with which to assign to. 125 StringBuilder(); 126 127 // |pos| is the beginning of an input string, excluding the |"|. 128 explicit StringBuilder(const char* pos); 129 130 ~StringBuilder(); 131 132 StringBuilder& operator=(StringBuilder&& other); 133 134 // Appends the Unicode code point |point| to the string, either by 135 // increasing the |length_| of the string if the string has not been 136 // converted, or by appending the UTF8 bytes for the code point. 137 void Append(base_icu::UChar32 point); 138 139 // Converts the builder from its default std::string_view to a full 140 // std::string, performing a copy. Once a builder is converted, it cannot be 141 // made a std::string_view again. 142 void Convert(); 143 144 // Returns the builder as a string, invalidating all state. This allows 145 // the internal string buffer representation to be destructively moved 146 // in cases where the builder will not be needed any more. 147 std::string DestructiveAsString(); 148 149 private: 150 // The beginning of the input string. 151 const char* pos_; 152 153 // Number of bytes in |pos_| that make up the string being built. 154 size_t length_; 155 156 // The copied string representation. Will be unset until Convert() is 157 // called. 158 std::optional<std::string> string_; 159 }; 160 161 // Returns the next |count| bytes of the input stream, or nullopt if fewer 162 // than |count| bytes remain. 163 std::optional<std::string_view> PeekChars(size_t count); 164 165 // Calls PeekChars() with a |count| of 1. 166 std::optional<char> PeekChar(); 167 168 // Returns the next |count| bytes of the input stream, or nullopt if fewer 169 // than |count| bytes remain, and advances the parser position by |count|. 170 std::optional<std::string_view> ConsumeChars(size_t count); 171 172 // Calls ConsumeChars() with a |count| of 1. 173 std::optional<char> ConsumeChar(); 174 175 // Returns a pointer to the current character position. 176 const char* pos(); 177 178 // Skips over whitespace and comments to find the next token in the stream. 179 // This does not advance the parser for non-whitespace or comment chars. 180 Token GetNextToken(); 181 182 // Consumes whitespace characters and comments until the next non-that is 183 // encountered. 184 void EatWhitespaceAndComments(); 185 // Helper function that consumes a comment, assuming that the parser is 186 // currently wound to a '/'. 187 bool EatComment(); 188 189 // Calls GetNextToken() and then ParseToken(). 190 std::optional<Value> ParseNextToken(); 191 192 // Takes a token that represents the start of a Value ("a structural token" 193 // in RFC terms) and consumes it, returning the result as a Value. 194 std::optional<Value> ParseToken(Token token); 195 196 // Assuming that the parser is currently wound to '{', this parses a JSON 197 // object into a Value. 198 std::optional<Value> ConsumeDictionary(); 199 200 // Assuming that the parser is wound to '[', this parses a JSON list into a 201 // Value. 202 std::optional<Value> ConsumeList(); 203 204 // Calls through ConsumeStringRaw and wraps it in a value. 205 std::optional<Value> ConsumeString(); 206 207 // Assuming that the parser is wound to a double quote, this parses a string, 208 // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on 209 // success and places result into |out|. Returns false on failure with 210 // error information set. 211 bool ConsumeStringRaw(StringBuilder* out); 212 // Helper function for ConsumeStringRaw() that consumes the next four or 10 213 // bytes (parser is wound to the first character of a HEX sequence, with the 214 // potential for consuming another \uXXXX for a surrogate). Returns true on 215 // success and places the code point |out_code_point|, and false on failure. 216 bool DecodeUTF16(base_icu::UChar32* out_code_point); 217 218 // Assuming that the parser is wound to the start of a valid JSON number, 219 // this parses and converts it to either an int or double value. 220 std::optional<Value> ConsumeNumber(); 221 // Helper that reads characters that are ints. Returns true if a number was 222 // read and false on error. 223 bool ReadInt(bool allow_leading_zeros); 224 225 // Consumes the literal values of |true|, |false|, and |null|, assuming the 226 // parser is wound to the first character of any of those. 227 std::optional<Value> ConsumeLiteral(); 228 229 // Helper function that returns true if the byte squence |match| can be 230 // consumed at the current parser position. Returns false if there are fewer 231 // than |match|-length bytes or if the sequence does not match, and the 232 // parser state is unchanged. 233 bool ConsumeIfMatch(std::string_view match); 234 235 // Sets the error information to |code| at the current column, based on 236 // |index_| and |index_last_line_|, with an optional positive/negative 237 // adjustment by |column_adjust|. 238 void ReportError(JsonParseError code, int column_adjust); 239 240 // Given the line and column number of an error, formats one of the error 241 // message contants from json_reader.h for human display. 242 static std::string FormatErrorMessage(int line, int column, 243 const std::string& description); 244 245 // base::JSONParserOptions that control parsing. 246 const int options_; 247 248 // Maximum depth to parse. 249 const size_t max_depth_; 250 251 // The input stream being parsed. Note: Not guaranteed to NUL-terminated. 252 std::string_view input_; 253 254 // The index in the input stream to which the parser is wound. 255 size_t index_; 256 257 // The number of times the parser has recursed (current stack depth). 258 size_t stack_depth_; 259 260 // The line number that the parser is at currently. 261 int line_number_; 262 263 // The last value of |index_| on the previous line. 264 size_t index_last_line_; 265 266 // Error information. 267 JsonParseError error_code_; 268 int error_line_; 269 int error_column_; 270 271 friend class JSONParserTest; 272 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar); 273 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary); 274 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList); 275 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString); 276 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals); 277 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers); 278 FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages); 279 }; 280 281 // Used when decoding and an invalid utf-8 sequence is encountered. 282 BASE_EXPORT extern const char kUnicodeReplacementString[]; 283 284 } // namespace internal 285 } // namespace base 286 287 #endif // BASE_JSON_JSON_PARSER_H_ 288