xref: /aosp_15_r20/external/cronet/third_party/protobuf/src/google/protobuf/io/tokenizer.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // Author: [email protected] (Kenton Varda)
32 //  Based on original Protocol Buffers design by
33 //  Sanjay Ghemawat, Jeff Dean, and others.
34 //
35 // Here we have a hand-written lexer.  At first you might ask yourself,
36 // "Hand-written text processing?  Is Kenton crazy?!"  Well, first of all,
37 // yes I am crazy, but that's beside the point.  There are actually reasons
38 // why I ended up writing this this way.
39 //
40 // The traditional approach to lexing is to use lex to generate a lexer for
41 // you.  Unfortunately, lex's output is ridiculously ugly and difficult to
42 // integrate cleanly with C++ code, especially abstract code or code meant
43 // as a library.  Better parser-generators exist but would add dependencies
44 // which most users won't already have, which we'd like to avoid.  (GNU flex
45 // has a C++ output option, but it's still ridiculously ugly, non-abstract,
46 // and not library-friendly.)
47 //
48 // The next approach that any good software engineer should look at is to
49 // use regular expressions.  And, indeed, I did.  I have code which
50 // implements this same class using regular expressions.  It's about 200
51 // lines shorter.  However:
52 // - Rather than error messages telling you "This string has an invalid
53 //   escape sequence at line 5, column 45", you get error messages like
54 //   "Parse error on line 5".  Giving more precise errors requires adding
55 //   a lot of code that ends up basically as complex as the hand-coded
56 //   version anyway.
57 // - The regular expression to match a string literal looks like this:
58 //     kString  = new RE("(\"([^\"\\\\]|"              // non-escaped
59 //                       "\\\\[abfnrtv?\"'\\\\0-7]|"   // normal escape
60 //                       "\\\\x[0-9a-fA-F])*\"|"       // hex escape
61 //                       "\'([^\'\\\\]|"        // Also support single-quotes.
62 //                       "\\\\[abfnrtv?\"'\\\\0-7]|"
63 //                       "\\\\x[0-9a-fA-F])*\')");
64 //   Verifying the correctness of this line noise is actually harder than
65 //   verifying the correctness of ConsumeString(), defined below.  I'm not
66 //   even confident that the above is correct, after staring at it for some
67 //   time.
68 // - PCRE is fast, but there's still more overhead involved than the code
69 //   below.
70 // - Sadly, regular expressions are not part of the C standard library, so
71 //   using them would require depending on some other library.  For the
72 //   open source release, this could be really annoying.  Nobody likes
73 //   downloading one piece of software just to find that they need to
74 //   download something else to make it work, and in all likelihood
75 //   people downloading Protocol Buffers will already be doing so just
76 //   to make something else work.  We could include a copy of PCRE with
77 //   our code, but that obligates us to keep it up-to-date and just seems
78 //   like a big waste just to save 200 lines of code.
79 //
80 // On a similar but unrelated note, I'm even scared to use ctype.h.
81 // Apparently functions like isalpha() are locale-dependent.  So, if we used
82 // that, then if this code is being called from some program that doesn't
83 // have its locale set to "C", it would behave strangely.  We can't just set
84 // the locale to "C" ourselves since we might break the calling program that
85 // way, particularly if it is multi-threaded.  WTF?  Someone please let me
86 // (Kenton) know if I'm missing something here...
87 //
88 // I'd love to hear about other alternatives, though, as this code isn't
89 // exactly pretty.
90 
91 #include <google/protobuf/io/tokenizer.h>
92 
93 #include <google/protobuf/stubs/common.h>
94 #include <google/protobuf/stubs/logging.h>
95 #include <google/protobuf/stubs/strutil.h>
96 #include <google/protobuf/stubs/stringprintf.h>
97 #include <google/protobuf/io/strtod.h>
98 #include <google/protobuf/io/zero_copy_stream.h>
99 #include <google/protobuf/stubs/stl_util.h>
100 
101 // Must be included last.
102 #include <google/protobuf/port_def.inc>
103 
104 namespace google {
105 namespace protobuf {
106 namespace io {
107 namespace {
108 
109 // As mentioned above, I don't trust ctype.h due to the presence of "locales".
110 // So, I have written replacement functions here.  Someone please smack me if
111 // this is a bad idea or if there is some way around this.
112 //
113 // These "character classes" are designed to be used in template methods.
114 // For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
115 // whitespace.
116 
117 // Note:  No class is allowed to contain '\0', since this is used to mark end-
118 //   of-input and is handled specially.
119 
120 #define CHARACTER_CLASS(NAME, EXPRESSION)                     \
121   class NAME {                                                \
122    public:                                                    \
123     static inline bool InClass(char c) { return EXPRESSION; } \
124   }
125 
126 CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' || c == '\r' ||
127                                 c == '\v' || c == '\f');
128 CHARACTER_CLASS(WhitespaceNoNewline,
129                 c == ' ' || c == '\t' || c == '\r' || c == '\v' || c == '\f');
130 
131 CHARACTER_CLASS(Unprintable, c<' ' && c> '\0');
132 
133 CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
134 CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
135 CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
136                               ('A' <= c && c <= 'F'));
137 
138 CHARACTER_CLASS(Letter,
139                 ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_'));
140 
141 CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
142                                   ('A' <= c && c <= 'Z') ||
143                                   ('0' <= c && c <= '9') || (c == '_'));
144 
145 CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
146                             c == 'r' || c == 't' || c == 'v' || c == '\\' ||
147                             c == '?' || c == '\'' || c == '\"');
148 
149 #undef CHARACTER_CLASS
150 
151 // Given a char, interpret it as a numeric digit and return its value.
152 // This supports any number base up to 36.
153 // Represents integer values of digits.
154 // Uses 36 to indicate an invalid character since we support
155 // bases up to 36.
156 static const int8_t kAsciiToInt[256] = {
157     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // 00-0F
158     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // 10-1F
159     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // ' '-'/'
160     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,                           // '0'-'9'
161     36, 36, 36, 36, 36, 36, 36,                                      // ':'-'@'
162     10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  // 'A'-'P'
163     26, 27, 28, 29, 30, 31, 32, 33, 34, 35,                          // 'Q'-'Z'
164     36, 36, 36, 36, 36, 36,                                          // '['-'`'
165     10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  // 'a'-'p'
166     26, 27, 28, 29, 30, 31, 32, 33, 34, 35,                          // 'q'-'z'
167     36, 36, 36, 36, 36,                                              // '{'-DEL
168     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // 80-8F
169     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // 90-9F
170     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // A0-AF
171     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // B0-BF
172     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // C0-CF
173     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // D0-DF
174     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // E0-EF
175     36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,  // F0-FF
176 };
177 
DigitValue(char digit)178 inline int DigitValue(char digit) { return kAsciiToInt[digit & 0xFF]; }
179 
180 // Inline because it's only used in one place.
TranslateEscape(char c)181 inline char TranslateEscape(char c) {
182   switch (c) {
183     case 'a':
184       return '\a';
185     case 'b':
186       return '\b';
187     case 'f':
188       return '\f';
189     case 'n':
190       return '\n';
191     case 'r':
192       return '\r';
193     case 't':
194       return '\t';
195     case 'v':
196       return '\v';
197     case '\\':
198       return '\\';
199     case '?':
200       return '\?';  // Trigraphs = :(
201     case '\'':
202       return '\'';
203     case '"':
204       return '\"';
205 
206     // We expect escape sequences to have been validated separately.
207     default:
208       return '?';
209   }
210 }
211 
212 }  // anonymous namespace
213 
~ErrorCollector()214 ErrorCollector::~ErrorCollector() {}
215 
216 // ===================================================================
217 
Tokenizer(ZeroCopyInputStream * input,ErrorCollector * error_collector)218 Tokenizer::Tokenizer(ZeroCopyInputStream* input,
219                      ErrorCollector* error_collector)
220     : input_(input),
221       error_collector_(error_collector),
222       buffer_(NULL),
223       buffer_size_(0),
224       buffer_pos_(0),
225       read_error_(false),
226       line_(0),
227       column_(0),
228       record_target_(NULL),
229       record_start_(-1),
230       allow_f_after_float_(false),
231       comment_style_(CPP_COMMENT_STYLE),
232       require_space_after_number_(true),
233       allow_multiline_strings_(false) {
234   current_.line = 0;
235   current_.column = 0;
236   current_.end_column = 0;
237   current_.type = TYPE_START;
238 
239   Refresh();
240 }
241 
~Tokenizer()242 Tokenizer::~Tokenizer() {
243   // If we had any buffer left unread, return it to the underlying stream
244   // so that someone else can read it.
245   if (buffer_size_ > buffer_pos_) {
246     input_->BackUp(buffer_size_ - buffer_pos_);
247   }
248 }
249 
report_whitespace() const250 bool Tokenizer::report_whitespace() const { return report_whitespace_; }
251 // Note: `set_report_whitespace(false)` implies `set_report_newlines(false)`.
set_report_whitespace(bool report)252 void Tokenizer::set_report_whitespace(bool report) {
253   report_whitespace_ = report;
254   report_newlines_ &= report;
255 }
256 
257 // If true, newline tokens are reported by Next().
report_newlines() const258 bool Tokenizer::report_newlines() const { return report_newlines_; }
259 // Note: `set_report_newlines(true)` implies `set_report_whitespace(true)`.
set_report_newlines(bool report)260 void Tokenizer::set_report_newlines(bool report) {
261   report_newlines_ = report;
262   report_whitespace_ |= report;  // enable report_whitespace if necessary
263 }
264 
265 // -------------------------------------------------------------------
266 // Internal helpers.
267 
NextChar()268 void Tokenizer::NextChar() {
269   // Update our line and column counters based on the character being
270   // consumed.
271   if (current_char_ == '\n') {
272     ++line_;
273     column_ = 0;
274   } else if (current_char_ == '\t') {
275     column_ += kTabWidth - column_ % kTabWidth;
276   } else {
277     ++column_;
278   }
279 
280   // Advance to the next character.
281   ++buffer_pos_;
282   if (buffer_pos_ < buffer_size_) {
283     current_char_ = buffer_[buffer_pos_];
284   } else {
285     Refresh();
286   }
287 }
288 
Refresh()289 void Tokenizer::Refresh() {
290   if (read_error_) {
291     current_char_ = '\0';
292     return;
293   }
294 
295   // If we're in a token, append the rest of the buffer to it.
296   if (record_target_ != NULL && record_start_ < buffer_size_) {
297     record_target_->append(buffer_ + record_start_,
298                            buffer_size_ - record_start_);
299     record_start_ = 0;
300   }
301 
302   const void* data = NULL;
303   buffer_ = NULL;
304   buffer_pos_ = 0;
305   do {
306     if (!input_->Next(&data, &buffer_size_)) {
307       // end of stream (or read error)
308       buffer_size_ = 0;
309       read_error_ = true;
310       current_char_ = '\0';
311       return;
312     }
313   } while (buffer_size_ == 0);
314 
315   buffer_ = static_cast<const char*>(data);
316 
317   current_char_ = buffer_[0];
318 }
319 
RecordTo(std::string * target)320 inline void Tokenizer::RecordTo(std::string* target) {
321   record_target_ = target;
322   record_start_ = buffer_pos_;
323 }
324 
StopRecording()325 inline void Tokenizer::StopRecording() {
326   // Note:  The if() is necessary because some STL implementations crash when
327   //   you call string::append(NULL, 0), presumably because they are trying to
328   //   be helpful by detecting the NULL pointer, even though there's nothing
329   //   wrong with reading zero bytes from NULL.
330   if (buffer_pos_ != record_start_) {
331     record_target_->append(buffer_ + record_start_,
332                            buffer_pos_ - record_start_);
333   }
334   record_target_ = NULL;
335   record_start_ = -1;
336 }
337 
StartToken()338 inline void Tokenizer::StartToken() {
339   current_.type = TYPE_START;  // Just for the sake of initializing it.
340   current_.text.clear();
341   current_.line = line_;
342   current_.column = column_;
343   RecordTo(&current_.text);
344 }
345 
EndToken()346 inline void Tokenizer::EndToken() {
347   StopRecording();
348   current_.end_column = column_;
349 }
350 
351 // -------------------------------------------------------------------
352 // Helper methods that consume characters.
353 
354 template <typename CharacterClass>
LookingAt()355 inline bool Tokenizer::LookingAt() {
356   return CharacterClass::InClass(current_char_);
357 }
358 
359 template <typename CharacterClass>
TryConsumeOne()360 inline bool Tokenizer::TryConsumeOne() {
361   if (CharacterClass::InClass(current_char_)) {
362     NextChar();
363     return true;
364   } else {
365     return false;
366   }
367 }
368 
TryConsume(char c)369 inline bool Tokenizer::TryConsume(char c) {
370   if (current_char_ == c) {
371     NextChar();
372     return true;
373   } else {
374     return false;
375   }
376 }
377 
378 template <typename CharacterClass>
ConsumeZeroOrMore()379 inline void Tokenizer::ConsumeZeroOrMore() {
380   while (CharacterClass::InClass(current_char_)) {
381     NextChar();
382   }
383 }
384 
385 template <typename CharacterClass>
ConsumeOneOrMore(const char * error)386 inline void Tokenizer::ConsumeOneOrMore(const char* error) {
387   if (!CharacterClass::InClass(current_char_)) {
388     AddError(error);
389   } else {
390     do {
391       NextChar();
392     } while (CharacterClass::InClass(current_char_));
393   }
394 }
395 
396 // -------------------------------------------------------------------
397 // Methods that read whole patterns matching certain kinds of tokens
398 // or comments.
399 
ConsumeString(char delimiter)400 void Tokenizer::ConsumeString(char delimiter) {
401   while (true) {
402     switch (current_char_) {
403       case '\0':
404         AddError("Unexpected end of string.");
405         return;
406 
407       case '\n': {
408         if (!allow_multiline_strings_) {
409           AddError("String literals cannot cross line boundaries.");
410           return;
411         }
412         NextChar();
413         break;
414       }
415 
416       case '\\': {
417         // An escape sequence.
418         NextChar();
419         if (TryConsumeOne<Escape>()) {
420           // Valid escape sequence.
421         } else if (TryConsumeOne<OctalDigit>()) {
422           // Possibly followed by two more octal digits, but these will
423           // just be consumed by the main loop anyway so we don't need
424           // to do so explicitly here.
425         } else if (TryConsume('x')) {
426           if (!TryConsumeOne<HexDigit>()) {
427             AddError("Expected hex digits for escape sequence.");
428           }
429           // Possibly followed by another hex digit, but again we don't care.
430         } else if (TryConsume('u')) {
431           if (!TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
432               !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>()) {
433             AddError("Expected four hex digits for \\u escape sequence.");
434           }
435         } else if (TryConsume('U')) {
436           // We expect 8 hex digits; but only the range up to 0x10ffff is
437           // legal.
438           if (!TryConsume('0') || !TryConsume('0') ||
439               !(TryConsume('0') || TryConsume('1')) ||
440               !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
441               !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
442               !TryConsumeOne<HexDigit>()) {
443             AddError(
444                 "Expected eight hex digits up to 10ffff for \\U escape "
445                 "sequence");
446           }
447         } else {
448           AddError("Invalid escape sequence in string literal.");
449         }
450         break;
451       }
452 
453       default: {
454         if (current_char_ == delimiter) {
455           NextChar();
456           return;
457         }
458         NextChar();
459         break;
460       }
461     }
462   }
463 }
464 
ConsumeNumber(bool started_with_zero,bool started_with_dot)465 Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
466                                               bool started_with_dot) {
467   bool is_float = false;
468 
469   if (started_with_zero && (TryConsume('x') || TryConsume('X'))) {
470     // A hex number (started with "0x").
471     ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
472 
473   } else if (started_with_zero && LookingAt<Digit>()) {
474     // An octal number (had a leading zero).
475     ConsumeZeroOrMore<OctalDigit>();
476     if (LookingAt<Digit>()) {
477       AddError("Numbers starting with leading zero must be in octal.");
478       ConsumeZeroOrMore<Digit>();
479     }
480 
481   } else {
482     // A decimal number.
483     if (started_with_dot) {
484       is_float = true;
485       ConsumeZeroOrMore<Digit>();
486     } else {
487       ConsumeZeroOrMore<Digit>();
488 
489       if (TryConsume('.')) {
490         is_float = true;
491         ConsumeZeroOrMore<Digit>();
492       }
493     }
494 
495     if (TryConsume('e') || TryConsume('E')) {
496       is_float = true;
497       TryConsume('-') || TryConsume('+');
498       ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
499     }
500 
501     if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) {
502       is_float = true;
503     }
504   }
505 
506   if (LookingAt<Letter>() && require_space_after_number_) {
507     AddError("Need space between number and identifier.");
508   } else if (current_char_ == '.') {
509     if (is_float) {
510       AddError(
511           "Already saw decimal point or exponent; can't have another one.");
512     } else {
513       AddError("Hex and octal numbers must be integers.");
514     }
515   }
516 
517   return is_float ? TYPE_FLOAT : TYPE_INTEGER;
518 }
519 
ConsumeLineComment(std::string * content)520 void Tokenizer::ConsumeLineComment(std::string* content) {
521   if (content != NULL) RecordTo(content);
522 
523   while (current_char_ != '\0' && current_char_ != '\n') {
524     NextChar();
525   }
526   TryConsume('\n');
527 
528   if (content != NULL) StopRecording();
529 }
530 
ConsumeBlockComment(std::string * content)531 void Tokenizer::ConsumeBlockComment(std::string* content) {
532   int start_line = line_;
533   int start_column = column_ - 2;
534 
535   if (content != NULL) RecordTo(content);
536 
537   while (true) {
538     while (current_char_ != '\0' && current_char_ != '*' &&
539            current_char_ != '/' && current_char_ != '\n') {
540       NextChar();
541     }
542 
543     if (TryConsume('\n')) {
544       if (content != NULL) StopRecording();
545 
546       // Consume leading whitespace and asterisk;
547       ConsumeZeroOrMore<WhitespaceNoNewline>();
548       if (TryConsume('*')) {
549         if (TryConsume('/')) {
550           // End of comment.
551           break;
552         }
553       }
554 
555       if (content != NULL) RecordTo(content);
556     } else if (TryConsume('*') && TryConsume('/')) {
557       // End of comment.
558       if (content != NULL) {
559         StopRecording();
560         // Strip trailing "*/".
561         content->erase(content->size() - 2);
562       }
563       break;
564     } else if (TryConsume('/') && current_char_ == '*') {
565       // Note:  We didn't consume the '*' because if there is a '/' after it
566       //   we want to interpret that as the end of the comment.
567       AddError(
568           "\"/*\" inside block comment.  Block comments cannot be nested.");
569     } else if (current_char_ == '\0') {
570       AddError("End-of-file inside block comment.");
571       error_collector_->AddError(start_line, start_column,
572                                  "  Comment started here.");
573       if (content != NULL) StopRecording();
574       break;
575     }
576   }
577 }
578 
TryConsumeCommentStart()579 Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
580   if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
581     if (TryConsume('/')) {
582       return LINE_COMMENT;
583     } else if (TryConsume('*')) {
584       return BLOCK_COMMENT;
585     } else {
586       // Oops, it was just a slash.  Return it.
587       current_.type = TYPE_SYMBOL;
588       current_.text = "/";
589       current_.line = line_;
590       current_.column = column_ - 1;
591       current_.end_column = column_;
592       return SLASH_NOT_COMMENT;
593     }
594   } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
595     return LINE_COMMENT;
596   } else {
597     return NO_COMMENT;
598   }
599 }
600 
TryConsumeWhitespace()601 bool Tokenizer::TryConsumeWhitespace() {
602   if (report_newlines_) {
603     if (TryConsumeOne<WhitespaceNoNewline>()) {
604       ConsumeZeroOrMore<WhitespaceNoNewline>();
605       current_.type = TYPE_WHITESPACE;
606       return true;
607     }
608     return false;
609   }
610   if (TryConsumeOne<Whitespace>()) {
611     ConsumeZeroOrMore<Whitespace>();
612     current_.type = TYPE_WHITESPACE;
613     return report_whitespace_;
614   }
615   return false;
616 }
617 
TryConsumeNewline()618 bool Tokenizer::TryConsumeNewline() {
619   if (!report_whitespace_ || !report_newlines_) {
620     return false;
621   }
622   if (TryConsume('\n')) {
623     current_.type = TYPE_NEWLINE;
624     return true;
625   }
626   return false;
627 }
628 
629 // -------------------------------------------------------------------
630 
Next()631 bool Tokenizer::Next() {
632   previous_ = current_;
633 
634   while (!read_error_) {
635     StartToken();
636     bool report_token = TryConsumeWhitespace() || TryConsumeNewline();
637     EndToken();
638     if (report_token) {
639       return true;
640     }
641 
642     switch (TryConsumeCommentStart()) {
643       case LINE_COMMENT:
644         ConsumeLineComment(NULL);
645         continue;
646       case BLOCK_COMMENT:
647         ConsumeBlockComment(NULL);
648         continue;
649       case SLASH_NOT_COMMENT:
650         return true;
651       case NO_COMMENT:
652         break;
653     }
654 
655     // Check for EOF before continuing.
656     if (read_error_) break;
657 
658     if (LookingAt<Unprintable>() || current_char_ == '\0') {
659       AddError("Invalid control characters encountered in text.");
660       NextChar();
661       // Skip more unprintable characters, too.  But, remember that '\0' is
662       // also what current_char_ is set to after EOF / read error.  We have
663       // to be careful not to go into an infinite loop of trying to consume
664       // it, so make sure to check read_error_ explicitly before consuming
665       // '\0'.
666       while (TryConsumeOne<Unprintable>() ||
667              (!read_error_ && TryConsume('\0'))) {
668         // Ignore.
669       }
670 
671     } else {
672       // Reading some sort of token.
673       StartToken();
674 
675       if (TryConsumeOne<Letter>()) {
676         ConsumeZeroOrMore<Alphanumeric>();
677         current_.type = TYPE_IDENTIFIER;
678       } else if (TryConsume('0')) {
679         current_.type = ConsumeNumber(true, false);
680       } else if (TryConsume('.')) {
681         // This could be the beginning of a floating-point number, or it could
682         // just be a '.' symbol.
683 
684         if (TryConsumeOne<Digit>()) {
685           // It's a floating-point number.
686           if (previous_.type == TYPE_IDENTIFIER &&
687               current_.line == previous_.line &&
688               current_.column == previous_.end_column) {
689             // We don't accept syntax like "blah.123".
690             error_collector_->AddError(
691                 line_, column_ - 2,
692                 "Need space between identifier and decimal point.");
693           }
694           current_.type = ConsumeNumber(false, true);
695         } else {
696           current_.type = TYPE_SYMBOL;
697         }
698       } else if (TryConsumeOne<Digit>()) {
699         current_.type = ConsumeNumber(false, false);
700       } else if (TryConsume('\"')) {
701         ConsumeString('\"');
702         current_.type = TYPE_STRING;
703       } else if (TryConsume('\'')) {
704         ConsumeString('\'');
705         current_.type = TYPE_STRING;
706       } else {
707         // Check if the high order bit is set.
708         if (current_char_ & 0x80) {
709           error_collector_->AddError(
710               line_, column_,
711               StringPrintf("Interpreting non ascii codepoint %d.",
712                               static_cast<unsigned char>(current_char_)));
713         }
714         NextChar();
715         current_.type = TYPE_SYMBOL;
716       }
717 
718       EndToken();
719       return true;
720     }
721   }
722 
723   // EOF
724   current_.type = TYPE_END;
725   current_.text.clear();
726   current_.line = line_;
727   current_.column = column_;
728   current_.end_column = column_;
729   return false;
730 }
731 
732 namespace {
733 
734 // Helper class for collecting comments and putting them in the right places.
735 //
736 // This basically just buffers the most recent comment until it can be decided
737 // exactly where that comment should be placed.  When Flush() is called, the
738 // current comment goes into either prev_trailing_comments or detached_comments.
739 // When the CommentCollector is destroyed, the last buffered comment goes into
740 // next_leading_comments.
741 class CommentCollector {
742  public:
CommentCollector(std::string * prev_trailing_comments,std::vector<std::string> * detached_comments,std::string * next_leading_comments)743   CommentCollector(std::string* prev_trailing_comments,
744                    std::vector<std::string>* detached_comments,
745                    std::string* next_leading_comments)
746       : prev_trailing_comments_(prev_trailing_comments),
747         detached_comments_(detached_comments),
748         next_leading_comments_(next_leading_comments),
749         has_comment_(false),
750         is_line_comment_(false),
751         can_attach_to_prev_(true) {
752     if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
753     if (detached_comments != NULL) detached_comments->clear();
754     if (next_leading_comments != NULL) next_leading_comments->clear();
755   }
756 
~CommentCollector()757   ~CommentCollector() {
758     // Whatever is in the buffer is a leading comment.
759     if (next_leading_comments_ != NULL && has_comment_) {
760       comment_buffer_.swap(*next_leading_comments_);
761     }
762   }
763 
764   // About to read a line comment.  Get the comment buffer pointer in order to
765   // read into it.
GetBufferForLineComment()766   std::string* GetBufferForLineComment() {
767     // We want to combine with previous line comments, but not block comments.
768     if (has_comment_ && !is_line_comment_) {
769       Flush();
770     }
771     has_comment_ = true;
772     is_line_comment_ = true;
773     return &comment_buffer_;
774   }
775 
776   // About to read a block comment.  Get the comment buffer pointer in order to
777   // read into it.
GetBufferForBlockComment()778   std::string* GetBufferForBlockComment() {
779     if (has_comment_) {
780       Flush();
781     }
782     has_comment_ = true;
783     is_line_comment_ = false;
784     return &comment_buffer_;
785   }
786 
ClearBuffer()787   void ClearBuffer() {
788     comment_buffer_.clear();
789     has_comment_ = false;
790   }
791 
792   // Called once we know that the comment buffer is complete and is *not*
793   // connected to the next token.
Flush()794   void Flush() {
795     if (has_comment_) {
796       if (can_attach_to_prev_) {
797         if (prev_trailing_comments_ != NULL) {
798           prev_trailing_comments_->append(comment_buffer_);
799         }
800         can_attach_to_prev_ = false;
801       } else {
802         if (detached_comments_ != NULL) {
803           detached_comments_->push_back(comment_buffer_);
804         }
805       }
806       ClearBuffer();
807     }
808   }
809 
DetachFromPrev()810   void DetachFromPrev() { can_attach_to_prev_ = false; }
811 
812  private:
813   std::string* prev_trailing_comments_;
814   std::vector<std::string>* detached_comments_;
815   std::string* next_leading_comments_;
816 
817   std::string comment_buffer_;
818 
819   // True if any comments were read into comment_buffer_.  This can be true even
820   // if comment_buffer_ is empty, namely if the comment was "/**/".
821   bool has_comment_;
822 
823   // Is the comment in the comment buffer a line comment?
824   bool is_line_comment_;
825 
826   // Is it still possible that we could be reading a comment attached to the
827   // previous token?
828   bool can_attach_to_prev_;
829 };
830 
831 }  // namespace
832 
NextWithComments(std::string * prev_trailing_comments,std::vector<std::string> * detached_comments,std::string * next_leading_comments)833 bool Tokenizer::NextWithComments(std::string* prev_trailing_comments,
834                                  std::vector<std::string>* detached_comments,
835                                  std::string* next_leading_comments) {
836   CommentCollector collector(prev_trailing_comments, detached_comments,
837                              next_leading_comments);
838 
839   if (current_.type == TYPE_START) {
840     // Ignore unicode byte order mark(BOM) if it appears at the file
841     // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
842     if (TryConsume(static_cast<char>(0xEF))) {
843       if (!TryConsume(static_cast<char>(0xBB)) ||
844           !TryConsume(static_cast<char>(0xBF))) {
845         AddError(
846             "Proto file starts with 0xEF but not UTF-8 BOM. "
847             "Only UTF-8 is accepted for proto file.");
848         return false;
849       }
850     }
851     collector.DetachFromPrev();
852   } else {
853     // A comment appearing on the same line must be attached to the previous
854     // declaration.
855     ConsumeZeroOrMore<WhitespaceNoNewline>();
856     switch (TryConsumeCommentStart()) {
857       case LINE_COMMENT:
858         ConsumeLineComment(collector.GetBufferForLineComment());
859 
860         // Don't allow comments on subsequent lines to be attached to a trailing
861         // comment.
862         collector.Flush();
863         break;
864       case BLOCK_COMMENT:
865         ConsumeBlockComment(collector.GetBufferForBlockComment());
866 
867         ConsumeZeroOrMore<WhitespaceNoNewline>();
868         if (!TryConsume('\n')) {
869           // Oops, the next token is on the same line.  If we recorded a comment
870           // we really have no idea which token it should be attached to.
871           collector.ClearBuffer();
872           return Next();
873         }
874 
875         // Don't allow comments on subsequent lines to be attached to a trailing
876         // comment.
877         collector.Flush();
878         break;
879       case SLASH_NOT_COMMENT:
880         return true;
881       case NO_COMMENT:
882         if (!TryConsume('\n')) {
883           // The next token is on the same line.  There are no comments.
884           return Next();
885         }
886         break;
887     }
888   }
889 
890   // OK, we are now on the line *after* the previous token.
891   while (true) {
892     ConsumeZeroOrMore<WhitespaceNoNewline>();
893 
894     switch (TryConsumeCommentStart()) {
895       case LINE_COMMENT:
896         ConsumeLineComment(collector.GetBufferForLineComment());
897         break;
898       case BLOCK_COMMENT:
899         ConsumeBlockComment(collector.GetBufferForBlockComment());
900 
901         // Consume the rest of the line so that we don't interpret it as a
902         // blank line the next time around the loop.
903         ConsumeZeroOrMore<WhitespaceNoNewline>();
904         TryConsume('\n');
905         break;
906       case SLASH_NOT_COMMENT:
907         return true;
908       case NO_COMMENT:
909         if (TryConsume('\n')) {
910           // Completely blank line.
911           collector.Flush();
912           collector.DetachFromPrev();
913         } else {
914           bool result = Next();
915           if (!result || current_.text == "}" || current_.text == "]" ||
916               current_.text == ")") {
917             // It looks like we're at the end of a scope.  In this case it
918             // makes no sense to attach a comment to the following token.
919             collector.Flush();
920           }
921           return result;
922         }
923         break;
924     }
925   }
926 }
927 
928 // -------------------------------------------------------------------
929 // Token-parsing helpers.  Remember that these don't need to report
930 // errors since any errors should already have been reported while
931 // tokenizing.  Also, these can assume that whatever text they
932 // are given is text that the tokenizer actually parsed as a token
933 // of the given type.
934 
ParseInteger(const std::string & text,uint64_t max_value,uint64_t * output)935 bool Tokenizer::ParseInteger(const std::string& text, uint64_t max_value,
936                              uint64_t* output) {
937   // We can't just use strtoull() because (a) it accepts negative numbers,
938   // (b) We want additional range checks, (c) it reports overflows via errno.
939 
940 #if 0
941   const char *str_begin = text.c_str();
942   if (*str_begin == '-') return false;
943   char *str_end = nullptr;
944   errno = 0;
945   *output = std::strtoull(str_begin, &str_end, 0);
946   return (errno == 0 && str_end && *str_end == '\0' && *output <= max_value);
947 #endif
948 
949   const char* ptr = text.c_str();
950   int base = 10;
951   uint64_t overflow_if_mul_base = (kuint64max / 10) + 1;
952   if (ptr[0] == '0') {
953     if (ptr[1] == 'x' || ptr[1] == 'X') {
954       // This is hex.
955       base = 16;
956       overflow_if_mul_base = (kuint64max / 16) + 1;
957       ptr += 2;
958     } else {
959       // This is octal.
960       base = 8;
961       overflow_if_mul_base = (kuint64max / 8) + 1;
962     }
963   }
964 
965   uint64_t result = 0;
966   // For all the leading '0's, and also the first non-zero character, we
967   // don't need to multiply.
968   while (*ptr != '\0') {
969     int digit = DigitValue(*ptr++);
970     if (digit >= base) {
971       // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
972       // token, but Tokenizer still think it's integer.
973       return false;
974     }
975     if (digit != 0) {
976       result = digit;
977       break;
978     }
979   }
980   for (; *ptr != '\0'; ptr++) {
981     int digit = DigitValue(*ptr);
982     if (digit < 0 || digit >= base) {
983       // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
984       // token, but Tokenizer still think it's integer.
985       return false;
986     }
987     if (result >= overflow_if_mul_base) {
988       // We know the multiply we're about to do will overflow, so exit now.
989       return false;
990     }
991     // We know that result * base won't overflow, but adding digit might...
992     result = result * base + digit;
993     // C++ guarantees defined "wrap" semantics when unsigned integer
994     // operations overflow, making this a fast way to check if adding
995     // digit made result overflow, and thus, wrap around.
996     if (result < static_cast<uint64_t>(base)) return false;
997   }
998   if (result > max_value) return false;
999 
1000   *output = result;
1001   return true;
1002 }
1003 
ParseFloat(const std::string & text)1004 double Tokenizer::ParseFloat(const std::string& text) {
1005   const char* start = text.c_str();
1006   char* end;
1007   double result = NoLocaleStrtod(start, &end);
1008 
1009   // "1e" is not a valid float, but if the tokenizer reads it, it will
1010   // report an error but still return it as a valid token.  We need to
1011   // accept anything the tokenizer could possibly return, error or not.
1012   if (*end == 'e' || *end == 'E') {
1013     ++end;
1014     if (*end == '-' || *end == '+') ++end;
1015   }
1016 
1017   // If the Tokenizer had allow_f_after_float_ enabled, the float may be
1018   // suffixed with the letter 'f'.
1019   if (*end == 'f' || *end == 'F') {
1020     ++end;
1021   }
1022 
1023   GOOGLE_LOG_IF(DFATAL,
1024          static_cast<size_t>(end - start) != text.size() || *start == '-')
1025       << " Tokenizer::ParseFloat() passed text that could not have been"
1026          " tokenized as a float: "
1027       << CEscape(text);
1028   return result;
1029 }
1030 
1031 // Helper to append a Unicode code point to a string as UTF8, without bringing
1032 // in any external dependencies.
AppendUTF8(uint32_t code_point,std::string * output)1033 static void AppendUTF8(uint32_t code_point, std::string* output) {
1034   uint32_t tmp = 0;
1035   int len = 0;
1036   if (code_point <= 0x7f) {
1037     tmp = code_point;
1038     len = 1;
1039   } else if (code_point <= 0x07ff) {
1040     tmp = 0x0000c080 | ((code_point & 0x07c0) << 2) | (code_point & 0x003f);
1041     len = 2;
1042   } else if (code_point <= 0xffff) {
1043     tmp = 0x00e08080 | ((code_point & 0xf000) << 4) |
1044           ((code_point & 0x0fc0) << 2) | (code_point & 0x003f);
1045     len = 3;
1046   } else if (code_point <= 0x10ffff) {
1047     tmp = 0xf0808080 | ((code_point & 0x1c0000) << 6) |
1048           ((code_point & 0x03f000) << 4) | ((code_point & 0x000fc0) << 2) |
1049           (code_point & 0x003f);
1050     len = 4;
1051   } else {
1052     // Unicode code points end at 0x10FFFF, so this is out-of-range.
1053     // ConsumeString permits hex values up to 0x1FFFFF, and FetchUnicodePoint
1054     // doesn't perform a range check.
1055     StringAppendF(output, "\\U%08x", code_point);
1056     return;
1057   }
1058   tmp = ghtonl(tmp);
1059   output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
1060 }
1061 
1062 // Try to read <len> hex digits from ptr, and stuff the numeric result into
1063 // *result. Returns true if that many digits were successfully consumed.
ReadHexDigits(const char * ptr,int len,uint32_t * result)1064 static bool ReadHexDigits(const char* ptr, int len, uint32_t* result) {
1065   *result = 0;
1066   if (len == 0) return false;
1067   for (const char* end = ptr + len; ptr < end; ++ptr) {
1068     if (*ptr == '\0') return false;
1069     *result = (*result << 4) + DigitValue(*ptr);
1070   }
1071   return true;
1072 }
1073 
1074 // Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
1075 // 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
1076 // surrogate. These numbers are in a reserved range of Unicode code points, so
1077 // if we encounter such a pair we know how to parse it and convert it into a
1078 // single code point.
1079 static const uint32_t kMinHeadSurrogate = 0xd800;
1080 static const uint32_t kMaxHeadSurrogate = 0xdc00;
1081 static const uint32_t kMinTrailSurrogate = 0xdc00;
1082 static const uint32_t kMaxTrailSurrogate = 0xe000;
1083 
IsHeadSurrogate(uint32_t code_point)1084 static inline bool IsHeadSurrogate(uint32_t code_point) {
1085   return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
1086 }
1087 
IsTrailSurrogate(uint32_t code_point)1088 static inline bool IsTrailSurrogate(uint32_t code_point) {
1089   return (code_point >= kMinTrailSurrogate) &&
1090          (code_point < kMaxTrailSurrogate);
1091 }
1092 
1093 // Combine a head and trail surrogate into a single Unicode code point.
AssembleUTF16(uint32_t head_surrogate,uint32_t trail_surrogate)1094 static uint32_t AssembleUTF16(uint32_t head_surrogate,
1095                               uint32_t trail_surrogate) {
1096   GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
1097   GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
1098   return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
1099                     (trail_surrogate - kMinTrailSurrogate));
1100 }
1101 
1102 // Convert the escape sequence parameter to a number of expected hex digits.
UnicodeLength(char key)1103 static inline int UnicodeLength(char key) {
1104   if (key == 'u') return 4;
1105   if (key == 'U') return 8;
1106   return 0;
1107 }
1108 
1109 // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
1110 // to parse that sequence. On success, returns a pointer to the first char
1111 // beyond that sequence, and fills in *code_point. On failure, returns ptr
1112 // itself.
FetchUnicodePoint(const char * ptr,uint32_t * code_point)1113 static const char* FetchUnicodePoint(const char* ptr, uint32_t* code_point) {
1114   const char* p = ptr;
1115   // Fetch the code point.
1116   const int len = UnicodeLength(*p++);
1117   if (!ReadHexDigits(p, len, code_point)) return ptr;
1118   p += len;
1119 
1120   // Check if the code point we read is a "head surrogate." If so, then we
1121   // expect it to be immediately followed by another code point which is a valid
1122   // "trail surrogate," and together they form a UTF-16 pair which decodes into
1123   // a single Unicode point. Trail surrogates may only use \u, not \U.
1124   if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
1125     uint32_t trail_surrogate;
1126     if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
1127         IsTrailSurrogate(trail_surrogate)) {
1128       *code_point = AssembleUTF16(*code_point, trail_surrogate);
1129       p += 6;
1130     }
1131     // If this failed, then we just emit the head surrogate as a code point.
1132     // It's bogus, but so is the string.
1133   }
1134 
1135   return p;
1136 }
1137 
1138 // The text string must begin and end with single or double quote
1139 // characters.
ParseStringAppend(const std::string & text,std::string * output)1140 void Tokenizer::ParseStringAppend(const std::string& text,
1141                                   std::string* output) {
1142   // Reminder: text[0] is always a quote character.  (If text is
1143   // empty, it's invalid, so we'll just return).
1144   const size_t text_size = text.size();
1145   if (text_size == 0) {
1146     GOOGLE_LOG(DFATAL) << " Tokenizer::ParseStringAppend() passed text that could not"
1147                    " have been tokenized as a string: "
1148                 << CEscape(text);
1149     return;
1150   }
1151 
1152   // Reserve room for new string. The branch is necessary because if
1153   // there is already space available the reserve() call might
1154   // downsize the output.
1155   const size_t new_len = text_size + output->size();
1156   if (new_len > output->capacity()) {
1157     output->reserve(new_len);
1158   }
1159 
1160   // Loop through the string copying characters to "output" and
1161   // interpreting escape sequences.  Note that any invalid escape
1162   // sequences or other errors were already reported while tokenizing.
1163   // In this case we do not need to produce valid results.
1164   for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
1165     if (*ptr == '\\' && ptr[1] != '\0') {
1166       // An escape sequence.
1167       ++ptr;
1168 
1169       if (OctalDigit::InClass(*ptr)) {
1170         // An octal escape.  May one, two, or three digits.
1171         int code = DigitValue(*ptr);
1172         if (OctalDigit::InClass(ptr[1])) {
1173           ++ptr;
1174           code = code * 8 + DigitValue(*ptr);
1175         }
1176         if (OctalDigit::InClass(ptr[1])) {
1177           ++ptr;
1178           code = code * 8 + DigitValue(*ptr);
1179         }
1180         output->push_back(static_cast<char>(code));
1181 
1182       } else if (*ptr == 'x') {
1183         // A hex escape.  May zero, one, or two digits.  (The zero case
1184         // will have been caught as an error earlier.)
1185         int code = 0;
1186         if (HexDigit::InClass(ptr[1])) {
1187           ++ptr;
1188           code = DigitValue(*ptr);
1189         }
1190         if (HexDigit::InClass(ptr[1])) {
1191           ++ptr;
1192           code = code * 16 + DigitValue(*ptr);
1193         }
1194         output->push_back(static_cast<char>(code));
1195 
1196       } else if (*ptr == 'u' || *ptr == 'U') {
1197         uint32_t unicode;
1198         const char* end = FetchUnicodePoint(ptr, &unicode);
1199         if (end == ptr) {
1200           // Failure: Just dump out what we saw, don't try to parse it.
1201           output->push_back(*ptr);
1202         } else {
1203           AppendUTF8(unicode, output);
1204           ptr = end - 1;  // Because we're about to ++ptr.
1205         }
1206       } else {
1207         // Some other escape code.
1208         output->push_back(TranslateEscape(*ptr));
1209       }
1210 
1211     } else if (*ptr == text[0] && ptr[1] == '\0') {
1212       // Ignore final quote matching the starting quote.
1213     } else {
1214       output->push_back(*ptr);
1215     }
1216   }
1217 }
1218 
1219 template <typename CharacterClass>
AllInClass(const std::string & s)1220 static bool AllInClass(const std::string& s) {
1221   for (const char character : s) {
1222     if (!CharacterClass::InClass(character)) return false;
1223   }
1224   return true;
1225 }
1226 
IsIdentifier(const std::string & text)1227 bool Tokenizer::IsIdentifier(const std::string& text) {
1228   // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
1229   if (text.size() == 0) return false;
1230   if (!Letter::InClass(text.at(0))) return false;
1231   if (!AllInClass<Alphanumeric>(text.substr(1))) return false;
1232   return true;
1233 }
1234 
1235 }  // namespace io
1236 }  // namespace protobuf
1237 }  // namespace google
1238 
1239 #include <google/protobuf/port_undef.inc>
1240