1 // Copyright 2014 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // A streaming validator for UTF-8. Validation is based on the definition in 6 // RFC-3629. In particular, it does not reject the invalid characters rejected 7 // by base::IsStringUTF8(). 8 // 9 // The implementation detects errors on the first possible byte. 10 11 #ifndef BASE_I18N_STREAMING_UTF8_VALIDATOR_H_ 12 #define BASE_I18N_STREAMING_UTF8_VALIDATOR_H_ 13 14 #include <stddef.h> 15 #include <stdint.h> 16 17 #include <string> 18 19 #include "base/containers/span.h" 20 #include "base/i18n/base_i18n_export.h" 21 22 namespace base { 23 24 class BASE_I18N_EXPORT StreamingUtf8Validator { 25 public: 26 // The validator exposes 3 states. It starts in state VALID_ENDPOINT. As it 27 // processes characters it alternates between VALID_ENDPOINT and 28 // VALID_MIDPOINT. If it encounters an invalid byte or UTF-8 sequence the 29 // state changes permanently to INVALID. 30 enum State { 31 VALID_ENDPOINT, 32 VALID_MIDPOINT, 33 INVALID 34 }; 35 StreamingUtf8Validator()36 StreamingUtf8Validator() : state_(0u) {} 37 38 // This type could be made copyable but there is currently no use-case for 39 // it. 40 StreamingUtf8Validator(const StreamingUtf8Validator&) = delete; 41 StreamingUtf8Validator& operator=(const StreamingUtf8Validator&) = delete; 42 43 // Trivial destructor intentionally omitted. 44 45 // Validate bytes described by |data|. If the concatenation of all calls 46 // to AddBytes() since this object was constructed or reset is a valid UTF-8 47 // string, returns VALID_ENDPOINT. If it could be the prefix of a valid UTF-8 48 // string, returns VALID_MIDPOINT. If an invalid byte or UTF-8 sequence was 49 // present, returns INVALID. 50 State AddBytes(base::span<const uint8_t> data); 51 52 // Return the object to a freshly-constructed state so that it can be re-used. 53 void Reset(); 54 55 // Validate a complete string using the same criteria. Returns true if the 56 // string only contains complete, valid UTF-8 codepoints. 57 static bool Validate(const std::string& string); 58 59 private: 60 // The current state of the validator. Value 0 is the initial/valid state. 61 // The state is stored as an offset into |kUtf8ValidatorTables|. The special 62 // state |kUtf8InvalidState| is invalid. 63 uint8_t state_; 64 }; 65 66 } // namespace base 67 68 #endif // BASE_I18N_STREAMING_UTF8_VALIDATOR_H_ 69