xref: /aosp_15_r20/external/cronet/base/i18n/streaming_utf8_validator.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2014 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // A streaming validator for UTF-8. Validation is based on the definition in
6 // RFC-3629. In particular, it does not reject the invalid characters rejected
7 // by base::IsStringUTF8().
8 //
9 // The implementation detects errors on the first possible byte.
10 
11 #ifndef BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
12 #define BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
13 
14 #include <stddef.h>
15 #include <stdint.h>
16 
17 #include <string>
18 
19 #include "base/containers/span.h"
20 #include "base/i18n/base_i18n_export.h"
21 
22 namespace base {
23 
24 class BASE_I18N_EXPORT StreamingUtf8Validator {
25  public:
26   // The validator exposes 3 states. It starts in state VALID_ENDPOINT. As it
27   // processes characters it alternates between VALID_ENDPOINT and
28   // VALID_MIDPOINT. If it encounters an invalid byte or UTF-8 sequence the
29   // state changes permanently to INVALID.
30   enum State {
31     VALID_ENDPOINT,
32     VALID_MIDPOINT,
33     INVALID
34   };
35 
StreamingUtf8Validator()36   StreamingUtf8Validator() : state_(0u) {}
37 
38   // This type could be made copyable but there is currently no use-case for
39   // it.
40   StreamingUtf8Validator(const StreamingUtf8Validator&) = delete;
41   StreamingUtf8Validator& operator=(const StreamingUtf8Validator&) = delete;
42 
43   // Trivial destructor intentionally omitted.
44 
45   // Validate bytes described by |data|. If the concatenation of all calls
46   // to AddBytes() since this object was constructed or reset is a valid UTF-8
47   // string, returns VALID_ENDPOINT. If it could be the prefix of a valid UTF-8
48   // string, returns VALID_MIDPOINT. If an invalid byte or UTF-8 sequence was
49   // present, returns INVALID.
50   State AddBytes(base::span<const uint8_t> data);
51 
52   // Return the object to a freshly-constructed state so that it can be re-used.
53   void Reset();
54 
55   // Validate a complete string using the same criteria. Returns true if the
56   // string only contains complete, valid UTF-8 codepoints.
57   static bool Validate(const std::string& string);
58 
59  private:
60   // The current state of the validator. Value 0 is the initial/valid state.
61   // The state is stored as an offset into |kUtf8ValidatorTables|. The special
62   // state |kUtf8InvalidState| is invalid.
63   uint8_t state_;
64 };
65 
66 }  // namespace base
67 
68 #endif  // BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
69