xref: /aosp_15_r20/external/cronet/base/i18n/char_iterator.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef BASE_I18N_CHAR_ITERATOR_H_
6 #define BASE_I18N_CHAR_ITERATOR_H_
7 
8 #include <stdint.h>
9 
10 #include <string_view>
11 
12 #include "base/i18n/base_i18n_export.h"
13 
14 // The CharIterator classes iterate through the characters in UTF8 and
15 // UTF16 strings.  Example usage:
16 //
17 //   for (UTF8CharIterator iter(str); !iter.end(); iter.Advance()) {
18 //     VLOG(1) << iter.get();
19 //   }
20 
21 namespace base {
22 namespace i18n {
23 
24 class BASE_I18N_EXPORT UTF8CharIterator {
25  public:
26   // Requires |str| to live as long as the UTF8CharIterator does.
27   explicit UTF8CharIterator(std::string_view str);
28   UTF8CharIterator(const UTF8CharIterator&) = delete;
29   UTF8CharIterator& operator=(const UTF8CharIterator&) = delete;
30   ~UTF8CharIterator();
31 
32   // Return the starting array index of the current character within the
33   // string.
array_pos()34   size_t array_pos() const { return array_pos_; }
35 
36   // Return the logical index of the current character, independent of the
37   // number of bytes each character takes.
char_pos()38   size_t char_pos() const { return char_pos_; }
39 
40   // Return the current char.
get()41   int32_t get() const { return char_; }
42 
43   // Returns true if we're at the end of the string.
end()44   bool end() const { return array_pos_ == str_.length(); }
45 
46   // Advance to the next actual character.  Returns false if we're at the
47   // end of the string.
48   bool Advance();
49 
50  private:
51   // The string we're iterating over.
52   std::string_view str_;
53 
54   // Array index.
55   size_t array_pos_;
56 
57   // The next array index.
58   size_t next_pos_;
59 
60   // Character index.
61   size_t char_pos_;
62 
63   // The current character.
64   int32_t char_;
65 };
66 
67 class BASE_I18N_EXPORT UTF16CharIterator {
68  public:
69   // Requires |str| to live as long as the UTF16CharIterator does.
70   explicit UTF16CharIterator(std::u16string_view str);
71   UTF16CharIterator(UTF16CharIterator&& to_move);
72   UTF16CharIterator& operator=(UTF16CharIterator&& to_move);
73 
74   UTF16CharIterator(const UTF16CharIterator&) = delete;
75   UTF16CharIterator operator=(const UTF16CharIterator&) = delete;
76 
77   ~UTF16CharIterator();
78 
79   // Returns an iterator starting on the unicode character at offset
80   // |array_index| into the string, or the previous array offset if
81   // |array_index| is the second half of a surrogate pair.
82   static UTF16CharIterator LowerBound(std::u16string_view str,
83                                       size_t array_index);
84 
85   // Returns an iterator starting on the unicode character at offset
86   // |array_index| into the string, or the next offset if |array_index| is the
87   // second half of a surrogate pair.
88   static UTF16CharIterator UpperBound(std::u16string_view str,
89                                       size_t array_index);
90 
91   // Return the starting array index of the current character within the
92   // string.
array_pos()93   size_t array_pos() const { return array_pos_; }
94 
95   // Returns the offset in code points from the initial iterator position, which
96   // could be negative if Rewind() is called. The initial value is always zero,
97   // regardless of how the iterator is constructed.
char_offset()98   int32_t char_offset() const { return char_offset_; }
99 
100   // Returns the code point at the current position.
get()101   int32_t get() const { return char_; }
102 
103   // Returns the code point (i.e. the full Unicode character, not half of a
104   // surrogate pair) following the current one. Should not be called if end() is
105   // true. If the current code point is the last one in the string, returns
106   // zero.
107   int32_t NextCodePoint() const;
108 
109   // Returns the code point (i.e. the full Unicode character, not half of a
110   // surrogate pair) preceding the current one. Should not be called if start()
111   // is true.
112   int32_t PreviousCodePoint() const;
113 
114   // Returns true if we're at the start of the string.
start()115   bool start() const { return array_pos_ == 0; }
116 
117   // Returns true if we're at the end of the string.
end()118   bool end() const { return array_pos_ == str_.length(); }
119 
120   // Advances to the next actual character.  Returns false if we're at the
121   // end of the string.
122   bool Advance();
123 
124   // Moves to the previous actual character. Returns false if we're at the start
125   // of the string.
126   bool Rewind();
127 
128  private:
129   UTF16CharIterator(std::u16string_view str, size_t initial_pos);
130 
131   // Fills in the current character we found and advances to the next
132   // character, updating all flags as necessary.
133   void ReadChar();
134 
135   // The string we're iterating over.
136   std::u16string_view str_;
137 
138   // Array index.
139   size_t array_pos_;
140 
141   // The next array index.
142   size_t next_pos_;
143 
144   // Character offset from the initial position of the iterator.
145   int32_t char_offset_;
146 
147   // The current character.
148   int32_t char_;
149 };
150 
151 }  // namespace i18n
152 }  // namespace base
153 
154 #endif  // BASE_I18N_CHAR_ITERATOR_H_
155