1*6777b538SAndroid Build Coastguard Worker // Copyright 2011 The Chromium Authors 2*6777b538SAndroid Build Coastguard Worker // Use of this source code is governed by a BSD-style license that can be 3*6777b538SAndroid Build Coastguard Worker // found in the LICENSE file. 4*6777b538SAndroid Build Coastguard Worker 5*6777b538SAndroid Build Coastguard Worker #ifndef BASE_I18N_CHAR_ITERATOR_H_ 6*6777b538SAndroid Build Coastguard Worker #define BASE_I18N_CHAR_ITERATOR_H_ 7*6777b538SAndroid Build Coastguard Worker 8*6777b538SAndroid Build Coastguard Worker #include <stdint.h> 9*6777b538SAndroid Build Coastguard Worker 10*6777b538SAndroid Build Coastguard Worker #include <string_view> 11*6777b538SAndroid Build Coastguard Worker 12*6777b538SAndroid Build Coastguard Worker #include "base/i18n/base_i18n_export.h" 13*6777b538SAndroid Build Coastguard Worker 14*6777b538SAndroid Build Coastguard Worker // The CharIterator classes iterate through the characters in UTF8 and 15*6777b538SAndroid Build Coastguard Worker // UTF16 strings. Example usage: 16*6777b538SAndroid Build Coastguard Worker // 17*6777b538SAndroid Build Coastguard Worker // for (UTF8CharIterator iter(str); !iter.end(); iter.Advance()) { 18*6777b538SAndroid Build Coastguard Worker // VLOG(1) << iter.get(); 19*6777b538SAndroid Build Coastguard Worker // } 20*6777b538SAndroid Build Coastguard Worker 21*6777b538SAndroid Build Coastguard Worker namespace base { 22*6777b538SAndroid Build Coastguard Worker namespace i18n { 23*6777b538SAndroid Build Coastguard Worker 24*6777b538SAndroid Build Coastguard Worker class BASE_I18N_EXPORT UTF8CharIterator { 25*6777b538SAndroid Build Coastguard Worker public: 26*6777b538SAndroid Build Coastguard Worker // Requires |str| to live as long as the UTF8CharIterator does. 27*6777b538SAndroid Build Coastguard Worker explicit UTF8CharIterator(std::string_view str); 28*6777b538SAndroid Build Coastguard Worker UTF8CharIterator(const UTF8CharIterator&) = delete; 29*6777b538SAndroid Build Coastguard Worker UTF8CharIterator& operator=(const UTF8CharIterator&) = delete; 30*6777b538SAndroid Build Coastguard Worker ~UTF8CharIterator(); 31*6777b538SAndroid Build Coastguard Worker 32*6777b538SAndroid Build Coastguard Worker // Return the starting array index of the current character within the 33*6777b538SAndroid Build Coastguard Worker // string. array_pos()34*6777b538SAndroid Build Coastguard Worker size_t array_pos() const { return array_pos_; } 35*6777b538SAndroid Build Coastguard Worker 36*6777b538SAndroid Build Coastguard Worker // Return the logical index of the current character, independent of the 37*6777b538SAndroid Build Coastguard Worker // number of bytes each character takes. char_pos()38*6777b538SAndroid Build Coastguard Worker size_t char_pos() const { return char_pos_; } 39*6777b538SAndroid Build Coastguard Worker 40*6777b538SAndroid Build Coastguard Worker // Return the current char. get()41*6777b538SAndroid Build Coastguard Worker int32_t get() const { return char_; } 42*6777b538SAndroid Build Coastguard Worker 43*6777b538SAndroid Build Coastguard Worker // Returns true if we're at the end of the string. end()44*6777b538SAndroid Build Coastguard Worker bool end() const { return array_pos_ == str_.length(); } 45*6777b538SAndroid Build Coastguard Worker 46*6777b538SAndroid Build Coastguard Worker // Advance to the next actual character. Returns false if we're at the 47*6777b538SAndroid Build Coastguard Worker // end of the string. 48*6777b538SAndroid Build Coastguard Worker bool Advance(); 49*6777b538SAndroid Build Coastguard Worker 50*6777b538SAndroid Build Coastguard Worker private: 51*6777b538SAndroid Build Coastguard Worker // The string we're iterating over. 52*6777b538SAndroid Build Coastguard Worker std::string_view str_; 53*6777b538SAndroid Build Coastguard Worker 54*6777b538SAndroid Build Coastguard Worker // Array index. 55*6777b538SAndroid Build Coastguard Worker size_t array_pos_; 56*6777b538SAndroid Build Coastguard Worker 57*6777b538SAndroid Build Coastguard Worker // The next array index. 58*6777b538SAndroid Build Coastguard Worker size_t next_pos_; 59*6777b538SAndroid Build Coastguard Worker 60*6777b538SAndroid Build Coastguard Worker // Character index. 61*6777b538SAndroid Build Coastguard Worker size_t char_pos_; 62*6777b538SAndroid Build Coastguard Worker 63*6777b538SAndroid Build Coastguard Worker // The current character. 64*6777b538SAndroid Build Coastguard Worker int32_t char_; 65*6777b538SAndroid Build Coastguard Worker }; 66*6777b538SAndroid Build Coastguard Worker 67*6777b538SAndroid Build Coastguard Worker class BASE_I18N_EXPORT UTF16CharIterator { 68*6777b538SAndroid Build Coastguard Worker public: 69*6777b538SAndroid Build Coastguard Worker // Requires |str| to live as long as the UTF16CharIterator does. 70*6777b538SAndroid Build Coastguard Worker explicit UTF16CharIterator(std::u16string_view str); 71*6777b538SAndroid Build Coastguard Worker UTF16CharIterator(UTF16CharIterator&& to_move); 72*6777b538SAndroid Build Coastguard Worker UTF16CharIterator& operator=(UTF16CharIterator&& to_move); 73*6777b538SAndroid Build Coastguard Worker 74*6777b538SAndroid Build Coastguard Worker UTF16CharIterator(const UTF16CharIterator&) = delete; 75*6777b538SAndroid Build Coastguard Worker UTF16CharIterator operator=(const UTF16CharIterator&) = delete; 76*6777b538SAndroid Build Coastguard Worker 77*6777b538SAndroid Build Coastguard Worker ~UTF16CharIterator(); 78*6777b538SAndroid Build Coastguard Worker 79*6777b538SAndroid Build Coastguard Worker // Returns an iterator starting on the unicode character at offset 80*6777b538SAndroid Build Coastguard Worker // |array_index| into the string, or the previous array offset if 81*6777b538SAndroid Build Coastguard Worker // |array_index| is the second half of a surrogate pair. 82*6777b538SAndroid Build Coastguard Worker static UTF16CharIterator LowerBound(std::u16string_view str, 83*6777b538SAndroid Build Coastguard Worker size_t array_index); 84*6777b538SAndroid Build Coastguard Worker 85*6777b538SAndroid Build Coastguard Worker // Returns an iterator starting on the unicode character at offset 86*6777b538SAndroid Build Coastguard Worker // |array_index| into the string, or the next offset if |array_index| is the 87*6777b538SAndroid Build Coastguard Worker // second half of a surrogate pair. 88*6777b538SAndroid Build Coastguard Worker static UTF16CharIterator UpperBound(std::u16string_view str, 89*6777b538SAndroid Build Coastguard Worker size_t array_index); 90*6777b538SAndroid Build Coastguard Worker 91*6777b538SAndroid Build Coastguard Worker // Return the starting array index of the current character within the 92*6777b538SAndroid Build Coastguard Worker // string. array_pos()93*6777b538SAndroid Build Coastguard Worker size_t array_pos() const { return array_pos_; } 94*6777b538SAndroid Build Coastguard Worker 95*6777b538SAndroid Build Coastguard Worker // Returns the offset in code points from the initial iterator position, which 96*6777b538SAndroid Build Coastguard Worker // could be negative if Rewind() is called. The initial value is always zero, 97*6777b538SAndroid Build Coastguard Worker // regardless of how the iterator is constructed. char_offset()98*6777b538SAndroid Build Coastguard Worker int32_t char_offset() const { return char_offset_; } 99*6777b538SAndroid Build Coastguard Worker 100*6777b538SAndroid Build Coastguard Worker // Returns the code point at the current position. get()101*6777b538SAndroid Build Coastguard Worker int32_t get() const { return char_; } 102*6777b538SAndroid Build Coastguard Worker 103*6777b538SAndroid Build Coastguard Worker // Returns the code point (i.e. the full Unicode character, not half of a 104*6777b538SAndroid Build Coastguard Worker // surrogate pair) following the current one. Should not be called if end() is 105*6777b538SAndroid Build Coastguard Worker // true. If the current code point is the last one in the string, returns 106*6777b538SAndroid Build Coastguard Worker // zero. 107*6777b538SAndroid Build Coastguard Worker int32_t NextCodePoint() const; 108*6777b538SAndroid Build Coastguard Worker 109*6777b538SAndroid Build Coastguard Worker // Returns the code point (i.e. the full Unicode character, not half of a 110*6777b538SAndroid Build Coastguard Worker // surrogate pair) preceding the current one. Should not be called if start() 111*6777b538SAndroid Build Coastguard Worker // is true. 112*6777b538SAndroid Build Coastguard Worker int32_t PreviousCodePoint() const; 113*6777b538SAndroid Build Coastguard Worker 114*6777b538SAndroid Build Coastguard Worker // Returns true if we're at the start of the string. start()115*6777b538SAndroid Build Coastguard Worker bool start() const { return array_pos_ == 0; } 116*6777b538SAndroid Build Coastguard Worker 117*6777b538SAndroid Build Coastguard Worker // Returns true if we're at the end of the string. end()118*6777b538SAndroid Build Coastguard Worker bool end() const { return array_pos_ == str_.length(); } 119*6777b538SAndroid Build Coastguard Worker 120*6777b538SAndroid Build Coastguard Worker // Advances to the next actual character. Returns false if we're at the 121*6777b538SAndroid Build Coastguard Worker // end of the string. 122*6777b538SAndroid Build Coastguard Worker bool Advance(); 123*6777b538SAndroid Build Coastguard Worker 124*6777b538SAndroid Build Coastguard Worker // Moves to the previous actual character. Returns false if we're at the start 125*6777b538SAndroid Build Coastguard Worker // of the string. 126*6777b538SAndroid Build Coastguard Worker bool Rewind(); 127*6777b538SAndroid Build Coastguard Worker 128*6777b538SAndroid Build Coastguard Worker private: 129*6777b538SAndroid Build Coastguard Worker UTF16CharIterator(std::u16string_view str, size_t initial_pos); 130*6777b538SAndroid Build Coastguard Worker 131*6777b538SAndroid Build Coastguard Worker // Fills in the current character we found and advances to the next 132*6777b538SAndroid Build Coastguard Worker // character, updating all flags as necessary. 133*6777b538SAndroid Build Coastguard Worker void ReadChar(); 134*6777b538SAndroid Build Coastguard Worker 135*6777b538SAndroid Build Coastguard Worker // The string we're iterating over. 136*6777b538SAndroid Build Coastguard Worker std::u16string_view str_; 137*6777b538SAndroid Build Coastguard Worker 138*6777b538SAndroid Build Coastguard Worker // Array index. 139*6777b538SAndroid Build Coastguard Worker size_t array_pos_; 140*6777b538SAndroid Build Coastguard Worker 141*6777b538SAndroid Build Coastguard Worker // The next array index. 142*6777b538SAndroid Build Coastguard Worker size_t next_pos_; 143*6777b538SAndroid Build Coastguard Worker 144*6777b538SAndroid Build Coastguard Worker // Character offset from the initial position of the iterator. 145*6777b538SAndroid Build Coastguard Worker int32_t char_offset_; 146*6777b538SAndroid Build Coastguard Worker 147*6777b538SAndroid Build Coastguard Worker // The current character. 148*6777b538SAndroid Build Coastguard Worker int32_t char_; 149*6777b538SAndroid Build Coastguard Worker }; 150*6777b538SAndroid Build Coastguard Worker 151*6777b538SAndroid Build Coastguard Worker } // namespace i18n 152*6777b538SAndroid Build Coastguard Worker } // namespace base 153*6777b538SAndroid Build Coastguard Worker 154*6777b538SAndroid Build Coastguard Worker #endif // BASE_I18N_CHAR_ITERATOR_H_ 155