xref: /aosp_15_r20/external/libtextclassifier/native/utils/utf8/unicodetext.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker #include <string.h>
20*993b0882SAndroid Build Coastguard Worker 
21*993b0882SAndroid Build Coastguard Worker #include <algorithm>
22*993b0882SAndroid Build Coastguard Worker 
23*993b0882SAndroid Build Coastguard Worker #include "utils/base/logging.h"
24*993b0882SAndroid Build Coastguard Worker #include "utils/strings/utf8.h"
25*993b0882SAndroid Build Coastguard Worker #include "absl/strings/string_view.h"
26*993b0882SAndroid Build Coastguard Worker 
27*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
28*993b0882SAndroid Build Coastguard Worker 
29*993b0882SAndroid Build Coastguard Worker // *************** Data representation **********
30*993b0882SAndroid Build Coastguard Worker // Note: the copy constructor is undefined.
31*993b0882SAndroid Build Coastguard Worker 
operator =(Repr && src)32*993b0882SAndroid Build Coastguard Worker UnicodeText::Repr& UnicodeText::Repr::operator=(Repr&& src) {
33*993b0882SAndroid Build Coastguard Worker   if (ours_ && data_) delete[] data_;
34*993b0882SAndroid Build Coastguard Worker   data_ = src.data_;
35*993b0882SAndroid Build Coastguard Worker   size_ = src.size_;
36*993b0882SAndroid Build Coastguard Worker   capacity_ = src.capacity_;
37*993b0882SAndroid Build Coastguard Worker   ours_ = src.ours_;
38*993b0882SAndroid Build Coastguard Worker   src.ours_ = false;
39*993b0882SAndroid Build Coastguard Worker   return *this;
40*993b0882SAndroid Build Coastguard Worker }
41*993b0882SAndroid Build Coastguard Worker 
PointTo(const char * data,int size)42*993b0882SAndroid Build Coastguard Worker void UnicodeText::Repr::PointTo(const char* data, int size) {
43*993b0882SAndroid Build Coastguard Worker   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
44*993b0882SAndroid Build Coastguard Worker   data_ = const_cast<char*>(data);
45*993b0882SAndroid Build Coastguard Worker   size_ = size;
46*993b0882SAndroid Build Coastguard Worker   capacity_ = size;
47*993b0882SAndroid Build Coastguard Worker   ours_ = false;
48*993b0882SAndroid Build Coastguard Worker }
49*993b0882SAndroid Build Coastguard Worker 
Copy(const char * data,int size)50*993b0882SAndroid Build Coastguard Worker void UnicodeText::Repr::Copy(const char* data, int size) {
51*993b0882SAndroid Build Coastguard Worker   resize(size);
52*993b0882SAndroid Build Coastguard Worker   memcpy(data_, data, size);
53*993b0882SAndroid Build Coastguard Worker }
54*993b0882SAndroid Build Coastguard Worker 
resize(int new_size)55*993b0882SAndroid Build Coastguard Worker void UnicodeText::Repr::resize(int new_size) {
56*993b0882SAndroid Build Coastguard Worker   if (new_size == 0) {
57*993b0882SAndroid Build Coastguard Worker     clear();
58*993b0882SAndroid Build Coastguard Worker   } else {
59*993b0882SAndroid Build Coastguard Worker     if (!ours_ || new_size > capacity_) reserve(new_size);
60*993b0882SAndroid Build Coastguard Worker     // Clear the memory in the expanded part.
61*993b0882SAndroid Build Coastguard Worker     if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
62*993b0882SAndroid Build Coastguard Worker     size_ = new_size;
63*993b0882SAndroid Build Coastguard Worker     ours_ = true;
64*993b0882SAndroid Build Coastguard Worker   }
65*993b0882SAndroid Build Coastguard Worker }
66*993b0882SAndroid Build Coastguard Worker 
reserve(int new_capacity)67*993b0882SAndroid Build Coastguard Worker void UnicodeText::Repr::reserve(int new_capacity) {
68*993b0882SAndroid Build Coastguard Worker   // If there's already enough capacity, and we're an owner, do nothing.
69*993b0882SAndroid Build Coastguard Worker   if (capacity_ >= new_capacity && ours_) return;
70*993b0882SAndroid Build Coastguard Worker 
71*993b0882SAndroid Build Coastguard Worker   // Otherwise, allocate a new buffer.
72*993b0882SAndroid Build Coastguard Worker   capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
73*993b0882SAndroid Build Coastguard Worker   char* new_data = new char[capacity_];
74*993b0882SAndroid Build Coastguard Worker 
75*993b0882SAndroid Build Coastguard Worker   // If there is an old buffer, copy it into the new buffer.
76*993b0882SAndroid Build Coastguard Worker   if (data_) {
77*993b0882SAndroid Build Coastguard Worker     memcpy(new_data, data_, size_);
78*993b0882SAndroid Build Coastguard Worker     if (ours_) delete[] data_;  // If we owned the old buffer, free it.
79*993b0882SAndroid Build Coastguard Worker   }
80*993b0882SAndroid Build Coastguard Worker   data_ = new_data;
81*993b0882SAndroid Build Coastguard Worker   ours_ = true;  // We own the new buffer.
82*993b0882SAndroid Build Coastguard Worker   // size_ is unchanged.
83*993b0882SAndroid Build Coastguard Worker }
84*993b0882SAndroid Build Coastguard Worker 
append(const char * bytes,int byte_length)85*993b0882SAndroid Build Coastguard Worker void UnicodeText::Repr::append(const char* bytes, int byte_length) {
86*993b0882SAndroid Build Coastguard Worker   reserve(size_ + byte_length);
87*993b0882SAndroid Build Coastguard Worker   memcpy(data_ + size_, bytes, byte_length);
88*993b0882SAndroid Build Coastguard Worker   size_ += byte_length;
89*993b0882SAndroid Build Coastguard Worker }
90*993b0882SAndroid Build Coastguard Worker 
clear()91*993b0882SAndroid Build Coastguard Worker void UnicodeText::Repr::clear() {
92*993b0882SAndroid Build Coastguard Worker   if (ours_) delete[] data_;
93*993b0882SAndroid Build Coastguard Worker   data_ = nullptr;
94*993b0882SAndroid Build Coastguard Worker   size_ = capacity_ = 0;
95*993b0882SAndroid Build Coastguard Worker   ours_ = true;
96*993b0882SAndroid Build Coastguard Worker }
97*993b0882SAndroid Build Coastguard Worker 
98*993b0882SAndroid Build Coastguard Worker // *************** UnicodeText ******************
99*993b0882SAndroid Build Coastguard Worker 
UnicodeText()100*993b0882SAndroid Build Coastguard Worker UnicodeText::UnicodeText() {}
101*993b0882SAndroid Build Coastguard Worker 
UnicodeText(const UnicodeText & src,bool do_copy)102*993b0882SAndroid Build Coastguard Worker UnicodeText::UnicodeText(const UnicodeText& src, bool do_copy) {
103*993b0882SAndroid Build Coastguard Worker   if (do_copy) {
104*993b0882SAndroid Build Coastguard Worker     Copy(src);
105*993b0882SAndroid Build Coastguard Worker   } else {
106*993b0882SAndroid Build Coastguard Worker     repr_.PointTo(src.repr_.data_, src.repr_.size_);
107*993b0882SAndroid Build Coastguard Worker   }
108*993b0882SAndroid Build Coastguard Worker }
109*993b0882SAndroid Build Coastguard Worker 
operator =(UnicodeText && src)110*993b0882SAndroid Build Coastguard Worker UnicodeText& UnicodeText::operator=(UnicodeText&& src) {
111*993b0882SAndroid Build Coastguard Worker   this->repr_ = std::move(src.repr_);
112*993b0882SAndroid Build Coastguard Worker   return *this;
113*993b0882SAndroid Build Coastguard Worker }
114*993b0882SAndroid Build Coastguard Worker 
Copy(const UnicodeText & src)115*993b0882SAndroid Build Coastguard Worker UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
116*993b0882SAndroid Build Coastguard Worker   repr_.Copy(src.repr_.data_, src.repr_.size_);
117*993b0882SAndroid Build Coastguard Worker   return *this;
118*993b0882SAndroid Build Coastguard Worker }
119*993b0882SAndroid Build Coastguard Worker 
PointToUTF8(const char * buffer,int byte_length)120*993b0882SAndroid Build Coastguard Worker UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
121*993b0882SAndroid Build Coastguard Worker   repr_.PointTo(buffer, byte_length);
122*993b0882SAndroid Build Coastguard Worker   return *this;
123*993b0882SAndroid Build Coastguard Worker }
124*993b0882SAndroid Build Coastguard Worker 
CopyUTF8(const char * buffer,int byte_length)125*993b0882SAndroid Build Coastguard Worker UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
126*993b0882SAndroid Build Coastguard Worker   repr_.Copy(buffer, byte_length);
127*993b0882SAndroid Build Coastguard Worker   return *this;
128*993b0882SAndroid Build Coastguard Worker }
129*993b0882SAndroid Build Coastguard Worker 
AppendUTF8(const char * utf8,int len)130*993b0882SAndroid Build Coastguard Worker UnicodeText& UnicodeText::AppendUTF8(const char* utf8, int len) {
131*993b0882SAndroid Build Coastguard Worker   repr_.append(utf8, len);
132*993b0882SAndroid Build Coastguard Worker   return *this;
133*993b0882SAndroid Build Coastguard Worker }
134*993b0882SAndroid Build Coastguard Worker 
data() const135*993b0882SAndroid Build Coastguard Worker const char* UnicodeText::data() const { return repr_.data_; }
136*993b0882SAndroid Build Coastguard Worker 
size_bytes() const137*993b0882SAndroid Build Coastguard Worker int UnicodeText::size_bytes() const { return repr_.size_; }
138*993b0882SAndroid Build Coastguard Worker 
139*993b0882SAndroid Build Coastguard Worker namespace {
140*993b0882SAndroid Build Coastguard Worker 
141*993b0882SAndroid Build Coastguard Worker enum {
142*993b0882SAndroid Build Coastguard Worker   RuneError = 0xFFFD,  // Decoding error in UTF.
143*993b0882SAndroid Build Coastguard Worker   RuneMax = 0x10FFFF,  // Maximum rune value.
144*993b0882SAndroid Build Coastguard Worker };
145*993b0882SAndroid Build Coastguard Worker 
runetochar(const char32 rune,char * dest)146*993b0882SAndroid Build Coastguard Worker int runetochar(const char32 rune, char* dest) {
147*993b0882SAndroid Build Coastguard Worker   // Convert to unsigned for range check.
148*993b0882SAndroid Build Coastguard Worker   uint32 c;
149*993b0882SAndroid Build Coastguard Worker 
150*993b0882SAndroid Build Coastguard Worker   // 1 char 00-7F
151*993b0882SAndroid Build Coastguard Worker   c = rune;
152*993b0882SAndroid Build Coastguard Worker   if (c <= 0x7F) {
153*993b0882SAndroid Build Coastguard Worker     dest[0] = static_cast<char>(c);
154*993b0882SAndroid Build Coastguard Worker     return 1;
155*993b0882SAndroid Build Coastguard Worker   }
156*993b0882SAndroid Build Coastguard Worker 
157*993b0882SAndroid Build Coastguard Worker   // 2 char 0080-07FF
158*993b0882SAndroid Build Coastguard Worker   if (c <= 0x07FF) {
159*993b0882SAndroid Build Coastguard Worker     dest[0] = 0xC0 | static_cast<char>(c >> 1 * 6);
160*993b0882SAndroid Build Coastguard Worker     dest[1] = 0x80 | (c & 0x3F);
161*993b0882SAndroid Build Coastguard Worker     return 2;
162*993b0882SAndroid Build Coastguard Worker   }
163*993b0882SAndroid Build Coastguard Worker 
164*993b0882SAndroid Build Coastguard Worker   // Range check
165*993b0882SAndroid Build Coastguard Worker   if (c > RuneMax) {
166*993b0882SAndroid Build Coastguard Worker     c = RuneError;
167*993b0882SAndroid Build Coastguard Worker   }
168*993b0882SAndroid Build Coastguard Worker 
169*993b0882SAndroid Build Coastguard Worker   // 3 char 0800-FFFF
170*993b0882SAndroid Build Coastguard Worker   if (c <= 0xFFFF) {
171*993b0882SAndroid Build Coastguard Worker     dest[0] = 0xE0 | static_cast<char>(c >> 2 * 6);
172*993b0882SAndroid Build Coastguard Worker     dest[1] = 0x80 | ((c >> 1 * 6) & 0x3F);
173*993b0882SAndroid Build Coastguard Worker     dest[2] = 0x80 | (c & 0x3F);
174*993b0882SAndroid Build Coastguard Worker     return 3;
175*993b0882SAndroid Build Coastguard Worker   }
176*993b0882SAndroid Build Coastguard Worker 
177*993b0882SAndroid Build Coastguard Worker   // 4 char 10000-1FFFFF
178*993b0882SAndroid Build Coastguard Worker   dest[0] = 0xF0 | static_cast<char>(c >> 3 * 6);
179*993b0882SAndroid Build Coastguard Worker   dest[1] = 0x80 | ((c >> 2 * 6) & 0x3F);
180*993b0882SAndroid Build Coastguard Worker   dest[2] = 0x80 | ((c >> 1 * 6) & 0x3F);
181*993b0882SAndroid Build Coastguard Worker   dest[3] = 0x80 | (c & 0x3F);
182*993b0882SAndroid Build Coastguard Worker   return 4;
183*993b0882SAndroid Build Coastguard Worker }
184*993b0882SAndroid Build Coastguard Worker 
185*993b0882SAndroid Build Coastguard Worker }  // namespace
186*993b0882SAndroid Build Coastguard Worker 
push_back(char32 ch)187*993b0882SAndroid Build Coastguard Worker UnicodeText& UnicodeText::push_back(char32 ch) {
188*993b0882SAndroid Build Coastguard Worker   char str[4];
189*993b0882SAndroid Build Coastguard Worker   int char_len = runetochar(ch, str);
190*993b0882SAndroid Build Coastguard Worker   repr_.append(str, char_len);
191*993b0882SAndroid Build Coastguard Worker   return *this;
192*993b0882SAndroid Build Coastguard Worker }
193*993b0882SAndroid Build Coastguard Worker 
clear()194*993b0882SAndroid Build Coastguard Worker void UnicodeText::clear() { repr_.clear(); }
195*993b0882SAndroid Build Coastguard Worker 
size_codepoints() const196*993b0882SAndroid Build Coastguard Worker int UnicodeText::size_codepoints() const {
197*993b0882SAndroid Build Coastguard Worker   return std::distance(begin(), end());
198*993b0882SAndroid Build Coastguard Worker }
199*993b0882SAndroid Build Coastguard Worker 
empty() const200*993b0882SAndroid Build Coastguard Worker bool UnicodeText::empty() const { return size_bytes() == 0; }
201*993b0882SAndroid Build Coastguard Worker 
is_valid() const202*993b0882SAndroid Build Coastguard Worker bool UnicodeText::is_valid() const {
203*993b0882SAndroid Build Coastguard Worker   return IsValidUTF8(repr_.data_, repr_.size_);
204*993b0882SAndroid Build Coastguard Worker }
205*993b0882SAndroid Build Coastguard Worker 
Codepoints() const206*993b0882SAndroid Build Coastguard Worker std::vector<UnicodeText::const_iterator> UnicodeText::Codepoints() const {
207*993b0882SAndroid Build Coastguard Worker   std::vector<UnicodeText::const_iterator> codepoints;
208*993b0882SAndroid Build Coastguard Worker   for (auto it = begin(); it != end(); it++) {
209*993b0882SAndroid Build Coastguard Worker     codepoints.push_back(it);
210*993b0882SAndroid Build Coastguard Worker   }
211*993b0882SAndroid Build Coastguard Worker   return codepoints;
212*993b0882SAndroid Build Coastguard Worker }
213*993b0882SAndroid Build Coastguard Worker 
CodepointsChar32() const214*993b0882SAndroid Build Coastguard Worker std::vector<char32> UnicodeText::CodepointsChar32() const {
215*993b0882SAndroid Build Coastguard Worker   std::vector<char32> codepoints;
216*993b0882SAndroid Build Coastguard Worker   for (auto it = begin(); it != end(); it++) {
217*993b0882SAndroid Build Coastguard Worker     codepoints.push_back(*it);
218*993b0882SAndroid Build Coastguard Worker   }
219*993b0882SAndroid Build Coastguard Worker   return codepoints;
220*993b0882SAndroid Build Coastguard Worker }
221*993b0882SAndroid Build Coastguard Worker 
operator ==(const UnicodeText & other) const222*993b0882SAndroid Build Coastguard Worker bool UnicodeText::operator==(const UnicodeText& other) const {
223*993b0882SAndroid Build Coastguard Worker   if (repr_.size_ != other.repr_.size_) {
224*993b0882SAndroid Build Coastguard Worker     return false;
225*993b0882SAndroid Build Coastguard Worker   }
226*993b0882SAndroid Build Coastguard Worker   return memcmp(repr_.data_, other.repr_.data_, repr_.size_) == 0;
227*993b0882SAndroid Build Coastguard Worker }
228*993b0882SAndroid Build Coastguard Worker 
ToUTF8String() const229*993b0882SAndroid Build Coastguard Worker std::string UnicodeText::ToUTF8String() const {
230*993b0882SAndroid Build Coastguard Worker   return UTF8Substring(begin(), end());
231*993b0882SAndroid Build Coastguard Worker }
232*993b0882SAndroid Build Coastguard Worker 
UTF8Substring(int begin_codepoint,int end_codepoint) const233*993b0882SAndroid Build Coastguard Worker std::string UnicodeText::UTF8Substring(int begin_codepoint,
234*993b0882SAndroid Build Coastguard Worker                                        int end_codepoint) const {
235*993b0882SAndroid Build Coastguard Worker   auto span_begin = begin();
236*993b0882SAndroid Build Coastguard Worker   std::advance(span_begin, begin_codepoint);
237*993b0882SAndroid Build Coastguard Worker   auto span_end = span_begin;
238*993b0882SAndroid Build Coastguard Worker   std::advance(span_end, end_codepoint - begin_codepoint);
239*993b0882SAndroid Build Coastguard Worker   return UTF8Substring(span_begin, span_end);
240*993b0882SAndroid Build Coastguard Worker }
241*993b0882SAndroid Build Coastguard Worker 
UTF8Substring(const const_iterator & it_begin,const const_iterator & it_end)242*993b0882SAndroid Build Coastguard Worker std::string UnicodeText::UTF8Substring(const const_iterator& it_begin,
243*993b0882SAndroid Build Coastguard Worker                                        const const_iterator& it_end) {
244*993b0882SAndroid Build Coastguard Worker   return std::string(it_begin.it_, it_end.it_ - it_begin.it_);
245*993b0882SAndroid Build Coastguard Worker }
246*993b0882SAndroid Build Coastguard Worker 
Substring(const UnicodeText & text,int begin_codepoint,int end_codepoint,bool do_copy)247*993b0882SAndroid Build Coastguard Worker UnicodeText UnicodeText::Substring(const UnicodeText& text, int begin_codepoint,
248*993b0882SAndroid Build Coastguard Worker                                    int end_codepoint, bool do_copy) {
249*993b0882SAndroid Build Coastguard Worker   auto it_begin = text.begin();
250*993b0882SAndroid Build Coastguard Worker   std::advance(it_begin, begin_codepoint);
251*993b0882SAndroid Build Coastguard Worker   auto it_end = text.begin();
252*993b0882SAndroid Build Coastguard Worker   std::advance(it_end, end_codepoint);
253*993b0882SAndroid Build Coastguard Worker 
254*993b0882SAndroid Build Coastguard Worker   return Substring(it_begin, it_end, do_copy);
255*993b0882SAndroid Build Coastguard Worker }
256*993b0882SAndroid Build Coastguard Worker 
Substring(const const_iterator & it_begin,const const_iterator & it_end,bool do_copy)257*993b0882SAndroid Build Coastguard Worker UnicodeText UnicodeText::Substring(const const_iterator& it_begin,
258*993b0882SAndroid Build Coastguard Worker                                    const const_iterator& it_end, bool do_copy) {
259*993b0882SAndroid Build Coastguard Worker   if (do_copy) {
260*993b0882SAndroid Build Coastguard Worker     UnicodeText result;
261*993b0882SAndroid Build Coastguard Worker     result.repr_.Copy(it_begin.it_, it_end.it_ - it_begin.it_);
262*993b0882SAndroid Build Coastguard Worker     return result;
263*993b0882SAndroid Build Coastguard Worker   } else {
264*993b0882SAndroid Build Coastguard Worker     UnicodeText result;
265*993b0882SAndroid Build Coastguard Worker     result.repr_.PointTo(it_begin.it_, it_end.it_ - it_begin.it_);
266*993b0882SAndroid Build Coastguard Worker     return result;
267*993b0882SAndroid Build Coastguard Worker   }
268*993b0882SAndroid Build Coastguard Worker }
269*993b0882SAndroid Build Coastguard Worker 
~UnicodeText()270*993b0882SAndroid Build Coastguard Worker UnicodeText::~UnicodeText() {}
271*993b0882SAndroid Build Coastguard Worker 
272*993b0882SAndroid Build Coastguard Worker // ******************* UnicodeText::const_iterator *********************
273*993b0882SAndroid Build Coastguard Worker 
274*993b0882SAndroid Build Coastguard Worker // The implementation of const_iterator would be nicer if it
275*993b0882SAndroid Build Coastguard Worker // inherited from boost::iterator_facade
276*993b0882SAndroid Build Coastguard Worker // (http://boost.org/libs/iterator/doc/iterator_facade.html).
277*993b0882SAndroid Build Coastguard Worker 
const_iterator()278*993b0882SAndroid Build Coastguard Worker UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
279*993b0882SAndroid Build Coastguard Worker 
begin() const280*993b0882SAndroid Build Coastguard Worker UnicodeText::const_iterator UnicodeText::begin() const {
281*993b0882SAndroid Build Coastguard Worker   return const_iterator(repr_.data_);
282*993b0882SAndroid Build Coastguard Worker }
283*993b0882SAndroid Build Coastguard Worker 
end() const284*993b0882SAndroid Build Coastguard Worker UnicodeText::const_iterator UnicodeText::end() const {
285*993b0882SAndroid Build Coastguard Worker   return const_iterator(repr_.data_ + repr_.size_);
286*993b0882SAndroid Build Coastguard Worker }
287*993b0882SAndroid Build Coastguard Worker 
operator <(const UnicodeText::const_iterator & lhs,const UnicodeText::const_iterator & rhs)288*993b0882SAndroid Build Coastguard Worker bool operator<(const UnicodeText::const_iterator& lhs,
289*993b0882SAndroid Build Coastguard Worker                const UnicodeText::const_iterator& rhs) {
290*993b0882SAndroid Build Coastguard Worker   return lhs.it_ < rhs.it_;
291*993b0882SAndroid Build Coastguard Worker }
292*993b0882SAndroid Build Coastguard Worker 
operator *() const293*993b0882SAndroid Build Coastguard Worker char32 UnicodeText::const_iterator::operator*() const {
294*993b0882SAndroid Build Coastguard Worker   // (We could call chartorune here, but that does some
295*993b0882SAndroid Build Coastguard Worker   // error-checking, and we're guaranteed that our data is valid
296*993b0882SAndroid Build Coastguard Worker   // UTF-8. Also, we expect this routine to be called very often. So
297*993b0882SAndroid Build Coastguard Worker   // for speed, we do the calculation ourselves.)
298*993b0882SAndroid Build Coastguard Worker   return ValidCharToRune(it_);
299*993b0882SAndroid Build Coastguard Worker }
300*993b0882SAndroid Build Coastguard Worker 
operator ++()301*993b0882SAndroid Build Coastguard Worker UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
302*993b0882SAndroid Build Coastguard Worker   it_ += GetNumBytesForUTF8Char(it_);
303*993b0882SAndroid Build Coastguard Worker   return *this;
304*993b0882SAndroid Build Coastguard Worker }
305*993b0882SAndroid Build Coastguard Worker 
operator --()306*993b0882SAndroid Build Coastguard Worker UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
307*993b0882SAndroid Build Coastguard Worker   while (IsTrailByte(*--it_)) {
308*993b0882SAndroid Build Coastguard Worker   }
309*993b0882SAndroid Build Coastguard Worker   return *this;
310*993b0882SAndroid Build Coastguard Worker }
311*993b0882SAndroid Build Coastguard Worker 
UTF8ToUnicodeText(const char * utf8_buf,int len,bool do_copy)312*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy) {
313*993b0882SAndroid Build Coastguard Worker   UnicodeText t;
314*993b0882SAndroid Build Coastguard Worker   if (do_copy) {
315*993b0882SAndroid Build Coastguard Worker     t.CopyUTF8(utf8_buf, len);
316*993b0882SAndroid Build Coastguard Worker   } else {
317*993b0882SAndroid Build Coastguard Worker     t.PointToUTF8(utf8_buf, len);
318*993b0882SAndroid Build Coastguard Worker   }
319*993b0882SAndroid Build Coastguard Worker   return t;
320*993b0882SAndroid Build Coastguard Worker }
321*993b0882SAndroid Build Coastguard Worker 
UTF8ToUnicodeText(const char * utf8_buf,bool do_copy)322*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy) {
323*993b0882SAndroid Build Coastguard Worker   return UTF8ToUnicodeText(utf8_buf, strlen(utf8_buf), do_copy);
324*993b0882SAndroid Build Coastguard Worker }
325*993b0882SAndroid Build Coastguard Worker 
UTF8ToUnicodeText(const std::string & str,bool do_copy)326*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy) {
327*993b0882SAndroid Build Coastguard Worker   return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
328*993b0882SAndroid Build Coastguard Worker }
329*993b0882SAndroid Build Coastguard Worker 
UTF8ToUnicodeText(StringPiece str,bool do_copy)330*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(StringPiece str, bool do_copy) {
331*993b0882SAndroid Build Coastguard Worker   return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
332*993b0882SAndroid Build Coastguard Worker }
333*993b0882SAndroid Build Coastguard Worker 
UTF8ToUnicodeText(absl::string_view str,bool do_copy)334*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(absl::string_view str, bool do_copy) {
335*993b0882SAndroid Build Coastguard Worker   return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
336*993b0882SAndroid Build Coastguard Worker }
337*993b0882SAndroid Build Coastguard Worker 
338*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
339