xref: /aosp_15_r20/external/libtextclassifier/native/utils/utf8/unicodetext.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
19*993b0882SAndroid Build Coastguard Worker 
20*993b0882SAndroid Build Coastguard Worker #include <iterator>
21*993b0882SAndroid Build Coastguard Worker #include <string>
22*993b0882SAndroid Build Coastguard Worker #include <utility>
23*993b0882SAndroid Build Coastguard Worker #include <vector>
24*993b0882SAndroid Build Coastguard Worker 
25*993b0882SAndroid Build Coastguard Worker #include "utils/base/integral_types.h"
26*993b0882SAndroid Build Coastguard Worker #include "utils/base/logging.h"
27*993b0882SAndroid Build Coastguard Worker #include "utils/strings/stringpiece.h"
28*993b0882SAndroid Build Coastguard Worker #include "absl/strings/string_view.h"
29*993b0882SAndroid Build Coastguard Worker 
30*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
31*993b0882SAndroid Build Coastguard Worker 
32*993b0882SAndroid Build Coastguard Worker // ***************************** UnicodeText **************************
33*993b0882SAndroid Build Coastguard Worker //
34*993b0882SAndroid Build Coastguard Worker // A UnicodeText object is a wrapper around a sequence of Unicode
35*993b0882SAndroid Build Coastguard Worker // codepoint values that allows iteration over these values.
36*993b0882SAndroid Build Coastguard Worker //
37*993b0882SAndroid Build Coastguard Worker // The internal representation of the text is UTF-8. Since UTF-8 is a
38*993b0882SAndroid Build Coastguard Worker // variable-width format, UnicodeText does not provide random access
39*993b0882SAndroid Build Coastguard Worker // to the text, and changes to the text are permitted only at the end.
40*993b0882SAndroid Build Coastguard Worker //
41*993b0882SAndroid Build Coastguard Worker // The UnicodeText class defines a const_iterator. The dereferencing
42*993b0882SAndroid Build Coastguard Worker // operator (*) returns a codepoint (int32). The iterator is a
43*993b0882SAndroid Build Coastguard Worker // read-only iterator. It becomes invalid if the text is changed.
44*993b0882SAndroid Build Coastguard Worker //
45*993b0882SAndroid Build Coastguard Worker // Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
46*993b0882SAndroid Build Coastguard Worker // 0x10FFFF], but UnicodeText has the additional restriction that it
47*993b0882SAndroid Build Coastguard Worker // can contain only those characters that are valid for interchange on
48*993b0882SAndroid Build Coastguard Worker // the Web. This excludes all of the control codes except for carriage
49*993b0882SAndroid Build Coastguard Worker // return, line feed, and horizontal tab.  It also excludes
50*993b0882SAndroid Build Coastguard Worker // non-characters, but codepoints that are in the Private Use regions
51*993b0882SAndroid Build Coastguard Worker // are allowed, as are codepoints that are unassigned. (See the
52*993b0882SAndroid Build Coastguard Worker // Unicode reference for details.)
53*993b0882SAndroid Build Coastguard Worker //
54*993b0882SAndroid Build Coastguard Worker // MEMORY MANAGEMENT:
55*993b0882SAndroid Build Coastguard Worker //
56*993b0882SAndroid Build Coastguard Worker // PointToUTF8(buffer, size) creates an alias pointing to buffer.
57*993b0882SAndroid Build Coastguard Worker //
58*993b0882SAndroid Build Coastguard Worker // The purpose of an alias is to avoid making an unnecessary copy of a
59*993b0882SAndroid Build Coastguard Worker // UTF-8 buffer while still providing access to the Unicode values
60*993b0882SAndroid Build Coastguard Worker // within that text through iterators. The lifetime of an alias must not
61*993b0882SAndroid Build Coastguard Worker // exceed the lifetime of the buffer from which it was constructed.
62*993b0882SAndroid Build Coastguard Worker //
63*993b0882SAndroid Build Coastguard Worker // Aliases should be used with care. If the source from which an alias
64*993b0882SAndroid Build Coastguard Worker // was created is freed, or if the contents are changed, while the
65*993b0882SAndroid Build Coastguard Worker // alias is still in use, fatal errors could result. But it can be
66*993b0882SAndroid Build Coastguard Worker // quite useful to have a UnicodeText "window" through which to see a
67*993b0882SAndroid Build Coastguard Worker // UTF-8 buffer without having to pay the price of making a copy.
68*993b0882SAndroid Build Coastguard Worker 
69*993b0882SAndroid Build Coastguard Worker class UnicodeText {
70*993b0882SAndroid Build Coastguard Worker  public:
71*993b0882SAndroid Build Coastguard Worker   class const_iterator;
72*993b0882SAndroid Build Coastguard Worker 
73*993b0882SAndroid Build Coastguard Worker   UnicodeText();  // Create an empty text.
74*993b0882SAndroid Build Coastguard Worker   UnicodeText(const UnicodeText& src, bool do_copy = true);
75*993b0882SAndroid Build Coastguard Worker   UnicodeText& operator=(UnicodeText&& src);
76*993b0882SAndroid Build Coastguard Worker   ~UnicodeText();
77*993b0882SAndroid Build Coastguard Worker 
78*993b0882SAndroid Build Coastguard Worker   class const_iterator {
79*993b0882SAndroid Build Coastguard Worker     typedef const_iterator CI;
80*993b0882SAndroid Build Coastguard Worker 
81*993b0882SAndroid Build Coastguard Worker    public:
82*993b0882SAndroid Build Coastguard Worker     typedef std::bidirectional_iterator_tag iterator_category;
83*993b0882SAndroid Build Coastguard Worker     typedef char32 value_type;
84*993b0882SAndroid Build Coastguard Worker     typedef int difference_type;
85*993b0882SAndroid Build Coastguard Worker     typedef void pointer;            // (Not needed.)
86*993b0882SAndroid Build Coastguard Worker     typedef const char32 reference;  // (Needed for const_reverse_iterator)
87*993b0882SAndroid Build Coastguard Worker 
88*993b0882SAndroid Build Coastguard Worker     // Iterators are default-constructible.
89*993b0882SAndroid Build Coastguard Worker     const_iterator();
90*993b0882SAndroid Build Coastguard Worker 
91*993b0882SAndroid Build Coastguard Worker     // It's safe to make multiple passes over a UnicodeText.
92*993b0882SAndroid Build Coastguard Worker     const_iterator(const const_iterator&) = default;
93*993b0882SAndroid Build Coastguard Worker     const_iterator& operator=(const const_iterator&) = default;
94*993b0882SAndroid Build Coastguard Worker 
95*993b0882SAndroid Build Coastguard Worker     char32 operator*() const;  // Dereference
96*993b0882SAndroid Build Coastguard Worker 
97*993b0882SAndroid Build Coastguard Worker     const_iterator& operator++();     // Advance (++iter)
98*993b0882SAndroid Build Coastguard Worker     const_iterator operator++(int) {  // (iter++)
99*993b0882SAndroid Build Coastguard Worker       const_iterator result(*this);
100*993b0882SAndroid Build Coastguard Worker       ++*this;
101*993b0882SAndroid Build Coastguard Worker       return result;
102*993b0882SAndroid Build Coastguard Worker     }
103*993b0882SAndroid Build Coastguard Worker 
104*993b0882SAndroid Build Coastguard Worker     const_iterator& operator--();     // Retreat (--iter)
105*993b0882SAndroid Build Coastguard Worker     const_iterator operator--(int) {  // (iter--)
106*993b0882SAndroid Build Coastguard Worker       const_iterator result(*this);
107*993b0882SAndroid Build Coastguard Worker       --*this;
108*993b0882SAndroid Build Coastguard Worker       return result;
109*993b0882SAndroid Build Coastguard Worker     }
110*993b0882SAndroid Build Coastguard Worker 
111*993b0882SAndroid Build Coastguard Worker     friend bool operator==(const CI& lhs, const CI& rhs) {
112*993b0882SAndroid Build Coastguard Worker       return lhs.it_ == rhs.it_;
113*993b0882SAndroid Build Coastguard Worker     }
114*993b0882SAndroid Build Coastguard Worker     friend bool operator!=(const CI& lhs, const CI& rhs) {
115*993b0882SAndroid Build Coastguard Worker       return !(lhs == rhs);
116*993b0882SAndroid Build Coastguard Worker     }
117*993b0882SAndroid Build Coastguard Worker     friend bool operator<(const CI& lhs, const CI& rhs);
118*993b0882SAndroid Build Coastguard Worker     friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; }
119*993b0882SAndroid Build Coastguard Worker     friend bool operator<=(const CI& lhs, const CI& rhs) {
120*993b0882SAndroid Build Coastguard Worker       return !(rhs < lhs);
121*993b0882SAndroid Build Coastguard Worker     }
122*993b0882SAndroid Build Coastguard Worker     friend bool operator>=(const CI& lhs, const CI& rhs) {
123*993b0882SAndroid Build Coastguard Worker       return !(lhs < rhs);
124*993b0882SAndroid Build Coastguard Worker     }
125*993b0882SAndroid Build Coastguard Worker 
utf8_length()126*993b0882SAndroid Build Coastguard Worker     int utf8_length() const {
127*993b0882SAndroid Build Coastguard Worker       const unsigned char byte = static_cast<unsigned char>(it_[0]);
128*993b0882SAndroid Build Coastguard Worker       if (byte < 0x80) {
129*993b0882SAndroid Build Coastguard Worker         return 1;
130*993b0882SAndroid Build Coastguard Worker       } else if (byte < 0xE0) {
131*993b0882SAndroid Build Coastguard Worker         return 2;
132*993b0882SAndroid Build Coastguard Worker       } else if (byte < 0xF0) {
133*993b0882SAndroid Build Coastguard Worker         return 3;
134*993b0882SAndroid Build Coastguard Worker       } else {
135*993b0882SAndroid Build Coastguard Worker         return 4;
136*993b0882SAndroid Build Coastguard Worker       }
137*993b0882SAndroid Build Coastguard Worker     }
utf8_data()138*993b0882SAndroid Build Coastguard Worker     const char* utf8_data() const { return it_; }
139*993b0882SAndroid Build Coastguard Worker 
140*993b0882SAndroid Build Coastguard Worker    private:
141*993b0882SAndroid Build Coastguard Worker     friend class UnicodeText;
const_iterator(const char * it)142*993b0882SAndroid Build Coastguard Worker     explicit const_iterator(const char* it) : it_(it) {}
143*993b0882SAndroid Build Coastguard Worker 
144*993b0882SAndroid Build Coastguard Worker     const char* it_;
145*993b0882SAndroid Build Coastguard Worker   };
146*993b0882SAndroid Build Coastguard Worker 
147*993b0882SAndroid Build Coastguard Worker   const_iterator begin() const;
148*993b0882SAndroid Build Coastguard Worker   const_iterator end() const;
149*993b0882SAndroid Build Coastguard Worker 
150*993b0882SAndroid Build Coastguard Worker   // Gets pointer to the underlying utf8 data.
151*993b0882SAndroid Build Coastguard Worker   const char* data() const;
152*993b0882SAndroid Build Coastguard Worker 
153*993b0882SAndroid Build Coastguard Worker   // Gets length (in bytes) of the underlying utf8 data.
154*993b0882SAndroid Build Coastguard Worker   int size_bytes() const;
155*993b0882SAndroid Build Coastguard Worker 
156*993b0882SAndroid Build Coastguard Worker   // Computes length (in number of Unicode codepoints) of the underlying utf8
157*993b0882SAndroid Build Coastguard Worker   // data.
158*993b0882SAndroid Build Coastguard Worker   // NOTE: Complexity O(n).
159*993b0882SAndroid Build Coastguard Worker   int size_codepoints() const;
160*993b0882SAndroid Build Coastguard Worker 
161*993b0882SAndroid Build Coastguard Worker   bool empty() const;
162*993b0882SAndroid Build Coastguard Worker 
163*993b0882SAndroid Build Coastguard Worker   // Checks whether the underlying data is valid utf8 data.
164*993b0882SAndroid Build Coastguard Worker   bool is_valid() const;
165*993b0882SAndroid Build Coastguard Worker 
166*993b0882SAndroid Build Coastguard Worker   bool operator==(const UnicodeText& other) const;
167*993b0882SAndroid Build Coastguard Worker 
168*993b0882SAndroid Build Coastguard Worker   // x.PointToUTF8(buf,len) changes x so that it points to buf
169*993b0882SAndroid Build Coastguard Worker   // ("becomes an alias"). It does not take ownership or copy buf.
170*993b0882SAndroid Build Coastguard Worker   // This function assumes that the input is interchange valid UTF8.
171*993b0882SAndroid Build Coastguard Worker   UnicodeText& Copy(const UnicodeText& src);
172*993b0882SAndroid Build Coastguard Worker   UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
173*993b0882SAndroid Build Coastguard Worker   UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
174*993b0882SAndroid Build Coastguard Worker 
175*993b0882SAndroid Build Coastguard Worker   // Calling this may invalidate pointers to underlying data.
176*993b0882SAndroid Build Coastguard Worker   UnicodeText& AppendUTF8(const char* utf8, int len);
177*993b0882SAndroid Build Coastguard Worker   UnicodeText& push_back(char32 ch);
178*993b0882SAndroid Build Coastguard Worker   void clear();
179*993b0882SAndroid Build Coastguard Worker 
180*993b0882SAndroid Build Coastguard Worker   // Returns an iterator for each codepoint.
181*993b0882SAndroid Build Coastguard Worker   std::vector<const_iterator> Codepoints() const;
182*993b0882SAndroid Build Coastguard Worker 
183*993b0882SAndroid Build Coastguard Worker   // Returns the list of codepoints of the UnicodeText.
184*993b0882SAndroid Build Coastguard Worker   std::vector<char32> CodepointsChar32() const;
185*993b0882SAndroid Build Coastguard Worker 
186*993b0882SAndroid Build Coastguard Worker   std::string ToUTF8String() const;
187*993b0882SAndroid Build Coastguard Worker   std::string UTF8Substring(int begin_codepoint, int end_codepoint) const;
188*993b0882SAndroid Build Coastguard Worker   static std::string UTF8Substring(const const_iterator& it_begin,
189*993b0882SAndroid Build Coastguard Worker                                    const const_iterator& it_end);
190*993b0882SAndroid Build Coastguard Worker   static UnicodeText Substring(const UnicodeText& text, int begin_codepoint,
191*993b0882SAndroid Build Coastguard Worker                                int end_codepoint, bool do_copy = true);
192*993b0882SAndroid Build Coastguard Worker   static UnicodeText Substring(const const_iterator& it_begin,
193*993b0882SAndroid Build Coastguard Worker                                const const_iterator& it_end,
194*993b0882SAndroid Build Coastguard Worker                                bool do_copy = true);
195*993b0882SAndroid Build Coastguard Worker 
196*993b0882SAndroid Build Coastguard Worker  private:
197*993b0882SAndroid Build Coastguard Worker   friend class const_iterator;
198*993b0882SAndroid Build Coastguard Worker 
199*993b0882SAndroid Build Coastguard Worker   class Repr {  // A byte-string.
200*993b0882SAndroid Build Coastguard Worker    public:
201*993b0882SAndroid Build Coastguard Worker     char* data_;
202*993b0882SAndroid Build Coastguard Worker     int size_;
203*993b0882SAndroid Build Coastguard Worker     int capacity_;
204*993b0882SAndroid Build Coastguard Worker     bool ours_;  // Do we own data_?
205*993b0882SAndroid Build Coastguard Worker 
Repr()206*993b0882SAndroid Build Coastguard Worker     Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
207*993b0882SAndroid Build Coastguard Worker     Repr& operator=(Repr&& src);
~Repr()208*993b0882SAndroid Build Coastguard Worker     ~Repr() {
209*993b0882SAndroid Build Coastguard Worker       if (ours_) delete[] data_;
210*993b0882SAndroid Build Coastguard Worker     }
211*993b0882SAndroid Build Coastguard Worker 
212*993b0882SAndroid Build Coastguard Worker     void clear();
213*993b0882SAndroid Build Coastguard Worker     void reserve(int capacity);
214*993b0882SAndroid Build Coastguard Worker     void resize(int size);
215*993b0882SAndroid Build Coastguard Worker 
216*993b0882SAndroid Build Coastguard Worker     void append(const char* bytes, int byte_length);
217*993b0882SAndroid Build Coastguard Worker     void Copy(const char* data, int size);
218*993b0882SAndroid Build Coastguard Worker     void PointTo(const char* data, int size);
219*993b0882SAndroid Build Coastguard Worker 
220*993b0882SAndroid Build Coastguard Worker    private:
221*993b0882SAndroid Build Coastguard Worker     Repr& operator=(const Repr&);
222*993b0882SAndroid Build Coastguard Worker     Repr(const Repr& other);
223*993b0882SAndroid Build Coastguard Worker   };
224*993b0882SAndroid Build Coastguard Worker 
225*993b0882SAndroid Build Coastguard Worker   Repr repr_;
226*993b0882SAndroid Build Coastguard Worker };
227*993b0882SAndroid Build Coastguard Worker 
228*993b0882SAndroid Build Coastguard Worker typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator>
229*993b0882SAndroid Build Coastguard Worker     UnicodeTextRange;
230*993b0882SAndroid Build Coastguard Worker 
231*993b0882SAndroid Build Coastguard Worker // NOTE: The following are needed to avoid implicit conversion from char* to
232*993b0882SAndroid Build Coastguard Worker // std::string, or from ::string to std::string, because if this happens it
233*993b0882SAndroid Build Coastguard Worker // often results in invalid memory access to a temporary object created during
234*993b0882SAndroid Build Coastguard Worker // such conversion (if do_copy == false).
235*993b0882SAndroid Build Coastguard Worker // NOTE: These methods don't check if the input string is UTF8 well formed, for
236*993b0882SAndroid Build Coastguard Worker // efficiency reasons. Use UnicodeText::is_valid() when explicitly needed.
237*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
238*993b0882SAndroid Build Coastguard Worker                               bool do_copy = true);
239*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy = true);
240*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy = true);
241*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(StringPiece str, bool do_copy = true);
242*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(absl::string_view str, bool do_copy = true);
243*993b0882SAndroid Build Coastguard Worker 
244*993b0882SAndroid Build Coastguard Worker inline logging::LoggingStringStream& operator<<(
245*993b0882SAndroid Build Coastguard Worker     logging::LoggingStringStream& stream, const UnicodeText& message) {
246*993b0882SAndroid Build Coastguard Worker   stream.message.append(message.data(), message.size_bytes());
247*993b0882SAndroid Build Coastguard Worker   return stream;
248*993b0882SAndroid Build Coastguard Worker }
249*993b0882SAndroid Build Coastguard Worker 
250*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
251*993b0882SAndroid Build Coastguard Worker 
252*993b0882SAndroid Build Coastguard Worker #endif  // LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
253