1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_ 18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #include <iterator> 21*993b0882SAndroid Build Coastguard Worker #include <string> 22*993b0882SAndroid Build Coastguard Worker #include <utility> 23*993b0882SAndroid Build Coastguard Worker #include <vector> 24*993b0882SAndroid Build Coastguard Worker 25*993b0882SAndroid Build Coastguard Worker #include "utils/base/integral_types.h" 26*993b0882SAndroid Build Coastguard Worker #include "utils/base/logging.h" 27*993b0882SAndroid Build Coastguard Worker #include "utils/strings/stringpiece.h" 28*993b0882SAndroid Build Coastguard Worker #include "absl/strings/string_view.h" 29*993b0882SAndroid Build Coastguard Worker 30*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 { 31*993b0882SAndroid Build Coastguard Worker 32*993b0882SAndroid Build Coastguard Worker // ***************************** UnicodeText ************************** 33*993b0882SAndroid Build Coastguard Worker // 34*993b0882SAndroid Build Coastguard Worker // A UnicodeText object is a wrapper around a sequence of Unicode 35*993b0882SAndroid Build Coastguard Worker // codepoint values that allows iteration over these values. 36*993b0882SAndroid Build Coastguard Worker // 37*993b0882SAndroid Build Coastguard Worker // The internal representation of the text is UTF-8. Since UTF-8 is a 38*993b0882SAndroid Build Coastguard Worker // variable-width format, UnicodeText does not provide random access 39*993b0882SAndroid Build Coastguard Worker // to the text, and changes to the text are permitted only at the end. 40*993b0882SAndroid Build Coastguard Worker // 41*993b0882SAndroid Build Coastguard Worker // The UnicodeText class defines a const_iterator. The dereferencing 42*993b0882SAndroid Build Coastguard Worker // operator (*) returns a codepoint (int32). The iterator is a 43*993b0882SAndroid Build Coastguard Worker // read-only iterator. It becomes invalid if the text is changed. 44*993b0882SAndroid Build Coastguard Worker // 45*993b0882SAndroid Build Coastguard Worker // Codepoints are integers in the range [0, 0xD7FF] or [0xE000, 46*993b0882SAndroid Build Coastguard Worker // 0x10FFFF], but UnicodeText has the additional restriction that it 47*993b0882SAndroid Build Coastguard Worker // can contain only those characters that are valid for interchange on 48*993b0882SAndroid Build Coastguard Worker // the Web. This excludes all of the control codes except for carriage 49*993b0882SAndroid Build Coastguard Worker // return, line feed, and horizontal tab. It also excludes 50*993b0882SAndroid Build Coastguard Worker // non-characters, but codepoints that are in the Private Use regions 51*993b0882SAndroid Build Coastguard Worker // are allowed, as are codepoints that are unassigned. (See the 52*993b0882SAndroid Build Coastguard Worker // Unicode reference for details.) 53*993b0882SAndroid Build Coastguard Worker // 54*993b0882SAndroid Build Coastguard Worker // MEMORY MANAGEMENT: 55*993b0882SAndroid Build Coastguard Worker // 56*993b0882SAndroid Build Coastguard Worker // PointToUTF8(buffer, size) creates an alias pointing to buffer. 57*993b0882SAndroid Build Coastguard Worker // 58*993b0882SAndroid Build Coastguard Worker // The purpose of an alias is to avoid making an unnecessary copy of a 59*993b0882SAndroid Build Coastguard Worker // UTF-8 buffer while still providing access to the Unicode values 60*993b0882SAndroid Build Coastguard Worker // within that text through iterators. The lifetime of an alias must not 61*993b0882SAndroid Build Coastguard Worker // exceed the lifetime of the buffer from which it was constructed. 62*993b0882SAndroid Build Coastguard Worker // 63*993b0882SAndroid Build Coastguard Worker // Aliases should be used with care. If the source from which an alias 64*993b0882SAndroid Build Coastguard Worker // was created is freed, or if the contents are changed, while the 65*993b0882SAndroid Build Coastguard Worker // alias is still in use, fatal errors could result. But it can be 66*993b0882SAndroid Build Coastguard Worker // quite useful to have a UnicodeText "window" through which to see a 67*993b0882SAndroid Build Coastguard Worker // UTF-8 buffer without having to pay the price of making a copy. 68*993b0882SAndroid Build Coastguard Worker 69*993b0882SAndroid Build Coastguard Worker class UnicodeText { 70*993b0882SAndroid Build Coastguard Worker public: 71*993b0882SAndroid Build Coastguard Worker class const_iterator; 72*993b0882SAndroid Build Coastguard Worker 73*993b0882SAndroid Build Coastguard Worker UnicodeText(); // Create an empty text. 74*993b0882SAndroid Build Coastguard Worker UnicodeText(const UnicodeText& src, bool do_copy = true); 75*993b0882SAndroid Build Coastguard Worker UnicodeText& operator=(UnicodeText&& src); 76*993b0882SAndroid Build Coastguard Worker ~UnicodeText(); 77*993b0882SAndroid Build Coastguard Worker 78*993b0882SAndroid Build Coastguard Worker class const_iterator { 79*993b0882SAndroid Build Coastguard Worker typedef const_iterator CI; 80*993b0882SAndroid Build Coastguard Worker 81*993b0882SAndroid Build Coastguard Worker public: 82*993b0882SAndroid Build Coastguard Worker typedef std::bidirectional_iterator_tag iterator_category; 83*993b0882SAndroid Build Coastguard Worker typedef char32 value_type; 84*993b0882SAndroid Build Coastguard Worker typedef int difference_type; 85*993b0882SAndroid Build Coastguard Worker typedef void pointer; // (Not needed.) 86*993b0882SAndroid Build Coastguard Worker typedef const char32 reference; // (Needed for const_reverse_iterator) 87*993b0882SAndroid Build Coastguard Worker 88*993b0882SAndroid Build Coastguard Worker // Iterators are default-constructible. 89*993b0882SAndroid Build Coastguard Worker const_iterator(); 90*993b0882SAndroid Build Coastguard Worker 91*993b0882SAndroid Build Coastguard Worker // It's safe to make multiple passes over a UnicodeText. 92*993b0882SAndroid Build Coastguard Worker const_iterator(const const_iterator&) = default; 93*993b0882SAndroid Build Coastguard Worker const_iterator& operator=(const const_iterator&) = default; 94*993b0882SAndroid Build Coastguard Worker 95*993b0882SAndroid Build Coastguard Worker char32 operator*() const; // Dereference 96*993b0882SAndroid Build Coastguard Worker 97*993b0882SAndroid Build Coastguard Worker const_iterator& operator++(); // Advance (++iter) 98*993b0882SAndroid Build Coastguard Worker const_iterator operator++(int) { // (iter++) 99*993b0882SAndroid Build Coastguard Worker const_iterator result(*this); 100*993b0882SAndroid Build Coastguard Worker ++*this; 101*993b0882SAndroid Build Coastguard Worker return result; 102*993b0882SAndroid Build Coastguard Worker } 103*993b0882SAndroid Build Coastguard Worker 104*993b0882SAndroid Build Coastguard Worker const_iterator& operator--(); // Retreat (--iter) 105*993b0882SAndroid Build Coastguard Worker const_iterator operator--(int) { // (iter--) 106*993b0882SAndroid Build Coastguard Worker const_iterator result(*this); 107*993b0882SAndroid Build Coastguard Worker --*this; 108*993b0882SAndroid Build Coastguard Worker return result; 109*993b0882SAndroid Build Coastguard Worker } 110*993b0882SAndroid Build Coastguard Worker 111*993b0882SAndroid Build Coastguard Worker friend bool operator==(const CI& lhs, const CI& rhs) { 112*993b0882SAndroid Build Coastguard Worker return lhs.it_ == rhs.it_; 113*993b0882SAndroid Build Coastguard Worker } 114*993b0882SAndroid Build Coastguard Worker friend bool operator!=(const CI& lhs, const CI& rhs) { 115*993b0882SAndroid Build Coastguard Worker return !(lhs == rhs); 116*993b0882SAndroid Build Coastguard Worker } 117*993b0882SAndroid Build Coastguard Worker friend bool operator<(const CI& lhs, const CI& rhs); 118*993b0882SAndroid Build Coastguard Worker friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; } 119*993b0882SAndroid Build Coastguard Worker friend bool operator<=(const CI& lhs, const CI& rhs) { 120*993b0882SAndroid Build Coastguard Worker return !(rhs < lhs); 121*993b0882SAndroid Build Coastguard Worker } 122*993b0882SAndroid Build Coastguard Worker friend bool operator>=(const CI& lhs, const CI& rhs) { 123*993b0882SAndroid Build Coastguard Worker return !(lhs < rhs); 124*993b0882SAndroid Build Coastguard Worker } 125*993b0882SAndroid Build Coastguard Worker utf8_length()126*993b0882SAndroid Build Coastguard Worker int utf8_length() const { 127*993b0882SAndroid Build Coastguard Worker const unsigned char byte = static_cast<unsigned char>(it_[0]); 128*993b0882SAndroid Build Coastguard Worker if (byte < 0x80) { 129*993b0882SAndroid Build Coastguard Worker return 1; 130*993b0882SAndroid Build Coastguard Worker } else if (byte < 0xE0) { 131*993b0882SAndroid Build Coastguard Worker return 2; 132*993b0882SAndroid Build Coastguard Worker } else if (byte < 0xF0) { 133*993b0882SAndroid Build Coastguard Worker return 3; 134*993b0882SAndroid Build Coastguard Worker } else { 135*993b0882SAndroid Build Coastguard Worker return 4; 136*993b0882SAndroid Build Coastguard Worker } 137*993b0882SAndroid Build Coastguard Worker } utf8_data()138*993b0882SAndroid Build Coastguard Worker const char* utf8_data() const { return it_; } 139*993b0882SAndroid Build Coastguard Worker 140*993b0882SAndroid Build Coastguard Worker private: 141*993b0882SAndroid Build Coastguard Worker friend class UnicodeText; const_iterator(const char * it)142*993b0882SAndroid Build Coastguard Worker explicit const_iterator(const char* it) : it_(it) {} 143*993b0882SAndroid Build Coastguard Worker 144*993b0882SAndroid Build Coastguard Worker const char* it_; 145*993b0882SAndroid Build Coastguard Worker }; 146*993b0882SAndroid Build Coastguard Worker 147*993b0882SAndroid Build Coastguard Worker const_iterator begin() const; 148*993b0882SAndroid Build Coastguard Worker const_iterator end() const; 149*993b0882SAndroid Build Coastguard Worker 150*993b0882SAndroid Build Coastguard Worker // Gets pointer to the underlying utf8 data. 151*993b0882SAndroid Build Coastguard Worker const char* data() const; 152*993b0882SAndroid Build Coastguard Worker 153*993b0882SAndroid Build Coastguard Worker // Gets length (in bytes) of the underlying utf8 data. 154*993b0882SAndroid Build Coastguard Worker int size_bytes() const; 155*993b0882SAndroid Build Coastguard Worker 156*993b0882SAndroid Build Coastguard Worker // Computes length (in number of Unicode codepoints) of the underlying utf8 157*993b0882SAndroid Build Coastguard Worker // data. 158*993b0882SAndroid Build Coastguard Worker // NOTE: Complexity O(n). 159*993b0882SAndroid Build Coastguard Worker int size_codepoints() const; 160*993b0882SAndroid Build Coastguard Worker 161*993b0882SAndroid Build Coastguard Worker bool empty() const; 162*993b0882SAndroid Build Coastguard Worker 163*993b0882SAndroid Build Coastguard Worker // Checks whether the underlying data is valid utf8 data. 164*993b0882SAndroid Build Coastguard Worker bool is_valid() const; 165*993b0882SAndroid Build Coastguard Worker 166*993b0882SAndroid Build Coastguard Worker bool operator==(const UnicodeText& other) const; 167*993b0882SAndroid Build Coastguard Worker 168*993b0882SAndroid Build Coastguard Worker // x.PointToUTF8(buf,len) changes x so that it points to buf 169*993b0882SAndroid Build Coastguard Worker // ("becomes an alias"). It does not take ownership or copy buf. 170*993b0882SAndroid Build Coastguard Worker // This function assumes that the input is interchange valid UTF8. 171*993b0882SAndroid Build Coastguard Worker UnicodeText& Copy(const UnicodeText& src); 172*993b0882SAndroid Build Coastguard Worker UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); 173*993b0882SAndroid Build Coastguard Worker UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); 174*993b0882SAndroid Build Coastguard Worker 175*993b0882SAndroid Build Coastguard Worker // Calling this may invalidate pointers to underlying data. 176*993b0882SAndroid Build Coastguard Worker UnicodeText& AppendUTF8(const char* utf8, int len); 177*993b0882SAndroid Build Coastguard Worker UnicodeText& push_back(char32 ch); 178*993b0882SAndroid Build Coastguard Worker void clear(); 179*993b0882SAndroid Build Coastguard Worker 180*993b0882SAndroid Build Coastguard Worker // Returns an iterator for each codepoint. 181*993b0882SAndroid Build Coastguard Worker std::vector<const_iterator> Codepoints() const; 182*993b0882SAndroid Build Coastguard Worker 183*993b0882SAndroid Build Coastguard Worker // Returns the list of codepoints of the UnicodeText. 184*993b0882SAndroid Build Coastguard Worker std::vector<char32> CodepointsChar32() const; 185*993b0882SAndroid Build Coastguard Worker 186*993b0882SAndroid Build Coastguard Worker std::string ToUTF8String() const; 187*993b0882SAndroid Build Coastguard Worker std::string UTF8Substring(int begin_codepoint, int end_codepoint) const; 188*993b0882SAndroid Build Coastguard Worker static std::string UTF8Substring(const const_iterator& it_begin, 189*993b0882SAndroid Build Coastguard Worker const const_iterator& it_end); 190*993b0882SAndroid Build Coastguard Worker static UnicodeText Substring(const UnicodeText& text, int begin_codepoint, 191*993b0882SAndroid Build Coastguard Worker int end_codepoint, bool do_copy = true); 192*993b0882SAndroid Build Coastguard Worker static UnicodeText Substring(const const_iterator& it_begin, 193*993b0882SAndroid Build Coastguard Worker const const_iterator& it_end, 194*993b0882SAndroid Build Coastguard Worker bool do_copy = true); 195*993b0882SAndroid Build Coastguard Worker 196*993b0882SAndroid Build Coastguard Worker private: 197*993b0882SAndroid Build Coastguard Worker friend class const_iterator; 198*993b0882SAndroid Build Coastguard Worker 199*993b0882SAndroid Build Coastguard Worker class Repr { // A byte-string. 200*993b0882SAndroid Build Coastguard Worker public: 201*993b0882SAndroid Build Coastguard Worker char* data_; 202*993b0882SAndroid Build Coastguard Worker int size_; 203*993b0882SAndroid Build Coastguard Worker int capacity_; 204*993b0882SAndroid Build Coastguard Worker bool ours_; // Do we own data_? 205*993b0882SAndroid Build Coastguard Worker Repr()206*993b0882SAndroid Build Coastguard Worker Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {} 207*993b0882SAndroid Build Coastguard Worker Repr& operator=(Repr&& src); ~Repr()208*993b0882SAndroid Build Coastguard Worker ~Repr() { 209*993b0882SAndroid Build Coastguard Worker if (ours_) delete[] data_; 210*993b0882SAndroid Build Coastguard Worker } 211*993b0882SAndroid Build Coastguard Worker 212*993b0882SAndroid Build Coastguard Worker void clear(); 213*993b0882SAndroid Build Coastguard Worker void reserve(int capacity); 214*993b0882SAndroid Build Coastguard Worker void resize(int size); 215*993b0882SAndroid Build Coastguard Worker 216*993b0882SAndroid Build Coastguard Worker void append(const char* bytes, int byte_length); 217*993b0882SAndroid Build Coastguard Worker void Copy(const char* data, int size); 218*993b0882SAndroid Build Coastguard Worker void PointTo(const char* data, int size); 219*993b0882SAndroid Build Coastguard Worker 220*993b0882SAndroid Build Coastguard Worker private: 221*993b0882SAndroid Build Coastguard Worker Repr& operator=(const Repr&); 222*993b0882SAndroid Build Coastguard Worker Repr(const Repr& other); 223*993b0882SAndroid Build Coastguard Worker }; 224*993b0882SAndroid Build Coastguard Worker 225*993b0882SAndroid Build Coastguard Worker Repr repr_; 226*993b0882SAndroid Build Coastguard Worker }; 227*993b0882SAndroid Build Coastguard Worker 228*993b0882SAndroid Build Coastguard Worker typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator> 229*993b0882SAndroid Build Coastguard Worker UnicodeTextRange; 230*993b0882SAndroid Build Coastguard Worker 231*993b0882SAndroid Build Coastguard Worker // NOTE: The following are needed to avoid implicit conversion from char* to 232*993b0882SAndroid Build Coastguard Worker // std::string, or from ::string to std::string, because if this happens it 233*993b0882SAndroid Build Coastguard Worker // often results in invalid memory access to a temporary object created during 234*993b0882SAndroid Build Coastguard Worker // such conversion (if do_copy == false). 235*993b0882SAndroid Build Coastguard Worker // NOTE: These methods don't check if the input string is UTF8 well formed, for 236*993b0882SAndroid Build Coastguard Worker // efficiency reasons. Use UnicodeText::is_valid() when explicitly needed. 237*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, 238*993b0882SAndroid Build Coastguard Worker bool do_copy = true); 239*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy = true); 240*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy = true); 241*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(StringPiece str, bool do_copy = true); 242*993b0882SAndroid Build Coastguard Worker UnicodeText UTF8ToUnicodeText(absl::string_view str, bool do_copy = true); 243*993b0882SAndroid Build Coastguard Worker 244*993b0882SAndroid Build Coastguard Worker inline logging::LoggingStringStream& operator<<( 245*993b0882SAndroid Build Coastguard Worker logging::LoggingStringStream& stream, const UnicodeText& message) { 246*993b0882SAndroid Build Coastguard Worker stream.message.append(message.data(), message.size_bytes()); 247*993b0882SAndroid Build Coastguard Worker return stream; 248*993b0882SAndroid Build Coastguard Worker } 249*993b0882SAndroid Build Coastguard Worker 250*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3 251*993b0882SAndroid Build Coastguard Worker 252*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_ 253