1*6777b538SAndroid Build Coastguard Worker // Copyright 2011 The Chromium Authors 2*6777b538SAndroid Build Coastguard Worker // Use of this source code is governed by a BSD-style license that can be 3*6777b538SAndroid Build Coastguard Worker // found in the LICENSE file. 4*6777b538SAndroid Build Coastguard Worker 5*6777b538SAndroid Build Coastguard Worker #ifndef BASE_I18N_BREAK_ITERATOR_H_ 6*6777b538SAndroid Build Coastguard Worker #define BASE_I18N_BREAK_ITERATOR_H_ 7*6777b538SAndroid Build Coastguard Worker 8*6777b538SAndroid Build Coastguard Worker #include <stddef.h> 9*6777b538SAndroid Build Coastguard Worker 10*6777b538SAndroid Build Coastguard Worker #include <memory> 11*6777b538SAndroid Build Coastguard Worker #include <string> 12*6777b538SAndroid Build Coastguard Worker #include <string_view> 13*6777b538SAndroid Build Coastguard Worker 14*6777b538SAndroid Build Coastguard Worker #include "base/i18n/base_i18n_export.h" 15*6777b538SAndroid Build Coastguard Worker #include "base/memory/raw_ptr.h" 16*6777b538SAndroid Build Coastguard Worker 17*6777b538SAndroid Build Coastguard Worker // The BreakIterator class iterates through the words, word breaks, and 18*6777b538SAndroid Build Coastguard Worker // line breaks in a UTF-16 string. 19*6777b538SAndroid Build Coastguard Worker // 20*6777b538SAndroid Build Coastguard Worker // It provides several modes, BREAK_WORD, BREAK_LINE, BREAK_NEWLINE, and 21*6777b538SAndroid Build Coastguard Worker // BREAK_SENTENCE which modify how characters are aggregated into the returned 22*6777b538SAndroid Build Coastguard Worker // string. 23*6777b538SAndroid Build Coastguard Worker // 24*6777b538SAndroid Build Coastguard Worker // Under BREAK_WORD mode, once a word is encountered any non-word 25*6777b538SAndroid Build Coastguard Worker // characters are not included in the returned string (e.g. in the 26*6777b538SAndroid Build Coastguard Worker // UTF-16 equivalent of the string " foo bar! ", the word breaks are at 27*6777b538SAndroid Build Coastguard Worker // the periods in ". .foo. .bar.!. ."). 28*6777b538SAndroid Build Coastguard Worker // Note that Chinese/Japanese/Thai do not use spaces between words so that 29*6777b538SAndroid Build Coastguard Worker // boundaries can fall in the middle of a continuous run of non-space / 30*6777b538SAndroid Build Coastguard Worker // non-punctuation characters. 31*6777b538SAndroid Build Coastguard Worker // 32*6777b538SAndroid Build Coastguard Worker // Under BREAK_LINE mode, once a line breaking opportunity is encountered, 33*6777b538SAndroid Build Coastguard Worker // any non-word characters are included in the returned string, breaking 34*6777b538SAndroid Build Coastguard Worker // only when a space-equivalent character or a line breaking opportunity 35*6777b538SAndroid Build Coastguard Worker // is encountered (e.g. in the UTF16-equivalent of the string " foo bar! ", 36*6777b538SAndroid Build Coastguard Worker // the breaks are at the periods in ". .foo .bar! ."). 37*6777b538SAndroid Build Coastguard Worker // 38*6777b538SAndroid Build Coastguard Worker // Note that lines can be broken at any character/syllable/grapheme cluster 39*6777b538SAndroid Build Coastguard Worker // boundary in Chinese/Japanese/Korean and at word boundaries in Thai 40*6777b538SAndroid Build Coastguard Worker // (Thai does not use spaces between words). Therefore, this is NOT the same 41*6777b538SAndroid Build Coastguard Worker // as breaking only at space-equivalent characters where its former 42*6777b538SAndroid Build Coastguard Worker // name (BREAK_SPACE) implied. 43*6777b538SAndroid Build Coastguard Worker // 44*6777b538SAndroid Build Coastguard Worker // Under BREAK_NEWLINE mode, all characters are included in the returned 45*6777b538SAndroid Build Coastguard Worker // string, breaking only when a newline-equivalent character is encountered 46*6777b538SAndroid Build Coastguard Worker // (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line 47*6777b538SAndroid Build Coastguard Worker // breaks are at the periods in ".foo\n.bar\n.\n."). 48*6777b538SAndroid Build Coastguard Worker // 49*6777b538SAndroid Build Coastguard Worker // Under BREAK_SENTENCE mode, all characters are included in the returned 50*6777b538SAndroid Build Coastguard Worker // string, breaking only on sentence boundaries defined in "Unicode Standard 51*6777b538SAndroid Build Coastguard Worker // Annex #29: Text Segmentation." Whitespace immediately following the sentence 52*6777b538SAndroid Build Coastguard Worker // is also included. For example, in the UTF-16 equivalent of the string 53*6777b538SAndroid Build Coastguard Worker // "foo bar! baz qux?" the breaks are at the periods in ".foo bar! .baz quz?." 54*6777b538SAndroid Build Coastguard Worker // 55*6777b538SAndroid Build Coastguard Worker // To extract the words from a string, move a BREAK_WORD BreakIterator 56*6777b538SAndroid Build Coastguard Worker // through the string and test whether IsWord() is true. E.g., 57*6777b538SAndroid Build Coastguard Worker // BreakIterator iter(str, BreakIterator::BREAK_WORD); 58*6777b538SAndroid Build Coastguard Worker // if (!iter.Init()) 59*6777b538SAndroid Build Coastguard Worker // return false; 60*6777b538SAndroid Build Coastguard Worker // while (iter.Advance()) { 61*6777b538SAndroid Build Coastguard Worker // if (iter.IsWord()) { 62*6777b538SAndroid Build Coastguard Worker // // Region [iter.prev(), iter.pos()) contains a word. 63*6777b538SAndroid Build Coastguard Worker // VLOG(1) << "word: " << iter.GetString(); 64*6777b538SAndroid Build Coastguard Worker // } 65*6777b538SAndroid Build Coastguard Worker // } 66*6777b538SAndroid Build Coastguard Worker 67*6777b538SAndroid Build Coastguard Worker // ICU iterator type. It is forward declared to avoid including transitively the 68*6777b538SAndroid Build Coastguard Worker // full ICU headers toward every dependent files. 69*6777b538SAndroid Build Coastguard Worker struct UBreakIterator; 70*6777b538SAndroid Build Coastguard Worker 71*6777b538SAndroid Build Coastguard Worker namespace base { 72*6777b538SAndroid Build Coastguard Worker namespace i18n { 73*6777b538SAndroid Build Coastguard Worker 74*6777b538SAndroid Build Coastguard Worker struct UBreakIteratorDeleter { 75*6777b538SAndroid Build Coastguard Worker void operator()(UBreakIterator*); 76*6777b538SAndroid Build Coastguard Worker }; 77*6777b538SAndroid Build Coastguard Worker using UBreakIteratorPtr = 78*6777b538SAndroid Build Coastguard Worker std::unique_ptr<UBreakIterator, UBreakIteratorDeleter>; 79*6777b538SAndroid Build Coastguard Worker 80*6777b538SAndroid Build Coastguard Worker class BASE_I18N_EXPORT BreakIterator { 81*6777b538SAndroid Build Coastguard Worker public: 82*6777b538SAndroid Build Coastguard Worker enum BreakType { 83*6777b538SAndroid Build Coastguard Worker BREAK_WORD, 84*6777b538SAndroid Build Coastguard Worker BREAK_LINE, 85*6777b538SAndroid Build Coastguard Worker // TODO(jshin): Remove this after reviewing call sites. 86*6777b538SAndroid Build Coastguard Worker // If call sites really need break only on space-like characters 87*6777b538SAndroid Build Coastguard Worker // implement it separately. 88*6777b538SAndroid Build Coastguard Worker BREAK_SPACE = BREAK_LINE, 89*6777b538SAndroid Build Coastguard Worker BREAK_NEWLINE, 90*6777b538SAndroid Build Coastguard Worker BREAK_CHARACTER, 91*6777b538SAndroid Build Coastguard Worker // But don't remove this one! 92*6777b538SAndroid Build Coastguard Worker RULE_BASED, 93*6777b538SAndroid Build Coastguard Worker BREAK_SENTENCE, 94*6777b538SAndroid Build Coastguard Worker }; 95*6777b538SAndroid Build Coastguard Worker 96*6777b538SAndroid Build Coastguard Worker enum WordBreakStatus { 97*6777b538SAndroid Build Coastguard Worker // The end of text that the iterator recognizes as word characters. 98*6777b538SAndroid Build Coastguard Worker // Non-word characters are things like punctuation and spaces. 99*6777b538SAndroid Build Coastguard Worker IS_WORD_BREAK, 100*6777b538SAndroid Build Coastguard Worker // Characters that the iterator can skip past, such as punctuation, 101*6777b538SAndroid Build Coastguard Worker // whitespace, and, if using RULE_BASED mode, characters from another 102*6777b538SAndroid Build Coastguard Worker // character set. 103*6777b538SAndroid Build Coastguard Worker IS_SKIPPABLE_WORD, 104*6777b538SAndroid Build Coastguard Worker // Only used if not in BREAK_WORD or RULE_BASED mode. This is returned for 105*6777b538SAndroid Build Coastguard Worker // newlines, line breaks, and character breaks. 106*6777b538SAndroid Build Coastguard Worker IS_LINE_OR_CHAR_BREAK 107*6777b538SAndroid Build Coastguard Worker }; 108*6777b538SAndroid Build Coastguard Worker 109*6777b538SAndroid Build Coastguard Worker static constexpr size_t npos = static_cast<size_t>(-1); 110*6777b538SAndroid Build Coastguard Worker 111*6777b538SAndroid Build Coastguard Worker // Requires |str| to live as long as the BreakIterator does. 112*6777b538SAndroid Build Coastguard Worker BreakIterator(std::u16string_view str, BreakType break_type); 113*6777b538SAndroid Build Coastguard Worker // Make a rule-based iterator. BreakType == RULE_BASED is implied. 114*6777b538SAndroid Build Coastguard Worker // TODO(andrewhayden): This signature could easily be misinterpreted as 115*6777b538SAndroid Build Coastguard Worker // "(const std::u16string& str, const std::u16string& locale)". We should do 116*6777b538SAndroid Build Coastguard Worker // something better. 117*6777b538SAndroid Build Coastguard Worker BreakIterator(std::u16string_view str, const std::u16string& rules); 118*6777b538SAndroid Build Coastguard Worker 119*6777b538SAndroid Build Coastguard Worker BreakIterator(const BreakIterator&) = delete; 120*6777b538SAndroid Build Coastguard Worker BreakIterator& operator=(const BreakIterator&) = delete; 121*6777b538SAndroid Build Coastguard Worker 122*6777b538SAndroid Build Coastguard Worker ~BreakIterator(); 123*6777b538SAndroid Build Coastguard Worker 124*6777b538SAndroid Build Coastguard Worker // Init() must be called before any of the iterators are valid. 125*6777b538SAndroid Build Coastguard Worker // Returns false if ICU failed to initialize. 126*6777b538SAndroid Build Coastguard Worker bool Init(); 127*6777b538SAndroid Build Coastguard Worker 128*6777b538SAndroid Build Coastguard Worker // Advance to the next break. Returns false if we've run past the end of 129*6777b538SAndroid Build Coastguard Worker // the string. (Note that the very last "break" is after the final 130*6777b538SAndroid Build Coastguard Worker // character in the string, and when we advance to that position it's the 131*6777b538SAndroid Build Coastguard Worker // last time Advance() returns true.) 132*6777b538SAndroid Build Coastguard Worker bool Advance(); 133*6777b538SAndroid Build Coastguard Worker 134*6777b538SAndroid Build Coastguard Worker // Updates the text used by the iterator, resetting the iterator as if 135*6777b538SAndroid Build Coastguard Worker // if Init() had been called again. Any old state is lost. Returns true 136*6777b538SAndroid Build Coastguard Worker // unless there is an error setting the text. 137*6777b538SAndroid Build Coastguard Worker bool SetText(std::u16string_view text); 138*6777b538SAndroid Build Coastguard Worker 139*6777b538SAndroid Build Coastguard Worker // Under BREAK_WORD mode, returns true if the break we just hit is the 140*6777b538SAndroid Build Coastguard Worker // end of a word. (Otherwise, the break iterator just skipped over e.g. 141*6777b538SAndroid Build Coastguard Worker // whitespace or punctuation.) Under BREAK_LINE and BREAK_NEWLINE modes, 142*6777b538SAndroid Build Coastguard Worker // this distinction doesn't apply and it always returns false. 143*6777b538SAndroid Build Coastguard Worker bool IsWord() const; 144*6777b538SAndroid Build Coastguard Worker 145*6777b538SAndroid Build Coastguard Worker // Under BREAK_WORD mode: 146*6777b538SAndroid Build Coastguard Worker // - Returns IS_SKIPPABLE_WORD if non-word characters, such as punctuation or 147*6777b538SAndroid Build Coastguard Worker // spaces, are found. 148*6777b538SAndroid Build Coastguard Worker // - Returns IS_WORD_BREAK if the break we just hit is the end of a sequence 149*6777b538SAndroid Build Coastguard Worker // of word characters. 150*6777b538SAndroid Build Coastguard Worker // Under RULE_BASED mode: 151*6777b538SAndroid Build Coastguard Worker // - Returns IS_SKIPPABLE_WORD if characters outside the rules' character set 152*6777b538SAndroid Build Coastguard Worker // or non-word characters, such as punctuation or spaces, are found. 153*6777b538SAndroid Build Coastguard Worker // - Returns IS_WORD_BREAK if the break we just hit is the end of a sequence 154*6777b538SAndroid Build Coastguard Worker // of word characters that are in the rules' character set. 155*6777b538SAndroid Build Coastguard Worker // Not under BREAK_WORD or RULE_BASED mode: 156*6777b538SAndroid Build Coastguard Worker // - Returns IS_LINE_OR_CHAR_BREAK. 157*6777b538SAndroid Build Coastguard Worker BreakIterator::WordBreakStatus GetWordBreakStatus() const; 158*6777b538SAndroid Build Coastguard Worker 159*6777b538SAndroid Build Coastguard Worker // Under BREAK_WORD mode, returns true if |position| is at the end of word or 160*6777b538SAndroid Build Coastguard Worker // at the start of word. It always returns false under modes that are not 161*6777b538SAndroid Build Coastguard Worker // BREAK_WORD or RULE_BASED. 162*6777b538SAndroid Build Coastguard Worker bool IsEndOfWord(size_t position) const; 163*6777b538SAndroid Build Coastguard Worker bool IsStartOfWord(size_t position) const; 164*6777b538SAndroid Build Coastguard Worker 165*6777b538SAndroid Build Coastguard Worker // Under BREAK_SENTENCE mode, returns true if |position| is at a sentence 166*6777b538SAndroid Build Coastguard Worker // boundary. It always returns false under modes that are not BREAK_SENTENCE 167*6777b538SAndroid Build Coastguard Worker // or RULE_BASED. 168*6777b538SAndroid Build Coastguard Worker bool IsSentenceBoundary(size_t position) const; 169*6777b538SAndroid Build Coastguard Worker 170*6777b538SAndroid Build Coastguard Worker // Under BREAK_CHARACTER mode, returns whether |position| is a Unicode 171*6777b538SAndroid Build Coastguard Worker // grapheme boundary. 172*6777b538SAndroid Build Coastguard Worker bool IsGraphemeBoundary(size_t position) const; 173*6777b538SAndroid Build Coastguard Worker 174*6777b538SAndroid Build Coastguard Worker // Returns the string between prev() and pos(). 175*6777b538SAndroid Build Coastguard Worker // Advance() must have been called successfully at least once for pos() to 176*6777b538SAndroid Build Coastguard Worker // have advanced to somewhere useful. 177*6777b538SAndroid Build Coastguard Worker std::u16string GetString() const; 178*6777b538SAndroid Build Coastguard Worker 179*6777b538SAndroid Build Coastguard Worker std::u16string_view GetStringPiece() const; 180*6777b538SAndroid Build Coastguard Worker 181*6777b538SAndroid Build Coastguard Worker // Returns the value of pos() returned before Advance() was last called. prev()182*6777b538SAndroid Build Coastguard Worker size_t prev() const { return prev_; } 183*6777b538SAndroid Build Coastguard Worker 184*6777b538SAndroid Build Coastguard Worker // Returns the current break position within the string, 185*6777b538SAndroid Build Coastguard Worker // or BreakIterator::npos when done. pos()186*6777b538SAndroid Build Coastguard Worker size_t pos() const { return pos_; } 187*6777b538SAndroid Build Coastguard Worker 188*6777b538SAndroid Build Coastguard Worker private: 189*6777b538SAndroid Build Coastguard Worker UBreakIteratorPtr iter_; 190*6777b538SAndroid Build Coastguard Worker 191*6777b538SAndroid Build Coastguard Worker // The string we're iterating over. Can be changed with SetText(...) 192*6777b538SAndroid Build Coastguard Worker std::u16string_view string_; 193*6777b538SAndroid Build Coastguard Worker 194*6777b538SAndroid Build Coastguard Worker // Rules for our iterator. Mutually exclusive with break_type_. 195*6777b538SAndroid Build Coastguard Worker const std::u16string rules_; 196*6777b538SAndroid Build Coastguard Worker 197*6777b538SAndroid Build Coastguard Worker // The breaking style (word/space/newline). Mutually exclusive with rules_ 198*6777b538SAndroid Build Coastguard Worker const BreakType break_type_; 199*6777b538SAndroid Build Coastguard Worker 200*6777b538SAndroid Build Coastguard Worker // Previous and current iterator positions. 201*6777b538SAndroid Build Coastguard Worker size_t prev_ = npos; 202*6777b538SAndroid Build Coastguard Worker size_t pos_ = 0; 203*6777b538SAndroid Build Coastguard Worker }; 204*6777b538SAndroid Build Coastguard Worker 205*6777b538SAndroid Build Coastguard Worker } // namespace i18n 206*6777b538SAndroid Build Coastguard Worker } // namespace base 207*6777b538SAndroid Build Coastguard Worker 208*6777b538SAndroid Build Coastguard Worker #endif // BASE_I18N_BREAK_ITERATOR_H_ 209