xref: /aosp_15_r20/external/cronet/base/i18n/break_iterator.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1*6777b538SAndroid Build Coastguard Worker // Copyright 2011 The Chromium Authors
2*6777b538SAndroid Build Coastguard Worker // Use of this source code is governed by a BSD-style license that can be
3*6777b538SAndroid Build Coastguard Worker // found in the LICENSE file.
4*6777b538SAndroid Build Coastguard Worker 
5*6777b538SAndroid Build Coastguard Worker #include "base/i18n/break_iterator.h"
6*6777b538SAndroid Build Coastguard Worker 
7*6777b538SAndroid Build Coastguard Worker #include <stdint.h>
8*6777b538SAndroid Build Coastguard Worker #include <ostream>
9*6777b538SAndroid Build Coastguard Worker #include <string_view>
10*6777b538SAndroid Build Coastguard Worker 
11*6777b538SAndroid Build Coastguard Worker #include "base/check.h"
12*6777b538SAndroid Build Coastguard Worker #include "base/lazy_instance.h"
13*6777b538SAndroid Build Coastguard Worker #include "base/memory/raw_ptr.h"
14*6777b538SAndroid Build Coastguard Worker #include "base/notreached.h"
15*6777b538SAndroid Build Coastguard Worker #include "base/synchronization/lock.h"
16*6777b538SAndroid Build Coastguard Worker #include "third_party/icu/source/common/unicode/ubrk.h"
17*6777b538SAndroid Build Coastguard Worker #include "third_party/icu/source/common/unicode/uchar.h"
18*6777b538SAndroid Build Coastguard Worker #include "third_party/icu/source/common/unicode/ustring.h"
19*6777b538SAndroid Build Coastguard Worker 
20*6777b538SAndroid Build Coastguard Worker namespace base {
21*6777b538SAndroid Build Coastguard Worker namespace i18n {
22*6777b538SAndroid Build Coastguard Worker 
23*6777b538SAndroid Build Coastguard Worker namespace {
24*6777b538SAndroid Build Coastguard Worker 
25*6777b538SAndroid Build Coastguard Worker // We found the usage pattern of break iterator is to create, use and destroy.
26*6777b538SAndroid Build Coastguard Worker // The following cache support multiple break iterator in the same thread and
27*6777b538SAndroid Build Coastguard Worker // also optimize to not create break iterator many time. For each kind of break
28*6777b538SAndroid Build Coastguard Worker // iterator (character, word, line and sentence, but NOT rule), we keep one of
29*6777b538SAndroid Build Coastguard Worker // them in the main_ and lease it out. If some other code request a lease
30*6777b538SAndroid Build Coastguard Worker // before |main_| is returned, we create a new instance of the iterator.
31*6777b538SAndroid Build Coastguard Worker // This will keep at most 4 break iterators (one for each kind) unreleased until
32*6777b538SAndroid Build Coastguard Worker // the program destruction time.
33*6777b538SAndroid Build Coastguard Worker template <UBreakIteratorType break_type>
34*6777b538SAndroid Build Coastguard Worker class DefaultLocaleBreakIteratorCache {
35*6777b538SAndroid Build Coastguard Worker  public:
DefaultLocaleBreakIteratorCache()36*6777b538SAndroid Build Coastguard Worker   DefaultLocaleBreakIteratorCache() {
37*6777b538SAndroid Build Coastguard Worker     main_ = UBreakIteratorPtr(
38*6777b538SAndroid Build Coastguard Worker         ubrk_open(break_type, nullptr, nullptr, 0, &main_status_));
39*6777b538SAndroid Build Coastguard Worker     if (U_FAILURE(main_status_)) {
40*6777b538SAndroid Build Coastguard Worker       NOTREACHED() << "ubrk_open failed for type " << break_type
41*6777b538SAndroid Build Coastguard Worker                    << " with error " << main_status_;
42*6777b538SAndroid Build Coastguard Worker     }
43*6777b538SAndroid Build Coastguard Worker   }
Lease(UErrorCode & status)44*6777b538SAndroid Build Coastguard Worker   UBreakIteratorPtr Lease(UErrorCode& status) {
45*6777b538SAndroid Build Coastguard Worker     if (U_FAILURE(status)) {
46*6777b538SAndroid Build Coastguard Worker       return nullptr;
47*6777b538SAndroid Build Coastguard Worker     }
48*6777b538SAndroid Build Coastguard Worker     if (U_FAILURE(main_status_)) {
49*6777b538SAndroid Build Coastguard Worker       status = main_status_;
50*6777b538SAndroid Build Coastguard Worker       return nullptr;
51*6777b538SAndroid Build Coastguard Worker     }
52*6777b538SAndroid Build Coastguard Worker     {
53*6777b538SAndroid Build Coastguard Worker       AutoLock scoped_lock(lock_);
54*6777b538SAndroid Build Coastguard Worker       if (main_) {
55*6777b538SAndroid Build Coastguard Worker         return std::move(main_);
56*6777b538SAndroid Build Coastguard Worker       }
57*6777b538SAndroid Build Coastguard Worker     }
58*6777b538SAndroid Build Coastguard Worker 
59*6777b538SAndroid Build Coastguard Worker     // The main_ is already leased out to some other places, return a new
60*6777b538SAndroid Build Coastguard Worker     // object instead.
61*6777b538SAndroid Build Coastguard Worker     UBreakIteratorPtr result(
62*6777b538SAndroid Build Coastguard Worker         ubrk_open(break_type, nullptr, nullptr, 0, &status));
63*6777b538SAndroid Build Coastguard Worker     if (U_FAILURE(status)) {
64*6777b538SAndroid Build Coastguard Worker       NOTREACHED() << "ubrk_open failed for type " << break_type
65*6777b538SAndroid Build Coastguard Worker                    << " with error " << status;
66*6777b538SAndroid Build Coastguard Worker     }
67*6777b538SAndroid Build Coastguard Worker     return result;
68*6777b538SAndroid Build Coastguard Worker   }
69*6777b538SAndroid Build Coastguard Worker 
Return(UBreakIteratorPtr item)70*6777b538SAndroid Build Coastguard Worker   void Return(UBreakIteratorPtr item) {
71*6777b538SAndroid Build Coastguard Worker     AutoLock scoped_lock(lock_);
72*6777b538SAndroid Build Coastguard Worker     if (!main_) {
73*6777b538SAndroid Build Coastguard Worker       main_ = std::move(item);
74*6777b538SAndroid Build Coastguard Worker     }
75*6777b538SAndroid Build Coastguard Worker   }
76*6777b538SAndroid Build Coastguard Worker 
77*6777b538SAndroid Build Coastguard Worker  private:
78*6777b538SAndroid Build Coastguard Worker   UErrorCode main_status_ = U_ZERO_ERROR;
79*6777b538SAndroid Build Coastguard Worker   UBreakIteratorPtr main_ GUARDED_BY(lock_);
80*6777b538SAndroid Build Coastguard Worker   Lock lock_;
81*6777b538SAndroid Build Coastguard Worker };
82*6777b538SAndroid Build Coastguard Worker 
83*6777b538SAndroid Build Coastguard Worker static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_CHARACTER>>::Leaky
84*6777b538SAndroid Build Coastguard Worker     char_break_cache = LAZY_INSTANCE_INITIALIZER;
85*6777b538SAndroid Build Coastguard Worker static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_WORD>>::Leaky
86*6777b538SAndroid Build Coastguard Worker     word_break_cache = LAZY_INSTANCE_INITIALIZER;
87*6777b538SAndroid Build Coastguard Worker static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_SENTENCE>>::Leaky
88*6777b538SAndroid Build Coastguard Worker     sentence_break_cache = LAZY_INSTANCE_INITIALIZER;
89*6777b538SAndroid Build Coastguard Worker static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_LINE>>::Leaky
90*6777b538SAndroid Build Coastguard Worker     line_break_cache = LAZY_INSTANCE_INITIALIZER;
91*6777b538SAndroid Build Coastguard Worker 
92*6777b538SAndroid Build Coastguard Worker }  // namespace
93*6777b538SAndroid Build Coastguard Worker 
operator ()(UBreakIterator * ptr)94*6777b538SAndroid Build Coastguard Worker void UBreakIteratorDeleter::operator()(UBreakIterator* ptr) {
95*6777b538SAndroid Build Coastguard Worker   if (ptr) {
96*6777b538SAndroid Build Coastguard Worker     ubrk_close(ptr);
97*6777b538SAndroid Build Coastguard Worker   }
98*6777b538SAndroid Build Coastguard Worker }
99*6777b538SAndroid Build Coastguard Worker 
BreakIterator(std::u16string_view str,BreakType break_type)100*6777b538SAndroid Build Coastguard Worker BreakIterator::BreakIterator(std::u16string_view str, BreakType break_type)
101*6777b538SAndroid Build Coastguard Worker     : string_(str), break_type_(break_type) {}
102*6777b538SAndroid Build Coastguard Worker 
BreakIterator(std::u16string_view str,const std::u16string & rules)103*6777b538SAndroid Build Coastguard Worker BreakIterator::BreakIterator(std::u16string_view str,
104*6777b538SAndroid Build Coastguard Worker                              const std::u16string& rules)
105*6777b538SAndroid Build Coastguard Worker     : string_(str), rules_(rules), break_type_(RULE_BASED) {}
106*6777b538SAndroid Build Coastguard Worker 
~BreakIterator()107*6777b538SAndroid Build Coastguard Worker BreakIterator::~BreakIterator() {
108*6777b538SAndroid Build Coastguard Worker   switch (break_type_) {
109*6777b538SAndroid Build Coastguard Worker     case RULE_BASED:
110*6777b538SAndroid Build Coastguard Worker       return;
111*6777b538SAndroid Build Coastguard Worker     case BREAK_CHARACTER:
112*6777b538SAndroid Build Coastguard Worker       char_break_cache.Pointer()->Return(std::move(iter_));
113*6777b538SAndroid Build Coastguard Worker       return;
114*6777b538SAndroid Build Coastguard Worker     case BREAK_WORD:
115*6777b538SAndroid Build Coastguard Worker       word_break_cache.Pointer()->Return(std::move(iter_));
116*6777b538SAndroid Build Coastguard Worker       return;
117*6777b538SAndroid Build Coastguard Worker     case BREAK_SENTENCE:
118*6777b538SAndroid Build Coastguard Worker       sentence_break_cache.Pointer()->Return(std::move(iter_));
119*6777b538SAndroid Build Coastguard Worker       return;
120*6777b538SAndroid Build Coastguard Worker     case BREAK_LINE:
121*6777b538SAndroid Build Coastguard Worker     case BREAK_NEWLINE:
122*6777b538SAndroid Build Coastguard Worker       line_break_cache.Pointer()->Return(std::move(iter_));
123*6777b538SAndroid Build Coastguard Worker       return;
124*6777b538SAndroid Build Coastguard Worker   }
125*6777b538SAndroid Build Coastguard Worker }
126*6777b538SAndroid Build Coastguard Worker 
Init()127*6777b538SAndroid Build Coastguard Worker bool BreakIterator::Init() {
128*6777b538SAndroid Build Coastguard Worker   UErrorCode status = U_ZERO_ERROR;
129*6777b538SAndroid Build Coastguard Worker   UParseError parse_error;
130*6777b538SAndroid Build Coastguard Worker   switch (break_type_) {
131*6777b538SAndroid Build Coastguard Worker     case BREAK_CHARACTER:
132*6777b538SAndroid Build Coastguard Worker       iter_ = char_break_cache.Pointer()->Lease(status);
133*6777b538SAndroid Build Coastguard Worker       break;
134*6777b538SAndroid Build Coastguard Worker     case BREAK_WORD:
135*6777b538SAndroid Build Coastguard Worker       iter_ = word_break_cache.Pointer()->Lease(status);
136*6777b538SAndroid Build Coastguard Worker       break;
137*6777b538SAndroid Build Coastguard Worker     case BREAK_SENTENCE:
138*6777b538SAndroid Build Coastguard Worker       iter_ = sentence_break_cache.Pointer()->Lease(status);
139*6777b538SAndroid Build Coastguard Worker       break;
140*6777b538SAndroid Build Coastguard Worker     case BREAK_LINE:
141*6777b538SAndroid Build Coastguard Worker     case BREAK_NEWLINE:
142*6777b538SAndroid Build Coastguard Worker       iter_ = line_break_cache.Pointer()->Lease(status);
143*6777b538SAndroid Build Coastguard Worker       break;
144*6777b538SAndroid Build Coastguard Worker     case RULE_BASED:
145*6777b538SAndroid Build Coastguard Worker       iter_ = UBreakIteratorPtr(
146*6777b538SAndroid Build Coastguard Worker           ubrk_openRules(rules_.c_str(), static_cast<int32_t>(rules_.length()),
147*6777b538SAndroid Build Coastguard Worker                          nullptr, 0, &parse_error, &status));
148*6777b538SAndroid Build Coastguard Worker       if (U_FAILURE(status)) {
149*6777b538SAndroid Build Coastguard Worker         NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
150*6777b538SAndroid Build Coastguard Worker                      << parse_error.line << ", offset " << parse_error.offset;
151*6777b538SAndroid Build Coastguard Worker       }
152*6777b538SAndroid Build Coastguard Worker       break;
153*6777b538SAndroid Build Coastguard Worker   }
154*6777b538SAndroid Build Coastguard Worker 
155*6777b538SAndroid Build Coastguard Worker   if (U_FAILURE(status) || iter_ == nullptr) {
156*6777b538SAndroid Build Coastguard Worker     return false;
157*6777b538SAndroid Build Coastguard Worker   }
158*6777b538SAndroid Build Coastguard Worker 
159*6777b538SAndroid Build Coastguard Worker   if (string_.data() != nullptr) {
160*6777b538SAndroid Build Coastguard Worker     ubrk_setText(iter_.get(), string_.data(),
161*6777b538SAndroid Build Coastguard Worker                  static_cast<int32_t>(string_.size()), &status);
162*6777b538SAndroid Build Coastguard Worker     if (U_FAILURE(status)) {
163*6777b538SAndroid Build Coastguard Worker       return false;
164*6777b538SAndroid Build Coastguard Worker     }
165*6777b538SAndroid Build Coastguard Worker   }
166*6777b538SAndroid Build Coastguard Worker 
167*6777b538SAndroid Build Coastguard Worker   // Move the iterator to the beginning of the string.
168*6777b538SAndroid Build Coastguard Worker   ubrk_first(iter_.get());
169*6777b538SAndroid Build Coastguard Worker   return true;
170*6777b538SAndroid Build Coastguard Worker }
171*6777b538SAndroid Build Coastguard Worker 
Advance()172*6777b538SAndroid Build Coastguard Worker bool BreakIterator::Advance() {
173*6777b538SAndroid Build Coastguard Worker   int32_t pos;
174*6777b538SAndroid Build Coastguard Worker   int32_t status;
175*6777b538SAndroid Build Coastguard Worker   prev_ = pos_;
176*6777b538SAndroid Build Coastguard Worker   switch (break_type_) {
177*6777b538SAndroid Build Coastguard Worker     case BREAK_CHARACTER:
178*6777b538SAndroid Build Coastguard Worker     case BREAK_WORD:
179*6777b538SAndroid Build Coastguard Worker     case BREAK_LINE:
180*6777b538SAndroid Build Coastguard Worker     case BREAK_SENTENCE:
181*6777b538SAndroid Build Coastguard Worker     case RULE_BASED:
182*6777b538SAndroid Build Coastguard Worker       pos = ubrk_next(iter_.get());
183*6777b538SAndroid Build Coastguard Worker       if (pos == UBRK_DONE) {
184*6777b538SAndroid Build Coastguard Worker         pos_ = npos;
185*6777b538SAndroid Build Coastguard Worker         return false;
186*6777b538SAndroid Build Coastguard Worker       }
187*6777b538SAndroid Build Coastguard Worker       pos_ = static_cast<size_t>(pos);
188*6777b538SAndroid Build Coastguard Worker       return true;
189*6777b538SAndroid Build Coastguard Worker     case BREAK_NEWLINE:
190*6777b538SAndroid Build Coastguard Worker       do {
191*6777b538SAndroid Build Coastguard Worker         pos = ubrk_next(iter_.get());
192*6777b538SAndroid Build Coastguard Worker         if (pos == UBRK_DONE)
193*6777b538SAndroid Build Coastguard Worker           break;
194*6777b538SAndroid Build Coastguard Worker         pos_ = static_cast<size_t>(pos);
195*6777b538SAndroid Build Coastguard Worker         status = ubrk_getRuleStatus(iter_.get());
196*6777b538SAndroid Build Coastguard Worker       } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT);
197*6777b538SAndroid Build Coastguard Worker       if (pos == UBRK_DONE && prev_ == pos_) {
198*6777b538SAndroid Build Coastguard Worker         pos_ = npos;
199*6777b538SAndroid Build Coastguard Worker         return false;
200*6777b538SAndroid Build Coastguard Worker       }
201*6777b538SAndroid Build Coastguard Worker       return true;
202*6777b538SAndroid Build Coastguard Worker   }
203*6777b538SAndroid Build Coastguard Worker }
204*6777b538SAndroid Build Coastguard Worker 
SetText(std::u16string_view text)205*6777b538SAndroid Build Coastguard Worker bool BreakIterator::SetText(std::u16string_view text) {
206*6777b538SAndroid Build Coastguard Worker   UErrorCode status = U_ZERO_ERROR;
207*6777b538SAndroid Build Coastguard Worker   ubrk_setText(iter_.get(), text.data(), text.length(), &status);
208*6777b538SAndroid Build Coastguard Worker   pos_ = 0;  // implicit when ubrk_setText is done
209*6777b538SAndroid Build Coastguard Worker   prev_ = npos;
210*6777b538SAndroid Build Coastguard Worker   if (U_FAILURE(status)) {
211*6777b538SAndroid Build Coastguard Worker     NOTREACHED() << "ubrk_setText failed";
212*6777b538SAndroid Build Coastguard Worker     return false;
213*6777b538SAndroid Build Coastguard Worker   }
214*6777b538SAndroid Build Coastguard Worker   string_ = text;
215*6777b538SAndroid Build Coastguard Worker   return true;
216*6777b538SAndroid Build Coastguard Worker }
217*6777b538SAndroid Build Coastguard Worker 
IsWord() const218*6777b538SAndroid Build Coastguard Worker bool BreakIterator::IsWord() const {
219*6777b538SAndroid Build Coastguard Worker   return GetWordBreakStatus() == IS_WORD_BREAK;
220*6777b538SAndroid Build Coastguard Worker }
221*6777b538SAndroid Build Coastguard Worker 
GetWordBreakStatus() const222*6777b538SAndroid Build Coastguard Worker BreakIterator::WordBreakStatus BreakIterator::GetWordBreakStatus() const {
223*6777b538SAndroid Build Coastguard Worker   int32_t status = ubrk_getRuleStatus(iter_.get());
224*6777b538SAndroid Build Coastguard Worker   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
225*6777b538SAndroid Build Coastguard Worker     return IS_LINE_OR_CHAR_BREAK;
226*6777b538SAndroid Build Coastguard Worker   // In ICU 60, trying to advance past the end of the text does not change
227*6777b538SAndroid Build Coastguard Worker   // |status| so that |pos_| has to be checked as well as |status|.
228*6777b538SAndroid Build Coastguard Worker   // See http://bugs.icu-project.org/trac/ticket/13447 .
229*6777b538SAndroid Build Coastguard Worker   return (status == UBRK_WORD_NONE || pos_ == npos) ? IS_SKIPPABLE_WORD
230*6777b538SAndroid Build Coastguard Worker                                                     : IS_WORD_BREAK;
231*6777b538SAndroid Build Coastguard Worker }
232*6777b538SAndroid Build Coastguard Worker 
IsEndOfWord(size_t position) const233*6777b538SAndroid Build Coastguard Worker bool BreakIterator::IsEndOfWord(size_t position) const {
234*6777b538SAndroid Build Coastguard Worker   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
235*6777b538SAndroid Build Coastguard Worker     return false;
236*6777b538SAndroid Build Coastguard Worker 
237*6777b538SAndroid Build Coastguard Worker   UBool boundary = ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
238*6777b538SAndroid Build Coastguard Worker   int32_t status = ubrk_getRuleStatus(iter_.get());
239*6777b538SAndroid Build Coastguard Worker   return (!!boundary && status != UBRK_WORD_NONE);
240*6777b538SAndroid Build Coastguard Worker }
241*6777b538SAndroid Build Coastguard Worker 
IsStartOfWord(size_t position) const242*6777b538SAndroid Build Coastguard Worker bool BreakIterator::IsStartOfWord(size_t position) const {
243*6777b538SAndroid Build Coastguard Worker   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
244*6777b538SAndroid Build Coastguard Worker     return false;
245*6777b538SAndroid Build Coastguard Worker 
246*6777b538SAndroid Build Coastguard Worker   UBool boundary = ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
247*6777b538SAndroid Build Coastguard Worker   ubrk_next(iter_.get());
248*6777b538SAndroid Build Coastguard Worker   int32_t next_status = ubrk_getRuleStatus(iter_.get());
249*6777b538SAndroid Build Coastguard Worker   return (!!boundary && next_status != UBRK_WORD_NONE);
250*6777b538SAndroid Build Coastguard Worker }
251*6777b538SAndroid Build Coastguard Worker 
IsSentenceBoundary(size_t position) const252*6777b538SAndroid Build Coastguard Worker bool BreakIterator::IsSentenceBoundary(size_t position) const {
253*6777b538SAndroid Build Coastguard Worker   if (break_type_ != BREAK_SENTENCE && break_type_ != RULE_BASED)
254*6777b538SAndroid Build Coastguard Worker     return false;
255*6777b538SAndroid Build Coastguard Worker 
256*6777b538SAndroid Build Coastguard Worker   return !!ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
257*6777b538SAndroid Build Coastguard Worker }
258*6777b538SAndroid Build Coastguard Worker 
IsGraphemeBoundary(size_t position) const259*6777b538SAndroid Build Coastguard Worker bool BreakIterator::IsGraphemeBoundary(size_t position) const {
260*6777b538SAndroid Build Coastguard Worker   if (break_type_ != BREAK_CHARACTER)
261*6777b538SAndroid Build Coastguard Worker     return false;
262*6777b538SAndroid Build Coastguard Worker 
263*6777b538SAndroid Build Coastguard Worker   return !!ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
264*6777b538SAndroid Build Coastguard Worker }
265*6777b538SAndroid Build Coastguard Worker 
GetString() const266*6777b538SAndroid Build Coastguard Worker std::u16string BreakIterator::GetString() const {
267*6777b538SAndroid Build Coastguard Worker   return std::u16string(GetStringPiece());
268*6777b538SAndroid Build Coastguard Worker }
269*6777b538SAndroid Build Coastguard Worker 
GetStringPiece() const270*6777b538SAndroid Build Coastguard Worker std::u16string_view BreakIterator::GetStringPiece() const {
271*6777b538SAndroid Build Coastguard Worker   DCHECK(prev_ != npos && pos_ != npos);
272*6777b538SAndroid Build Coastguard Worker   return string_.substr(prev_, pos_ - prev_);
273*6777b538SAndroid Build Coastguard Worker }
274*6777b538SAndroid Build Coastguard Worker 
275*6777b538SAndroid Build Coastguard Worker }  // namespace i18n
276*6777b538SAndroid Build Coastguard Worker }  // namespace base
277