1*6777b538SAndroid Build Coastguard Worker // Copyright 2011 The Chromium Authors
2*6777b538SAndroid Build Coastguard Worker // Use of this source code is governed by a BSD-style license that can be
3*6777b538SAndroid Build Coastguard Worker // found in the LICENSE file.
4*6777b538SAndroid Build Coastguard Worker
5*6777b538SAndroid Build Coastguard Worker #include "base/i18n/break_iterator.h"
6*6777b538SAndroid Build Coastguard Worker
7*6777b538SAndroid Build Coastguard Worker #include <stdint.h>
8*6777b538SAndroid Build Coastguard Worker #include <ostream>
9*6777b538SAndroid Build Coastguard Worker #include <string_view>
10*6777b538SAndroid Build Coastguard Worker
11*6777b538SAndroid Build Coastguard Worker #include "base/check.h"
12*6777b538SAndroid Build Coastguard Worker #include "base/lazy_instance.h"
13*6777b538SAndroid Build Coastguard Worker #include "base/memory/raw_ptr.h"
14*6777b538SAndroid Build Coastguard Worker #include "base/notreached.h"
15*6777b538SAndroid Build Coastguard Worker #include "base/synchronization/lock.h"
16*6777b538SAndroid Build Coastguard Worker #include "third_party/icu/source/common/unicode/ubrk.h"
17*6777b538SAndroid Build Coastguard Worker #include "third_party/icu/source/common/unicode/uchar.h"
18*6777b538SAndroid Build Coastguard Worker #include "third_party/icu/source/common/unicode/ustring.h"
19*6777b538SAndroid Build Coastguard Worker
20*6777b538SAndroid Build Coastguard Worker namespace base {
21*6777b538SAndroid Build Coastguard Worker namespace i18n {
22*6777b538SAndroid Build Coastguard Worker
23*6777b538SAndroid Build Coastguard Worker namespace {
24*6777b538SAndroid Build Coastguard Worker
25*6777b538SAndroid Build Coastguard Worker // We found the usage pattern of break iterator is to create, use and destroy.
26*6777b538SAndroid Build Coastguard Worker // The following cache support multiple break iterator in the same thread and
27*6777b538SAndroid Build Coastguard Worker // also optimize to not create break iterator many time. For each kind of break
28*6777b538SAndroid Build Coastguard Worker // iterator (character, word, line and sentence, but NOT rule), we keep one of
29*6777b538SAndroid Build Coastguard Worker // them in the main_ and lease it out. If some other code request a lease
30*6777b538SAndroid Build Coastguard Worker // before |main_| is returned, we create a new instance of the iterator.
31*6777b538SAndroid Build Coastguard Worker // This will keep at most 4 break iterators (one for each kind) unreleased until
32*6777b538SAndroid Build Coastguard Worker // the program destruction time.
33*6777b538SAndroid Build Coastguard Worker template <UBreakIteratorType break_type>
34*6777b538SAndroid Build Coastguard Worker class DefaultLocaleBreakIteratorCache {
35*6777b538SAndroid Build Coastguard Worker public:
DefaultLocaleBreakIteratorCache()36*6777b538SAndroid Build Coastguard Worker DefaultLocaleBreakIteratorCache() {
37*6777b538SAndroid Build Coastguard Worker main_ = UBreakIteratorPtr(
38*6777b538SAndroid Build Coastguard Worker ubrk_open(break_type, nullptr, nullptr, 0, &main_status_));
39*6777b538SAndroid Build Coastguard Worker if (U_FAILURE(main_status_)) {
40*6777b538SAndroid Build Coastguard Worker NOTREACHED() << "ubrk_open failed for type " << break_type
41*6777b538SAndroid Build Coastguard Worker << " with error " << main_status_;
42*6777b538SAndroid Build Coastguard Worker }
43*6777b538SAndroid Build Coastguard Worker }
Lease(UErrorCode & status)44*6777b538SAndroid Build Coastguard Worker UBreakIteratorPtr Lease(UErrorCode& status) {
45*6777b538SAndroid Build Coastguard Worker if (U_FAILURE(status)) {
46*6777b538SAndroid Build Coastguard Worker return nullptr;
47*6777b538SAndroid Build Coastguard Worker }
48*6777b538SAndroid Build Coastguard Worker if (U_FAILURE(main_status_)) {
49*6777b538SAndroid Build Coastguard Worker status = main_status_;
50*6777b538SAndroid Build Coastguard Worker return nullptr;
51*6777b538SAndroid Build Coastguard Worker }
52*6777b538SAndroid Build Coastguard Worker {
53*6777b538SAndroid Build Coastguard Worker AutoLock scoped_lock(lock_);
54*6777b538SAndroid Build Coastguard Worker if (main_) {
55*6777b538SAndroid Build Coastguard Worker return std::move(main_);
56*6777b538SAndroid Build Coastguard Worker }
57*6777b538SAndroid Build Coastguard Worker }
58*6777b538SAndroid Build Coastguard Worker
59*6777b538SAndroid Build Coastguard Worker // The main_ is already leased out to some other places, return a new
60*6777b538SAndroid Build Coastguard Worker // object instead.
61*6777b538SAndroid Build Coastguard Worker UBreakIteratorPtr result(
62*6777b538SAndroid Build Coastguard Worker ubrk_open(break_type, nullptr, nullptr, 0, &status));
63*6777b538SAndroid Build Coastguard Worker if (U_FAILURE(status)) {
64*6777b538SAndroid Build Coastguard Worker NOTREACHED() << "ubrk_open failed for type " << break_type
65*6777b538SAndroid Build Coastguard Worker << " with error " << status;
66*6777b538SAndroid Build Coastguard Worker }
67*6777b538SAndroid Build Coastguard Worker return result;
68*6777b538SAndroid Build Coastguard Worker }
69*6777b538SAndroid Build Coastguard Worker
Return(UBreakIteratorPtr item)70*6777b538SAndroid Build Coastguard Worker void Return(UBreakIteratorPtr item) {
71*6777b538SAndroid Build Coastguard Worker AutoLock scoped_lock(lock_);
72*6777b538SAndroid Build Coastguard Worker if (!main_) {
73*6777b538SAndroid Build Coastguard Worker main_ = std::move(item);
74*6777b538SAndroid Build Coastguard Worker }
75*6777b538SAndroid Build Coastguard Worker }
76*6777b538SAndroid Build Coastguard Worker
77*6777b538SAndroid Build Coastguard Worker private:
78*6777b538SAndroid Build Coastguard Worker UErrorCode main_status_ = U_ZERO_ERROR;
79*6777b538SAndroid Build Coastguard Worker UBreakIteratorPtr main_ GUARDED_BY(lock_);
80*6777b538SAndroid Build Coastguard Worker Lock lock_;
81*6777b538SAndroid Build Coastguard Worker };
82*6777b538SAndroid Build Coastguard Worker
83*6777b538SAndroid Build Coastguard Worker static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_CHARACTER>>::Leaky
84*6777b538SAndroid Build Coastguard Worker char_break_cache = LAZY_INSTANCE_INITIALIZER;
85*6777b538SAndroid Build Coastguard Worker static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_WORD>>::Leaky
86*6777b538SAndroid Build Coastguard Worker word_break_cache = LAZY_INSTANCE_INITIALIZER;
87*6777b538SAndroid Build Coastguard Worker static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_SENTENCE>>::Leaky
88*6777b538SAndroid Build Coastguard Worker sentence_break_cache = LAZY_INSTANCE_INITIALIZER;
89*6777b538SAndroid Build Coastguard Worker static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_LINE>>::Leaky
90*6777b538SAndroid Build Coastguard Worker line_break_cache = LAZY_INSTANCE_INITIALIZER;
91*6777b538SAndroid Build Coastguard Worker
92*6777b538SAndroid Build Coastguard Worker } // namespace
93*6777b538SAndroid Build Coastguard Worker
operator ()(UBreakIterator * ptr)94*6777b538SAndroid Build Coastguard Worker void UBreakIteratorDeleter::operator()(UBreakIterator* ptr) {
95*6777b538SAndroid Build Coastguard Worker if (ptr) {
96*6777b538SAndroid Build Coastguard Worker ubrk_close(ptr);
97*6777b538SAndroid Build Coastguard Worker }
98*6777b538SAndroid Build Coastguard Worker }
99*6777b538SAndroid Build Coastguard Worker
BreakIterator(std::u16string_view str,BreakType break_type)100*6777b538SAndroid Build Coastguard Worker BreakIterator::BreakIterator(std::u16string_view str, BreakType break_type)
101*6777b538SAndroid Build Coastguard Worker : string_(str), break_type_(break_type) {}
102*6777b538SAndroid Build Coastguard Worker
BreakIterator(std::u16string_view str,const std::u16string & rules)103*6777b538SAndroid Build Coastguard Worker BreakIterator::BreakIterator(std::u16string_view str,
104*6777b538SAndroid Build Coastguard Worker const std::u16string& rules)
105*6777b538SAndroid Build Coastguard Worker : string_(str), rules_(rules), break_type_(RULE_BASED) {}
106*6777b538SAndroid Build Coastguard Worker
~BreakIterator()107*6777b538SAndroid Build Coastguard Worker BreakIterator::~BreakIterator() {
108*6777b538SAndroid Build Coastguard Worker switch (break_type_) {
109*6777b538SAndroid Build Coastguard Worker case RULE_BASED:
110*6777b538SAndroid Build Coastguard Worker return;
111*6777b538SAndroid Build Coastguard Worker case BREAK_CHARACTER:
112*6777b538SAndroid Build Coastguard Worker char_break_cache.Pointer()->Return(std::move(iter_));
113*6777b538SAndroid Build Coastguard Worker return;
114*6777b538SAndroid Build Coastguard Worker case BREAK_WORD:
115*6777b538SAndroid Build Coastguard Worker word_break_cache.Pointer()->Return(std::move(iter_));
116*6777b538SAndroid Build Coastguard Worker return;
117*6777b538SAndroid Build Coastguard Worker case BREAK_SENTENCE:
118*6777b538SAndroid Build Coastguard Worker sentence_break_cache.Pointer()->Return(std::move(iter_));
119*6777b538SAndroid Build Coastguard Worker return;
120*6777b538SAndroid Build Coastguard Worker case BREAK_LINE:
121*6777b538SAndroid Build Coastguard Worker case BREAK_NEWLINE:
122*6777b538SAndroid Build Coastguard Worker line_break_cache.Pointer()->Return(std::move(iter_));
123*6777b538SAndroid Build Coastguard Worker return;
124*6777b538SAndroid Build Coastguard Worker }
125*6777b538SAndroid Build Coastguard Worker }
126*6777b538SAndroid Build Coastguard Worker
Init()127*6777b538SAndroid Build Coastguard Worker bool BreakIterator::Init() {
128*6777b538SAndroid Build Coastguard Worker UErrorCode status = U_ZERO_ERROR;
129*6777b538SAndroid Build Coastguard Worker UParseError parse_error;
130*6777b538SAndroid Build Coastguard Worker switch (break_type_) {
131*6777b538SAndroid Build Coastguard Worker case BREAK_CHARACTER:
132*6777b538SAndroid Build Coastguard Worker iter_ = char_break_cache.Pointer()->Lease(status);
133*6777b538SAndroid Build Coastguard Worker break;
134*6777b538SAndroid Build Coastguard Worker case BREAK_WORD:
135*6777b538SAndroid Build Coastguard Worker iter_ = word_break_cache.Pointer()->Lease(status);
136*6777b538SAndroid Build Coastguard Worker break;
137*6777b538SAndroid Build Coastguard Worker case BREAK_SENTENCE:
138*6777b538SAndroid Build Coastguard Worker iter_ = sentence_break_cache.Pointer()->Lease(status);
139*6777b538SAndroid Build Coastguard Worker break;
140*6777b538SAndroid Build Coastguard Worker case BREAK_LINE:
141*6777b538SAndroid Build Coastguard Worker case BREAK_NEWLINE:
142*6777b538SAndroid Build Coastguard Worker iter_ = line_break_cache.Pointer()->Lease(status);
143*6777b538SAndroid Build Coastguard Worker break;
144*6777b538SAndroid Build Coastguard Worker case RULE_BASED:
145*6777b538SAndroid Build Coastguard Worker iter_ = UBreakIteratorPtr(
146*6777b538SAndroid Build Coastguard Worker ubrk_openRules(rules_.c_str(), static_cast<int32_t>(rules_.length()),
147*6777b538SAndroid Build Coastguard Worker nullptr, 0, &parse_error, &status));
148*6777b538SAndroid Build Coastguard Worker if (U_FAILURE(status)) {
149*6777b538SAndroid Build Coastguard Worker NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
150*6777b538SAndroid Build Coastguard Worker << parse_error.line << ", offset " << parse_error.offset;
151*6777b538SAndroid Build Coastguard Worker }
152*6777b538SAndroid Build Coastguard Worker break;
153*6777b538SAndroid Build Coastguard Worker }
154*6777b538SAndroid Build Coastguard Worker
155*6777b538SAndroid Build Coastguard Worker if (U_FAILURE(status) || iter_ == nullptr) {
156*6777b538SAndroid Build Coastguard Worker return false;
157*6777b538SAndroid Build Coastguard Worker }
158*6777b538SAndroid Build Coastguard Worker
159*6777b538SAndroid Build Coastguard Worker if (string_.data() != nullptr) {
160*6777b538SAndroid Build Coastguard Worker ubrk_setText(iter_.get(), string_.data(),
161*6777b538SAndroid Build Coastguard Worker static_cast<int32_t>(string_.size()), &status);
162*6777b538SAndroid Build Coastguard Worker if (U_FAILURE(status)) {
163*6777b538SAndroid Build Coastguard Worker return false;
164*6777b538SAndroid Build Coastguard Worker }
165*6777b538SAndroid Build Coastguard Worker }
166*6777b538SAndroid Build Coastguard Worker
167*6777b538SAndroid Build Coastguard Worker // Move the iterator to the beginning of the string.
168*6777b538SAndroid Build Coastguard Worker ubrk_first(iter_.get());
169*6777b538SAndroid Build Coastguard Worker return true;
170*6777b538SAndroid Build Coastguard Worker }
171*6777b538SAndroid Build Coastguard Worker
Advance()172*6777b538SAndroid Build Coastguard Worker bool BreakIterator::Advance() {
173*6777b538SAndroid Build Coastguard Worker int32_t pos;
174*6777b538SAndroid Build Coastguard Worker int32_t status;
175*6777b538SAndroid Build Coastguard Worker prev_ = pos_;
176*6777b538SAndroid Build Coastguard Worker switch (break_type_) {
177*6777b538SAndroid Build Coastguard Worker case BREAK_CHARACTER:
178*6777b538SAndroid Build Coastguard Worker case BREAK_WORD:
179*6777b538SAndroid Build Coastguard Worker case BREAK_LINE:
180*6777b538SAndroid Build Coastguard Worker case BREAK_SENTENCE:
181*6777b538SAndroid Build Coastguard Worker case RULE_BASED:
182*6777b538SAndroid Build Coastguard Worker pos = ubrk_next(iter_.get());
183*6777b538SAndroid Build Coastguard Worker if (pos == UBRK_DONE) {
184*6777b538SAndroid Build Coastguard Worker pos_ = npos;
185*6777b538SAndroid Build Coastguard Worker return false;
186*6777b538SAndroid Build Coastguard Worker }
187*6777b538SAndroid Build Coastguard Worker pos_ = static_cast<size_t>(pos);
188*6777b538SAndroid Build Coastguard Worker return true;
189*6777b538SAndroid Build Coastguard Worker case BREAK_NEWLINE:
190*6777b538SAndroid Build Coastguard Worker do {
191*6777b538SAndroid Build Coastguard Worker pos = ubrk_next(iter_.get());
192*6777b538SAndroid Build Coastguard Worker if (pos == UBRK_DONE)
193*6777b538SAndroid Build Coastguard Worker break;
194*6777b538SAndroid Build Coastguard Worker pos_ = static_cast<size_t>(pos);
195*6777b538SAndroid Build Coastguard Worker status = ubrk_getRuleStatus(iter_.get());
196*6777b538SAndroid Build Coastguard Worker } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT);
197*6777b538SAndroid Build Coastguard Worker if (pos == UBRK_DONE && prev_ == pos_) {
198*6777b538SAndroid Build Coastguard Worker pos_ = npos;
199*6777b538SAndroid Build Coastguard Worker return false;
200*6777b538SAndroid Build Coastguard Worker }
201*6777b538SAndroid Build Coastguard Worker return true;
202*6777b538SAndroid Build Coastguard Worker }
203*6777b538SAndroid Build Coastguard Worker }
204*6777b538SAndroid Build Coastguard Worker
SetText(std::u16string_view text)205*6777b538SAndroid Build Coastguard Worker bool BreakIterator::SetText(std::u16string_view text) {
206*6777b538SAndroid Build Coastguard Worker UErrorCode status = U_ZERO_ERROR;
207*6777b538SAndroid Build Coastguard Worker ubrk_setText(iter_.get(), text.data(), text.length(), &status);
208*6777b538SAndroid Build Coastguard Worker pos_ = 0; // implicit when ubrk_setText is done
209*6777b538SAndroid Build Coastguard Worker prev_ = npos;
210*6777b538SAndroid Build Coastguard Worker if (U_FAILURE(status)) {
211*6777b538SAndroid Build Coastguard Worker NOTREACHED() << "ubrk_setText failed";
212*6777b538SAndroid Build Coastguard Worker return false;
213*6777b538SAndroid Build Coastguard Worker }
214*6777b538SAndroid Build Coastguard Worker string_ = text;
215*6777b538SAndroid Build Coastguard Worker return true;
216*6777b538SAndroid Build Coastguard Worker }
217*6777b538SAndroid Build Coastguard Worker
IsWord() const218*6777b538SAndroid Build Coastguard Worker bool BreakIterator::IsWord() const {
219*6777b538SAndroid Build Coastguard Worker return GetWordBreakStatus() == IS_WORD_BREAK;
220*6777b538SAndroid Build Coastguard Worker }
221*6777b538SAndroid Build Coastguard Worker
GetWordBreakStatus() const222*6777b538SAndroid Build Coastguard Worker BreakIterator::WordBreakStatus BreakIterator::GetWordBreakStatus() const {
223*6777b538SAndroid Build Coastguard Worker int32_t status = ubrk_getRuleStatus(iter_.get());
224*6777b538SAndroid Build Coastguard Worker if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
225*6777b538SAndroid Build Coastguard Worker return IS_LINE_OR_CHAR_BREAK;
226*6777b538SAndroid Build Coastguard Worker // In ICU 60, trying to advance past the end of the text does not change
227*6777b538SAndroid Build Coastguard Worker // |status| so that |pos_| has to be checked as well as |status|.
228*6777b538SAndroid Build Coastguard Worker // See http://bugs.icu-project.org/trac/ticket/13447 .
229*6777b538SAndroid Build Coastguard Worker return (status == UBRK_WORD_NONE || pos_ == npos) ? IS_SKIPPABLE_WORD
230*6777b538SAndroid Build Coastguard Worker : IS_WORD_BREAK;
231*6777b538SAndroid Build Coastguard Worker }
232*6777b538SAndroid Build Coastguard Worker
IsEndOfWord(size_t position) const233*6777b538SAndroid Build Coastguard Worker bool BreakIterator::IsEndOfWord(size_t position) const {
234*6777b538SAndroid Build Coastguard Worker if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
235*6777b538SAndroid Build Coastguard Worker return false;
236*6777b538SAndroid Build Coastguard Worker
237*6777b538SAndroid Build Coastguard Worker UBool boundary = ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
238*6777b538SAndroid Build Coastguard Worker int32_t status = ubrk_getRuleStatus(iter_.get());
239*6777b538SAndroid Build Coastguard Worker return (!!boundary && status != UBRK_WORD_NONE);
240*6777b538SAndroid Build Coastguard Worker }
241*6777b538SAndroid Build Coastguard Worker
IsStartOfWord(size_t position) const242*6777b538SAndroid Build Coastguard Worker bool BreakIterator::IsStartOfWord(size_t position) const {
243*6777b538SAndroid Build Coastguard Worker if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
244*6777b538SAndroid Build Coastguard Worker return false;
245*6777b538SAndroid Build Coastguard Worker
246*6777b538SAndroid Build Coastguard Worker UBool boundary = ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
247*6777b538SAndroid Build Coastguard Worker ubrk_next(iter_.get());
248*6777b538SAndroid Build Coastguard Worker int32_t next_status = ubrk_getRuleStatus(iter_.get());
249*6777b538SAndroid Build Coastguard Worker return (!!boundary && next_status != UBRK_WORD_NONE);
250*6777b538SAndroid Build Coastguard Worker }
251*6777b538SAndroid Build Coastguard Worker
IsSentenceBoundary(size_t position) const252*6777b538SAndroid Build Coastguard Worker bool BreakIterator::IsSentenceBoundary(size_t position) const {
253*6777b538SAndroid Build Coastguard Worker if (break_type_ != BREAK_SENTENCE && break_type_ != RULE_BASED)
254*6777b538SAndroid Build Coastguard Worker return false;
255*6777b538SAndroid Build Coastguard Worker
256*6777b538SAndroid Build Coastguard Worker return !!ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
257*6777b538SAndroid Build Coastguard Worker }
258*6777b538SAndroid Build Coastguard Worker
IsGraphemeBoundary(size_t position) const259*6777b538SAndroid Build Coastguard Worker bool BreakIterator::IsGraphemeBoundary(size_t position) const {
260*6777b538SAndroid Build Coastguard Worker if (break_type_ != BREAK_CHARACTER)
261*6777b538SAndroid Build Coastguard Worker return false;
262*6777b538SAndroid Build Coastguard Worker
263*6777b538SAndroid Build Coastguard Worker return !!ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
264*6777b538SAndroid Build Coastguard Worker }
265*6777b538SAndroid Build Coastguard Worker
GetString() const266*6777b538SAndroid Build Coastguard Worker std::u16string BreakIterator::GetString() const {
267*6777b538SAndroid Build Coastguard Worker return std::u16string(GetStringPiece());
268*6777b538SAndroid Build Coastguard Worker }
269*6777b538SAndroid Build Coastguard Worker
GetStringPiece() const270*6777b538SAndroid Build Coastguard Worker std::u16string_view BreakIterator::GetStringPiece() const {
271*6777b538SAndroid Build Coastguard Worker DCHECK(prev_ != npos && pos_ != npos);
272*6777b538SAndroid Build Coastguard Worker return string_.substr(prev_, pos_ - prev_);
273*6777b538SAndroid Build Coastguard Worker }
274*6777b538SAndroid Build Coastguard Worker
275*6777b538SAndroid Build Coastguard Worker } // namespace i18n
276*6777b538SAndroid Build Coastguard Worker } // namespace base
277