1*635a8641SAndroid Build Coastguard Worker // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2*635a8641SAndroid Build Coastguard Worker // Use of this source code is governed by a BSD-style license that can be
3*635a8641SAndroid Build Coastguard Worker // found in the LICENSE file.
4*635a8641SAndroid Build Coastguard Worker
5*635a8641SAndroid Build Coastguard Worker #include "base/i18n/break_iterator.h"
6*635a8641SAndroid Build Coastguard Worker
7*635a8641SAndroid Build Coastguard Worker #include <stdint.h>
8*635a8641SAndroid Build Coastguard Worker
9*635a8641SAndroid Build Coastguard Worker #include "base/logging.h"
10*635a8641SAndroid Build Coastguard Worker #include "third_party/icu/source/common/unicode/ubrk.h"
11*635a8641SAndroid Build Coastguard Worker #include "third_party/icu/source/common/unicode/uchar.h"
12*635a8641SAndroid Build Coastguard Worker #include "third_party/icu/source/common/unicode/ustring.h"
13*635a8641SAndroid Build Coastguard Worker
14*635a8641SAndroid Build Coastguard Worker namespace base {
15*635a8641SAndroid Build Coastguard Worker namespace i18n {
16*635a8641SAndroid Build Coastguard Worker
17*635a8641SAndroid Build Coastguard Worker const size_t npos = static_cast<size_t>(-1);
18*635a8641SAndroid Build Coastguard Worker
BreakIterator(const StringPiece16 & str,BreakType break_type)19*635a8641SAndroid Build Coastguard Worker BreakIterator::BreakIterator(const StringPiece16& str, BreakType break_type)
20*635a8641SAndroid Build Coastguard Worker : iter_(nullptr),
21*635a8641SAndroid Build Coastguard Worker string_(str),
22*635a8641SAndroid Build Coastguard Worker break_type_(break_type),
23*635a8641SAndroid Build Coastguard Worker prev_(npos),
24*635a8641SAndroid Build Coastguard Worker pos_(0) {}
25*635a8641SAndroid Build Coastguard Worker
BreakIterator(const StringPiece16 & str,const string16 & rules)26*635a8641SAndroid Build Coastguard Worker BreakIterator::BreakIterator(const StringPiece16& str, const string16& rules)
27*635a8641SAndroid Build Coastguard Worker : iter_(nullptr),
28*635a8641SAndroid Build Coastguard Worker string_(str),
29*635a8641SAndroid Build Coastguard Worker rules_(rules),
30*635a8641SAndroid Build Coastguard Worker break_type_(RULE_BASED),
31*635a8641SAndroid Build Coastguard Worker prev_(npos),
32*635a8641SAndroid Build Coastguard Worker pos_(0) {}
33*635a8641SAndroid Build Coastguard Worker
~BreakIterator()34*635a8641SAndroid Build Coastguard Worker BreakIterator::~BreakIterator() {
35*635a8641SAndroid Build Coastguard Worker if (iter_)
36*635a8641SAndroid Build Coastguard Worker ubrk_close(static_cast<UBreakIterator*>(iter_));
37*635a8641SAndroid Build Coastguard Worker }
38*635a8641SAndroid Build Coastguard Worker
Init()39*635a8641SAndroid Build Coastguard Worker bool BreakIterator::Init() {
40*635a8641SAndroid Build Coastguard Worker UErrorCode status = U_ZERO_ERROR;
41*635a8641SAndroid Build Coastguard Worker UParseError parse_error;
42*635a8641SAndroid Build Coastguard Worker UBreakIteratorType break_type;
43*635a8641SAndroid Build Coastguard Worker switch (break_type_) {
44*635a8641SAndroid Build Coastguard Worker case BREAK_CHARACTER:
45*635a8641SAndroid Build Coastguard Worker break_type = UBRK_CHARACTER;
46*635a8641SAndroid Build Coastguard Worker break;
47*635a8641SAndroid Build Coastguard Worker case BREAK_WORD:
48*635a8641SAndroid Build Coastguard Worker break_type = UBRK_WORD;
49*635a8641SAndroid Build Coastguard Worker break;
50*635a8641SAndroid Build Coastguard Worker case BREAK_LINE:
51*635a8641SAndroid Build Coastguard Worker case BREAK_NEWLINE:
52*635a8641SAndroid Build Coastguard Worker case RULE_BASED: // (Keep compiler happy, break_type not used in this case)
53*635a8641SAndroid Build Coastguard Worker break_type = UBRK_LINE;
54*635a8641SAndroid Build Coastguard Worker break;
55*635a8641SAndroid Build Coastguard Worker default:
56*635a8641SAndroid Build Coastguard Worker NOTREACHED() << "invalid break_type_";
57*635a8641SAndroid Build Coastguard Worker return false;
58*635a8641SAndroid Build Coastguard Worker }
59*635a8641SAndroid Build Coastguard Worker if (break_type_ == RULE_BASED) {
60*635a8641SAndroid Build Coastguard Worker iter_ = ubrk_openRules(rules_.c_str(),
61*635a8641SAndroid Build Coastguard Worker static_cast<int32_t>(rules_.length()),
62*635a8641SAndroid Build Coastguard Worker string_.data(),
63*635a8641SAndroid Build Coastguard Worker static_cast<int32_t>(string_.size()),
64*635a8641SAndroid Build Coastguard Worker &parse_error,
65*635a8641SAndroid Build Coastguard Worker &status);
66*635a8641SAndroid Build Coastguard Worker if (U_FAILURE(status)) {
67*635a8641SAndroid Build Coastguard Worker NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
68*635a8641SAndroid Build Coastguard Worker << parse_error.line << ", offset " << parse_error.offset;
69*635a8641SAndroid Build Coastguard Worker }
70*635a8641SAndroid Build Coastguard Worker } else {
71*635a8641SAndroid Build Coastguard Worker iter_ = ubrk_open(break_type, nullptr, string_.data(),
72*635a8641SAndroid Build Coastguard Worker static_cast<int32_t>(string_.size()), &status);
73*635a8641SAndroid Build Coastguard Worker if (U_FAILURE(status)) {
74*635a8641SAndroid Build Coastguard Worker NOTREACHED() << "ubrk_open failed for type " << break_type
75*635a8641SAndroid Build Coastguard Worker << " with error " << status;
76*635a8641SAndroid Build Coastguard Worker }
77*635a8641SAndroid Build Coastguard Worker }
78*635a8641SAndroid Build Coastguard Worker
79*635a8641SAndroid Build Coastguard Worker if (U_FAILURE(status)) {
80*635a8641SAndroid Build Coastguard Worker return false;
81*635a8641SAndroid Build Coastguard Worker }
82*635a8641SAndroid Build Coastguard Worker
83*635a8641SAndroid Build Coastguard Worker // Move the iterator to the beginning of the string.
84*635a8641SAndroid Build Coastguard Worker ubrk_first(static_cast<UBreakIterator*>(iter_));
85*635a8641SAndroid Build Coastguard Worker return true;
86*635a8641SAndroid Build Coastguard Worker }
87*635a8641SAndroid Build Coastguard Worker
Advance()88*635a8641SAndroid Build Coastguard Worker bool BreakIterator::Advance() {
89*635a8641SAndroid Build Coastguard Worker int32_t pos;
90*635a8641SAndroid Build Coastguard Worker int32_t status;
91*635a8641SAndroid Build Coastguard Worker prev_ = pos_;
92*635a8641SAndroid Build Coastguard Worker switch (break_type_) {
93*635a8641SAndroid Build Coastguard Worker case BREAK_CHARACTER:
94*635a8641SAndroid Build Coastguard Worker case BREAK_WORD:
95*635a8641SAndroid Build Coastguard Worker case BREAK_LINE:
96*635a8641SAndroid Build Coastguard Worker case RULE_BASED:
97*635a8641SAndroid Build Coastguard Worker pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
98*635a8641SAndroid Build Coastguard Worker if (pos == UBRK_DONE) {
99*635a8641SAndroid Build Coastguard Worker pos_ = npos;
100*635a8641SAndroid Build Coastguard Worker return false;
101*635a8641SAndroid Build Coastguard Worker }
102*635a8641SAndroid Build Coastguard Worker pos_ = static_cast<size_t>(pos);
103*635a8641SAndroid Build Coastguard Worker return true;
104*635a8641SAndroid Build Coastguard Worker case BREAK_NEWLINE:
105*635a8641SAndroid Build Coastguard Worker do {
106*635a8641SAndroid Build Coastguard Worker pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
107*635a8641SAndroid Build Coastguard Worker if (pos == UBRK_DONE)
108*635a8641SAndroid Build Coastguard Worker break;
109*635a8641SAndroid Build Coastguard Worker pos_ = static_cast<size_t>(pos);
110*635a8641SAndroid Build Coastguard Worker status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
111*635a8641SAndroid Build Coastguard Worker } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT);
112*635a8641SAndroid Build Coastguard Worker if (pos == UBRK_DONE && prev_ == pos_) {
113*635a8641SAndroid Build Coastguard Worker pos_ = npos;
114*635a8641SAndroid Build Coastguard Worker return false;
115*635a8641SAndroid Build Coastguard Worker }
116*635a8641SAndroid Build Coastguard Worker return true;
117*635a8641SAndroid Build Coastguard Worker default:
118*635a8641SAndroid Build Coastguard Worker NOTREACHED() << "invalid break_type_";
119*635a8641SAndroid Build Coastguard Worker return false;
120*635a8641SAndroid Build Coastguard Worker }
121*635a8641SAndroid Build Coastguard Worker }
122*635a8641SAndroid Build Coastguard Worker
SetText(const base::char16 * text,const size_t length)123*635a8641SAndroid Build Coastguard Worker bool BreakIterator::SetText(const base::char16* text, const size_t length) {
124*635a8641SAndroid Build Coastguard Worker UErrorCode status = U_ZERO_ERROR;
125*635a8641SAndroid Build Coastguard Worker ubrk_setText(static_cast<UBreakIterator*>(iter_),
126*635a8641SAndroid Build Coastguard Worker text, length, &status);
127*635a8641SAndroid Build Coastguard Worker pos_ = 0; // implicit when ubrk_setText is done
128*635a8641SAndroid Build Coastguard Worker prev_ = npos;
129*635a8641SAndroid Build Coastguard Worker if (U_FAILURE(status)) {
130*635a8641SAndroid Build Coastguard Worker NOTREACHED() << "ubrk_setText failed";
131*635a8641SAndroid Build Coastguard Worker return false;
132*635a8641SAndroid Build Coastguard Worker }
133*635a8641SAndroid Build Coastguard Worker string_ = StringPiece16(text, length);
134*635a8641SAndroid Build Coastguard Worker return true;
135*635a8641SAndroid Build Coastguard Worker }
136*635a8641SAndroid Build Coastguard Worker
IsWord() const137*635a8641SAndroid Build Coastguard Worker bool BreakIterator::IsWord() const {
138*635a8641SAndroid Build Coastguard Worker return GetWordBreakStatus() == IS_WORD_BREAK;
139*635a8641SAndroid Build Coastguard Worker }
140*635a8641SAndroid Build Coastguard Worker
GetWordBreakStatus() const141*635a8641SAndroid Build Coastguard Worker BreakIterator::WordBreakStatus BreakIterator::GetWordBreakStatus() const {
142*635a8641SAndroid Build Coastguard Worker int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
143*635a8641SAndroid Build Coastguard Worker if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
144*635a8641SAndroid Build Coastguard Worker return IS_LINE_OR_CHAR_BREAK;
145*635a8641SAndroid Build Coastguard Worker // In ICU 60, trying to advance past the end of the text does not change
146*635a8641SAndroid Build Coastguard Worker // |status| so that |pos_| has to be checked as well as |status|.
147*635a8641SAndroid Build Coastguard Worker // See http://bugs.icu-project.org/trac/ticket/13447 .
148*635a8641SAndroid Build Coastguard Worker return (status == UBRK_WORD_NONE || pos_ == npos) ? IS_SKIPPABLE_WORD
149*635a8641SAndroid Build Coastguard Worker : IS_WORD_BREAK;
150*635a8641SAndroid Build Coastguard Worker }
151*635a8641SAndroid Build Coastguard Worker
IsEndOfWord(size_t position) const152*635a8641SAndroid Build Coastguard Worker bool BreakIterator::IsEndOfWord(size_t position) const {
153*635a8641SAndroid Build Coastguard Worker if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
154*635a8641SAndroid Build Coastguard Worker return false;
155*635a8641SAndroid Build Coastguard Worker
156*635a8641SAndroid Build Coastguard Worker UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
157*635a8641SAndroid Build Coastguard Worker UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position));
158*635a8641SAndroid Build Coastguard Worker int32_t status = ubrk_getRuleStatus(iter);
159*635a8641SAndroid Build Coastguard Worker return (!!boundary && status != UBRK_WORD_NONE);
160*635a8641SAndroid Build Coastguard Worker }
161*635a8641SAndroid Build Coastguard Worker
IsStartOfWord(size_t position) const162*635a8641SAndroid Build Coastguard Worker bool BreakIterator::IsStartOfWord(size_t position) const {
163*635a8641SAndroid Build Coastguard Worker if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
164*635a8641SAndroid Build Coastguard Worker return false;
165*635a8641SAndroid Build Coastguard Worker
166*635a8641SAndroid Build Coastguard Worker UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
167*635a8641SAndroid Build Coastguard Worker UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position));
168*635a8641SAndroid Build Coastguard Worker ubrk_next(iter);
169*635a8641SAndroid Build Coastguard Worker int32_t next_status = ubrk_getRuleStatus(iter);
170*635a8641SAndroid Build Coastguard Worker return (!!boundary && next_status != UBRK_WORD_NONE);
171*635a8641SAndroid Build Coastguard Worker }
172*635a8641SAndroid Build Coastguard Worker
IsGraphemeBoundary(size_t position) const173*635a8641SAndroid Build Coastguard Worker bool BreakIterator::IsGraphemeBoundary(size_t position) const {
174*635a8641SAndroid Build Coastguard Worker if (break_type_ != BREAK_CHARACTER)
175*635a8641SAndroid Build Coastguard Worker return false;
176*635a8641SAndroid Build Coastguard Worker
177*635a8641SAndroid Build Coastguard Worker UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
178*635a8641SAndroid Build Coastguard Worker return !!ubrk_isBoundary(iter, static_cast<int32_t>(position));
179*635a8641SAndroid Build Coastguard Worker }
180*635a8641SAndroid Build Coastguard Worker
GetString() const181*635a8641SAndroid Build Coastguard Worker string16 BreakIterator::GetString() const {
182*635a8641SAndroid Build Coastguard Worker return GetStringPiece().as_string();
183*635a8641SAndroid Build Coastguard Worker }
184*635a8641SAndroid Build Coastguard Worker
GetStringPiece() const185*635a8641SAndroid Build Coastguard Worker StringPiece16 BreakIterator::GetStringPiece() const {
186*635a8641SAndroid Build Coastguard Worker DCHECK(prev_ != npos && pos_ != npos);
187*635a8641SAndroid Build Coastguard Worker return string_.substr(prev_, pos_ - prev_);
188*635a8641SAndroid Build Coastguard Worker }
189*635a8641SAndroid Build Coastguard Worker
190*635a8641SAndroid Build Coastguard Worker } // namespace i18n
191*635a8641SAndroid Build Coastguard Worker } // namespace base
192