1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "WordBreaker.h"
18
19 #include <unicode/ubrk.h>
20 #include <unicode/uchar.h>
21 #include <unicode/utf16.h>
22
23 #include <list>
24 #include <map>
25
26 #include "FeatureFlags.h"
27 #include "Locale.h"
28 #include "MinikinInternal.h"
29 #include "minikin/Emoji.h"
30 #include "minikin/Hyphenator.h"
31
32 namespace minikin {
33
34 namespace {
createNewIterator(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle)35 static std::unique_ptr<BreakIterator> createNewIterator(const Locale& locale,
36 LineBreakStyle lbStyle,
37 LineBreakWordStyle lbWordStyle) {
38 MINIKIN_ASSERT(lbStyle != LineBreakStyle::Auto,
39 "LineBreakStyle::Auto must be resolved beforehand.");
40 MINIKIN_ASSERT(lbWordStyle != LineBreakWordStyle::Auto,
41 "LineBreakWordStyle::Auto must be resolved beforehand.");
42
43 // TODO: handle failure status
44 if (lbStyle == LineBreakStyle::NoBreak) {
45 return std::make_unique<NoBreakBreakIterator>();
46 } else {
47 UErrorCode status = U_ZERO_ERROR;
48 char localeID[ULOC_FULLNAME_CAPACITY] = {};
49 uloc_forLanguageTag(locale.getStringWithLineBreakOption(lbStyle, lbWordStyle).c_str(),
50 localeID, ULOC_FULLNAME_CAPACITY, nullptr, &status);
51 IcuUbrkUniquePtr icuBrkPtr(
52 ubrk_open(UBreakIteratorType::UBRK_LINE, localeID, nullptr, 0, &status));
53 return std::make_unique<ICUBreakIterator>(std::move(icuBrkPtr));
54 }
55 }
56 } // namespace
57
setText(UText * text,size_t)58 void ICUBreakIterator::setText(UText* text, size_t) {
59 UErrorCode status = U_ZERO_ERROR;
60 ubrk_setUText(mBreaker.get(), text, &status);
61 }
62
isBoundary(int32_t i)63 bool ICUBreakIterator::isBoundary(int32_t i) {
64 return ubrk_isBoundary(mBreaker.get(), i);
65 }
66
following(size_t i)67 int32_t ICUBreakIterator::following(size_t i) {
68 return ubrk_following(mBreaker.get(), i);
69 }
70
next()71 int32_t ICUBreakIterator::next() {
72 return ubrk_next(mBreaker.get());
73 }
74
acquire(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle)75 ICULineBreakerPool::Slot ICULineBreakerPoolImpl::acquire(const Locale& locale,
76 LineBreakStyle lbStyle,
77 LineBreakWordStyle lbWordStyle) {
78 if (lbStyle == LineBreakStyle::Auto) {
79 lbStyle = locale.supportsScript('J', 'p', 'a', 'n') ? LineBreakStyle::Strict
80 : LineBreakStyle::None;
81 }
82
83 const uint64_t id = locale.getIdentifier();
84 std::lock_guard<std::mutex> lock(mMutex);
85 for (auto i = mPool.begin(); i != mPool.end(); i++) {
86 if (i->localeId == id && i->lbStyle == lbStyle && i->lbWordStyle == lbWordStyle) {
87 Slot slot = std::move(*i);
88 mPool.erase(i);
89 return slot;
90 }
91 }
92
93 // Not found in pool. Create new one.
94 return {id, lbStyle, lbWordStyle, createNewIterator(locale, lbStyle, lbWordStyle)};
95 }
96
release(ICULineBreakerPool::Slot && slot)97 void ICULineBreakerPoolImpl::release(ICULineBreakerPool::Slot&& slot) {
98 if (slot.breaker.get() == nullptr) {
99 return; // Already released slot. Do nothing.
100 }
101 std::lock_guard<std::mutex> lock(mMutex);
102 if (mPool.size() >= MAX_POOL_SIZE) {
103 // Pool is full. Move to local variable, so that the given slot will be released when the
104 // variable leaves the scope.
105 Slot localSlot = std::move(slot);
106 return;
107 }
108 mPool.push_front(std::move(slot));
109 }
110
WordBreaker()111 WordBreaker::WordBreaker()
112 : mPool(&ICULineBreakerPoolImpl::getInstance()), mUText(nullptr, &utext_close) {}
113
WordBreaker(ICULineBreakerPool * pool)114 WordBreaker::WordBreaker(ICULineBreakerPool* pool) : mPool(pool), mUText(nullptr, &utext_close) {}
115
followingWithLocale(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle,size_t from)116 ssize_t WordBreaker::followingWithLocale(const Locale& locale, LineBreakStyle lbStyle,
117 LineBreakWordStyle lbWordStyle, size_t from) {
118 if (!mUText) {
119 return mCurrent;
120 }
121 mIcuBreaker = mPool->acquire(locale, lbStyle, lbWordStyle);
122 MINIKIN_ASSERT(mText != nullptr, "setText must be called first");
123 // TODO: handle failure status
124 mIcuBreaker.breaker->setText(mUText.get(), mTextSize);
125 if (mInEmailOrUrl) {
126 // Note:
127 // Don't reset mCurrent, mLast, or mScanOffset for keeping email/URL context.
128 // The email/URL detection doesn't support following() functionality, so that we can't
129 // restart from the specific position. This means following() can not be supported in
130 // general, but keeping old email/URL context works for LineBreaker since it just wants to
131 // re-calculate the next break point with the new locale.
132 } else {
133 mCurrent = mLast = mScanOffset = from;
134 next();
135 }
136 return mCurrent;
137 }
138
setText(const uint16_t * data,size_t size)139 void WordBreaker::setText(const uint16_t* data, size_t size) {
140 mText = data;
141 mTextSize = size;
142 mLast = 0;
143 mCurrent = 0;
144 mScanOffset = 0;
145 mInEmailOrUrl = false;
146 UErrorCode status = U_ZERO_ERROR;
147 mUText.reset(utext_openUChars(nullptr, reinterpret_cast<const UChar*>(data), size, &status));
148 }
149
current() const150 ssize_t WordBreaker::current() const {
151 return mCurrent;
152 }
153
154 /**
155 * Determine whether a line break at position i within the buffer buf is valid. This
156 * represents customization beyond the ICU behavior, because plain ICU provides some
157 * line break opportunities that we don't want.
158 **/
isValidBreak(const uint16_t * buf,size_t bufEnd,int32_t i)159 static bool isValidBreak(const uint16_t* buf, size_t bufEnd, int32_t i) {
160 const size_t position = static_cast<size_t>(i);
161 if (i == UBRK_DONE || position == bufEnd) {
162 // If the iterator reaches the end, treat as break.
163 return true;
164 }
165 uint32_t codePoint;
166 size_t prev_offset = position;
167 U16_PREV(buf, 0, prev_offset, codePoint);
168 // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
169 if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
170 return false;
171 }
172 // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
173 // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
174 // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
175 // where no line break could be imagined, since the Myanmar virama is a pure stacker.
176 if (codePoint == 0x1039) { // MYANMAR SIGN VIRAMA
177 return false;
178 }
179
180 uint32_t next_codepoint;
181 size_t next_offset = position;
182 U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
183
184 // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
185 // emoji data than ICU does.
186 if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
187 return false;
188 }
189
190 // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
191 if (isEmojiModifier(next_codepoint)) {
192 if (codePoint == 0xFE0F && prev_offset > 0) {
193 // skip over emoji variation selector
194 U16_PREV(buf, 0, prev_offset, codePoint);
195 }
196 if (isEmojiBase(codePoint)) {
197 return false;
198 }
199 }
200 return true;
201 }
202
203 // Customized iteratorNext that takes care of both resets and our modifications
204 // to ICU's behavior.
iteratorNext()205 int32_t WordBreaker::iteratorNext() {
206 int32_t result = mIcuBreaker.breaker->following(mCurrent);
207 while (!isValidBreak(mText, mTextSize, result)) {
208 result = mIcuBreaker.breaker->next();
209 }
210 return result;
211 }
212
213 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
breakAfter(uint16_t c)214 static bool breakAfter(uint16_t c) {
215 return c == ':' || c == '=' || c == '&';
216 }
217
218 // Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
breakBefore(uint16_t c)219 static bool breakBefore(uint16_t c) {
220 return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' ||
221 c == '%' || c == '=' || c == '&';
222 }
223
224 enum ScanState {
225 START,
226 SAW_AT,
227 SAW_COLON,
228 SAW_COLON_SLASH,
229 SAW_COLON_SLASH_SLASH,
230 };
231
detectEmailOrUrl()232 void WordBreaker::detectEmailOrUrl() {
233 if (mIcuBreaker.lbStyle == LineBreakStyle::NoBreak) {
234 mInEmailOrUrl = false;
235 return;
236 }
237 // scan forward from current ICU position for email address or URL
238 if (mLast >= mScanOffset) {
239 ScanState state = START;
240 size_t i;
241 for (i = mLast; i < mTextSize; i++) {
242 uint16_t c = mText[i];
243 // scan only ASCII characters, stop at space
244 if (!(' ' < c && c <= 0x007E)) {
245 break;
246 }
247 if (state == START && c == '@') {
248 state = SAW_AT;
249 } else if (state == START && c == ':') {
250 state = SAW_COLON;
251 } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
252 if (c == '/') {
253 state = static_cast<ScanState>((int)state + 1); // next state adds a slash
254 } else {
255 state = START;
256 }
257 }
258 }
259 if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
260 if (!mIcuBreaker.breaker->isBoundary(i)) {
261 // If there are combining marks or such at the end of the URL or the email address,
262 // consider them a part of the URL or the email, and skip to the next actual
263 // boundary.
264 i = mIcuBreaker.breaker->following(i);
265 }
266 mInEmailOrUrl = true;
267 } else {
268 mInEmailOrUrl = false;
269 }
270 mScanOffset = i;
271 }
272 }
273
findNextBreakInEmailOrUrl()274 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
275 // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
276 uint16_t lastChar = mText[mLast];
277 ssize_t i;
278 for (i = mLast + 1; i < mScanOffset; i++) {
279 if (breakAfter(lastChar)) {
280 break;
281 }
282 // break after double slash
283 if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
284 break;
285 }
286 const uint16_t thisChar = mText[i];
287 // never break after hyphen
288 if (lastChar != '-') {
289 if (breakBefore(thisChar)) {
290 break;
291 }
292 // break before single slash
293 if (thisChar == '/' && lastChar != '/' &&
294 !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
295 break;
296 }
297 }
298 lastChar = thisChar;
299 }
300 return i;
301 }
302
next()303 ssize_t WordBreaker::next() {
304 mLast = mCurrent;
305
306 detectEmailOrUrl();
307 if (mInEmailOrUrl) {
308 mCurrent = findNextBreakInEmailOrUrl();
309 } else { // Business as usual
310 mCurrent = (ssize_t)iteratorNext();
311 }
312 return mCurrent;
313 }
314
wordStart() const315 ssize_t WordBreaker::wordStart() const {
316 if (mInEmailOrUrl) {
317 return mLast;
318 }
319 ssize_t result = mLast;
320 while (result < mCurrent) {
321 UChar32 c;
322 ssize_t ix = result;
323 U16_NEXT(mText, ix, mCurrent, c);
324 const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
325 // strip leading punctuation, defined as OP and QU line breaking classes,
326 // see UAX #14
327 if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
328 break;
329 }
330 result = ix;
331 }
332 return result;
333 }
334
wordEnd() const335 ssize_t WordBreaker::wordEnd() const {
336 if (mInEmailOrUrl) {
337 return mLast;
338 }
339 ssize_t result = mCurrent;
340 while (result > mLast) {
341 UChar32 c;
342 ssize_t ix = result;
343 U16_PREV(mText, mLast, ix, c);
344 const int32_t gc_mask = U_GET_GC_MASK(c);
345 // strip trailing spaces, punctuation and control characters
346 if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK | U_GC_CC_MASK)) == 0) {
347 break;
348 }
349 result = ix;
350 }
351 return result;
352 }
353
breakBadness() const354 int WordBreaker::breakBadness() const {
355 return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
356 }
357
finish()358 void WordBreaker::finish() {
359 mText = nullptr;
360 mUText.reset();
361 mPool->release(std::move(mIcuBreaker));
362 }
363
364 } // namespace minikin
365