xref: /aosp_15_r20/frameworks/minikin/libs/minikin/WordBreaker.cpp (revision 834a2baab5fdfc28e9a428ee87c7ea8f6a06a53d)
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "WordBreaker.h"
18 
19 #include <unicode/ubrk.h>
20 #include <unicode/uchar.h>
21 #include <unicode/utf16.h>
22 
23 #include <list>
24 #include <map>
25 
26 #include "FeatureFlags.h"
27 #include "Locale.h"
28 #include "MinikinInternal.h"
29 #include "minikin/Emoji.h"
30 #include "minikin/Hyphenator.h"
31 
32 namespace minikin {
33 
34 namespace {
createNewIterator(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle)35 static std::unique_ptr<BreakIterator> createNewIterator(const Locale& locale,
36                                                         LineBreakStyle lbStyle,
37                                                         LineBreakWordStyle lbWordStyle) {
38     MINIKIN_ASSERT(lbStyle != LineBreakStyle::Auto,
39                    "LineBreakStyle::Auto must be resolved beforehand.");
40     MINIKIN_ASSERT(lbWordStyle != LineBreakWordStyle::Auto,
41                    "LineBreakWordStyle::Auto must be resolved beforehand.");
42 
43     // TODO: handle failure status
44     if (lbStyle == LineBreakStyle::NoBreak) {
45         return std::make_unique<NoBreakBreakIterator>();
46     } else {
47         UErrorCode status = U_ZERO_ERROR;
48         char localeID[ULOC_FULLNAME_CAPACITY] = {};
49         uloc_forLanguageTag(locale.getStringWithLineBreakOption(lbStyle, lbWordStyle).c_str(),
50                             localeID, ULOC_FULLNAME_CAPACITY, nullptr, &status);
51         IcuUbrkUniquePtr icuBrkPtr(
52                 ubrk_open(UBreakIteratorType::UBRK_LINE, localeID, nullptr, 0, &status));
53         return std::make_unique<ICUBreakIterator>(std::move(icuBrkPtr));
54     }
55 }
56 }  // namespace
57 
setText(UText * text,size_t)58 void ICUBreakIterator::setText(UText* text, size_t) {
59     UErrorCode status = U_ZERO_ERROR;
60     ubrk_setUText(mBreaker.get(), text, &status);
61 }
62 
isBoundary(int32_t i)63 bool ICUBreakIterator::isBoundary(int32_t i) {
64     return ubrk_isBoundary(mBreaker.get(), i);
65 }
66 
following(size_t i)67 int32_t ICUBreakIterator::following(size_t i) {
68     return ubrk_following(mBreaker.get(), i);
69 }
70 
next()71 int32_t ICUBreakIterator::next() {
72     return ubrk_next(mBreaker.get());
73 }
74 
acquire(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle)75 ICULineBreakerPool::Slot ICULineBreakerPoolImpl::acquire(const Locale& locale,
76                                                          LineBreakStyle lbStyle,
77                                                          LineBreakWordStyle lbWordStyle) {
78     if (lbStyle == LineBreakStyle::Auto) {
79         lbStyle = locale.supportsScript('J', 'p', 'a', 'n') ? LineBreakStyle::Strict
80                                                             : LineBreakStyle::None;
81     }
82 
83     const uint64_t id = locale.getIdentifier();
84     std::lock_guard<std::mutex> lock(mMutex);
85     for (auto i = mPool.begin(); i != mPool.end(); i++) {
86         if (i->localeId == id && i->lbStyle == lbStyle && i->lbWordStyle == lbWordStyle) {
87             Slot slot = std::move(*i);
88             mPool.erase(i);
89             return slot;
90         }
91     }
92 
93     // Not found in pool. Create new one.
94     return {id, lbStyle, lbWordStyle, createNewIterator(locale, lbStyle, lbWordStyle)};
95 }
96 
release(ICULineBreakerPool::Slot && slot)97 void ICULineBreakerPoolImpl::release(ICULineBreakerPool::Slot&& slot) {
98     if (slot.breaker.get() == nullptr) {
99         return;  // Already released slot. Do nothing.
100     }
101     std::lock_guard<std::mutex> lock(mMutex);
102     if (mPool.size() >= MAX_POOL_SIZE) {
103         // Pool is full. Move to local variable, so that the given slot will be released when the
104         // variable leaves the scope.
105         Slot localSlot = std::move(slot);
106         return;
107     }
108     mPool.push_front(std::move(slot));
109 }
110 
WordBreaker()111 WordBreaker::WordBreaker()
112         : mPool(&ICULineBreakerPoolImpl::getInstance()), mUText(nullptr, &utext_close) {}
113 
WordBreaker(ICULineBreakerPool * pool)114 WordBreaker::WordBreaker(ICULineBreakerPool* pool) : mPool(pool), mUText(nullptr, &utext_close) {}
115 
followingWithLocale(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle,size_t from)116 ssize_t WordBreaker::followingWithLocale(const Locale& locale, LineBreakStyle lbStyle,
117                                          LineBreakWordStyle lbWordStyle, size_t from) {
118     if (!mUText) {
119         return mCurrent;
120     }
121     mIcuBreaker = mPool->acquire(locale, lbStyle, lbWordStyle);
122     MINIKIN_ASSERT(mText != nullptr, "setText must be called first");
123     // TODO: handle failure status
124     mIcuBreaker.breaker->setText(mUText.get(), mTextSize);
125     if (mInEmailOrUrl) {
126         // Note:
127         // Don't reset mCurrent, mLast, or mScanOffset for keeping email/URL context.
128         // The email/URL detection doesn't support following() functionality, so that we can't
129         // restart from the specific position. This means following() can not be supported in
130         // general, but keeping old email/URL context works for LineBreaker since it just wants to
131         // re-calculate the next break point with the new locale.
132     } else {
133         mCurrent = mLast = mScanOffset = from;
134         next();
135     }
136     return mCurrent;
137 }
138 
setText(const uint16_t * data,size_t size)139 void WordBreaker::setText(const uint16_t* data, size_t size) {
140     mText = data;
141     mTextSize = size;
142     mLast = 0;
143     mCurrent = 0;
144     mScanOffset = 0;
145     mInEmailOrUrl = false;
146     UErrorCode status = U_ZERO_ERROR;
147     mUText.reset(utext_openUChars(nullptr, reinterpret_cast<const UChar*>(data), size, &status));
148 }
149 
current() const150 ssize_t WordBreaker::current() const {
151     return mCurrent;
152 }
153 
154 /**
155  * Determine whether a line break at position i within the buffer buf is valid. This
156  * represents customization beyond the ICU behavior, because plain ICU provides some
157  * line break opportunities that we don't want.
158  **/
isValidBreak(const uint16_t * buf,size_t bufEnd,int32_t i)159 static bool isValidBreak(const uint16_t* buf, size_t bufEnd, int32_t i) {
160     const size_t position = static_cast<size_t>(i);
161     if (i == UBRK_DONE || position == bufEnd) {
162         // If the iterator reaches the end, treat as break.
163         return true;
164     }
165     uint32_t codePoint;
166     size_t prev_offset = position;
167     U16_PREV(buf, 0, prev_offset, codePoint);
168     // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
169     if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
170         return false;
171     }
172     // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
173     // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
174     // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
175     // where no line break could be imagined, since the Myanmar virama is a pure stacker.
176     if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
177         return false;
178     }
179 
180     uint32_t next_codepoint;
181     size_t next_offset = position;
182     U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
183 
184     // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
185     // emoji data than ICU does.
186     if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
187         return false;
188     }
189 
190     // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
191     if (isEmojiModifier(next_codepoint)) {
192         if (codePoint == 0xFE0F && prev_offset > 0) {
193             // skip over emoji variation selector
194             U16_PREV(buf, 0, prev_offset, codePoint);
195         }
196         if (isEmojiBase(codePoint)) {
197             return false;
198         }
199     }
200     return true;
201 }
202 
203 // Customized iteratorNext that takes care of both resets and our modifications
204 // to ICU's behavior.
iteratorNext()205 int32_t WordBreaker::iteratorNext() {
206     int32_t result = mIcuBreaker.breaker->following(mCurrent);
207     while (!isValidBreak(mText, mTextSize, result)) {
208         result = mIcuBreaker.breaker->next();
209     }
210     return result;
211 }
212 
213 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
breakAfter(uint16_t c)214 static bool breakAfter(uint16_t c) {
215     return c == ':' || c == '=' || c == '&';
216 }
217 
218 // Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
breakBefore(uint16_t c)219 static bool breakBefore(uint16_t c) {
220     return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' ||
221            c == '%' || c == '=' || c == '&';
222 }
223 
224 enum ScanState {
225     START,
226     SAW_AT,
227     SAW_COLON,
228     SAW_COLON_SLASH,
229     SAW_COLON_SLASH_SLASH,
230 };
231 
detectEmailOrUrl()232 void WordBreaker::detectEmailOrUrl() {
233     if (mIcuBreaker.lbStyle == LineBreakStyle::NoBreak) {
234         mInEmailOrUrl = false;
235         return;
236     }
237     // scan forward from current ICU position for email address or URL
238     if (mLast >= mScanOffset) {
239         ScanState state = START;
240         size_t i;
241         for (i = mLast; i < mTextSize; i++) {
242             uint16_t c = mText[i];
243             // scan only ASCII characters, stop at space
244             if (!(' ' < c && c <= 0x007E)) {
245                 break;
246             }
247             if (state == START && c == '@') {
248                 state = SAW_AT;
249             } else if (state == START && c == ':') {
250                 state = SAW_COLON;
251             } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
252                 if (c == '/') {
253                     state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
254                 } else {
255                     state = START;
256                 }
257             }
258         }
259         if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
260             if (!mIcuBreaker.breaker->isBoundary(i)) {
261                 // If there are combining marks or such at the end of the URL or the email address,
262                 // consider them a part of the URL or the email, and skip to the next actual
263                 // boundary.
264                 i = mIcuBreaker.breaker->following(i);
265             }
266             mInEmailOrUrl = true;
267         } else {
268             mInEmailOrUrl = false;
269         }
270         mScanOffset = i;
271     }
272 }
273 
findNextBreakInEmailOrUrl()274 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
275     // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
276     uint16_t lastChar = mText[mLast];
277     ssize_t i;
278     for (i = mLast + 1; i < mScanOffset; i++) {
279         if (breakAfter(lastChar)) {
280             break;
281         }
282         // break after double slash
283         if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
284             break;
285         }
286         const uint16_t thisChar = mText[i];
287         // never break after hyphen
288         if (lastChar != '-') {
289             if (breakBefore(thisChar)) {
290                 break;
291             }
292             // break before single slash
293             if (thisChar == '/' && lastChar != '/' &&
294                 !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
295                 break;
296             }
297         }
298         lastChar = thisChar;
299     }
300     return i;
301 }
302 
next()303 ssize_t WordBreaker::next() {
304     mLast = mCurrent;
305 
306     detectEmailOrUrl();
307     if (mInEmailOrUrl) {
308         mCurrent = findNextBreakInEmailOrUrl();
309     } else {  // Business as usual
310         mCurrent = (ssize_t)iteratorNext();
311     }
312     return mCurrent;
313 }
314 
wordStart() const315 ssize_t WordBreaker::wordStart() const {
316     if (mInEmailOrUrl) {
317         return mLast;
318     }
319     ssize_t result = mLast;
320     while (result < mCurrent) {
321         UChar32 c;
322         ssize_t ix = result;
323         U16_NEXT(mText, ix, mCurrent, c);
324         const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
325         // strip leading punctuation, defined as OP and QU line breaking classes,
326         // see UAX #14
327         if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
328             break;
329         }
330         result = ix;
331     }
332     return result;
333 }
334 
wordEnd() const335 ssize_t WordBreaker::wordEnd() const {
336     if (mInEmailOrUrl) {
337         return mLast;
338     }
339     ssize_t result = mCurrent;
340     while (result > mLast) {
341         UChar32 c;
342         ssize_t ix = result;
343         U16_PREV(mText, mLast, ix, c);
344         const int32_t gc_mask = U_GET_GC_MASK(c);
345         // strip trailing spaces, punctuation and control characters
346         if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK | U_GC_CC_MASK)) == 0) {
347             break;
348         }
349         result = ix;
350     }
351     return result;
352 }
353 
breakBadness() const354 int WordBreaker::breakBadness() const {
355     return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
356 }
357 
finish()358 void WordBreaker::finish() {
359     mText = nullptr;
360     mUText.reset();
361     mPool->release(std::move(mIcuBreaker));
362 }
363 
364 }  // namespace minikin
365