xref: /aosp_15_r20/frameworks/minikin/tests/unittest/WordBreakerTests.cpp (revision 834a2baab5fdfc28e9a428ee87c7ea8f6a06a53d)
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <com_android_text_flags.h>
18 #include <flag_macros.h>
19 #include <gtest/gtest.h>
20 
21 #include <cstdio>
22 
23 #include "UnicodeUtils.h"
24 #include "WordBreaker.h"
25 
26 #ifndef NELEM
27 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
28 #endif
29 
30 #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
31 
32 namespace minikin {
33 
TEST(WordBreakerTest,basic)34 TEST(WordBreakerTest, basic) {
35     uint16_t buf[] = {'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
36     WordBreaker breaker;
37     breaker.setText(buf, NELEM(buf));
38     EXPECT_EQ(0, breaker.current());
39     EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), LineBreakStyle::None,
40                                              LineBreakWordStyle::None, 0));  // after "hello "
41     EXPECT_EQ(0, breaker.wordStart());                                       // "hello"
42     EXPECT_EQ(5, breaker.wordEnd());
43     EXPECT_EQ(0, breaker.breakBadness());
44     EXPECT_EQ(6, breaker.current());
45     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
46     EXPECT_EQ(6, breaker.wordStart());               // "world"
47     EXPECT_EQ(11, breaker.wordEnd());
48     EXPECT_EQ(0, breaker.breakBadness());
49     EXPECT_EQ(11, breaker.current());
50 }
51 
TEST(WordBreakerTest,softHyphen)52 TEST(WordBreakerTest, softHyphen) {
53     uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
54     auto lbStyle = LineBreakStyle::None;
55     auto lbWordStyle = LineBreakWordStyle::None;
56     WordBreaker breaker;
57     breaker.setText(buf, NELEM(buf));
58     EXPECT_EQ(0, breaker.current());
59     // after "hel{SOFT HYPHEN}lo "
60     EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
61     EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
62     EXPECT_EQ(6, breaker.wordEnd());
63     EXPECT_EQ(0, breaker.breakBadness());
64     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
65     EXPECT_EQ(7, breaker.wordStart());               // "world"
66     EXPECT_EQ(12, breaker.wordEnd());
67     EXPECT_EQ(0, breaker.breakBadness());
68 }
69 
TEST(WordBreakerTest,hardHyphen)70 TEST(WordBreakerTest, hardHyphen) {
71     // Hyphens should not allow breaks anymore.
72     uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'};
73     auto lbStyle = LineBreakStyle::None;
74     auto lbWordStyle = LineBreakWordStyle::None;
75     WordBreaker breaker;
76     breaker.setText(buf, NELEM(buf));
77     EXPECT_EQ(0, breaker.current());
78     EXPECT_EQ((ssize_t)NELEM(buf),
79               breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
80     EXPECT_EQ(0, breaker.wordStart());
81     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
82     EXPECT_EQ(0, breaker.breakBadness());
83 }
84 
TEST(WordBreakerTest,postfixAndPrefix)85 TEST(WordBreakerTest, postfixAndPrefix) {
86     uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5};  // US¢ JP¥
87     auto lbStyle = LineBreakStyle::None;
88     auto lbWordStyle = LineBreakWordStyle::None;
89     WordBreaker breaker;
90     breaker.setText(buf, NELEM(buf));
91     EXPECT_EQ(0, breaker.current());
92 
93     EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
94                                              0));  // after CENT SIGN
95     EXPECT_EQ(0, breaker.wordStart());             // "US¢"
96     EXPECT_EQ(3, breaker.wordEnd());
97 
98     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
99     EXPECT_EQ(4, breaker.wordStart());               // "JP¥"
100     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
101 }
102 
TEST(WordBreakerTest,myanmarKinzi)103 TEST(WordBreakerTest, myanmarKinzi) {
104     uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C};  // NGA, ASAT, VIRAMA, KA, UU
105     auto lbStyle = LineBreakStyle::None;
106     auto lbWordStyle = LineBreakWordStyle::None;
107     WordBreaker breaker;
108     breaker.setText(buf, NELEM(buf));
109     EXPECT_EQ(0, breaker.current());
110 
111     // end of string
112     EXPECT_EQ((ssize_t)NELEM(buf),
113               breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
114     EXPECT_EQ(0, breaker.wordStart());
115     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
116 }
117 
TEST(WordBreakerTest,zwjEmojiSequences)118 TEST(WordBreakerTest, zwjEmojiSequences) {
119     uint16_t buf[] = {
120             // man + zwj + heart + zwj + man
121             UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
122             // woman + zwj + heart + zwj + kiss mark + zwj + woman
123             UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
124             // eye + zwj + left speech bubble
125             UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
126             // CAT FACE + zwj + BUST IN SILHOUETTE
127             UTF16(0x1F431), 0x200D, UTF16(0x1F464),
128     };
129     auto lbStyle = LineBreakStyle::None;
130     auto lbWordStyle = LineBreakWordStyle::None;
131     WordBreaker breaker;
132     breaker.setText(buf, NELEM(buf));
133     EXPECT_EQ(0, breaker.current());
134     // after man + zwj + heart + zwj + man
135     EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
136     EXPECT_EQ(0, breaker.wordStart());
137     EXPECT_EQ(7, breaker.wordEnd());
138     EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
139     EXPECT_EQ(7, breaker.wordStart());
140     EXPECT_EQ(17, breaker.wordEnd());
141     EXPECT_EQ(22, breaker.next());  // after eye + zwj + left speech bubble
142     EXPECT_EQ(17, breaker.wordStart());
143     EXPECT_EQ(22, breaker.wordEnd());
144     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
145     EXPECT_EQ(22, breaker.wordStart());
146     EXPECT_EQ(27, breaker.wordEnd());
147 }
148 
TEST(WordBreakerTest,emojiWithModifier)149 TEST(WordBreakerTest, emojiWithModifier) {
150     uint16_t buf[] = {
151             UTF16(0x1F466), UTF16(0x1F3FB),  // boy + type 1-2 fitzpatrick modifier
152             0x270C, 0xFE0F,
153             UTF16(0x1F3FF)  // victory hand + emoji style + type 6 fitzpatrick modifier
154     };
155     auto lbStyle = LineBreakStyle::None;
156     auto lbWordStyle = LineBreakWordStyle::None;
157     WordBreaker breaker;
158     breaker.setText(buf, NELEM(buf));
159     EXPECT_EQ(0, breaker.current());
160     // after boy + type 1-2 fitzpatrick modifier
161     EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
162     EXPECT_EQ(0, breaker.wordStart());
163     EXPECT_EQ(4, breaker.wordEnd());
164     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
165     EXPECT_EQ(4, breaker.wordStart());
166     EXPECT_EQ(8, breaker.wordEnd());
167 }
168 
TEST(WordBreakerTest,unicode10Emoji)169 TEST(WordBreakerTest, unicode10Emoji) {
170     // Should break between emojis.
171     uint16_t buf[] = {
172             // SLED + SLED
173             UTF16(0x1F6F7), UTF16(0x1F6F7),
174             // SLED + VS15 + SLED
175             UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7),
176             // WHITE SMILING FACE + SLED
177             0x263A, UTF16(0x1F6F7),
178             // WHITE SMILING FACE + VS16 + SLED
179             0x263A, 0xFE0F, UTF16(0x1F6F7),
180     };
181     auto lbStyle = LineBreakStyle::None;
182     auto lbWordStyle = LineBreakWordStyle::None;
183     WordBreaker breaker;
184     breaker.setText(buf, NELEM(buf));
185     EXPECT_EQ(0, breaker.current());
186     EXPECT_EQ(2, breaker.followingWithLocale(Locale("en"), lbStyle, lbWordStyle, 0));
187     EXPECT_EQ(0, breaker.wordStart());
188     EXPECT_EQ(2, breaker.wordEnd());
189 
190     EXPECT_EQ(4, breaker.next());
191     EXPECT_EQ(2, breaker.wordStart());
192     EXPECT_EQ(4, breaker.wordEnd());
193 
194     EXPECT_EQ(7, breaker.next());
195     EXPECT_EQ(4, breaker.wordStart());
196     EXPECT_EQ(7, breaker.wordEnd());
197 
198     EXPECT_EQ(9, breaker.next());
199     EXPECT_EQ(7, breaker.wordStart());
200     EXPECT_EQ(9, breaker.wordEnd());
201 
202     EXPECT_EQ(10, breaker.next());
203     EXPECT_EQ(9, breaker.wordStart());
204     EXPECT_EQ(10, breaker.wordEnd());
205 
206     EXPECT_EQ(12, breaker.next());
207     EXPECT_EQ(10, breaker.wordStart());
208     EXPECT_EQ(12, breaker.wordEnd());
209 
210     EXPECT_EQ(14, breaker.next());
211     EXPECT_EQ(12, breaker.wordStart());
212     EXPECT_EQ(14, breaker.wordEnd());
213 
214     EXPECT_EQ(16, breaker.next());
215     EXPECT_EQ(14, breaker.wordStart());
216     EXPECT_EQ(16, breaker.wordEnd());
217 }
218 
TEST(WordBreakerTest,flagsSequenceSingleFlag)219 TEST(WordBreakerTest, flagsSequenceSingleFlag) {
220     const std::string kFlag = "U+1F3F4";
221     const std::string flags = kFlag + " " + kFlag;
222 
223     const int kFlagLength = 2;
224     const size_t BUF_SIZE = kFlagLength * 2;
225 
226     uint16_t buf[BUF_SIZE];
227     size_t size;
228     ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr);
229     auto lbStyle = LineBreakStyle::None;
230     auto lbWordStyle = LineBreakWordStyle::None;
231 
232     WordBreaker breaker;
233     breaker.setText(buf, size);
234     EXPECT_EQ(0, breaker.current());
235     // end of the first flag
236     EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
237     EXPECT_EQ(0, breaker.wordStart());
238     EXPECT_EQ(kFlagLength, breaker.wordEnd());
239     EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
240     EXPECT_EQ(kFlagLength, breaker.wordStart());
241     EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
242 }
243 
TEST(WordBreakerTest,flagsSequence)244 TEST(WordBreakerTest, flagsSequence) {
245     // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
246     // of Scotland.
247     const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F";
248     const std::string flagSequence = kFlagSequence + " " + kFlagSequence;
249 
250     const int kFlagLength = 14;
251     const size_t BUF_SIZE = kFlagLength * 2;
252 
253     uint16_t buf[BUF_SIZE];
254     size_t size;
255     ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr);
256     auto lbStyle = LineBreakStyle::None;
257     auto lbWordStyle = LineBreakWordStyle::None;
258 
259     WordBreaker breaker;
260     breaker.setText(buf, size);
261     EXPECT_EQ(0, breaker.current());
262     // end of the first flag sequence
263     EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
264     EXPECT_EQ(0, breaker.wordStart());
265     EXPECT_EQ(kFlagLength, breaker.wordEnd());
266     EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
267     EXPECT_EQ(kFlagLength, breaker.wordStart());
268     EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
269 }
270 
TEST(WordBreakerTest,punct)271 TEST(WordBreakerTest, punct) {
272     uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l', 'o', ',',
273                       ' ',    'w',    'o', 'r', 'l', 'd', '!', '!'};
274     auto lbStyle = LineBreakStyle::None;
275     auto lbWordStyle = LineBreakWordStyle::None;
276     WordBreaker breaker;
277     breaker.setText(buf, NELEM(buf));
278     EXPECT_EQ(0, breaker.current());
279     EXPECT_EQ(9, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
280                                              0));  // after "¡¡hello, "
281     EXPECT_EQ(2, breaker.wordStart());             // "hello"
282     EXPECT_EQ(7, breaker.wordEnd());
283     EXPECT_EQ(0, breaker.breakBadness());
284     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
285     EXPECT_EQ(9, breaker.wordStart());               // "world"
286     EXPECT_EQ(14, breaker.wordEnd());
287     EXPECT_EQ(0, breaker.breakBadness());
288 }
289 
TEST(WordBreakerTest,email)290 TEST(WordBreakerTest, email) {
291     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p',
292                       'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
293     auto lbStyle = LineBreakStyle::None;
294     auto lbWordStyle = LineBreakWordStyle::None;
295     WordBreaker breaker;
296     breaker.setText(buf, NELEM(buf));
297     EXPECT_EQ(0, breaker.current());
298     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
299                                               0));  // after "foo@example"
300     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
301     EXPECT_EQ(1, breaker.breakBadness());
302     EXPECT_EQ(16, breaker.next());  // after ".com "
303     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
304     EXPECT_EQ(0, breaker.breakBadness());
305     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
306     EXPECT_EQ(16, breaker.wordStart());              // "x"
307     EXPECT_EQ(17, breaker.wordEnd());
308     EXPECT_EQ(0, breaker.breakBadness());
309 }
310 
TEST(WordBreakerTest,mailto)311 TEST(WordBreakerTest, mailto) {
312     uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 'e',
313                       'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
314     auto lbStyle = LineBreakStyle::None;
315     auto lbWordStyle = LineBreakWordStyle::None;
316     WordBreaker breaker;
317     breaker.setText(buf, NELEM(buf));
318     EXPECT_EQ(0, breaker.current());
319     EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
320                                              0));  // after "mailto:"
321     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
322     EXPECT_EQ(1, breaker.breakBadness());
323     EXPECT_EQ(18, breaker.next());  // after "foo@example"
324     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
325     EXPECT_EQ(1, breaker.breakBadness());
326     EXPECT_EQ(23, breaker.next());  // after ".com "
327     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
328     EXPECT_EQ(0, breaker.breakBadness());
329     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
330     EXPECT_EQ(23, breaker.wordStart());              // "x"
331     EXPECT_EQ(24, breaker.wordEnd());
332     EXPECT_EQ(0, breaker.breakBadness());
333 }
334 
335 // The current logic always places a line break after a detected email address or URL
336 // and an immediately following non-ASCII character.
TEST(WordBreakerTest,emailNonAscii)337 TEST(WordBreakerTest, emailNonAscii) {
338     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm',
339                       'p', 'l', 'e', '.', 'c', 'o', 'm', 0x4E00};
340     auto lbStyle = LineBreakStyle::None;
341     auto lbWordStyle = LineBreakWordStyle::None;
342     WordBreaker breaker;
343     breaker.setText(buf, NELEM(buf));
344     EXPECT_EQ(0, breaker.current());
345     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
346                                               0));  // after "foo@example"
347     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
348     EXPECT_EQ(1, breaker.breakBadness());
349     EXPECT_EQ(15, breaker.next());  // after ".com"
350     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
351     EXPECT_EQ(0, breaker.breakBadness());
352     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
353     EXPECT_EQ(15, breaker.wordStart());              // "一"
354     EXPECT_EQ(16, breaker.wordEnd());
355     EXPECT_EQ(0, breaker.breakBadness());
356 }
357 
TEST(WordBreakerTest,emailCombining)358 TEST(WordBreakerTest, emailCombining) {
359     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a',    'm', 'p',
360                       'l', 'e', '.', 'c', 'o', 'm', 0x0303, ' ', 'x'};
361     auto lbStyle = LineBreakStyle::None;
362     auto lbWordStyle = LineBreakWordStyle::None;
363     WordBreaker breaker;
364     breaker.setText(buf, NELEM(buf));
365     EXPECT_EQ(0, breaker.current());
366     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
367                                               0));  // after "foo@example"
368     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
369     EXPECT_EQ(1, breaker.breakBadness());
370     EXPECT_EQ(17, breaker.next());  // after ".com̃ "
371     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
372     EXPECT_EQ(0, breaker.breakBadness());
373     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
374     EXPECT_EQ(17, breaker.wordStart());              // "x"
375     EXPECT_EQ(18, breaker.wordEnd());
376     EXPECT_EQ(0, breaker.breakBadness());
377 }
378 
TEST(WordBreakerTest,lonelyAt)379 TEST(WordBreakerTest, lonelyAt) {
380     uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
381     auto lbStyle = LineBreakStyle::None;
382     auto lbWordStyle = LineBreakWordStyle::None;
383     WordBreaker breaker;
384     breaker.setText(buf, NELEM(buf));
385     EXPECT_EQ(0, breaker.current());
386     EXPECT_EQ(2,
387               breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));  // after "a "
388     EXPECT_EQ(0, breaker.wordStart());                              // "a"
389     EXPECT_EQ(1, breaker.wordEnd());
390     EXPECT_EQ(0, breaker.breakBadness());
391     EXPECT_EQ(4, breaker.next());  // after "@ "
392     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
393     EXPECT_EQ(0, breaker.breakBadness());
394     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
395     EXPECT_EQ(4, breaker.wordStart());               // "b"
396     EXPECT_EQ(5, breaker.wordEnd());
397     EXPECT_EQ(0, breaker.breakBadness());
398 }
399 
TEST(WordBreakerTest,url)400 TEST(WordBreakerTest, url) {
401     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a',
402                       'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
403     auto lbStyle = LineBreakStyle::None;
404     auto lbWordStyle = LineBreakWordStyle::None;
405     WordBreaker breaker;
406     breaker.setText(buf, NELEM(buf));
407     EXPECT_EQ(0, breaker.current());
408     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
409                                              0));  // after "http:"
410     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
411     EXPECT_EQ(1, breaker.breakBadness());
412     EXPECT_EQ(7, breaker.next());  // after "//"
413     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
414     EXPECT_EQ(1, breaker.breakBadness());
415     EXPECT_EQ(14, breaker.next());  // after "example"
416     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
417     EXPECT_EQ(1, breaker.breakBadness());
418     EXPECT_EQ(19, breaker.next());  // after ".com "
419     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
420     EXPECT_EQ(0, breaker.breakBadness());
421     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
422     EXPECT_EQ(19, breaker.wordStart());              // "x"
423     EXPECT_EQ(20, breaker.wordEnd());
424     EXPECT_EQ(0, breaker.breakBadness());
425 }
426 
427 // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST(WordBreakerTest,urlBreakChars)428 TEST(WordBreakerTest, urlBreakChars) {
429     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/',
430                       '~', 'c', ',', 'd', '-', 'e', '?', 'f', '=', 'g', '&',
431                       'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
432     auto lbStyle = LineBreakStyle::None;
433     auto lbWordStyle = LineBreakWordStyle::None;
434     WordBreaker breaker;
435     breaker.setText(buf, NELEM(buf));
436     EXPECT_EQ(0, breaker.current());
437     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
438                                              0));  // after "http:"
439     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
440     EXPECT_EQ(1, breaker.breakBadness());
441     EXPECT_EQ(7, breaker.next());  // after "//"
442     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
443     EXPECT_EQ(1, breaker.breakBadness());
444     EXPECT_EQ(8, breaker.next());  // after "a"
445     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
446     EXPECT_EQ(1, breaker.breakBadness());
447     EXPECT_EQ(10, breaker.next());  // after ".b"
448     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
449     EXPECT_EQ(1, breaker.breakBadness());
450     EXPECT_EQ(11, breaker.next());  // after "/"
451     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
452     EXPECT_EQ(1, breaker.breakBadness());
453     EXPECT_EQ(13, breaker.next());  // after "~c"
454     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
455     EXPECT_EQ(1, breaker.breakBadness());
456     EXPECT_EQ(15, breaker.next());  // after ",d"
457     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
458     EXPECT_EQ(1, breaker.breakBadness());
459     EXPECT_EQ(17, breaker.next());  // after "-e"
460     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
461     EXPECT_EQ(1, breaker.breakBadness());
462     EXPECT_EQ(19, breaker.next());  // after "?f"
463     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
464     EXPECT_EQ(1, breaker.breakBadness());
465     EXPECT_EQ(20, breaker.next());  // after "="
466     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
467     EXPECT_EQ(1, breaker.breakBadness());
468     EXPECT_EQ(21, breaker.next());  // after "g"
469     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
470     EXPECT_EQ(1, breaker.breakBadness());
471     EXPECT_EQ(22, breaker.next());  // after "&"
472     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
473     EXPECT_EQ(1, breaker.breakBadness());
474     EXPECT_EQ(23, breaker.next());  // after "h"
475     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
476     EXPECT_EQ(1, breaker.breakBadness());
477     EXPECT_EQ(25, breaker.next());  // after "#i"
478     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
479     EXPECT_EQ(1, breaker.breakBadness());
480     EXPECT_EQ(27, breaker.next());  // after "%j"
481     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
482     EXPECT_EQ(1, breaker.breakBadness());
483     EXPECT_EQ(29, breaker.next());  // after "_k"
484     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
485     EXPECT_EQ(1, breaker.breakBadness());
486     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
487     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
488     EXPECT_EQ(0, breaker.breakBadness());
489 }
490 
TEST(WordBreakerTest,urlNoHyphenBreak)491 TEST(WordBreakerTest, urlNoHyphenBreak) {
492     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
493     auto lbStyle = LineBreakStyle::None;
494     auto lbWordStyle = LineBreakWordStyle::None;
495     WordBreaker breaker;
496     breaker.setText(buf, NELEM(buf));
497     EXPECT_EQ(0, breaker.current());
498     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
499                                              0));  // after "http:"
500     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
501     EXPECT_EQ(7, breaker.next());  // after "//"
502     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
503     EXPECT_EQ(8, breaker.next());  // after "a"
504     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
505     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
506     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
507 }
508 
TEST(WordBreakerTest,urlEndsWithSlash)509 TEST(WordBreakerTest, urlEndsWithSlash) {
510     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
511     auto lbStyle = LineBreakStyle::None;
512     auto lbWordStyle = LineBreakWordStyle::None;
513     WordBreaker breaker;
514     breaker.setText(buf, NELEM(buf));
515     EXPECT_EQ(0, breaker.current());
516     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
517                                              0));  // after "http:"
518     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
519     EXPECT_EQ(7, breaker.next());  // after "//"
520     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
521     EXPECT_EQ(8, breaker.next());  // after "a"
522     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
523     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
524     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
525 }
526 
TEST(WordBreakerTest,emailStartsWithSlash)527 TEST(WordBreakerTest, emailStartsWithSlash) {
528     uint16_t buf[] = {'/', 'a', '@', 'b'};
529     auto lbStyle = LineBreakStyle::None;
530     auto lbWordStyle = LineBreakWordStyle::None;
531     WordBreaker breaker;
532     breaker.setText(buf, NELEM(buf));
533     EXPECT_EQ(0, breaker.current());
534     EXPECT_EQ((ssize_t)NELEM(buf),
535               breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));  // end
536     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
537 }
538 
TEST(WordBreakerTest,setLocaleInsideUrl)539 TEST(WordBreakerTest, setLocaleInsideUrl) {
540     std::vector<uint16_t> buf = utf8ToUtf16("Hello http://abc/d.html World");
541     auto lbStyle = LineBreakStyle::None;
542     auto lbWordStyle = LineBreakWordStyle::None;
543     WordBreaker breaker;
544     breaker.setText(buf.data(), buf.size());
545     EXPECT_EQ(0, breaker.current());
546     EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
547                                              0));  // after "Hello "
548     EXPECT_EQ(0, breaker.wordStart());
549     EXPECT_EQ(5, breaker.wordEnd());
550 
551     EXPECT_EQ(6, breaker.current());
552     EXPECT_EQ(11, breaker.next());  // after "http:"
553 
554     // Restart from middle point of the URL. It should return the same previous break point.
555     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
556                                               6));  // after "http:"
557     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
558 
559     EXPECT_EQ(13, breaker.next());  // after "//"
560     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
561 
562     // Restart from middle point of the URL. It should return the same previous break point.
563     EXPECT_EQ(13, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
564                                               12));  // after "//"
565     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
566     EXPECT_EQ(16, breaker.next());  // after "abc"
567     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
568     EXPECT_EQ(18, breaker.next());  // after "/d"
569     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
570     EXPECT_EQ(24, breaker.next());  // after ".html"
571     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
572 
573     EXPECT_EQ(29, breaker.next());  // after "World"
574     EXPECT_EQ(24, breaker.wordStart());
575     EXPECT_EQ(29, breaker.wordEnd());
576 }
577 
578 // b/68669534
TEST(WordBreakerTest,spaceAfterSpace)579 TEST(WordBreakerTest, spaceAfterSpace) {
580     const std::vector<uint16_t> SPACES = {
581             '\t',    // TAB
582             0x1680,  // OGHAM SPACE MARK
583             0x3000,  // IDEOGRAPHIC SPACE
584     };
585 
586     constexpr uint16_t CHAR_SPACE = 0x0020;
587     auto lbStyle = LineBreakStyle::None;
588     auto lbWordStyle = LineBreakWordStyle::None;
589 
590     for (uint16_t sp : SPACES) {
591         char msg[64] = {};
592         snprintf(msg, sizeof(msg), "Test Space: U+%04X", sp);
593         SCOPED_TRACE(msg);
594 
595         std::vector<uint16_t> buf = {'a', CHAR_SPACE, sp, 'b'};
596         WordBreaker breaker;
597         breaker.setText(buf.data(), buf.size());
598 
599         EXPECT_EQ(0, breaker.current());
600         EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
601                                                  0));  // after "a "
602         EXPECT_EQ(0, breaker.wordStart());
603         EXPECT_EQ(1, breaker.wordEnd());
604 
605         EXPECT_EQ(2, breaker.current());
606         EXPECT_EQ(3, breaker.next());  // after CHAR_SPACE character.
607         EXPECT_EQ(2, breaker.wordStart());
608         EXPECT_EQ(2, breaker.wordEnd());
609 
610         EXPECT_EQ(3, breaker.current());
611         EXPECT_EQ(4, breaker.next());  // after sp character.
612         EXPECT_EQ(3, breaker.wordStart());
613         EXPECT_EQ(4, breaker.wordEnd());
614     }
615 }
616 
617 class TestableICULineBreakerPoolImpl : public ICULineBreakerPoolImpl {
618 public:
TestableICULineBreakerPoolImpl()619     TestableICULineBreakerPoolImpl() : ICULineBreakerPoolImpl() {}
620 
621     using ICULineBreakerPoolImpl::getPoolSize;
622     using ICULineBreakerPoolImpl::MAX_POOL_SIZE;
623 };
624 
TEST(WordBreakerTest,LineBreakerPool_acquire_without_release)625 TEST(WordBreakerTest, LineBreakerPool_acquire_without_release) {
626     TestableICULineBreakerPoolImpl pool;
627 
628     const Locale enUS("en-Latn-US");
629     const Locale frFR("fr-Latn-FR");
630 
631     // All following three breakers must be the different instances.
632     ICULineBreakerPool::Slot enUSBreaker =
633             pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
634     ICULineBreakerPool::Slot enUSBreaker2 =
635             pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
636     ICULineBreakerPool::Slot enUSBreaker3 =
637             pool.acquire(enUS, LineBreakStyle::Strict, LineBreakWordStyle::None);
638     ICULineBreakerPool::Slot frFRBreaker =
639             pool.acquire(frFR, LineBreakStyle::None, LineBreakWordStyle::None);
640     ICULineBreakerPool::Slot frFRBreaker2 =
641             pool.acquire(frFR, LineBreakStyle::None, LineBreakWordStyle::Phrase);
642 
643     EXPECT_NE(nullptr, enUSBreaker.breaker.get());
644     EXPECT_NE(nullptr, enUSBreaker2.breaker.get());
645     EXPECT_NE(nullptr, enUSBreaker3.breaker.get());
646     EXPECT_NE(nullptr, frFRBreaker.breaker.get());
647     EXPECT_NE(nullptr, frFRBreaker2.breaker.get());
648 
649     EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker2.breaker.get());
650     EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker3.breaker.get());
651     EXPECT_NE(enUSBreaker.breaker.get(), frFRBreaker.breaker.get());
652     EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker.breaker.get());
653     EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker2.breaker.get());
654     EXPECT_NE(enUSBreaker2.breaker.get(), enUSBreaker3.breaker.get());
655 
656     EXPECT_EQ(enUSBreaker.localeId, enUSBreaker2.localeId);
657     EXPECT_EQ(enUSBreaker.localeId, enUSBreaker3.localeId);
658     EXPECT_NE(enUSBreaker.localeId, frFRBreaker.localeId);
659     EXPECT_NE(enUSBreaker.localeId, frFRBreaker2.localeId);
660     EXPECT_NE(enUSBreaker2.localeId, frFRBreaker.localeId);
661     EXPECT_NE(enUSBreaker2.localeId, frFRBreaker2.localeId);
662     EXPECT_EQ(frFRBreaker.localeId, frFRBreaker2.localeId);
663 }
664 
TEST(WordBreakerTest,LineBreakerPool_acquire_with_release)665 TEST(WordBreakerTest, LineBreakerPool_acquire_with_release) {
666     TestableICULineBreakerPoolImpl pool;
667 
668     const Locale enUS("en-Latn-US");
669     const Locale frFR("fr-Latn-FR");
670 
671     // All following three breakers must be the different instances.
672     ICULineBreakerPool::Slot enUSBreaker =
673             pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
674 
675     uint64_t enUSBreakerLocaleId = enUSBreaker.localeId;
676     auto* enUSBreakerPtr = enUSBreaker.breaker.get();
677 
678     pool.release(std::move(enUSBreaker));
679     EXPECT_EQ(nullptr, enUSBreaker.breaker.get());
680 
681     // acquire must return a different instance if the locale is different.
682     ICULineBreakerPool::Slot frFRBreaker =
683             pool.acquire(frFR, LineBreakStyle::Loose, LineBreakWordStyle::None);
684     EXPECT_NE(enUSBreakerPtr, frFRBreaker.breaker.get());
685     EXPECT_NE(enUSBreakerLocaleId, frFRBreaker.localeId);
686 
687     // acquire must return the same instance as released before if the locale is the same.
688     ICULineBreakerPool::Slot enUSBreaker2 =
689             pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
690     EXPECT_EQ(enUSBreakerPtr, enUSBreaker2.breaker.get());
691     EXPECT_EQ(enUSBreakerLocaleId, enUSBreaker2.localeId);
692 
693     // acquire must return a different instance if the line break is different.
694     ICULineBreakerPool::Slot frFRBreaker2 =
695             pool.acquire(frFR, LineBreakStyle::Normal, LineBreakWordStyle::None);
696     ICULineBreakerPool::Slot frFRBreaker3 =
697             pool.acquire(frFR, LineBreakStyle::Normal, LineBreakWordStyle::Phrase);
698     EXPECT_NE(frFRBreaker.breaker.get(), frFRBreaker2.breaker.get());
699     EXPECT_NE(frFRBreaker.breaker.get(), frFRBreaker3.breaker.get());
700     EXPECT_NE(frFRBreaker2.breaker.get(), frFRBreaker3.breaker.get());
701     EXPECT_EQ(frFRBreaker.localeId, frFRBreaker2.localeId);
702     EXPECT_EQ(frFRBreaker.localeId, frFRBreaker3.localeId);
703     EXPECT_EQ(frFRBreaker2.localeId, frFRBreaker3.localeId);
704 }
705 
TEST(WordBreakerTest,LineBreakerPool_exceeds_pool_size)706 TEST(WordBreakerTest, LineBreakerPool_exceeds_pool_size) {
707     const size_t MAX_POOL_SIZE = TestableICULineBreakerPoolImpl::MAX_POOL_SIZE;
708     TestableICULineBreakerPoolImpl pool;
709 
710     const Locale enUS("en-Latn-US");
711 
712     ICULineBreakerPool::Slot slots[MAX_POOL_SIZE * 2];
713 
714     // Make pool full.
715     for (size_t i = 0; i < MAX_POOL_SIZE * 2; i++) {
716         slots[i] = pool.acquire(enUS, LineBreakStyle::None, LineBreakWordStyle::None);
717         EXPECT_EQ(0U, pool.getPoolSize());
718     }
719 
720     for (size_t i = 0; i < MAX_POOL_SIZE; i++) {
721         pool.release(std::move(slots[i]));
722         EXPECT_EQ(i + 1, pool.getPoolSize());
723     }
724 
725     for (size_t i = MAX_POOL_SIZE; i < MAX_POOL_SIZE * 2; i++) {
726         pool.release(std::move(slots[i]));
727         EXPECT_EQ(MAX_POOL_SIZE, pool.getPoolSize());
728     }
729 }
730 
TEST(WordBreakerTest,noBreak_urlNoHyphenBreak)731 TEST(WordBreakerTest, noBreak_urlNoHyphenBreak) {
732     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
733     auto lbStyle = LineBreakStyle::NoBreak;
734     auto lbWordStyle = LineBreakWordStyle::None;
735     WordBreaker breaker;
736     breaker.setText(buf, NELEM(buf));
737     EXPECT_EQ(0, breaker.current());
738     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
739     EXPECT_EQ(0, breaker.wordStart());
740     EXPECT_EQ(11, breaker.current());
741     EXPECT_EQ(11, breaker.next());
742 }
743 
TEST(WordBreakerTest,noBreak_urlEndsWithSlash)744 TEST(WordBreakerTest, noBreak_urlEndsWithSlash) {
745     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
746     auto lbStyle = LineBreakStyle::NoBreak;
747     auto lbWordStyle = LineBreakWordStyle::None;
748     WordBreaker breaker;
749     breaker.setText(buf, NELEM(buf));
750     EXPECT_EQ(0, breaker.current());
751     EXPECT_EQ(9, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
752     EXPECT_EQ(0, breaker.wordStart());
753     EXPECT_EQ(9, breaker.next());
754 }
755 
TEST(WordBreakerTest,noBreak_setLocaleInsideUrl)756 TEST(WordBreakerTest, noBreak_setLocaleInsideUrl) {
757     std::vector<uint16_t> buf = utf8ToUtf16("Hello http://abc/d.html World");
758     auto lbStyle = LineBreakStyle::NoBreak;
759     auto lbWordStyle = LineBreakWordStyle::None;
760     WordBreaker breaker;
761     breaker.setText(buf.data(), buf.size());
762     EXPECT_EQ(0, breaker.current());
763     EXPECT_EQ(29, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
764     EXPECT_EQ(0, breaker.wordStart());
765     EXPECT_EQ(29, breaker.wordEnd());
766 
767     EXPECT_EQ(29, breaker.current());
768     EXPECT_EQ(29, breaker.next());
769 }
770 
771 }  // namespace minikin
772