1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <com_android_text_flags.h>
18 #include <flag_macros.h>
19 #include <gtest/gtest.h>
20
21 #include <cstdio>
22
23 #include "UnicodeUtils.h"
24 #include "WordBreaker.h"
25
26 #ifndef NELEM
27 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
28 #endif
29
30 #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
31
32 namespace minikin {
33
TEST(WordBreakerTest,basic)34 TEST(WordBreakerTest, basic) {
35 uint16_t buf[] = {'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
36 WordBreaker breaker;
37 breaker.setText(buf, NELEM(buf));
38 EXPECT_EQ(0, breaker.current());
39 EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), LineBreakStyle::None,
40 LineBreakWordStyle::None, 0)); // after "hello "
41 EXPECT_EQ(0, breaker.wordStart()); // "hello"
42 EXPECT_EQ(5, breaker.wordEnd());
43 EXPECT_EQ(0, breaker.breakBadness());
44 EXPECT_EQ(6, breaker.current());
45 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
46 EXPECT_EQ(6, breaker.wordStart()); // "world"
47 EXPECT_EQ(11, breaker.wordEnd());
48 EXPECT_EQ(0, breaker.breakBadness());
49 EXPECT_EQ(11, breaker.current());
50 }
51
TEST(WordBreakerTest,softHyphen)52 TEST(WordBreakerTest, softHyphen) {
53 uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
54 auto lbStyle = LineBreakStyle::None;
55 auto lbWordStyle = LineBreakWordStyle::None;
56 WordBreaker breaker;
57 breaker.setText(buf, NELEM(buf));
58 EXPECT_EQ(0, breaker.current());
59 // after "hel{SOFT HYPHEN}lo "
60 EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
61 EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo"
62 EXPECT_EQ(6, breaker.wordEnd());
63 EXPECT_EQ(0, breaker.breakBadness());
64 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
65 EXPECT_EQ(7, breaker.wordStart()); // "world"
66 EXPECT_EQ(12, breaker.wordEnd());
67 EXPECT_EQ(0, breaker.breakBadness());
68 }
69
TEST(WordBreakerTest,hardHyphen)70 TEST(WordBreakerTest, hardHyphen) {
71 // Hyphens should not allow breaks anymore.
72 uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'};
73 auto lbStyle = LineBreakStyle::None;
74 auto lbWordStyle = LineBreakWordStyle::None;
75 WordBreaker breaker;
76 breaker.setText(buf, NELEM(buf));
77 EXPECT_EQ(0, breaker.current());
78 EXPECT_EQ((ssize_t)NELEM(buf),
79 breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
80 EXPECT_EQ(0, breaker.wordStart());
81 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
82 EXPECT_EQ(0, breaker.breakBadness());
83 }
84
TEST(WordBreakerTest,postfixAndPrefix)85 TEST(WordBreakerTest, postfixAndPrefix) {
86 uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥
87 auto lbStyle = LineBreakStyle::None;
88 auto lbWordStyle = LineBreakWordStyle::None;
89 WordBreaker breaker;
90 breaker.setText(buf, NELEM(buf));
91 EXPECT_EQ(0, breaker.current());
92
93 EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
94 0)); // after CENT SIGN
95 EXPECT_EQ(0, breaker.wordStart()); // "US¢"
96 EXPECT_EQ(3, breaker.wordEnd());
97
98 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string
99 EXPECT_EQ(4, breaker.wordStart()); // "JP¥"
100 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
101 }
102
TEST(WordBreakerTest,myanmarKinzi)103 TEST(WordBreakerTest, myanmarKinzi) {
104 uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C}; // NGA, ASAT, VIRAMA, KA, UU
105 auto lbStyle = LineBreakStyle::None;
106 auto lbWordStyle = LineBreakWordStyle::None;
107 WordBreaker breaker;
108 breaker.setText(buf, NELEM(buf));
109 EXPECT_EQ(0, breaker.current());
110
111 // end of string
112 EXPECT_EQ((ssize_t)NELEM(buf),
113 breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
114 EXPECT_EQ(0, breaker.wordStart());
115 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
116 }
117
TEST(WordBreakerTest,zwjEmojiSequences)118 TEST(WordBreakerTest, zwjEmojiSequences) {
119 uint16_t buf[] = {
120 // man + zwj + heart + zwj + man
121 UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
122 // woman + zwj + heart + zwj + kiss mark + zwj + woman
123 UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
124 // eye + zwj + left speech bubble
125 UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
126 // CAT FACE + zwj + BUST IN SILHOUETTE
127 UTF16(0x1F431), 0x200D, UTF16(0x1F464),
128 };
129 auto lbStyle = LineBreakStyle::None;
130 auto lbWordStyle = LineBreakWordStyle::None;
131 WordBreaker breaker;
132 breaker.setText(buf, NELEM(buf));
133 EXPECT_EQ(0, breaker.current());
134 // after man + zwj + heart + zwj + man
135 EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
136 EXPECT_EQ(0, breaker.wordStart());
137 EXPECT_EQ(7, breaker.wordEnd());
138 EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
139 EXPECT_EQ(7, breaker.wordStart());
140 EXPECT_EQ(17, breaker.wordEnd());
141 EXPECT_EQ(22, breaker.next()); // after eye + zwj + left speech bubble
142 EXPECT_EQ(17, breaker.wordStart());
143 EXPECT_EQ(22, breaker.wordEnd());
144 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
145 EXPECT_EQ(22, breaker.wordStart());
146 EXPECT_EQ(27, breaker.wordEnd());
147 }
148
TEST(WordBreakerTest,emojiWithModifier)149 TEST(WordBreakerTest, emojiWithModifier) {
150 uint16_t buf[] = {
151 UTF16(0x1F466), UTF16(0x1F3FB), // boy + type 1-2 fitzpatrick modifier
152 0x270C, 0xFE0F,
153 UTF16(0x1F3FF) // victory hand + emoji style + type 6 fitzpatrick modifier
154 };
155 auto lbStyle = LineBreakStyle::None;
156 auto lbWordStyle = LineBreakWordStyle::None;
157 WordBreaker breaker;
158 breaker.setText(buf, NELEM(buf));
159 EXPECT_EQ(0, breaker.current());
160 // after boy + type 1-2 fitzpatrick modifier
161 EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
162 EXPECT_EQ(0, breaker.wordStart());
163 EXPECT_EQ(4, breaker.wordEnd());
164 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
165 EXPECT_EQ(4, breaker.wordStart());
166 EXPECT_EQ(8, breaker.wordEnd());
167 }
168
TEST(WordBreakerTest,unicode10Emoji)169 TEST(WordBreakerTest, unicode10Emoji) {
170 // Should break between emojis.
171 uint16_t buf[] = {
172 // SLED + SLED
173 UTF16(0x1F6F7), UTF16(0x1F6F7),
174 // SLED + VS15 + SLED
175 UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7),
176 // WHITE SMILING FACE + SLED
177 0x263A, UTF16(0x1F6F7),
178 // WHITE SMILING FACE + VS16 + SLED
179 0x263A, 0xFE0F, UTF16(0x1F6F7),
180 };
181 auto lbStyle = LineBreakStyle::None;
182 auto lbWordStyle = LineBreakWordStyle::None;
183 WordBreaker breaker;
184 breaker.setText(buf, NELEM(buf));
185 EXPECT_EQ(0, breaker.current());
186 EXPECT_EQ(2, breaker.followingWithLocale(Locale("en"), lbStyle, lbWordStyle, 0));
187 EXPECT_EQ(0, breaker.wordStart());
188 EXPECT_EQ(2, breaker.wordEnd());
189
190 EXPECT_EQ(4, breaker.next());
191 EXPECT_EQ(2, breaker.wordStart());
192 EXPECT_EQ(4, breaker.wordEnd());
193
194 EXPECT_EQ(7, breaker.next());
195 EXPECT_EQ(4, breaker.wordStart());
196 EXPECT_EQ(7, breaker.wordEnd());
197
198 EXPECT_EQ(9, breaker.next());
199 EXPECT_EQ(7, breaker.wordStart());
200 EXPECT_EQ(9, breaker.wordEnd());
201
202 EXPECT_EQ(10, breaker.next());
203 EXPECT_EQ(9, breaker.wordStart());
204 EXPECT_EQ(10, breaker.wordEnd());
205
206 EXPECT_EQ(12, breaker.next());
207 EXPECT_EQ(10, breaker.wordStart());
208 EXPECT_EQ(12, breaker.wordEnd());
209
210 EXPECT_EQ(14, breaker.next());
211 EXPECT_EQ(12, breaker.wordStart());
212 EXPECT_EQ(14, breaker.wordEnd());
213
214 EXPECT_EQ(16, breaker.next());
215 EXPECT_EQ(14, breaker.wordStart());
216 EXPECT_EQ(16, breaker.wordEnd());
217 }
218
TEST(WordBreakerTest,flagsSequenceSingleFlag)219 TEST(WordBreakerTest, flagsSequenceSingleFlag) {
220 const std::string kFlag = "U+1F3F4";
221 const std::string flags = kFlag + " " + kFlag;
222
223 const int kFlagLength = 2;
224 const size_t BUF_SIZE = kFlagLength * 2;
225
226 uint16_t buf[BUF_SIZE];
227 size_t size;
228 ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr);
229 auto lbStyle = LineBreakStyle::None;
230 auto lbWordStyle = LineBreakWordStyle::None;
231
232 WordBreaker breaker;
233 breaker.setText(buf, size);
234 EXPECT_EQ(0, breaker.current());
235 // end of the first flag
236 EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
237 EXPECT_EQ(0, breaker.wordStart());
238 EXPECT_EQ(kFlagLength, breaker.wordEnd());
239 EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
240 EXPECT_EQ(kFlagLength, breaker.wordStart());
241 EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
242 }
243
TEST(WordBreakerTest,flagsSequence)244 TEST(WordBreakerTest, flagsSequence) {
245 // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
246 // of Scotland.
247 const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F";
248 const std::string flagSequence = kFlagSequence + " " + kFlagSequence;
249
250 const int kFlagLength = 14;
251 const size_t BUF_SIZE = kFlagLength * 2;
252
253 uint16_t buf[BUF_SIZE];
254 size_t size;
255 ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr);
256 auto lbStyle = LineBreakStyle::None;
257 auto lbWordStyle = LineBreakWordStyle::None;
258
259 WordBreaker breaker;
260 breaker.setText(buf, size);
261 EXPECT_EQ(0, breaker.current());
262 // end of the first flag sequence
263 EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
264 EXPECT_EQ(0, breaker.wordStart());
265 EXPECT_EQ(kFlagLength, breaker.wordEnd());
266 EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
267 EXPECT_EQ(kFlagLength, breaker.wordStart());
268 EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
269 }
270
TEST(WordBreakerTest,punct)271 TEST(WordBreakerTest, punct) {
272 uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l', 'o', ',',
273 ' ', 'w', 'o', 'r', 'l', 'd', '!', '!'};
274 auto lbStyle = LineBreakStyle::None;
275 auto lbWordStyle = LineBreakWordStyle::None;
276 WordBreaker breaker;
277 breaker.setText(buf, NELEM(buf));
278 EXPECT_EQ(0, breaker.current());
279 EXPECT_EQ(9, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
280 0)); // after "¡¡hello, "
281 EXPECT_EQ(2, breaker.wordStart()); // "hello"
282 EXPECT_EQ(7, breaker.wordEnd());
283 EXPECT_EQ(0, breaker.breakBadness());
284 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
285 EXPECT_EQ(9, breaker.wordStart()); // "world"
286 EXPECT_EQ(14, breaker.wordEnd());
287 EXPECT_EQ(0, breaker.breakBadness());
288 }
289
TEST(WordBreakerTest,email)290 TEST(WordBreakerTest, email) {
291 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p',
292 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
293 auto lbStyle = LineBreakStyle::None;
294 auto lbWordStyle = LineBreakWordStyle::None;
295 WordBreaker breaker;
296 breaker.setText(buf, NELEM(buf));
297 EXPECT_EQ(0, breaker.current());
298 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
299 0)); // after "foo@example"
300 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
301 EXPECT_EQ(1, breaker.breakBadness());
302 EXPECT_EQ(16, breaker.next()); // after ".com "
303 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
304 EXPECT_EQ(0, breaker.breakBadness());
305 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
306 EXPECT_EQ(16, breaker.wordStart()); // "x"
307 EXPECT_EQ(17, breaker.wordEnd());
308 EXPECT_EQ(0, breaker.breakBadness());
309 }
310
TEST(WordBreakerTest,mailto)311 TEST(WordBreakerTest, mailto) {
312 uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 'e',
313 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
314 auto lbStyle = LineBreakStyle::None;
315 auto lbWordStyle = LineBreakWordStyle::None;
316 WordBreaker breaker;
317 breaker.setText(buf, NELEM(buf));
318 EXPECT_EQ(0, breaker.current());
319 EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
320 0)); // after "mailto:"
321 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
322 EXPECT_EQ(1, breaker.breakBadness());
323 EXPECT_EQ(18, breaker.next()); // after "foo@example"
324 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
325 EXPECT_EQ(1, breaker.breakBadness());
326 EXPECT_EQ(23, breaker.next()); // after ".com "
327 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
328 EXPECT_EQ(0, breaker.breakBadness());
329 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
330 EXPECT_EQ(23, breaker.wordStart()); // "x"
331 EXPECT_EQ(24, breaker.wordEnd());
332 EXPECT_EQ(0, breaker.breakBadness());
333 }
334
335 // The current logic always places a line break after a detected email address or URL
336 // and an immediately following non-ASCII character.
TEST(WordBreakerTest,emailNonAscii)337 TEST(WordBreakerTest, emailNonAscii) {
338 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm',
339 'p', 'l', 'e', '.', 'c', 'o', 'm', 0x4E00};
340 auto lbStyle = LineBreakStyle::None;
341 auto lbWordStyle = LineBreakWordStyle::None;
342 WordBreaker breaker;
343 breaker.setText(buf, NELEM(buf));
344 EXPECT_EQ(0, breaker.current());
345 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
346 0)); // after "foo@example"
347 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
348 EXPECT_EQ(1, breaker.breakBadness());
349 EXPECT_EQ(15, breaker.next()); // after ".com"
350 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
351 EXPECT_EQ(0, breaker.breakBadness());
352 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
353 EXPECT_EQ(15, breaker.wordStart()); // "一"
354 EXPECT_EQ(16, breaker.wordEnd());
355 EXPECT_EQ(0, breaker.breakBadness());
356 }
357
TEST(WordBreakerTest,emailCombining)358 TEST(WordBreakerTest, emailCombining) {
359 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p',
360 'l', 'e', '.', 'c', 'o', 'm', 0x0303, ' ', 'x'};
361 auto lbStyle = LineBreakStyle::None;
362 auto lbWordStyle = LineBreakWordStyle::None;
363 WordBreaker breaker;
364 breaker.setText(buf, NELEM(buf));
365 EXPECT_EQ(0, breaker.current());
366 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
367 0)); // after "foo@example"
368 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
369 EXPECT_EQ(1, breaker.breakBadness());
370 EXPECT_EQ(17, breaker.next()); // after ".com̃ "
371 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
372 EXPECT_EQ(0, breaker.breakBadness());
373 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
374 EXPECT_EQ(17, breaker.wordStart()); // "x"
375 EXPECT_EQ(18, breaker.wordEnd());
376 EXPECT_EQ(0, breaker.breakBadness());
377 }
378
TEST(WordBreakerTest,lonelyAt)379 TEST(WordBreakerTest, lonelyAt) {
380 uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
381 auto lbStyle = LineBreakStyle::None;
382 auto lbWordStyle = LineBreakWordStyle::None;
383 WordBreaker breaker;
384 breaker.setText(buf, NELEM(buf));
385 EXPECT_EQ(0, breaker.current());
386 EXPECT_EQ(2,
387 breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); // after "a "
388 EXPECT_EQ(0, breaker.wordStart()); // "a"
389 EXPECT_EQ(1, breaker.wordEnd());
390 EXPECT_EQ(0, breaker.breakBadness());
391 EXPECT_EQ(4, breaker.next()); // after "@ "
392 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
393 EXPECT_EQ(0, breaker.breakBadness());
394 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
395 EXPECT_EQ(4, breaker.wordStart()); // "b"
396 EXPECT_EQ(5, breaker.wordEnd());
397 EXPECT_EQ(0, breaker.breakBadness());
398 }
399
TEST(WordBreakerTest,url)400 TEST(WordBreakerTest, url) {
401 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a',
402 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
403 auto lbStyle = LineBreakStyle::None;
404 auto lbWordStyle = LineBreakWordStyle::None;
405 WordBreaker breaker;
406 breaker.setText(buf, NELEM(buf));
407 EXPECT_EQ(0, breaker.current());
408 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
409 0)); // after "http:"
410 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
411 EXPECT_EQ(1, breaker.breakBadness());
412 EXPECT_EQ(7, breaker.next()); // after "//"
413 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
414 EXPECT_EQ(1, breaker.breakBadness());
415 EXPECT_EQ(14, breaker.next()); // after "example"
416 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
417 EXPECT_EQ(1, breaker.breakBadness());
418 EXPECT_EQ(19, breaker.next()); // after ".com "
419 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
420 EXPECT_EQ(0, breaker.breakBadness());
421 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
422 EXPECT_EQ(19, breaker.wordStart()); // "x"
423 EXPECT_EQ(20, breaker.wordEnd());
424 EXPECT_EQ(0, breaker.breakBadness());
425 }
426
427 // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST(WordBreakerTest,urlBreakChars)428 TEST(WordBreakerTest, urlBreakChars) {
429 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/',
430 '~', 'c', ',', 'd', '-', 'e', '?', 'f', '=', 'g', '&',
431 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
432 auto lbStyle = LineBreakStyle::None;
433 auto lbWordStyle = LineBreakWordStyle::None;
434 WordBreaker breaker;
435 breaker.setText(buf, NELEM(buf));
436 EXPECT_EQ(0, breaker.current());
437 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
438 0)); // after "http:"
439 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
440 EXPECT_EQ(1, breaker.breakBadness());
441 EXPECT_EQ(7, breaker.next()); // after "//"
442 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
443 EXPECT_EQ(1, breaker.breakBadness());
444 EXPECT_EQ(8, breaker.next()); // after "a"
445 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
446 EXPECT_EQ(1, breaker.breakBadness());
447 EXPECT_EQ(10, breaker.next()); // after ".b"
448 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
449 EXPECT_EQ(1, breaker.breakBadness());
450 EXPECT_EQ(11, breaker.next()); // after "/"
451 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
452 EXPECT_EQ(1, breaker.breakBadness());
453 EXPECT_EQ(13, breaker.next()); // after "~c"
454 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
455 EXPECT_EQ(1, breaker.breakBadness());
456 EXPECT_EQ(15, breaker.next()); // after ",d"
457 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
458 EXPECT_EQ(1, breaker.breakBadness());
459 EXPECT_EQ(17, breaker.next()); // after "-e"
460 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
461 EXPECT_EQ(1, breaker.breakBadness());
462 EXPECT_EQ(19, breaker.next()); // after "?f"
463 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
464 EXPECT_EQ(1, breaker.breakBadness());
465 EXPECT_EQ(20, breaker.next()); // after "="
466 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
467 EXPECT_EQ(1, breaker.breakBadness());
468 EXPECT_EQ(21, breaker.next()); // after "g"
469 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
470 EXPECT_EQ(1, breaker.breakBadness());
471 EXPECT_EQ(22, breaker.next()); // after "&"
472 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
473 EXPECT_EQ(1, breaker.breakBadness());
474 EXPECT_EQ(23, breaker.next()); // after "h"
475 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
476 EXPECT_EQ(1, breaker.breakBadness());
477 EXPECT_EQ(25, breaker.next()); // after "#i"
478 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
479 EXPECT_EQ(1, breaker.breakBadness());
480 EXPECT_EQ(27, breaker.next()); // after "%j"
481 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
482 EXPECT_EQ(1, breaker.breakBadness());
483 EXPECT_EQ(29, breaker.next()); // after "_k"
484 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
485 EXPECT_EQ(1, breaker.breakBadness());
486 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
487 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
488 EXPECT_EQ(0, breaker.breakBadness());
489 }
490
TEST(WordBreakerTest,urlNoHyphenBreak)491 TEST(WordBreakerTest, urlNoHyphenBreak) {
492 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
493 auto lbStyle = LineBreakStyle::None;
494 auto lbWordStyle = LineBreakWordStyle::None;
495 WordBreaker breaker;
496 breaker.setText(buf, NELEM(buf));
497 EXPECT_EQ(0, breaker.current());
498 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
499 0)); // after "http:"
500 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
501 EXPECT_EQ(7, breaker.next()); // after "//"
502 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
503 EXPECT_EQ(8, breaker.next()); // after "a"
504 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
505 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
506 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
507 }
508
TEST(WordBreakerTest,urlEndsWithSlash)509 TEST(WordBreakerTest, urlEndsWithSlash) {
510 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
511 auto lbStyle = LineBreakStyle::None;
512 auto lbWordStyle = LineBreakWordStyle::None;
513 WordBreaker breaker;
514 breaker.setText(buf, NELEM(buf));
515 EXPECT_EQ(0, breaker.current());
516 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
517 0)); // after "http:"
518 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
519 EXPECT_EQ(7, breaker.next()); // after "//"
520 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
521 EXPECT_EQ(8, breaker.next()); // after "a"
522 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
523 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
524 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
525 }
526
TEST(WordBreakerTest,emailStartsWithSlash)527 TEST(WordBreakerTest, emailStartsWithSlash) {
528 uint16_t buf[] = {'/', 'a', '@', 'b'};
529 auto lbStyle = LineBreakStyle::None;
530 auto lbWordStyle = LineBreakWordStyle::None;
531 WordBreaker breaker;
532 breaker.setText(buf, NELEM(buf));
533 EXPECT_EQ(0, breaker.current());
534 EXPECT_EQ((ssize_t)NELEM(buf),
535 breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); // end
536 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
537 }
538
TEST(WordBreakerTest,setLocaleInsideUrl)539 TEST(WordBreakerTest, setLocaleInsideUrl) {
540 std::vector<uint16_t> buf = utf8ToUtf16("Hello http://abc/d.html World");
541 auto lbStyle = LineBreakStyle::None;
542 auto lbWordStyle = LineBreakWordStyle::None;
543 WordBreaker breaker;
544 breaker.setText(buf.data(), buf.size());
545 EXPECT_EQ(0, breaker.current());
546 EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
547 0)); // after "Hello "
548 EXPECT_EQ(0, breaker.wordStart());
549 EXPECT_EQ(5, breaker.wordEnd());
550
551 EXPECT_EQ(6, breaker.current());
552 EXPECT_EQ(11, breaker.next()); // after "http:"
553
554 // Restart from middle point of the URL. It should return the same previous break point.
555 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
556 6)); // after "http:"
557 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
558
559 EXPECT_EQ(13, breaker.next()); // after "//"
560 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
561
562 // Restart from middle point of the URL. It should return the same previous break point.
563 EXPECT_EQ(13, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
564 12)); // after "//"
565 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
566 EXPECT_EQ(16, breaker.next()); // after "abc"
567 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
568 EXPECT_EQ(18, breaker.next()); // after "/d"
569 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
570 EXPECT_EQ(24, breaker.next()); // after ".html"
571 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
572
573 EXPECT_EQ(29, breaker.next()); // after "World"
574 EXPECT_EQ(24, breaker.wordStart());
575 EXPECT_EQ(29, breaker.wordEnd());
576 }
577
578 // b/68669534
TEST(WordBreakerTest,spaceAfterSpace)579 TEST(WordBreakerTest, spaceAfterSpace) {
580 const std::vector<uint16_t> SPACES = {
581 '\t', // TAB
582 0x1680, // OGHAM SPACE MARK
583 0x3000, // IDEOGRAPHIC SPACE
584 };
585
586 constexpr uint16_t CHAR_SPACE = 0x0020;
587 auto lbStyle = LineBreakStyle::None;
588 auto lbWordStyle = LineBreakWordStyle::None;
589
590 for (uint16_t sp : SPACES) {
591 char msg[64] = {};
592 snprintf(msg, sizeof(msg), "Test Space: U+%04X", sp);
593 SCOPED_TRACE(msg);
594
595 std::vector<uint16_t> buf = {'a', CHAR_SPACE, sp, 'b'};
596 WordBreaker breaker;
597 breaker.setText(buf.data(), buf.size());
598
599 EXPECT_EQ(0, breaker.current());
600 EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
601 0)); // after "a "
602 EXPECT_EQ(0, breaker.wordStart());
603 EXPECT_EQ(1, breaker.wordEnd());
604
605 EXPECT_EQ(2, breaker.current());
606 EXPECT_EQ(3, breaker.next()); // after CHAR_SPACE character.
607 EXPECT_EQ(2, breaker.wordStart());
608 EXPECT_EQ(2, breaker.wordEnd());
609
610 EXPECT_EQ(3, breaker.current());
611 EXPECT_EQ(4, breaker.next()); // after sp character.
612 EXPECT_EQ(3, breaker.wordStart());
613 EXPECT_EQ(4, breaker.wordEnd());
614 }
615 }
616
617 class TestableICULineBreakerPoolImpl : public ICULineBreakerPoolImpl {
618 public:
TestableICULineBreakerPoolImpl()619 TestableICULineBreakerPoolImpl() : ICULineBreakerPoolImpl() {}
620
621 using ICULineBreakerPoolImpl::getPoolSize;
622 using ICULineBreakerPoolImpl::MAX_POOL_SIZE;
623 };
624
TEST(WordBreakerTest,LineBreakerPool_acquire_without_release)625 TEST(WordBreakerTest, LineBreakerPool_acquire_without_release) {
626 TestableICULineBreakerPoolImpl pool;
627
628 const Locale enUS("en-Latn-US");
629 const Locale frFR("fr-Latn-FR");
630
631 // All following three breakers must be the different instances.
632 ICULineBreakerPool::Slot enUSBreaker =
633 pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
634 ICULineBreakerPool::Slot enUSBreaker2 =
635 pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
636 ICULineBreakerPool::Slot enUSBreaker3 =
637 pool.acquire(enUS, LineBreakStyle::Strict, LineBreakWordStyle::None);
638 ICULineBreakerPool::Slot frFRBreaker =
639 pool.acquire(frFR, LineBreakStyle::None, LineBreakWordStyle::None);
640 ICULineBreakerPool::Slot frFRBreaker2 =
641 pool.acquire(frFR, LineBreakStyle::None, LineBreakWordStyle::Phrase);
642
643 EXPECT_NE(nullptr, enUSBreaker.breaker.get());
644 EXPECT_NE(nullptr, enUSBreaker2.breaker.get());
645 EXPECT_NE(nullptr, enUSBreaker3.breaker.get());
646 EXPECT_NE(nullptr, frFRBreaker.breaker.get());
647 EXPECT_NE(nullptr, frFRBreaker2.breaker.get());
648
649 EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker2.breaker.get());
650 EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker3.breaker.get());
651 EXPECT_NE(enUSBreaker.breaker.get(), frFRBreaker.breaker.get());
652 EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker.breaker.get());
653 EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker2.breaker.get());
654 EXPECT_NE(enUSBreaker2.breaker.get(), enUSBreaker3.breaker.get());
655
656 EXPECT_EQ(enUSBreaker.localeId, enUSBreaker2.localeId);
657 EXPECT_EQ(enUSBreaker.localeId, enUSBreaker3.localeId);
658 EXPECT_NE(enUSBreaker.localeId, frFRBreaker.localeId);
659 EXPECT_NE(enUSBreaker.localeId, frFRBreaker2.localeId);
660 EXPECT_NE(enUSBreaker2.localeId, frFRBreaker.localeId);
661 EXPECT_NE(enUSBreaker2.localeId, frFRBreaker2.localeId);
662 EXPECT_EQ(frFRBreaker.localeId, frFRBreaker2.localeId);
663 }
664
TEST(WordBreakerTest,LineBreakerPool_acquire_with_release)665 TEST(WordBreakerTest, LineBreakerPool_acquire_with_release) {
666 TestableICULineBreakerPoolImpl pool;
667
668 const Locale enUS("en-Latn-US");
669 const Locale frFR("fr-Latn-FR");
670
671 // All following three breakers must be the different instances.
672 ICULineBreakerPool::Slot enUSBreaker =
673 pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
674
675 uint64_t enUSBreakerLocaleId = enUSBreaker.localeId;
676 auto* enUSBreakerPtr = enUSBreaker.breaker.get();
677
678 pool.release(std::move(enUSBreaker));
679 EXPECT_EQ(nullptr, enUSBreaker.breaker.get());
680
681 // acquire must return a different instance if the locale is different.
682 ICULineBreakerPool::Slot frFRBreaker =
683 pool.acquire(frFR, LineBreakStyle::Loose, LineBreakWordStyle::None);
684 EXPECT_NE(enUSBreakerPtr, frFRBreaker.breaker.get());
685 EXPECT_NE(enUSBreakerLocaleId, frFRBreaker.localeId);
686
687 // acquire must return the same instance as released before if the locale is the same.
688 ICULineBreakerPool::Slot enUSBreaker2 =
689 pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
690 EXPECT_EQ(enUSBreakerPtr, enUSBreaker2.breaker.get());
691 EXPECT_EQ(enUSBreakerLocaleId, enUSBreaker2.localeId);
692
693 // acquire must return a different instance if the line break is different.
694 ICULineBreakerPool::Slot frFRBreaker2 =
695 pool.acquire(frFR, LineBreakStyle::Normal, LineBreakWordStyle::None);
696 ICULineBreakerPool::Slot frFRBreaker3 =
697 pool.acquire(frFR, LineBreakStyle::Normal, LineBreakWordStyle::Phrase);
698 EXPECT_NE(frFRBreaker.breaker.get(), frFRBreaker2.breaker.get());
699 EXPECT_NE(frFRBreaker.breaker.get(), frFRBreaker3.breaker.get());
700 EXPECT_NE(frFRBreaker2.breaker.get(), frFRBreaker3.breaker.get());
701 EXPECT_EQ(frFRBreaker.localeId, frFRBreaker2.localeId);
702 EXPECT_EQ(frFRBreaker.localeId, frFRBreaker3.localeId);
703 EXPECT_EQ(frFRBreaker2.localeId, frFRBreaker3.localeId);
704 }
705
TEST(WordBreakerTest,LineBreakerPool_exceeds_pool_size)706 TEST(WordBreakerTest, LineBreakerPool_exceeds_pool_size) {
707 const size_t MAX_POOL_SIZE = TestableICULineBreakerPoolImpl::MAX_POOL_SIZE;
708 TestableICULineBreakerPoolImpl pool;
709
710 const Locale enUS("en-Latn-US");
711
712 ICULineBreakerPool::Slot slots[MAX_POOL_SIZE * 2];
713
714 // Make pool full.
715 for (size_t i = 0; i < MAX_POOL_SIZE * 2; i++) {
716 slots[i] = pool.acquire(enUS, LineBreakStyle::None, LineBreakWordStyle::None);
717 EXPECT_EQ(0U, pool.getPoolSize());
718 }
719
720 for (size_t i = 0; i < MAX_POOL_SIZE; i++) {
721 pool.release(std::move(slots[i]));
722 EXPECT_EQ(i + 1, pool.getPoolSize());
723 }
724
725 for (size_t i = MAX_POOL_SIZE; i < MAX_POOL_SIZE * 2; i++) {
726 pool.release(std::move(slots[i]));
727 EXPECT_EQ(MAX_POOL_SIZE, pool.getPoolSize());
728 }
729 }
730
TEST(WordBreakerTest,noBreak_urlNoHyphenBreak)731 TEST(WordBreakerTest, noBreak_urlNoHyphenBreak) {
732 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
733 auto lbStyle = LineBreakStyle::NoBreak;
734 auto lbWordStyle = LineBreakWordStyle::None;
735 WordBreaker breaker;
736 breaker.setText(buf, NELEM(buf));
737 EXPECT_EQ(0, breaker.current());
738 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
739 EXPECT_EQ(0, breaker.wordStart());
740 EXPECT_EQ(11, breaker.current());
741 EXPECT_EQ(11, breaker.next());
742 }
743
TEST(WordBreakerTest,noBreak_urlEndsWithSlash)744 TEST(WordBreakerTest, noBreak_urlEndsWithSlash) {
745 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
746 auto lbStyle = LineBreakStyle::NoBreak;
747 auto lbWordStyle = LineBreakWordStyle::None;
748 WordBreaker breaker;
749 breaker.setText(buf, NELEM(buf));
750 EXPECT_EQ(0, breaker.current());
751 EXPECT_EQ(9, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
752 EXPECT_EQ(0, breaker.wordStart());
753 EXPECT_EQ(9, breaker.next());
754 }
755
TEST(WordBreakerTest,noBreak_setLocaleInsideUrl)756 TEST(WordBreakerTest, noBreak_setLocaleInsideUrl) {
757 std::vector<uint16_t> buf = utf8ToUtf16("Hello http://abc/d.html World");
758 auto lbStyle = LineBreakStyle::NoBreak;
759 auto lbWordStyle = LineBreakWordStyle::None;
760 WordBreaker breaker;
761 breaker.setText(buf.data(), buf.size());
762 EXPECT_EQ(0, breaker.current());
763 EXPECT_EQ(29, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
764 EXPECT_EQ(0, breaker.wordStart());
765 EXPECT_EQ(29, breaker.wordEnd());
766
767 EXPECT_EQ(29, breaker.current());
768 EXPECT_EQ(29, breaker.next());
769 }
770
771 } // namespace minikin
772