1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include <algorithm>
18 #include <set>
19 #include <sstream>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <utility>
24 #include <vector>
25
26 #include "unicode/brkiter.h"
27 #include "unicode/localpointer.h"
28 #include "unicode/numfmt.h"
29 #include "unicode/rbbi.h"
30 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
31 #include "unicode/regex.h"
32 #endif
33 #include "unicode/schriter.h"
34 #include "unicode/uchar.h"
35 #include "unicode/utf16.h"
36 #include "unicode/ucnv.h"
37 #include "unicode/uniset.h"
38 #include "unicode/uscript.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
41 #include "unicode/utrace.h"
42
43 #include "charstr.h"
44 #include "cmemory.h"
45 #include "cstr.h"
46 #include "cstring.h"
47 #include "intltest.h"
48 #include "lstmbe.h"
49 #include "rbbitst.h"
50 #include "rbbidata.h"
51 #include "utypeinfo.h" // for 'typeid' to work
52 #include "uvector.h"
53 #include "uvectr32.h"
54
55
56 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
57 #include "unicode/filteredbrk.h"
58 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
59
60 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
61 if (!(x)) { \
62 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
63 } \
64 } UPRV_BLOCK_MACRO_END
65
66 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
67 if (U_FAILURE(errcode)) { \
68 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
69 } \
70 } UPRV_BLOCK_MACRO_END
71
72 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
73 IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
74 __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
75 }
76
77 //---------------------------------------------
78 // runIndexedTest
79 //---------------------------------------------
80
81
82 // Note: Before adding new tests to this file, check whether the desired test data can
83 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
84 // it's much less work than writing a new test, diagnostic output in the event of failures
85 // is good, and the test data file will is shared with ICU4J, so eventually the test
86 // will run there as well, without additional effort.
87
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)88 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
89 {
90 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
91 fTestParams = params;
92
93 TESTCASE_AUTO_BEGIN;
94 #if !UCONFIG_NO_FILE_IO
95 TESTCASE_AUTO(TestBug4153072);
96 #endif
97 #if !UCONFIG_NO_FILE_IO
98 TESTCASE_AUTO(TestUnicodeFiles);
99 #endif
100 TESTCASE_AUTO(TestGetAvailableLocales);
101 TESTCASE_AUTO(TestGetDisplayName);
102 #if !UCONFIG_NO_FILE_IO
103 TESTCASE_AUTO(TestEndBehaviour);
104 TESTCASE_AUTO(TestWordBreaks);
105 TESTCASE_AUTO(TestWordBoundary);
106 TESTCASE_AUTO(TestLineBreaks);
107 TESTCASE_AUTO(TestSentBreaks);
108 TESTCASE_AUTO(TestExtended);
109 #endif
110 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
111 TESTCASE_AUTO(TestMonkey);
112 #endif
113 #if !UCONFIG_NO_FILE_IO
114 TESTCASE_AUTO(TestBug3818);
115 #endif
116 TESTCASE_AUTO(TestDebug);
117 #if !UCONFIG_NO_FILE_IO
118 TESTCASE_AUTO(TestBug5775);
119 #endif
120 TESTCASE_AUTO(TestBug9983);
121 TESTCASE_AUTO(TestDictRules);
122 TESTCASE_AUTO(TestBug5532);
123 TESTCASE_AUTO(TestBug7547);
124 TESTCASE_AUTO(TestBug12797);
125 TESTCASE_AUTO(TestBug12918);
126 TESTCASE_AUTO(TestBug12932);
127 TESTCASE_AUTO(TestEmoji);
128 TESTCASE_AUTO(TestBug12519);
129 TESTCASE_AUTO(TestBug12677);
130 TESTCASE_AUTO(TestTableRedundancies);
131 TESTCASE_AUTO(TestBug13447);
132 TESTCASE_AUTO(TestReverse);
133 TESTCASE_AUTO(TestBug13692);
134 TESTCASE_AUTO(TestDebugRules);
135 TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
136 TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
137 TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
138 TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
139 TESTCASE_AUTO(TestTable_8_16_Bits);
140 TESTCASE_AUTO(TestBug13590);
141 TESTCASE_AUTO(TestUnpairedSurrogate);
142 TESTCASE_AUTO(TestLSTMThai);
143 TESTCASE_AUTO(TestLSTMBurmese);
144 TESTCASE_AUTO(TestRandomAccess);
145 TESTCASE_AUTO(TestExternalBreakEngineWithFakeTaiLe);
146 TESTCASE_AUTO(TestExternalBreakEngineWithFakeYue);
147 TESTCASE_AUTO(TestBug22579);
148 TESTCASE_AUTO(TestBug22581);
149 TESTCASE_AUTO(TestBug22584);
150 TESTCASE_AUTO(TestBug22585);
151 TESTCASE_AUTO(TestBug22602);
152 TESTCASE_AUTO(TestBug22636);
153
154 #if U_ENABLE_TRACING
155 TESTCASE_AUTO(TestTraceCreateCharacter);
156 TESTCASE_AUTO(TestTraceCreateWord);
157 TESTCASE_AUTO(TestTraceCreateSentence);
158 TESTCASE_AUTO(TestTraceCreateTitle);
159 TESTCASE_AUTO(TestTraceCreateLine);
160 TESTCASE_AUTO(TestTraceCreateLineNormal);
161 TESTCASE_AUTO(TestTraceCreateLineLoose);
162 TESTCASE_AUTO(TestTraceCreateLineStrict);
163 TESTCASE_AUTO(TestTraceCreateLineNormalPhrase);
164 TESTCASE_AUTO(TestTraceCreateLineLoosePhrase);
165 TESTCASE_AUTO(TestTraceCreateLineStrictPhrase);
166 TESTCASE_AUTO(TestTraceCreateLinePhrase);
167 TESTCASE_AUTO(TestTraceCreateBreakEngine);
168 #endif
169
170 TESTCASE_AUTO_END;
171 }
172
173
174 //--------------------------------------------------------------------------------------
175 //
176 // RBBITest constructor and destructor
177 //
178 //--------------------------------------------------------------------------------------
179
RBBITest()180 RBBITest::RBBITest() {
181 fTestParams = nullptr;
182 }
183
184
~RBBITest()185 RBBITest::~RBBITest() {
186 }
187
188
printStringBreaks(UText * tstr,int expected[],int expectedCount)189 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
190 UErrorCode status = U_ZERO_ERROR;
191 char name[100];
192 printf("code alpha extend alphanum type word sent line name\n");
193 int nextExpectedIndex = 0;
194 utext_setNativeIndex(tstr, 0);
195 for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
196 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
197 printf("------------------------------------------------ %d\n", j);
198 ++nextExpectedIndex;
199 }
200
201 UChar32 c = utext_next32(tstr);
202 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
203 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
204 u_isUAlphabetic(c),
205 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
206 u_isalnum(c),
207 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
208 u_charType(c),
209 U_SHORT_PROPERTY_NAME),
210 u_getPropertyValueName(UCHAR_WORD_BREAK,
211 u_getIntPropertyValue(c,
212 UCHAR_WORD_BREAK),
213 U_SHORT_PROPERTY_NAME),
214 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
215 u_getIntPropertyValue(c,
216 UCHAR_SENTENCE_BREAK),
217 U_SHORT_PROPERTY_NAME),
218 u_getPropertyValueName(UCHAR_LINE_BREAK,
219 u_getIntPropertyValue(c,
220 UCHAR_LINE_BREAK),
221 U_SHORT_PROPERTY_NAME),
222 name);
223 }
224 }
225
226
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)227 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
228 UErrorCode status = U_ZERO_ERROR;
229 UText *tstr = nullptr;
230 tstr = utext_openConstUnicodeString(nullptr, &ustr, &status);
231 if (U_FAILURE(status)) {
232 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
233 return;
234 }
235 printStringBreaks(tstr, expected, expectedCount);
236 utext_close(tstr);
237 }
238
239
TestBug3818()240 void RBBITest::TestBug3818() {
241 UErrorCode status = U_ZERO_ERROR;
242
243 // Four Thai words...
244 static const char16_t thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
245 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
246 UnicodeString thaiStr(thaiWordData);
247
248 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
249 if (U_FAILURE(status) || bi == nullptr) {
250 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
251 return;
252 }
253 bi->setText(thaiStr);
254
255 int32_t startOfSecondWord = bi->following(1);
256 if (startOfSecondWord != 4) {
257 errln("Fail at file %s, line %d expected start of word at 4, got %d",
258 __FILE__, __LINE__, startOfSecondWord);
259 }
260 startOfSecondWord = bi->following(0);
261 if (startOfSecondWord != 4) {
262 errln("Fail at file %s, line %d expected start of word at 4, got %d",
263 __FILE__, __LINE__, startOfSecondWord);
264 }
265 delete bi;
266 }
267
268
269 //---------------------------------------------
270 //
271 // other tests
272 //
273 //---------------------------------------------
274
TestGetAvailableLocales()275 void RBBITest::TestGetAvailableLocales()
276 {
277 int32_t locCount = 0;
278 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
279
280 if (locCount == 0)
281 dataerrln("getAvailableLocales() returned an empty list!");
282 // Just make sure that it's returning good memory.
283 int32_t i;
284 for (i = 0; i < locCount; ++i) {
285 logln(locList[i].getName());
286 }
287 }
288
289 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()290 void RBBITest::TestGetDisplayName()
291 {
292 UnicodeString result;
293
294 BreakIterator::getDisplayName(Locale::getUS(), result);
295 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
296 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
297 + result);
298
299 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
300 if (result != "French (France)")
301 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
302 + result);
303 }
304 /**
305 * Test End Behaviour
306 * @bug 4068137
307 */
TestEndBehaviour()308 void RBBITest::TestEndBehaviour()
309 {
310 UErrorCode status = U_ZERO_ERROR;
311 UnicodeString testString("boo.");
312 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
313 if (U_FAILURE(status))
314 {
315 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
316 return;
317 }
318 wb->setText(testString);
319
320 if (wb->first() != 0)
321 errln("Didn't get break at beginning of string.");
322 if (wb->next() != 3)
323 errln("Didn't get break before period in \"boo.\"");
324 if (wb->current() != 4 && wb->next() != 4)
325 errln("Didn't get break at end of string.");
326 delete wb;
327 }
328 /*
329 * @bug 4153072
330 */
TestBug4153072()331 void RBBITest::TestBug4153072() {
332 UErrorCode status = U_ZERO_ERROR;
333 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
334 if (U_FAILURE(status))
335 {
336 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
337 return;
338 }
339 UnicodeString str("...Hello, World!...");
340 int32_t begin = 3;
341 int32_t end = str.length() - 3;
342 UBool onBoundary;
343
344 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
345 iter->adoptText(textIterator);
346 int index;
347 // Note: with the switch to UText, there is no way to restrict the
348 // iteration range to begin at an index other than zero.
349 // String character iterators created with a non-zero bound are
350 // treated by RBBI as being empty.
351 for (index = -1; index < begin + 1; ++index) {
352 onBoundary = iter->isBoundary(index);
353 if (index == 0? !onBoundary : onBoundary) {
354 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
355 " and begin index = " + begin);
356 }
357 }
358 delete iter;
359 }
360
361
362 //
363 // Test for problem reported by Ashok Matoria on 9 July 2007
364 // One.<kSoftHyphen><kSpace>Two.
365 //
366 // Sentence break at start (0) and then on calling next() it breaks at
367 // 'T' of "Two". Now, at this point if I do next() and
368 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
369 //
TestBug5775()370 void RBBITest::TestBug5775() {
371 UErrorCode status = U_ZERO_ERROR;
372 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
373 TEST_ASSERT_SUCCESS(status);
374 if (U_FAILURE(status)) {
375 return;
376 }
377 // Check for status first for better handling of no data errors.
378 TEST_ASSERT(bi != nullptr);
379 if (bi == nullptr) {
380 return;
381 }
382
383 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
384 // 01234 56789
385 s = s.unescape();
386 bi->setText(s);
387 int pos = bi->next();
388 TEST_ASSERT(pos == 6);
389 pos = bi->next();
390 TEST_ASSERT(pos == 10);
391 pos = bi->previous();
392 TEST_ASSERT(pos == 6);
393 delete bi;
394 }
395
396
397
398 //------------------------------------------------------------------------------
399 //
400 // RBBITest::Extended Run RBBI Tests from an external test data file
401 //
402 //------------------------------------------------------------------------------
403
404 struct TestParams {
405 BreakIterator *bi; // Break iterator is set while parsing test source.
406 // Changed out whenever test data changes break type.
407
408 UnicodeString dataToBreak; // Data that is built up while parsing the test.
409 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
410 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
411 UVector32 *srcCol;
412
413 UText *textToBreak; // UText, could be UTF8 or UTF16.
414 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
415 CharString utf8String; // UTF-8 form of text to break.
416
TestParamsTestParams417 TestParams(UErrorCode &status) : dataToBreak() {
418 bi = nullptr;
419 expectedBreaks = new UVector32(status);
420 srcLine = new UVector32(status);
421 srcCol = new UVector32(status);
422 textToBreak = nullptr;
423 textMap = new UVector32(status);
424 }
425
~TestParamsTestParams426 ~TestParams() {
427 delete bi;
428 delete expectedBreaks;
429 delete srcLine;
430 delete srcCol;
431 utext_close(textToBreak);
432 delete textMap;
433 }
434
435 int32_t getSrcLine(int32_t bp);
436 int32_t getExpectedBreak(int32_t bp);
437 int32_t getSrcCol(int32_t bp);
438
439 void setUTF16(UErrorCode &status);
440 void setUTF8(UErrorCode &status);
441 };
442
443 // Append a UnicodeString to a CharString with UTF-8 encoding.
444 // Substitute any invalid chars.
445 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)446 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
447 if (U_FAILURE(status)) {
448 return;
449 }
450 int32_t utf8Length;
451 u_strToUTF8WithSub(nullptr, 0, &utf8Length, // Output Buffer, nullptr for preflight.
452 src.getBuffer(), src.length(), // UTF-16 data
453 0xfffd, nullptr, // Substitution char, number of subs.
454 &status);
455 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
456 return;
457 }
458 status = U_ZERO_ERROR;
459 int32_t capacity;
460 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
461 u_strToUTF8WithSub(buffer, utf8Length, nullptr,
462 src.getBuffer(), src.length(),
463 0xfffd, nullptr, &status);
464 dest.append(buffer, utf8Length, status);
465 }
466
467
setUTF16(UErrorCode & status)468 void TestParams::setUTF16(UErrorCode &status) {
469 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
470 textMap->removeAllElements();
471 for (int32_t i=0; i<dataToBreak.length(); i++) {
472 if (i == dataToBreak.getChar32Start(i)) {
473 textMap->addElement(i, status);
474 } else {
475 textMap->addElement(-1, status);
476 }
477 }
478 textMap->addElement(dataToBreak.length(), status);
479 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
480 }
481
482
setUTF8(UErrorCode & status)483 void TestParams::setUTF8(UErrorCode &status) {
484 if (U_FAILURE(status)) {
485 return;
486 }
487 utf8String.clear();
488 CharStringAppend(utf8String, dataToBreak, status);
489 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
490 if (U_FAILURE(status)) {
491 return;
492 }
493
494 textMap->removeAllElements();
495 int32_t utf16Index = 0;
496 for (;;) {
497 textMap->addElement(utf16Index, status);
498 UChar32 c32 = utext_current32(textToBreak);
499 if (c32 < 0) {
500 break;
501 }
502 utf16Index += U16_LENGTH(c32);
503 utext_next32(textToBreak);
504 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
505 textMap->addElement(-1, status);
506 }
507 }
508 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
509 }
510
511
getSrcLine(int32_t bp)512 int32_t TestParams::getSrcLine(int32_t bp) {
513 if (bp >= textMap->size()) {
514 bp = textMap->size() - 1;
515 }
516 int32_t i = 0;
517 for(; bp >= 0 ; --bp) {
518 // Move to a character boundary if we are not on one already.
519 i = textMap->elementAti(bp);
520 if (i >= 0) {
521 break;
522 }
523 }
524 return srcLine->elementAti(i);
525 }
526
527
getExpectedBreak(int32_t bp)528 int32_t TestParams::getExpectedBreak(int32_t bp) {
529 if (bp >= textMap->size()) {
530 return 0;
531 }
532 int32_t i = textMap->elementAti(bp);
533 int32_t retVal = 0;
534 if (i >= 0) {
535 retVal = expectedBreaks->elementAti(i);
536 }
537 return retVal;
538 }
539
540
getSrcCol(int32_t bp)541 int32_t TestParams::getSrcCol(int32_t bp) {
542 if (bp >= textMap->size()) {
543 bp = textMap->size() - 1;
544 }
545 int32_t i = 0;
546 for(; bp >= 0; --bp) {
547 // Move bp to a character boundary if we are not on one already.
548 i = textMap->elementAti(bp);
549 if (i >= 0) {
550 break;
551 }
552 }
553 return srcCol->elementAti(i);
554 }
555
556
executeTest(TestParams * t,UErrorCode & status)557 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
558 int32_t bp;
559 int32_t prevBP;
560 int32_t i;
561
562 TEST_ASSERT_SUCCESS(status);
563 if (U_FAILURE(status)) {
564 return;
565 }
566
567 if (t->bi == nullptr) {
568 return;
569 }
570
571 t->bi->setText(t->textToBreak, status);
572 //
573 // Run the iterator forward
574 //
575 prevBP = -1;
576 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
577 if (prevBP == bp) {
578 // Fail for lack of forward progress.
579 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
580 bp, t->getSrcLine(bp), t->getSrcCol(bp));
581 break;
582 }
583
584 // Check that there we didn't miss an expected break between the last one
585 // and this one.
586 for (i=prevBP+1; i<bp; i++) {
587 if (t->getExpectedBreak(i) != 0) {
588 int expected[] = {0, i};
589 printStringBreaks(t->dataToBreak, expected, 2);
590 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
591 i, t->getSrcLine(i), t->getSrcCol(i));
592 }
593 }
594
595 // Check that the break we did find was expected
596 if (t->getExpectedBreak(bp) == 0) {
597 int expected[] = {0, bp};
598 printStringBreaks(t->textToBreak, expected, 2);
599 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
600 bp, t->getSrcLine(bp), t->getSrcCol(bp));
601 } else {
602 // The break was expected.
603 // Check that the {nnn} tag value is correct.
604 int32_t expectedTagVal = t->getExpectedBreak(bp);
605 if (expectedTagVal == -1) {
606 expectedTagVal = 0;
607 }
608 int32_t line = t->getSrcLine(bp);
609 int32_t rs = t->bi->getRuleStatus();
610 if (rs != expectedTagVal) {
611 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
612 " Actual, Expected status = %4d, %4d",
613 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
614 }
615 }
616
617 prevBP = bp;
618 }
619
620 // Verify that there were no missed expected breaks after the last one found
621 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
622 if (t->getExpectedBreak(i) != 0) {
623 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
624 i, t->getSrcLine(i), t->getSrcCol(i));
625 }
626 }
627
628 //
629 // Run the iterator backwards, verify that the same breaks are found.
630 //
631 prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
632 bp = t->bi->last();
633 while (bp != BreakIterator::DONE) {
634 if (prevBP == bp) {
635 // Fail for lack of progress.
636 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
637 bp, t->getSrcLine(bp), t->getSrcCol(bp));
638 break;
639 }
640
641 // Check that we didn't miss an expected break between the last one
642 // and this one. (UVector returns zeros for index out of bounds.)
643 for (i=prevBP-1; i>bp; i--) {
644 if (t->getExpectedBreak(i) != 0) {
645 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
646 i, t->getSrcLine(i), t->getSrcCol(i));
647 }
648 }
649
650 // Check that the break we did find was expected
651 if (t->getExpectedBreak(bp) == 0) {
652 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
653 bp, t->getSrcLine(bp), t->getSrcCol(bp));
654 } else {
655 // The break was expected.
656 // Check that the {nnn} tag value is correct.
657 int32_t expectedTagVal = t->getExpectedBreak(bp);
658 if (expectedTagVal == -1) {
659 expectedTagVal = 0;
660 }
661 int line = t->getSrcLine(bp);
662 int32_t rs = t->bi->getRuleStatus();
663 if (rs != expectedTagVal) {
664 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
665 " Actual, Expected status = %4d, %4d",
666 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
667 }
668 }
669
670 prevBP = bp;
671 bp = t->bi->previous();
672 }
673
674 // Verify that there were no missed breaks prior to the last one found
675 for (i=prevBP-1; i>=0; i--) {
676 if (t->getExpectedBreak(i) != 0) {
677 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
678 i, t->getSrcLine(i), t->getSrcCol(i));
679 }
680 }
681
682 // Check isBoundary()
683 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
684 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
685 UBool boundaryFound = t->bi->isBoundary(i);
686 if (boundaryExpected != boundaryFound) {
687 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
688 " Expected, Actual= %s, %s",
689 i, t->getSrcLine(i), t->getSrcCol(i),
690 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
691 }
692 }
693
694 // Check following()
695 for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
696 int32_t actualBreak = t->bi->following(i);
697 int32_t expectedBreak = BreakIterator::DONE;
698 for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
699 if (t->getExpectedBreak(j) != 0) {
700 expectedBreak = j;
701 break;
702 }
703 }
704 if (expectedBreak != actualBreak) {
705 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
706 " Expected, Actual= %d, %d",
707 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
708 }
709 }
710
711 // Check preceding()
712 for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
713 int32_t actualBreak = t->bi->preceding(i);
714 int32_t expectedBreak = BreakIterator::DONE;
715
716 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
717 // preceding(trailing byte) will return the index of some preceding code point,
718 // not the lead byte of the current code point, even though that has a smaller index.
719 // Therefore, start looking at the expected break data not at i-1, but at
720 // the start of code point index - 1.
721 utext_setNativeIndex(t->textToBreak, i);
722 int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
723 for (; j >= 0; j--) {
724 if (t->getExpectedBreak(j) != 0) {
725 expectedBreak = j;
726 break;
727 }
728 }
729 if (expectedBreak != actualBreak) {
730 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
731 " Expected, Actual= %d, %d",
732 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
733 }
734 }
735 }
736
TestExtended()737 void RBBITest::TestExtended() {
738 // The expectations in this test heavily depends on the Thai dictionary.
739 // Therefore, we skip this test under the LSTM configuration.
740 if (skipDictionaryTest()) {
741 return;
742 }
743 // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
744 // data driven test closely entangles filtered and regular data.
745 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
746 UErrorCode status = U_ZERO_ERROR;
747 Locale locale("");
748
749 TestParams tp(status);
750
751 RegexMatcher localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
752 if (U_FAILURE(status)) {
753 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
754 }
755
756 //
757 // Open and read the test data file.
758 //
759 const char *testDataDirectory = IntlTest::getSourceTestData(status);
760 CharString testFileName(testDataDirectory, -1, status);
761 testFileName.append("rbbitst.txt", -1, status);
762
763 int len;
764 char16_t *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
765 if (U_FAILURE(status)) {
766 errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
767 return;
768 }
769
770 bool skipTest = false; // Skip this test?
771
772 //
773 // Put the test data into a UnicodeString
774 //
775 UnicodeString testString(false, testFile, len);
776
777 enum EParseState{
778 PARSE_COMMENT,
779 PARSE_TAG,
780 PARSE_DATA,
781 PARSE_NUM,
782 PARSE_RULES
783 }
784 parseState = PARSE_TAG;
785
786 EParseState savedState = PARSE_TAG;
787
788 int32_t lineNum = 1;
789 int32_t colStart = 0;
790 int32_t column = 0;
791 int32_t charIdx = 0;
792
793 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
794
795 UnicodeString rules; // Holds rules from a <rules> ... </rules> block
796 int32_t rulesFirstLine = 0; // Line number of the start of current <rules> block
797
798 for (charIdx = 0; charIdx < len; ) {
799 status = U_ZERO_ERROR;
800 char16_t c = testString.charAt(charIdx);
801 charIdx++;
802 if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
803 // treat CRLF as a unit
804 c = u'\n';
805 charIdx++;
806 }
807 if (c == u'\n' || c == u'\r') {
808 lineNum++;
809 colStart = charIdx;
810 }
811 column = charIdx - colStart + 1;
812
813 switch (parseState) {
814 case PARSE_COMMENT:
815 if (c == u'\n' || c == u'\r') {
816 parseState = savedState;
817 }
818 break;
819
820 case PARSE_TAG:
821 {
822 if (c == u'#') {
823 parseState = PARSE_COMMENT;
824 savedState = PARSE_TAG;
825 break;
826 }
827 if (u_isUWhiteSpace(c)) {
828 break;
829 }
830 if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
831 delete tp.bi;
832 tp.bi = BreakIterator::createWordInstance(locale, status);
833 skipTest = false;
834 charIdx += 5;
835 break;
836 }
837 if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
838 delete tp.bi;
839 tp.bi = BreakIterator::createCharacterInstance(locale, status);
840 skipTest = false;
841 charIdx += 5;
842 break;
843 }
844 if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
845 delete tp.bi;
846 tp.bi = BreakIterator::createLineInstance(locale, status);
847 skipTest = false;
848 #if UCONFIG_USE_ML_PHRASE_BREAKING
849 if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
850 // skip <line> test cases of JP's phrase breaking when ML is enabled.
851 skipTest = true;
852 }
853 #endif
854 charIdx += 5;
855 break;
856 }
857 if (testString.compare(charIdx-1, 8, u"<lineML>") == 0) {
858 delete tp.bi;
859 tp.bi = BreakIterator::createLineInstance(locale, status);
860 skipTest = false;
861 #if !UCONFIG_USE_ML_PHRASE_BREAKING
862 if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
863 // skip <lineML> test cases of JP's phrase breaking when ML is disabled.
864 skipTest = true;
865 }
866 #endif
867 charIdx += 7;
868 break;
869 }
870 if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
871 delete tp.bi;
872 tp.bi = BreakIterator::createSentenceInstance(locale, status);
873 skipTest = false;
874 charIdx += 5;
875 break;
876 }
877 if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
878 delete tp.bi;
879 tp.bi = BreakIterator::createTitleInstance(locale, status);
880 charIdx += 6;
881 break;
882 }
883
884 if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
885 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
886 charIdx = testString.indexOf(u'>', charIdx) + 1;
887 parseState = PARSE_RULES;
888 rules.remove();
889 rulesFirstLine = lineNum;
890 break;
891 }
892
893 // <locale loc_name>
894 localeMatcher.reset(testString);
895 if (localeMatcher.lookingAt(charIdx-1, status)) {
896 UnicodeString localeName = localeMatcher.group(1, status);
897 char localeName8[100];
898 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), nullptr);
899 locale = Locale::createFromName(localeName8);
900 charIdx += localeMatcher.group(0, status).length() - 1;
901 TEST_ASSERT_SUCCESS(status);
902 break;
903 }
904 if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
905 parseState = PARSE_DATA;
906 charIdx += 5;
907 tp.dataToBreak = "";
908 tp.expectedBreaks->removeAllElements();
909 tp.srcCol ->removeAllElements();
910 tp.srcLine->removeAllElements();
911 break;
912 }
913
914 errln("line %d: Tag expected in test file.", lineNum);
915 parseState = PARSE_COMMENT;
916 savedState = PARSE_DATA;
917 goto end_test; // Stop the test.
918 }
919 break;
920
921 case PARSE_RULES:
922 if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
923 charIdx += 7;
924 parseState = PARSE_TAG;
925 delete tp.bi;
926 UParseError pe;
927 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
928 skipTest = U_FAILURE(status);
929 if (U_FAILURE(status)) {
930 errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
931 rulesFirstLine + pe.line - 1, u_errorName(status));
932 }
933 } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
934 charIdx += 10;
935 parseState = PARSE_TAG;
936 UErrorCode ec = U_ZERO_ERROR;
937 UParseError pe;
938 RuleBasedBreakIterator bi(rules, pe, ec);
939 if (U_SUCCESS(ec)) {
940 errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
941 rulesFirstLine + pe.line - 1);
942 }
943 } else {
944 rules.append(c);
945 }
946 break;
947
948 case PARSE_DATA:
949 if (c == u'•') {
950 int32_t breakIdx = tp.dataToBreak.length();
951 if (tp.expectedBreaks->size() > breakIdx) {
952 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
953 lineNum, column);
954 }
955 tp.expectedBreaks->setSize(breakIdx+1);
956 tp.expectedBreaks->setElementAt(-1, breakIdx);
957 tp.srcLine->setSize(breakIdx+1);
958 tp.srcLine->setElementAt(lineNum, breakIdx);
959 tp.srcCol ->setSize(breakIdx+1);
960 tp.srcCol ->setElementAt(column, breakIdx);
961 break;
962 }
963
964 if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
965 // Add final entry to mappings from break location to source file position.
966 // Need one extra because last break position returned is after the
967 // last char in the data, not at the last char.
968 tp.srcLine->addElement(lineNum, status);
969 tp.srcCol ->addElement(column, status);
970
971 parseState = PARSE_TAG;
972 charIdx += 6;
973
974 if (!skipTest) {
975 // RUN THE TEST!
976 status = U_ZERO_ERROR;
977 tp.setUTF16(status);
978 executeTest(&tp, status);
979 TEST_ASSERT_SUCCESS(status);
980
981 // Run again, this time with UTF-8 text wrapped in a UText.
982 status = U_ZERO_ERROR;
983 tp.setUTF8(status);
984 TEST_ASSERT_SUCCESS(status);
985 executeTest(&tp, status);
986 }
987 break;
988 }
989
990 if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
991 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
992 // Get the code point from the name and insert it into the test data.
993 // (Damn, no API takes names in Unicode !!!
994 // we've got to take it back to char *)
995 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
996 int32_t nameLength = nameEndIdx - (charIdx+2);
997 char charNameBuf[200];
998 UChar32 theChar = -1;
999 if (nameEndIdx != -1) {
1000 UErrorCode status = U_ZERO_ERROR;
1001 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1002 charNameBuf[sizeof(charNameBuf)-1] = 0;
1003 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1004 if (U_FAILURE(status)) {
1005 theChar = -1;
1006 }
1007 }
1008 if (theChar == -1) {
1009 errln("Error in named character in test file at line %d, col %d",
1010 lineNum, column);
1011 } else {
1012 // Named code point was recognized. Insert it
1013 // into the test data.
1014 tp.dataToBreak.append(theChar);
1015 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1016 tp.srcLine->addElement(lineNum, status);
1017 tp.srcCol ->addElement(column, status);
1018 }
1019 }
1020 if (nameEndIdx > charIdx) {
1021 charIdx = nameEndIdx+1;
1022
1023 }
1024 break;
1025 }
1026
1027
1028
1029 if (testString.compare(charIdx-1, 2, u"<>") == 0) {
1030 charIdx++;
1031 int32_t breakIdx = tp.dataToBreak.length();
1032 tp.expectedBreaks->setSize(breakIdx+1);
1033 tp.expectedBreaks->setElementAt(-1, breakIdx);
1034 tp.srcLine->setSize(breakIdx+1);
1035 tp.srcLine->setElementAt(lineNum, breakIdx);
1036 tp.srcCol ->setSize(breakIdx+1);
1037 tp.srcCol ->setElementAt(column, breakIdx);
1038 break;
1039 }
1040
1041 if (c == u'<') {
1042 tagValue = 0;
1043 parseState = PARSE_NUM;
1044 break;
1045 }
1046
1047 if (c == u'#' && column==3) { // TODO: why is column off so far?
1048 parseState = PARSE_COMMENT;
1049 savedState = PARSE_DATA;
1050 break;
1051 }
1052
1053 if (c == u'\\') {
1054 // Check for \ at end of line, a line continuation.
1055 // Advance over (discard) the newline
1056 UChar32 cp = testString.char32At(charIdx);
1057 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1058 // We have a CR LF
1059 // Need an extra increment of the input ptr to move over both of them
1060 charIdx++;
1061 }
1062 if (cp == u'\n' || cp == u'\r') {
1063 lineNum++;
1064 colStart = charIdx;
1065 charIdx++;
1066 break;
1067 }
1068
1069 // Let unescape handle the back slash.
1070 cp = testString.unescapeAt(charIdx);
1071 if (cp != -1) {
1072 // Escape sequence was recognized. Insert the char
1073 // into the test data.
1074 tp.dataToBreak.append(cp);
1075 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1076 tp.srcLine->addElement(lineNum, status);
1077 tp.srcCol ->addElement(column, status);
1078 }
1079 break;
1080 }
1081
1082
1083 // Not a recognized backslash escape sequence.
1084 // Take the next char as a literal.
1085 // TODO: Should this be an error?
1086 c = testString.charAt(charIdx);
1087 charIdx = testString.moveIndex32(charIdx, 1);
1088 }
1089
1090 // Normal, non-escaped data char.
1091 tp.dataToBreak.append(c);
1092
1093 // Save the mapping from offset in the data to line/column numbers in
1094 // the original input file. Will be used for better error messages only.
1095 // If there's an expected break before this char, the slot in the mapping
1096 // vector will already be set for this char; don't overwrite it.
1097 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1098 tp.srcLine->addElement(lineNum, status);
1099 tp.srcCol ->addElement(column, status);
1100 }
1101 break;
1102
1103
1104 case PARSE_NUM:
1105 // We are parsing an expected numeric tag value, like <1234>,
1106 // within a chunk of data.
1107 if (u_isUWhiteSpace(c)) {
1108 break;
1109 }
1110
1111 if (c == u'>') {
1112 // Finished the number. Add the info to the expected break data,
1113 // and switch parse state back to doing plain data.
1114 parseState = PARSE_DATA;
1115 if (tagValue == 0) {
1116 tagValue = -1;
1117 }
1118 int32_t breakIdx = tp.dataToBreak.length();
1119 if (tp.expectedBreaks->size() > breakIdx) {
1120 errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1121 lineNum, column);
1122 }
1123 tp.expectedBreaks->setSize(breakIdx+1);
1124 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1125 tp.srcLine->setSize(breakIdx+1);
1126 tp.srcLine->setElementAt(lineNum, breakIdx);
1127 tp.srcCol ->setSize(breakIdx+1);
1128 tp.srcCol ->setElementAt(column, breakIdx);
1129 break;
1130 }
1131
1132 if (u_isdigit(c)) {
1133 tagValue = tagValue*10 + u_charDigitValue(c);
1134 break;
1135 }
1136
1137 errln("Syntax Error in test file at line %d, col %d",
1138 lineNum, column);
1139 parseState = PARSE_COMMENT;
1140 goto end_test; // Stop the test
1141 break;
1142 }
1143
1144
1145 if (U_FAILURE(status)) {
1146 dataerrln("ICU Error %s while parsing test file at line %d.",
1147 u_errorName(status), lineNum);
1148 status = U_ZERO_ERROR;
1149 goto end_test; // Stop the test
1150 }
1151
1152 }
1153
1154 // Reached end of test file. Raise an error if parseState indicates that we are
1155 // within a block that should have been terminated.
1156
1157 if (parseState == PARSE_RULES) {
1158 errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1159 lineNum, rulesFirstLine);
1160 }
1161 if (parseState == PARSE_DATA) {
1162 errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1163 }
1164
1165
1166 end_test:
1167 delete [] testFile;
1168 #endif
1169 }
1170
1171 //-------------------------------------------------------------------------------
1172 //
1173 // TestDictRules create a break iterator from source rules that includes a
1174 // dictionary range. Regression for bug #7130. Source rules
1175 // do not declare a break iterator type (word, line, sentence, etc.
1176 // but the dictionary code, without a type, would loop.
1177 //
1178 //-------------------------------------------------------------------------------
TestDictRules()1179 void RBBITest::TestDictRules() {
1180 const char *rules = "$dictionary = [a-z]; \n"
1181 "!!forward; \n"
1182 "$dictionary $dictionary; \n"
1183 "!!reverse; \n"
1184 "$dictionary $dictionary; \n";
1185 const char *text = "aa";
1186 UErrorCode status = U_ZERO_ERROR;
1187 UParseError parseError;
1188
1189 RuleBasedBreakIterator bi(rules, parseError, status);
1190 if (U_SUCCESS(status)) {
1191 UnicodeString utext = text;
1192 bi.setText(utext);
1193 int32_t position;
1194 int32_t loops;
1195 for (loops = 0; loops<10; loops++) {
1196 position = bi.next();
1197 if (position == RuleBasedBreakIterator::DONE) {
1198 break;
1199 }
1200 }
1201 TEST_ASSERT(loops == 1);
1202 } else {
1203 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1204 }
1205 }
1206
1207
1208
1209 //--------------------------------------------------------------------------------------------
1210 //
1211 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1212 //
1213 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1214 void RBBITest::TestUnicodeFiles() {
1215 RuleBasedBreakIterator *bi;
1216 UErrorCode status = U_ZERO_ERROR;
1217
1218 bi = dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createCharacterInstance(Locale::getEnglish(), status));
1219 TEST_ASSERT_SUCCESS(status);
1220 if (U_SUCCESS(status)) {
1221 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1222 }
1223 delete bi;
1224
1225 bi = dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getEnglish(), status));
1226 TEST_ASSERT_SUCCESS(status);
1227 if (U_SUCCESS(status)) {
1228 runUnicodeTestData("WordBreakTest.txt", bi);
1229 }
1230 delete bi;
1231
1232 bi = dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1233 TEST_ASSERT_SUCCESS(status);
1234 if (U_SUCCESS(status)) {
1235 runUnicodeTestData("SentenceBreakTest.txt", bi);
1236 }
1237 delete bi;
1238
1239 bi = dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getEnglish(), status));
1240 TEST_ASSERT_SUCCESS(status);
1241 if (U_SUCCESS(status)) {
1242 runUnicodeTestData("LineBreakTest.txt", bi);
1243 }
1244 delete bi;
1245 }
1246
1247
1248 // Check for test cases from the Unicode test data files that are known to fail
1249 // and should be skipped as known issues because ICU does not fully implement
1250 // the Unicode specifications, or because ICU includes tailorings that differ from
1251 // the Unicode standard.
1252 //
1253 // Test cases are identified by the test data sequence, which tends to be more stable
1254 // across Unicode versions than the test file line numbers.
1255 //
1256 // The test case with ticket "10666" is a dummy, included as an example.
1257
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1258 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1259 static struct TestCase {
1260 const char *fTicketNum;
1261 const char *fFileName;
1262 const char16_t *fString;
1263 } badTestCases[] = {
1264 {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
1265 // The following tests were originally for
1266 // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1267 // However, that ticket has been closed as fixed but these tests still fail, so
1268 // ICU-21097 has been created to investigate and address these remaining issues.
1269 {"21097", "LineBreakTest.txt", u"-#"},
1270 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1271 {"21097", "LineBreakTest.txt", u"\u002d\u00a7"},
1272 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1273 {"21097", "LineBreakTest.txt", u"\u002d\U00050005"},
1274 {"21097", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1275 {"21097", "LineBreakTest.txt", u"\u002d\u0e01"},
1276 {"21097", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1277
1278 // The following tests were originally for
1279 // Issue ICU-12017 Improve line break around numbers.
1280 // However, that ticket has been closed as fixed but these tests still fail, so
1281 // ICU-21097 has been created to investigate and address these remaining issues.
1282 {"21097", "LineBreakTest.txt", u"\u002C\u0030"}, // ",0"
1283 {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1284 {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1285 {"21097", "LineBreakTest.txt", u"a.2 "},
1286 {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1287 {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1288 {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1289 {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1290 {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1291 {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1292 {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1293 {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1294 {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1295 {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1296
1297 // ICU-22127 until UAX #29 wordbreak is update for the colon changes in ICU-22112,
1298 // need to skip some tests in WordBreakTest.txt
1299 {"22127", "WordBreakTest.txt", u"a:"},
1300 {"22127", "WordBreakTest.txt", u"A:"},
1301 };
1302
1303 for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1304 const TestCase &badCase = badTestCases[n];
1305 if (!strcmp(fileName, badCase.fFileName) &&
1306 testCase.startsWith(UnicodeString(badCase.fString))) {
1307 return logKnownIssue(badCase.fTicketNum);
1308 }
1309 }
1310 return false;
1311 }
1312
1313
1314 //--------------------------------------------------------------------------------------------
1315 //
1316 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1317 //
1318 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1319 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1320 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1321 UErrorCode status = U_ZERO_ERROR;
1322
1323 //
1324 // Open and read the test data file, put it into a UnicodeString.
1325 //
1326 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1327 char testFileName[1000];
1328 if (testDataDirectory == nullptr || strlen(testDataDirectory) >= sizeof(testFileName)) {
1329 dataerrln("Can't open test data. Path too long.");
1330 return;
1331 }
1332 strcpy(testFileName, testDataDirectory);
1333 strcat(testFileName, fileName);
1334
1335 logln("Opening data file %s\n", fileName);
1336
1337 int len;
1338 char16_t *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1339 if (status != U_FILE_ACCESS_ERROR) {
1340 TEST_ASSERT_SUCCESS(status);
1341 TEST_ASSERT(testFile != nullptr);
1342 }
1343 if (U_FAILURE(status) || testFile == nullptr) {
1344 return; /* something went wrong, error already output */
1345 }
1346 UnicodeString testFileAsString(true, testFile, len);
1347
1348 //
1349 // Parse the test data file using a regular expression.
1350 // Each kind of token is recognized in its own capture group; what type of item was scanned
1351 // is identified by which group had a match.
1352 //
1353 // Capture Group # 1 2 3 4 5
1354 // Parses this item: divide x hex digits comment \n unrecognized \n
1355 //
1356 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1357 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1358 UnicodeString testString;
1359 UVector32 breakPositions(status);
1360 int lineNumber = 1;
1361 TEST_ASSERT_SUCCESS(status);
1362 if (U_FAILURE(status)) {
1363 return;
1364 }
1365
1366 //
1367 // Scan through each test case, building up the string to be broken in testString,
1368 // and the positions that should be boundaries in the breakPositions vector.
1369 //
1370 int spin = 0;
1371 while (tokenMatcher.find()) {
1372 if(tokenMatcher.hitEnd()) {
1373 /* Shouldn't Happen(TM). This means we didn't find the symbols we were looking for.
1374 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1375 and caused an infinite loop here on EBCDIC systems!
1376 */
1377 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1378 // return;
1379 }
1380 if (tokenMatcher.start(1, status) >= 0) {
1381 // Scanned a divide sign, indicating a break position in the test data.
1382 if (testString.length()>0) {
1383 breakPositions.addElement(testString.length(), status);
1384 }
1385 }
1386 else if (tokenMatcher.start(2, status) >= 0) {
1387 // Scanned an 'x', meaning no break at this position in the test data
1388 // Nothing to be done here.
1389 }
1390 else if (tokenMatcher.start(3, status) >= 0) {
1391 // Scanned Hex digits. Convert them to binary, append to the character data string.
1392 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1393 int length = hexNumber.length();
1394 if (length<=8) {
1395 char buf[10];
1396 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1397 UChar32 c = (UChar32)strtol(buf, nullptr, 16);
1398 if (c<=0x10ffff) {
1399 testString.append(c);
1400 } else {
1401 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1402 fileName, lineNumber);
1403 }
1404 } else {
1405 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1406 fileName, lineNumber);
1407 }
1408 }
1409 else if (tokenMatcher.start(4, status) >= 0) {
1410 // Scanned to end of a line, possibly skipping over a comment in the process.
1411 // If the line from the file contained test data, run the test now.
1412 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1413 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1414 }
1415
1416 // Clear out this test case.
1417 // The string and breakPositions vector will be refilled as the next
1418 // test case is parsed.
1419 testString.remove();
1420 breakPositions.removeAllElements();
1421 lineNumber++;
1422 } else {
1423 // Scanner catchall. Something unrecognized appeared on the line.
1424 char token[16];
1425 UnicodeString uToken = tokenMatcher.group(0, status);
1426 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1427 token[sizeof(token)-1] = 0;
1428 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1429
1430 // Clean up, in preparation for continuing with the next line.
1431 testString.remove();
1432 breakPositions.removeAllElements();
1433 lineNumber++;
1434 }
1435 TEST_ASSERT_SUCCESS(status);
1436 if (U_FAILURE(status)) {
1437 break;
1438 }
1439 }
1440
1441 delete [] testFile;
1442 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1443 }
1444
1445 //--------------------------------------------------------------------------------------------
1446 //
1447 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1448 // test data files. Do only a simple, forward-only check -
1449 // this test is mostly to check that ICU and the Unicode
1450 // data agree with each other.
1451 //
1452 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1453 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1454 const UnicodeString &testString, // Text data to be broken
1455 UVector32 *breakPositions, // Positions where breaks should be found.
1456 RuleBasedBreakIterator *bi) {
1457 int32_t pos; // Break Position in the test string
1458 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1459 int32_t expectedPos; // Expected break position (index into test string)
1460
1461 bi->setText(testString);
1462 pos = bi->first();
1463 pos = bi->next();
1464
1465 bool error = false;
1466 std::set<int32_t> actualBreaks;
1467 std::set<int32_t> expectedBreaks;
1468 while (pos != BreakIterator::DONE) {
1469 actualBreaks.insert(pos);
1470 if (expectedI >= breakPositions->size()) {
1471 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1472 testFileName, lineNumber, pos);
1473 error = true;
1474 break;
1475 }
1476 expectedPos = breakPositions->elementAti(expectedI);
1477 expectedBreaks.insert(expectedPos);
1478 if (pos < expectedPos) {
1479 errln("Test file \"%s\", line %d, unexpected break found at position %d", testFileName,
1480 lineNumber, pos);
1481 error = true;
1482 break;
1483 }
1484 if (pos > expectedPos) {
1485 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1486 testFileName, lineNumber, expectedPos);
1487 error = true;
1488 break;
1489 }
1490 pos = bi->next();
1491 expectedI++;
1492 }
1493
1494 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1495 errln("Test file \"%s\", line %d, failed to find expected break at position %d", testFileName,
1496 lineNumber, breakPositions->elementAti(expectedI));
1497 error = true;
1498 }
1499
1500 if (error) {
1501 for (; pos != BreakIterator::DONE; pos = bi->next()) {
1502 actualBreaks.insert(pos);
1503 }
1504 for (; expectedI < breakPositions->size(); ++expectedI) {
1505 expectedBreaks.insert(breakPositions->elementAti(expectedI));
1506 }
1507 UnicodeString expected;
1508 UnicodeString actual;
1509 for (int32_t i = 0; i < testString.length();) {
1510 const UChar32 c = testString.char32At(i);
1511 i += U16_LENGTH(c);
1512 expected += expectedBreaks.count(i) == 1 ? u"÷" : u"×";
1513 actual += actualBreaks.count(i) == 1 ? u"÷" : u"×";
1514 expected += c;
1515 actual += c;
1516 }
1517 expected += expectedBreaks.count(testString.length()) == 1 ? u"÷" : u"×";
1518 actual += actualBreaks.count(testString.length()) == 1 ? u"÷" : u"×";
1519 errln("Expected : " + expected);
1520 errln("Actual : " + actual);
1521 }
1522 }
1523
1524
1525
1526 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1527 //---------------------------------------------------------------------------------------
1528 //
1529 // class RBBIMonkeyKind
1530 //
1531 // Monkey Test for Break Iteration
1532 // Abstract interface class. Concrete derived classes independently
1533 // implement the break rules for different iterator types.
1534 //
1535 // The Monkey Test itself uses doesn't know which type of break iterator it is
1536 // testing, but works purely in terms of the interface defined here.
1537 //
1538 //---------------------------------------------------------------------------------------
1539 class RBBIMonkeyKind {
1540 public:
1541 // Return a UVector of UnicodeSets, representing the character classes used
1542 // for this type of iterator.
1543 virtual UVector *charClasses() = 0;
1544
1545 // Set the test text on which subsequent calls to next() will operate
1546 virtual void setText(const UnicodeString &s) = 0;
1547
1548 // Find the next break position, starting from the prev break position, or from zero.
1549 // Return -1 after reaching end of string.
1550 virtual int32_t next(int32_t i) = 0;
1551
1552 // Name of each character class, parallel with charClasses. Used for debugging output
1553 // of characters.
1554 virtual std::vector<std::string>& characterClassNames();
1555
1556 void setAppliedRule(int32_t position, const char* value);
1557
1558 std::string getAppliedRule(int32_t position);
1559
1560 virtual ~RBBIMonkeyKind();
1561 UErrorCode deferredStatus;
1562
1563 std::string classNameFromCodepoint(const UChar32 c);
1564 unsigned int maxClassNameSize();
1565
1566 protected:
1567 RBBIMonkeyKind();
1568 std::vector<std::string> classNames;
1569 std::vector<std::string> appliedRules;
1570
1571 // Clear `appliedRules` and fill it with empty strings in the size of test text.
1572 void prepareAppliedRules(int32_t size );
1573
1574 private:
1575
1576 };
1577
RBBIMonkeyKind()1578 RBBIMonkeyKind::RBBIMonkeyKind() {
1579 deferredStatus = U_ZERO_ERROR;
1580 }
1581
~RBBIMonkeyKind()1582 RBBIMonkeyKind::~RBBIMonkeyKind() {
1583 }
1584
characterClassNames()1585 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1586 return classNames;
1587 }
1588
prepareAppliedRules(int32_t size)1589 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1590 // Remove all the information in the `appliedRules`.
1591 appliedRules.clear();
1592 appliedRules.resize(size + 1);
1593 }
1594
setAppliedRule(int32_t position,const char * value)1595 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1596 appliedRules[position] = value;
1597 }
1598
getAppliedRule(int32_t position)1599 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1600 return appliedRules[position];
1601 }
1602
classNameFromCodepoint(const UChar32 c)1603 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1604 // Simply iterate through charClasses to find character's class
1605 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1606 UnicodeSet *classSet = static_cast<UnicodeSet *>(charClasses()->elementAt(aClassNum));
1607 if (classSet->contains(c)) {
1608 return classNames[aClassNum];
1609 }
1610 }
1611 U_ASSERT(false); // This should not happen.
1612 return "bad class name";
1613 }
1614
maxClassNameSize()1615 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1616 unsigned int maxSize = 0;
1617 for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1618 auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1619 if (aClassNumSize > maxSize) {
1620 maxSize = aClassNumSize;
1621 }
1622 }
1623 return maxSize;
1624 }
1625
1626 //----------------------------------------------------------------------------------------
1627 //
1628 // Random Numbers. Similar to standard lib rand() and srand()
1629 // Not using library to
1630 // 1. Get same results on all platforms.
1631 // 2. Get access to current seed, to more easily reproduce failures.
1632 //
1633 //---------------------------------------------------------------------------------------
1634 static uint32_t m_seed = 1;
1635
m_rand()1636 static uint32_t m_rand()
1637 {
1638 m_seed = m_seed * 1103515245 + 12345;
1639 return (uint32_t)(m_seed/65536) % 32768;
1640 }
1641
1642
1643 //------------------------------------------------------------------------------------------
1644 //
1645 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1646 // of RBBIMonkeyKind.
1647 //
1648 //------------------------------------------------------------------------------------------
1649 class RBBICharMonkey: public RBBIMonkeyKind {
1650 public:
1651 RBBICharMonkey();
1652 virtual ~RBBICharMonkey();
1653 virtual UVector *charClasses() override;
1654 virtual void setText(const UnicodeString &s) override;
1655 virtual int32_t next(int32_t i) override;
1656 private:
1657 UVector *fSets;
1658
1659 UnicodeSet *fCRLFSet;
1660 UnicodeSet *fControlSet;
1661 UnicodeSet *fExtendSet;
1662 UnicodeSet *fZWJSet;
1663 UnicodeSet *fRegionalIndicatorSet;
1664 UnicodeSet *fPrependSet;
1665 UnicodeSet *fSpacingSet;
1666 UnicodeSet *fLSet;
1667 UnicodeSet *fVSet;
1668 UnicodeSet *fTSet;
1669 UnicodeSet *fLVSet;
1670 UnicodeSet *fLVTSet;
1671 UnicodeSet *fHangulSet;
1672 UnicodeSet *fExtendedPictSet;
1673 UnicodeSet *fViramaSet;
1674 UnicodeSet *fLinkingConsonantSet;
1675 UnicodeSet *fExtCccZwjSet;
1676 UnicodeSet *fAnySet;
1677
1678 const UnicodeString *fText;
1679 };
1680
1681
RBBICharMonkey()1682 RBBICharMonkey::RBBICharMonkey() {
1683 UErrorCode status = U_ZERO_ERROR;
1684
1685 fText = nullptr;
1686
1687 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1688 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1689 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1690 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1691 fRegionalIndicatorSet =
1692 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1693 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1694 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1695 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1696 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1697 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1698 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1699 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1700 fHangulSet = new UnicodeSet();
1701 fHangulSet->addAll(*fLSet);
1702 fHangulSet->addAll(*fVSet);
1703 fHangulSet->addAll(*fTSet);
1704 fHangulSet->addAll(*fLVSet);
1705 fHangulSet->addAll(*fLVTSet);
1706
1707 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1708 fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1709 "\\p{Indic_Syllabic_Category=Virama}]", status);
1710 fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1711 "\\p{Indic_Syllabic_Category=Consonant}]", status);
1712 fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1713 fAnySet = new UnicodeSet(0, 0x10ffff);
1714
1715 // Create sets of characters, and add the names of the above character sets.
1716 // In each new ICU release, add new names corresponding to the sets above.
1717 fSets = new UVector(status);
1718
1719 // Important: Keep class names the same as the class contents.
1720 fSets->addElement(fCRLFSet, status); classNames.emplace_back("CRLF");
1721 fSets->addElement(fControlSet, status); classNames.emplace_back("Control");
1722 fSets->addElement(fExtendSet, status); classNames.emplace_back("Extended");
1723 fSets->addElement(fRegionalIndicatorSet, status); classNames.emplace_back("RegionalIndicator");
1724 if (!fPrependSet->isEmpty()) {
1725 fSets->addElement(fPrependSet, status); classNames.emplace_back("Prepend");
1726 }
1727 fSets->addElement(fSpacingSet, status); classNames.emplace_back("Spacing");
1728 fSets->addElement(fHangulSet, status); classNames.emplace_back("Hangul");
1729 fSets->addElement(fZWJSet, status); classNames.emplace_back("ZWJ");
1730 fSets->addElement(fExtendedPictSet, status); classNames.emplace_back("ExtendedPict");
1731 fSets->addElement(fViramaSet, status); classNames.emplace_back("Virama");
1732 fSets->addElement(fLinkingConsonantSet, status); classNames.emplace_back("LinkingConsonant");
1733 fSets->addElement(fExtCccZwjSet, status); classNames.emplace_back("ExtCcccZwj");
1734 fSets->addElement(fAnySet, status); classNames.emplace_back("Any");
1735
1736 if (U_FAILURE(status)) {
1737 deferredStatus = status;
1738 }
1739 }
1740
1741
setText(const UnicodeString & s)1742 void RBBICharMonkey::setText(const UnicodeString &s) {
1743 fText = &s;
1744 prepareAppliedRules(s.length());
1745 }
1746
1747
1748
next(int32_t prevPos)1749 int32_t RBBICharMonkey::next(int32_t prevPos) {
1750 int p0, p1, p2, p3; // Indices of the significant code points around the
1751 // break position being tested. The candidate break
1752 // location is before p2.
1753
1754 int breakPos = -1;
1755
1756 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
1757 UChar32 cBase; // for (X Extend*) patterns, the X character.
1758
1759 if (U_FAILURE(deferredStatus)) {
1760 return -1;
1761 }
1762
1763 // Previous break at end of string. return DONE.
1764 if (prevPos >= fText->length()) {
1765 return -1;
1766 }
1767
1768 p0 = p1 = p2 = p3 = prevPos;
1769 c3 = fText->char32At(prevPos);
1770 c0 = c1 = c2 = cBase = 0;
1771 (void)p0; // suppress set but not used warning.
1772 (void)c0;
1773
1774 // Loop runs once per "significant" character position in the input text.
1775 for (;;) {
1776 // Move all of the positions forward in the input string.
1777 p0 = p1; c0 = c1;
1778 p1 = p2; c1 = c2;
1779 p2 = p3; c2 = c3;
1780
1781 // Advance p3 by one codepoint
1782 p3 = fText->moveIndex32(p3, 1);
1783 c3 = fText->char32At(p3);
1784
1785 if (p1 == p2) {
1786 // Still warming up the loop. (won't work with zero length strings, but we don't care)
1787 continue;
1788 }
1789
1790 if (p2 == fText->length()) {
1791 setAppliedRule(p2, "End of String");
1792 break;
1793 }
1794
1795 // No Extend or Format characters may appear between the CR and LF,
1796 // which requires the additional check for p2 immediately following p1.
1797 //
1798 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1799 setAppliedRule(p2, "GB3 CR x LF");
1800 continue;
1801 }
1802
1803 if (fControlSet->contains(c1) ||
1804 c1 == 0x0D ||
1805 c1 == 0x0A) {
1806 setAppliedRule(p2, "GB4 ( Control | CR | LF ) <break>");
1807 break;
1808 }
1809
1810 if (fControlSet->contains(c2) ||
1811 c2 == 0x0D ||
1812 c2 == 0x0A) {
1813 setAppliedRule(p2, "GB5 <break> ( Control | CR | LF )");
1814 break;
1815 }
1816
1817 if (fLSet->contains(c1) &&
1818 (fLSet->contains(c2) ||
1819 fVSet->contains(c2) ||
1820 fLVSet->contains(c2) ||
1821 fLVTSet->contains(c2))) {
1822 setAppliedRule(p2, "GB6 L x ( L | V | LV | LVT )");
1823 continue;
1824 }
1825
1826 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1827 (fVSet->contains(c2) || fTSet->contains(c2))) {
1828 setAppliedRule(p2, "GB7 ( LV | V ) x ( V | T )");
1829 continue;
1830 }
1831
1832 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1833 fTSet->contains(c2)) {
1834 setAppliedRule(p2, "GB8 ( LVT | T) x T");
1835 continue;
1836 }
1837
1838 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
1839 if (!fExtendSet->contains(c1)) {
1840 cBase = c1;
1841 }
1842 setAppliedRule(p2, "GB9 x (Extend | ZWJ)");
1843 continue;
1844 }
1845
1846 if (fSpacingSet->contains(c2)) {
1847 setAppliedRule(p2, "GB9a x SpacingMark");
1848 continue;
1849 }
1850
1851 if (fPrependSet->contains(c1)) {
1852 setAppliedRule(p2, "GB9b Prepend x");
1853 continue;
1854 }
1855
1856 // Note: Viramas are also included in the ExtCccZwj class.
1857 if (fLinkingConsonantSet->contains(c2)) {
1858 int pi = p1;
1859 bool sawVirama = false;
1860 while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1861 if (fViramaSet->contains(fText->char32At(pi))) {
1862 sawVirama = true;
1863 }
1864 pi = fText->moveIndex32(pi, -1);
1865 }
1866 if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1867 setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* x LinkingConsonant");
1868 continue;
1869 }
1870 }
1871
1872 if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1873 setAppliedRule(p2, "GB11 Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1874 continue;
1875 }
1876
1877 // Note: The first if condition is a little tricky. We only need to force
1878 // a break if there are three or more contiguous RIs. If there are
1879 // only two, a break following will occur via other rules, and will include
1880 // any trailing extend characters, which is needed behavior.
1881 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1882 && fRegionalIndicatorSet->contains(c2)) {
1883 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1884 break;
1885 }
1886 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1887 setAppliedRule(p2, "GB12-13 Regional_Indicator x Regional_Indicator");
1888 continue;
1889 }
1890
1891 setAppliedRule(p2, "GB999 Any <break> Any");
1892 break;
1893 }
1894
1895 breakPos = p2;
1896 return breakPos;
1897 }
1898
1899
1900
charClasses()1901 UVector *RBBICharMonkey::charClasses() {
1902 return fSets;
1903 }
1904
~RBBICharMonkey()1905 RBBICharMonkey::~RBBICharMonkey() {
1906 delete fSets;
1907 delete fCRLFSet;
1908 delete fControlSet;
1909 delete fExtendSet;
1910 delete fRegionalIndicatorSet;
1911 delete fPrependSet;
1912 delete fSpacingSet;
1913 delete fLSet;
1914 delete fVSet;
1915 delete fTSet;
1916 delete fLVSet;
1917 delete fLVTSet;
1918 delete fHangulSet;
1919 delete fAnySet;
1920 delete fZWJSet;
1921 delete fExtendedPictSet;
1922 delete fViramaSet;
1923 delete fLinkingConsonantSet;
1924 delete fExtCccZwjSet;
1925 }
1926
1927 //------------------------------------------------------------------------------------------
1928 //
1929 // class RBBIWordMonkey Word Break specific implementation
1930 // of RBBIMonkeyKind.
1931 //
1932 //------------------------------------------------------------------------------------------
1933 class RBBIWordMonkey: public RBBIMonkeyKind {
1934 public:
1935 RBBIWordMonkey();
1936 virtual ~RBBIWordMonkey();
1937 virtual UVector *charClasses() override;
1938 virtual void setText(const UnicodeString &s) override;
1939 virtual int32_t next(int32_t i) override;
1940 private:
1941 UVector *fSets;
1942
1943 UnicodeSet *fCRSet;
1944 UnicodeSet *fLFSet;
1945 UnicodeSet *fNewlineSet;
1946 UnicodeSet *fRegionalIndicatorSet;
1947 UnicodeSet *fKatakanaSet;
1948 UnicodeSet *fHebrew_LetterSet;
1949 UnicodeSet *fALetterSet;
1950 UnicodeSet *fSingle_QuoteSet;
1951 UnicodeSet *fDouble_QuoteSet;
1952 UnicodeSet *fMidNumLetSet;
1953 UnicodeSet *fMidLetterSet;
1954 UnicodeSet *fMidNumSet;
1955 UnicodeSet *fNumericSet;
1956 UnicodeSet *fFormatSet;
1957 UnicodeSet *fOtherSet = nullptr;
1958 UnicodeSet *fExtendSet;
1959 UnicodeSet *fExtendNumLetSet;
1960 UnicodeSet *fWSegSpaceSet;
1961 UnicodeSet *fDictionarySet = nullptr;
1962 UnicodeSet *fZWJSet;
1963 UnicodeSet *fExtendedPictSet;
1964
1965 const UnicodeString *fText;
1966 };
1967
1968
RBBIWordMonkey()1969 RBBIWordMonkey::RBBIWordMonkey()
1970 {
1971 UErrorCode status = U_ZERO_ERROR;
1972
1973 fSets = new UVector(status);
1974
1975 fCRSet = new UnicodeSet(u"[\\p{Word_Break = CR}]", status);
1976 fLFSet = new UnicodeSet(u"[\\p{Word_Break = LF}]", status);
1977 fNewlineSet = new UnicodeSet(u"[\\p{Word_Break = Newline}]", status);
1978 fKatakanaSet = new UnicodeSet(u"[\\p{Word_Break = Katakana}]", status);
1979 fRegionalIndicatorSet = new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1980 fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1981 fALetterSet = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
1982 fSingle_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]", status);
1983 fDouble_QuoteSet = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]", status);
1984 fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
1985 fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]", status);
1986 fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
1987 fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1988 fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
1989 fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1990 // There are some sc=Hani characters with WB=Extend.
1991 // The break rules need to pick one or the other because
1992 // Extend overlapping with something else is messy.
1993 // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
1994 // in $Han (for $dictionary) and out of $Extend.
1995 fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
1996 fWSegSpaceSet = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]", status);
1997
1998 fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
1999 fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
2000 if(U_FAILURE(status)) {
2001 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2002 deferredStatus = status;
2003 return;
2004 }
2005
2006 fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2007 fDictionarySet->addAll(*fKatakanaSet);
2008 fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2009
2010 fALetterSet->removeAll(*fDictionarySet);
2011
2012 fOtherSet = new UnicodeSet();
2013 if(U_FAILURE(status)) {
2014 IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2015 deferredStatus = status;
2016 return;
2017 }
2018
2019 fOtherSet->complement();
2020 fOtherSet->removeAll(*fCRSet);
2021 fOtherSet->removeAll(*fLFSet);
2022 fOtherSet->removeAll(*fNewlineSet);
2023 fOtherSet->removeAll(*fKatakanaSet);
2024 fOtherSet->removeAll(*fHebrew_LetterSet);
2025 fOtherSet->removeAll(*fALetterSet);
2026 fOtherSet->removeAll(*fSingle_QuoteSet);
2027 fOtherSet->removeAll(*fDouble_QuoteSet);
2028 fOtherSet->removeAll(*fMidLetterSet);
2029 fOtherSet->removeAll(*fMidNumSet);
2030 fOtherSet->removeAll(*fNumericSet);
2031 fOtherSet->removeAll(*fExtendNumLetSet);
2032 fOtherSet->removeAll(*fWSegSpaceSet);
2033 fOtherSet->removeAll(*fFormatSet);
2034 fOtherSet->removeAll(*fExtendSet);
2035 fOtherSet->removeAll(*fRegionalIndicatorSet);
2036 fOtherSet->removeAll(*fZWJSet);
2037 fOtherSet->removeAll(*fExtendedPictSet);
2038
2039 // Inhibit dictionary characters from being tested at all.
2040 fOtherSet->removeAll(*fDictionarySet);
2041
2042 // Add classes and their names
2043 fSets->addElement(fCRSet, status); classNames.emplace_back("CR");
2044 fSets->addElement(fLFSet, status); classNames.emplace_back("LF");
2045 fSets->addElement(fNewlineSet, status); classNames.emplace_back("Newline");
2046 fSets->addElement(fRegionalIndicatorSet, status); classNames.emplace_back("RegionalIndicator");
2047 fSets->addElement(fHebrew_LetterSet, status); classNames.emplace_back("Hebrew");
2048 fSets->addElement(fALetterSet, status); classNames.emplace_back("ALetter");
2049 fSets->addElement(fSingle_QuoteSet, status); classNames.emplace_back("Single Quote");
2050 fSets->addElement(fDouble_QuoteSet, status); classNames.emplace_back("Double Quote");
2051 // Omit Katakana from fSets, which omits Katakana characters
2052 // from the test data. They are all in the dictionary set,
2053 // which this (old, to be retired) monkey test cannot handle.
2054 //fSets->addElement(fKatakanaSet, status);
2055
2056 fSets->addElement(fMidLetterSet, status); classNames.emplace_back("MidLetter");
2057 fSets->addElement(fMidNumLetSet, status); classNames.emplace_back("MidNumLet");
2058 fSets->addElement(fMidNumSet, status); classNames.emplace_back("MidNum");
2059 fSets->addElement(fNumericSet, status); classNames.emplace_back("Numeric");
2060 fSets->addElement(fFormatSet, status); classNames.emplace_back("Format");
2061 fSets->addElement(fExtendSet, status); classNames.emplace_back("Extend");
2062 fSets->addElement(fOtherSet, status); classNames.emplace_back("Other");
2063 fSets->addElement(fExtendNumLetSet, status); classNames.emplace_back("ExtendNumLet");
2064 fSets->addElement(fWSegSpaceSet, status); classNames.emplace_back("WSegSpace");
2065
2066 fSets->addElement(fZWJSet, status); classNames.emplace_back("ZWJ");
2067 fSets->addElement(fExtendedPictSet, status); classNames.emplace_back("ExtendedPict");
2068
2069 if (U_FAILURE(status)) {
2070 deferredStatus = status;
2071 }
2072 }
2073
setText(const UnicodeString & s)2074 void RBBIWordMonkey::setText(const UnicodeString &s) {
2075 fText = &s;
2076 prepareAppliedRules(s.length());
2077 }
2078
2079
next(int32_t prevPos)2080 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2081 int p0, p1, p2, p3; // Indices of the significant code points around the
2082 // break position being tested. The candidate break
2083 // location is before p2.
2084
2085 int breakPos = -1;
2086
2087 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2088
2089 if (U_FAILURE(deferredStatus)) {
2090 return -1;
2091 }
2092
2093 // Prev break at end of string. return DONE.
2094 if (prevPos >= fText->length()) {
2095 return -1;
2096 }
2097 p0 = p1 = p2 = p3 = prevPos;
2098 c3 = fText->char32At(prevPos);
2099 c0 = c1 = c2 = 0;
2100 (void)p0; // Suppress set but not used warning.
2101
2102 // Loop runs once per "significant" character position in the input text.
2103 for (;;) {
2104 // Move all of the positions forward in the input string.
2105 p0 = p1; c0 = c1;
2106 p1 = p2; c1 = c2;
2107 p2 = p3; c2 = c3;
2108
2109 // Advance p3 by X(Extend | Format)* Rule 4
2110 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2111 do {
2112 p3 = fText->moveIndex32(p3, 1);
2113 c3 = fText->char32At(p3);
2114 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2115 break;
2116 }
2117 }
2118 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2119
2120
2121 if (p1 == p2) {
2122 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2123 continue;
2124 }
2125
2126 if (p2 == fText->length()) {
2127 // Reached end of string. Always a break position.
2128 break;
2129 }
2130
2131 // No Extend or Format characters may appear between the CR and LF,
2132 // which requires the additional check for p2 immediately following p1.
2133 //
2134 if (c1==0x0D && c2==0x0A) {
2135 setAppliedRule(p2, "WB3 CR x LF");
2136 continue;
2137 }
2138
2139 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2140 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2141 break;
2142 }
2143 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2144 setAppliedRule(p2, "WB3a Break before and after newlines (including CR and LF)");
2145 break;
2146 }
2147
2148 // Not ignoring extend chars, so peek into input text to
2149 // get the potential ZWJ, the character immediately preceding c2.
2150 // Sloppy UChar32 indexing: p2-1 may reference trail half
2151 // but char32At will get the full code point.
2152 if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2153 setAppliedRule(p2, "WB3c ZWJ x Extended_Pictographic");
2154 continue;
2155 }
2156
2157 if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2158 setAppliedRule(p2, "WB3d Keep horizontal whitespace together.");
2159 continue;
2160 }
2161
2162 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2163 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2164 setAppliedRule(p2, "WB4 (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2165 continue;
2166 }
2167
2168 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2169 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2170 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2171 setAppliedRule(p2,
2172 "WB6 (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2173 continue;
2174 }
2175
2176 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2177 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2178 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2179 setAppliedRule(p2,
2180 "WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)");
2181 continue;
2182 }
2183
2184 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2185 setAppliedRule(p2, "WB7a Hebrew_Letter x Single_Quote");
2186 continue;
2187 }
2188
2189 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2190 setAppliedRule(p2, "WB7b Hebrew_Letter x Double_Quote Hebrew_Letter");
2191 continue;
2192 }
2193
2194 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2195 setAppliedRule(p2, "WB7c Hebrew_Letter Double_Quote x Hebrew_Letter");
2196 continue;
2197 }
2198
2199 if (fNumericSet->contains(c1) &&
2200 fNumericSet->contains(c2)) {
2201 setAppliedRule(p2, "WB8 Numeric x Numeric");
2202 continue;
2203 }
2204
2205 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2206 fNumericSet->contains(c2)) {
2207 setAppliedRule(p2, "WB9 (ALetter | Hebrew_Letter) x Numeric");
2208 continue;
2209 }
2210
2211 if (fNumericSet->contains(c1) &&
2212 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2213 setAppliedRule(p2, "WB10 Numeric x (ALetter | Hebrew_Letter)");
2214 continue;
2215 }
2216
2217 if (fNumericSet->contains(c0) &&
2218 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2219 fNumericSet->contains(c2)) {
2220 setAppliedRule(p2, "WB11 Numeric (MidNum | MidNumLet | Single_Quote) x Numeric");
2221 continue;
2222 }
2223
2224 if (fNumericSet->contains(c1) &&
2225 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2226 fNumericSet->contains(c3)) {
2227 setAppliedRule(p2, "WB12 Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2228 continue;
2229 }
2230
2231 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2232 // all Katakana are handled by the dictionary breaker.
2233 if (fKatakanaSet->contains(c1) &&
2234 fKatakanaSet->contains(c2)) {
2235 setAppliedRule(p2, "WB13 Katakana x Katakana");
2236 continue;
2237 }
2238
2239 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2240 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2241 fExtendNumLetSet->contains(c2)) {
2242 setAppliedRule(p2,
2243 "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2244 continue;
2245 }
2246
2247 if (fExtendNumLetSet->contains(c1) &&
2248 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2249 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2250 setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2251 continue;
2252 }
2253
2254 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2255 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2256 break;
2257 }
2258 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2259 setAppliedRule(p2, "WB15 - WB17 Group pairs of Regional Indicators.");
2260 continue;
2261 }
2262
2263 setAppliedRule(p2, "WB999");
2264 break;
2265 }
2266
2267 breakPos = p2;
2268 return breakPos;
2269 }
2270
2271
charClasses()2272 UVector *RBBIWordMonkey::charClasses() {
2273 return fSets;
2274 }
2275
~RBBIWordMonkey()2276 RBBIWordMonkey::~RBBIWordMonkey() {
2277 delete fSets;
2278 delete fCRSet;
2279 delete fLFSet;
2280 delete fNewlineSet;
2281 delete fKatakanaSet;
2282 delete fHebrew_LetterSet;
2283 delete fALetterSet;
2284 delete fSingle_QuoteSet;
2285 delete fDouble_QuoteSet;
2286 delete fMidNumLetSet;
2287 delete fMidLetterSet;
2288 delete fMidNumSet;
2289 delete fNumericSet;
2290 delete fFormatSet;
2291 delete fExtendSet;
2292 delete fExtendNumLetSet;
2293 delete fWSegSpaceSet;
2294 delete fRegionalIndicatorSet;
2295 delete fDictionarySet;
2296 delete fOtherSet;
2297 delete fZWJSet;
2298 delete fExtendedPictSet;
2299 }
2300
2301
2302
2303
2304 //------------------------------------------------------------------------------------------
2305 //
2306 // class RBBISentMonkey Sentence Break specific implementation
2307 // of RBBIMonkeyKind.
2308 //
2309 //------------------------------------------------------------------------------------------
2310 class RBBISentMonkey: public RBBIMonkeyKind {
2311 public:
2312 RBBISentMonkey();
2313 virtual ~RBBISentMonkey();
2314 virtual UVector *charClasses() override;
2315 virtual void setText(const UnicodeString &s) override;
2316 virtual int32_t next(int32_t i) override;
2317 private:
2318 int moveBack(int posFrom);
2319 int moveForward(int posFrom);
2320 UChar32 cAt(int pos);
2321
2322 UVector *fSets;
2323
2324 UnicodeSet *fSepSet;
2325 UnicodeSet *fFormatSet;
2326 UnicodeSet *fSpSet;
2327 UnicodeSet *fLowerSet;
2328 UnicodeSet *fUpperSet;
2329 UnicodeSet *fOLetterSet;
2330 UnicodeSet *fNumericSet;
2331 UnicodeSet *fATermSet;
2332 UnicodeSet *fSContinueSet;
2333 UnicodeSet *fSTermSet;
2334 UnicodeSet *fCloseSet;
2335 UnicodeSet *fOtherSet;
2336 UnicodeSet *fExtendSet;
2337
2338 const UnicodeString *fText;
2339 };
2340
RBBISentMonkey()2341 RBBISentMonkey::RBBISentMonkey()
2342 {
2343 UErrorCode status = U_ZERO_ERROR;
2344
2345 fSets = new UVector(status);
2346
2347 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2348 // set and made into character classes of their own. For the monkey impl,
2349 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2350 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2351 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2352 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2353 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2354 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2355 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2356 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2357 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2358 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2359 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2360 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2361 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2362 fOtherSet = new UnicodeSet();
2363
2364 if(U_FAILURE(status)) {
2365 deferredStatus = status;
2366 return;
2367 }
2368
2369 fOtherSet->complement();
2370 fOtherSet->removeAll(*fSepSet);
2371 fOtherSet->removeAll(*fFormatSet);
2372 fOtherSet->removeAll(*fSpSet);
2373 fOtherSet->removeAll(*fLowerSet);
2374 fOtherSet->removeAll(*fUpperSet);
2375 fOtherSet->removeAll(*fOLetterSet);
2376 fOtherSet->removeAll(*fNumericSet);
2377 fOtherSet->removeAll(*fATermSet);
2378 fOtherSet->removeAll(*fSContinueSet);
2379 fOtherSet->removeAll(*fSTermSet);
2380 fOtherSet->removeAll(*fCloseSet);
2381 fOtherSet->removeAll(*fExtendSet);
2382
2383 fSets->addElement(fSepSet, status); classNames.emplace_back("Sep");
2384 fSets->addElement(fFormatSet, status); classNames.emplace_back("Format");
2385 fSets->addElement(fSpSet, status); classNames.emplace_back("Sp");
2386 fSets->addElement(fLowerSet, status); classNames.emplace_back("Lower");
2387 fSets->addElement(fUpperSet, status); classNames.emplace_back("Upper");
2388 fSets->addElement(fOLetterSet, status); classNames.emplace_back("OLetter");
2389 fSets->addElement(fNumericSet, status); classNames.emplace_back("Numeric");
2390 fSets->addElement(fATermSet, status); classNames.emplace_back("ATerm");
2391 fSets->addElement(fSContinueSet, status); classNames.emplace_back("SContinue");
2392 fSets->addElement(fSTermSet, status); classNames.emplace_back("STerm");
2393 fSets->addElement(fCloseSet, status); classNames.emplace_back("Close");
2394 fSets->addElement(fOtherSet, status); classNames.emplace_back("Other");
2395 fSets->addElement(fExtendSet, status); classNames.emplace_back("Extend");
2396
2397 if (U_FAILURE(status)) {
2398 deferredStatus = status;
2399 }
2400 }
2401
2402
2403
setText(const UnicodeString & s)2404 void RBBISentMonkey::setText(const UnicodeString &s) {
2405 fText = &s;
2406 prepareAppliedRules(s.length());
2407 }
2408
charClasses()2409 UVector *RBBISentMonkey::charClasses() {
2410 return fSets;
2411 }
2412
2413 // moveBack() Find the "significant" code point preceding the index i.
2414 // Skips over ($Extend | $Format)* .
2415 //
moveBack(int i)2416 int RBBISentMonkey::moveBack(int i) {
2417 if (i <= 0) {
2418 return -1;
2419 }
2420 UChar32 c;
2421 int32_t j = i;
2422 do {
2423 j = fText->moveIndex32(j, -1);
2424 c = fText->char32At(j);
2425 }
2426 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2427 return j;
2428
2429 }
2430
2431
moveForward(int i)2432 int RBBISentMonkey::moveForward(int i) {
2433 if (i>=fText->length()) {
2434 return fText->length();
2435 }
2436 UChar32 c;
2437 int32_t j = i;
2438 do {
2439 j = fText->moveIndex32(j, 1);
2440 c = cAt(j);
2441 }
2442 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2443 return j;
2444 }
2445
cAt(int pos)2446 UChar32 RBBISentMonkey::cAt(int pos) {
2447 if (pos<0 || pos>=fText->length()) {
2448 return -1;
2449 } else {
2450 return fText->char32At(pos);
2451 }
2452 }
2453
next(int32_t prevPos)2454 int32_t RBBISentMonkey::next(int32_t prevPos) {
2455 int p0, p1, p2, p3; // Indices of the significant code points around the
2456 // break position being tested. The candidate break
2457 // location is before p2.
2458
2459 int breakPos = -1;
2460
2461 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2462 UChar32 c;
2463
2464 if (U_FAILURE(deferredStatus)) {
2465 return -1;
2466 }
2467
2468 // Prev break at end of string. return DONE.
2469 if (prevPos >= fText->length()) {
2470 return -1;
2471 }
2472 p0 = p1 = p2 = p3 = prevPos;
2473 c3 = fText->char32At(prevPos);
2474 c0 = c1 = c2 = 0;
2475 (void)p0; // Suppress set but not used warning.
2476
2477 // Loop runs once per "significant" character position in the input text.
2478 for (;;) {
2479 // Move all of the positions forward in the input string.
2480 p0 = p1; c0 = c1;
2481 p1 = p2; c1 = c2;
2482 p2 = p3; c2 = c3;
2483
2484 // Advance p3 by X(Extend | Format)* Rule 4
2485 p3 = moveForward(p3);
2486 c3 = cAt(p3);
2487
2488 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2489 setAppliedRule(p2, "SB3 CR x LF");
2490 continue;
2491 }
2492
2493 if (fSepSet->contains(c1)) {
2494 p2 = p1+1; // Separators don't combine with Extend or Format.
2495
2496 setAppliedRule(p2, "SB4 Sep <break>");
2497 break;
2498 }
2499
2500 if (p2 >= fText->length()) {
2501 // Reached end of string. Always a break position.
2502 setAppliedRule(p2, "SB4 Sep <break>");
2503 break;
2504 }
2505
2506 if (p2 == prevPos) {
2507 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2508 setAppliedRule(p2, "SB4 Sep <break>");
2509 continue;
2510 }
2511
2512 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2513 setAppliedRule(p2, "SB6 ATerm x Numeric");
2514 continue;
2515 }
2516
2517 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2518 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2519 setAppliedRule(p2, "SB7 (Upper | Lower) ATerm x Uppper");
2520 continue;
2521 }
2522
2523 // Note: STerm | ATerm are added to the negated part of the expression by a
2524 // note to the Unicode 5.0 documents.
2525 int p8 = p1;
2526 while (fSpSet->contains(cAt(p8))) {
2527 p8 = moveBack(p8);
2528 }
2529 while (fCloseSet->contains(cAt(p8))) {
2530 p8 = moveBack(p8);
2531 }
2532 if (fATermSet->contains(cAt(p8))) {
2533 p8=p2;
2534 for (;;) {
2535 c = cAt(p8);
2536 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2537 fLowerSet->contains(c) || fSepSet->contains(c) ||
2538 fATermSet->contains(c) || fSTermSet->contains(c)) {
2539
2540 setAppliedRule(p2,
2541 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2542 break;
2543 }
2544 p8 = moveForward(p8);
2545 }
2546 if (fLowerSet->contains(cAt(p8))) {
2547
2548 setAppliedRule(p2,
2549 "SB8 ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2550 continue;
2551 }
2552 }
2553
2554 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2555 p8 = p1;
2556 while (fSpSet->contains(cAt(p8))) {
2557 p8 = moveBack(p8);
2558 }
2559 while (fCloseSet->contains(cAt(p8))) {
2560 p8 = moveBack(p8);
2561 }
2562 c = cAt(p8);
2563 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2564 setAppliedRule(p2, "SB8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2565 continue;
2566 }
2567 }
2568
2569 int p9 = p1;
2570 while (fCloseSet->contains(cAt(p9))) {
2571 p9 = moveBack(p9);
2572 }
2573 c = cAt(p9);
2574 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2575 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2576
2577 setAppliedRule(p2, "SB9 (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)");
2578 continue;
2579 }
2580 }
2581
2582 int p10 = p1;
2583 while (fSpSet->contains(cAt(p10))) {
2584 p10 = moveBack(p10);
2585 }
2586 while (fCloseSet->contains(cAt(p10))) {
2587 p10 = moveBack(p10);
2588 }
2589 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2590 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2591 setAppliedRule(p2, "SB10 (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)");
2592 continue;
2593 }
2594 }
2595
2596 int p11 = p1;
2597 if (fSepSet->contains(cAt(p11))) {
2598 p11 = moveBack(p11);
2599 }
2600 while (fSpSet->contains(cAt(p11))) {
2601 p11 = moveBack(p11);
2602 }
2603 while (fCloseSet->contains(cAt(p11))) {
2604 p11 = moveBack(p11);
2605 }
2606 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2607 setAppliedRule(p2, "SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>");
2608 break;
2609 }
2610
2611 setAppliedRule(p2, "SB12 Any x Any");
2612 }
2613
2614 breakPos = p2;
2615 return breakPos;
2616 }
2617
~RBBISentMonkey()2618 RBBISentMonkey::~RBBISentMonkey() {
2619 delete fSets;
2620 delete fSepSet;
2621 delete fFormatSet;
2622 delete fSpSet;
2623 delete fLowerSet;
2624 delete fUpperSet;
2625 delete fOLetterSet;
2626 delete fNumericSet;
2627 delete fATermSet;
2628 delete fSContinueSet;
2629 delete fSTermSet;
2630 delete fCloseSet;
2631 delete fOtherSet;
2632 delete fExtendSet;
2633 }
2634
2635
2636
2637 //-------------------------------------------------------------------------------------------
2638 //
2639 // RBBILineMonkey
2640 //
2641 //-------------------------------------------------------------------------------------------
2642
2643 class RBBILineMonkey: public RBBIMonkeyKind {
2644 public:
2645 RBBILineMonkey();
2646 virtual ~RBBILineMonkey();
2647 virtual UVector *charClasses() override;
2648 virtual void setText(const UnicodeString &s) override;
2649 virtual int32_t next(int32_t i) override;
2650 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2651 private:
2652 UVector *fSets;
2653
2654 UnicodeSet *fBK;
2655 UnicodeSet *fCR;
2656 UnicodeSet *fLF;
2657 UnicodeSet *fCM;
2658 UnicodeSet *fNL;
2659 UnicodeSet *fSG;
2660 UnicodeSet *fWJ;
2661 UnicodeSet *fZW;
2662 UnicodeSet *fGL;
2663 UnicodeSet *fCB;
2664 UnicodeSet *fSP;
2665 UnicodeSet *fB2;
2666 UnicodeSet *fBA;
2667 UnicodeSet *fBB;
2668 UnicodeSet *fHH;
2669 UnicodeSet *fHY;
2670 UnicodeSet *fH2;
2671 UnicodeSet *fH3;
2672 UnicodeSet *fCL;
2673 UnicodeSet *fCP;
2674 UnicodeSet *fEX;
2675 UnicodeSet *fIN;
2676 UnicodeSet *fJL;
2677 UnicodeSet *fJV;
2678 UnicodeSet *fJT;
2679 UnicodeSet *fNS;
2680 UnicodeSet *fOP;
2681 UnicodeSet *fQU;
2682 UnicodeSet *fIS;
2683 UnicodeSet *fNU;
2684 UnicodeSet *fPO;
2685 UnicodeSet *fPR;
2686 UnicodeSet *fSY;
2687 UnicodeSet *fAI;
2688 UnicodeSet *fAL;
2689 UnicodeSet *fCJ;
2690 UnicodeSet *fHL;
2691 UnicodeSet *fID;
2692 UnicodeSet *fRI;
2693 UnicodeSet *fXX;
2694 UnicodeSet *fEB;
2695 UnicodeSet *fEM;
2696 UnicodeSet *fZWJ;
2697 UnicodeSet *fOP30;
2698 UnicodeSet *fCP30;
2699 UnicodeSet *fExtPictUnassigned;
2700 UnicodeSet *fAK;
2701 UnicodeSet *fAP;
2702 UnicodeSet *fAS;
2703 UnicodeSet *fVF;
2704 UnicodeSet *fVI;
2705 UnicodeSet *fPi;
2706 UnicodeSet *fPf;
2707
2708 BreakIterator *fCharBI;
2709 const UnicodeString *fText;
2710 RegexMatcher *fNumberMatcher;
2711 };
2712
RBBILineMonkey()2713 RBBILineMonkey::RBBILineMonkey() :
2714 RBBIMonkeyKind(),
2715 fSets(nullptr),
2716
2717 fCharBI(nullptr),
2718 fText(nullptr),
2719 fNumberMatcher(nullptr)
2720
2721 {
2722 if (U_FAILURE(deferredStatus)) {
2723 return;
2724 }
2725
2726 UErrorCode status = U_ZERO_ERROR;
2727
2728 fSets = new UVector(status);
2729
2730 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2731 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2732 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2733 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2734 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2735 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2736 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2737 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2738 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2739 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2740 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2741 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2742 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2743 fHH = new UnicodeSet();
2744 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2745 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2746 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2747 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2748 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2749 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2750 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2751 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2752 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2753 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2754 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2755 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2756 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2757 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2758 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2759 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2760 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2761 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2762 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2763 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2764 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2765 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2766 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2767 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2768 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2769 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2770 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2771 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2772 fZWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2773 fOP30 = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2774 fCP30 = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2775 fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);
2776
2777 fAK = new UnicodeSet(uR"([\p{Line_Break=AK}])", status);
2778 fAP = new UnicodeSet(uR"([\p{Line_Break=AP}])", status);
2779 fAS = new UnicodeSet(uR"([\p{Line_Break=AS}])", status);
2780 fVF = new UnicodeSet(uR"([\p{Line_Break=VF}])", status);
2781 fVI = new UnicodeSet(uR"([\p{Line_Break=VI}])", status);
2782
2783 fPi = new UnicodeSet(uR"([\p{Pi}])", status);
2784 fPf = new UnicodeSet(uR"([\p{Pf}])", status);
2785
2786 if (U_FAILURE(status)) {
2787 deferredStatus = status;
2788 return;
2789 }
2790
2791 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
2792 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
2793 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
2794
2795 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
2796 fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
2797
2798 fHH->add(u'\u2010'); // Hyphen, '‐'
2799
2800 // Sets and names.
2801 fSets->addElement(fBK, status); classNames.emplace_back("fBK");
2802 fSets->addElement(fCR, status); classNames.emplace_back("fCR");
2803 fSets->addElement(fLF, status); classNames.emplace_back("fLF");
2804 fSets->addElement(fCM, status); classNames.emplace_back("fCM");
2805 fSets->addElement(fNL, status); classNames.emplace_back("fNL");
2806 fSets->addElement(fWJ, status); classNames.emplace_back("fWJ");
2807 fSets->addElement(fZW, status); classNames.emplace_back("fZW");
2808 fSets->addElement(fGL, status); classNames.emplace_back("fGL");
2809 fSets->addElement(fCB, status); classNames.emplace_back("fCB");
2810 fSets->addElement(fSP, status); classNames.emplace_back("fSP");
2811 fSets->addElement(fB2, status); classNames.emplace_back("fB2");
2812 fSets->addElement(fBA, status); classNames.emplace_back("fBA");
2813 fSets->addElement(fBB, status); classNames.emplace_back("fBB");
2814 fSets->addElement(fHY, status); classNames.emplace_back("fHY");
2815 fSets->addElement(fH2, status); classNames.emplace_back("fH2");
2816 fSets->addElement(fH3, status); classNames.emplace_back("fH3");
2817 fSets->addElement(fCL, status); classNames.emplace_back("fCL");
2818 fSets->addElement(fCP, status); classNames.emplace_back("fCP");
2819 fSets->addElement(fEX, status); classNames.emplace_back("fEX");
2820 fSets->addElement(fIN, status); classNames.emplace_back("fIN");
2821 fSets->addElement(fJL, status); classNames.emplace_back("fJL");
2822 fSets->addElement(fJT, status); classNames.emplace_back("fJT");
2823 fSets->addElement(fJV, status); classNames.emplace_back("fJV");
2824 fSets->addElement(fNS, status); classNames.emplace_back("fNS");
2825 fSets->addElement(fOP, status); classNames.emplace_back("fOP");
2826 fSets->addElement(fQU, status); classNames.emplace_back("fQU");
2827 fSets->addElement(fIS, status); classNames.emplace_back("fIS");
2828 fSets->addElement(fNU, status); classNames.emplace_back("fNU");
2829 fSets->addElement(fPO, status); classNames.emplace_back("fPO");
2830 fSets->addElement(fPR, status); classNames.emplace_back("fPR");
2831 fSets->addElement(fSY, status); classNames.emplace_back("fSY");
2832 fSets->addElement(fAI, status); classNames.emplace_back("fAI");
2833 fSets->addElement(fAL, status); classNames.emplace_back("fAL");
2834 fSets->addElement(fHL, status); classNames.emplace_back("fHL");
2835 fSets->addElement(fID, status); classNames.emplace_back("fID");
2836 fSets->addElement(fRI, status); classNames.emplace_back("fRI");
2837 fSets->addElement(fSG, status); classNames.emplace_back("fSG");
2838 fSets->addElement(fEB, status); classNames.emplace_back("fEB");
2839 fSets->addElement(fEM, status); classNames.emplace_back("fEM");
2840 fSets->addElement(fZWJ, status); classNames.emplace_back("fZWJ");
2841 // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2842 fSets->addElement(fOP30, status); classNames.emplace_back("fOP30");
2843 fSets->addElement(fCP30, status); classNames.emplace_back("fCP30");
2844 fSets->addElement(fExtPictUnassigned, status); classNames.emplace_back("fExtPictUnassigned");
2845 fSets->addElement(fAK, status); classNames.emplace_back("fAK");
2846 fSets->addElement(fAP, status); classNames.emplace_back("fAP");
2847 fSets->addElement(fAS, status); classNames.emplace_back("fAS");
2848 fSets->addElement(fVF, status); classNames.emplace_back("fVF");
2849 fSets->addElement(fVI, status); classNames.emplace_back("fVI");
2850
2851
2852 UnicodeString CMx {uR"([[\p{Line_Break=CM}]\u200d])"};
2853 UnicodeString rules;
2854 rules = rules + u"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(" + CMx + u")*)?"
2855 + u"((\\p{Line_Break=OP}|\\p{Line_Break=HY})(" + CMx + u")*)?"
2856 + u"((\\p{Line_Break=IS})(" + CMx + u")*)?"
2857 + u"\\p{Line_Break=NU}(" + CMx + u")*"
2858 + u"((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(" + CMx + u")*)*"
2859 + u"((\\p{Line_Break=CL}|\\p{Line_Break=CP})(" + CMx + u")*)?"
2860 + u"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(" + CMx + u")*)?";
2861
2862 fNumberMatcher = new RegexMatcher(rules, 0, status);
2863
2864 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2865
2866 if (U_FAILURE(status)) {
2867 deferredStatus = status;
2868 }
2869
2870 }
2871
2872
setText(const UnicodeString & s)2873 void RBBILineMonkey::setText(const UnicodeString &s) {
2874 fText = &s;
2875 fCharBI->setText(s);
2876 prepareAppliedRules(s.length());
2877 fNumberMatcher->reset(s);
2878 }
2879
2880 //
2881 // rule9Adjust
2882 // Line Break TR rules 9 and 10 implementation.
2883 // This deals with combining marks and other sequences that
2884 // that must be treated as if they were something other than what they actually are.
2885 //
2886 // This is factored out into a separate function because it must be applied twice for
2887 // each potential break, once to the chars before the position being checked, then
2888 // again to the text following the possible break.
2889 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2890 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2891 if (pos == -1) {
2892 // Invalid initial position. Happens during the warmup iteration of the
2893 // main loop in next().
2894 return;
2895 }
2896
2897 int32_t nPos = *nextPos;
2898
2899 // LB 9 Keep combining sequences together.
2900 // advance over any CM class chars. Note that Line Break CM is different
2901 // from the normal Grapheme Extend property.
2902 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2903 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2904 for (;;) {
2905 *nextChar = fText->char32At(nPos);
2906 if (!fCM->contains(*nextChar)) {
2907 break;
2908 }
2909 nPos = fText->moveIndex32(nPos, 1);
2910 }
2911 }
2912
2913
2914 // LB 9 Treat X CM* as if it were x.
2915 // No explicit action required.
2916
2917 // LB 10 Treat any remaining combining mark as AL
2918 if (fCM->contains(*posChar)) {
2919 *posChar = u'A';
2920 }
2921
2922 // Push the updated nextPos and nextChar back to our caller.
2923 // This only makes a difference if posChar got bigger by consuming a
2924 // combining sequence.
2925 *nextPos = nPos;
2926 *nextChar = fText->char32At(nPos);
2927 }
2928
2929
2930
next(int32_t startPos)2931 int32_t RBBILineMonkey::next(int32_t startPos) {
2932 UErrorCode status = U_ZERO_ERROR;
2933 int32_t pos; // Index of the char following a potential break position
2934 UChar32 thisChar; // Character at above position "pos"
2935
2936 int32_t prevPos; // Index of the char preceding a potential break position
2937 UChar32 prevChar; // Character at above position. Note that prevChar
2938 // and thisChar may not be adjacent because combining
2939 // characters between them will be ignored.
2940
2941 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
2942 UChar32 prevCharX2;
2943
2944 int32_t nextPos; // Index of the next character following pos.
2945 // Usually skips over combining marks.
2946 int32_t nextCPPos; // Index of the code point following "pos."
2947 // May point to a combining mark.
2948 int32_t tPos; // temp value.
2949 UChar32 c;
2950
2951 if (U_FAILURE(deferredStatus)) {
2952 return -1;
2953 }
2954
2955 if (startPos >= fText->length()) {
2956 return -1;
2957 }
2958
2959
2960 // Initial values for loop. Loop will run the first time without finding breaks,
2961 // while the invalid values shift out and the "this" and
2962 // "prev" positions are filled in with good values.
2963 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
2964 thisChar = prevChar = prevCharX2 = 0;
2965 nextPos = nextCPPos = startPos;
2966
2967
2968 // Loop runs once per position in the test text, until a break position
2969 // is found.
2970 for (;;) {
2971 prevPosX2 = prevPos;
2972 prevCharX2 = prevChar;
2973
2974 prevPos = pos;
2975 prevChar = thisChar;
2976
2977 pos = nextPos;
2978 thisChar = fText->char32At(pos);
2979
2980 nextCPPos = fText->moveIndex32(pos, 1);
2981 nextPos = nextCPPos;
2982
2983
2984 if (pos >= fText->length()) {
2985 setAppliedRule(pos, "LB2 - Break at end of text.");
2986 break;
2987 }
2988
2989
2990 // We do this one out-of-order because the adjustment does not change anything
2991 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2992 // be applied.
2993 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2994 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2995 c = fText->char32At(nextPos);
2996 rule9Adjust(pos, &thisChar, &nextPos, &c);
2997
2998 // If the loop is still warming up - if we haven't shifted the initial
2999 // -1 positions out of prevPos yet - loop back to advance the
3000 // position in the input without any further looking for breaks.
3001 if (prevPos == -1) {
3002 setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
3003 continue;
3004 }
3005
3006
3007 if (fBK->contains(prevChar)) {
3008 setAppliedRule(pos, "LB 4 Always break after hard line breaks");
3009 break;
3010 }
3011
3012
3013 if (prevChar == 0x0d && thisChar == 0x0a) {
3014 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
3015 continue;
3016 }
3017 if (prevChar == 0x0d ||
3018 prevChar == 0x0a ||
3019 prevChar == 0x85) {
3020 setAppliedRule(pos, "LB 5 Break after CR, LF, NL, but not inside CR LF");
3021 break;
3022 }
3023
3024
3025 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3026 fBK->contains(thisChar)) {
3027 setAppliedRule(pos, "LB 6 Don't break before hard line breaks");
3028 continue;
3029 }
3030
3031
3032 if (fSP->contains(thisChar)) {
3033 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
3034 continue;
3035 }
3036
3037 // !!! ??? Is this the right text for the applied rule?
3038 if (fZW->contains(thisChar)) {
3039 setAppliedRule(pos, "LB 7 Don't break before spaces or zero-width space.");
3040 continue;
3041 }
3042
3043
3044 // ZW SP* ÷
3045 // Scan backwards from prevChar for SP* ZW
3046 tPos = prevPos;
3047 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3048 tPos = fText->moveIndex32(tPos, -1);
3049 }
3050 if (fZW->contains(fText->char32At(tPos))) {
3051 setAppliedRule(pos, "LB 8 Break after zero width space");
3052 break;
3053 }
3054
3055
3056 // Move this test up, before LB8a, because numbers can match a longer sequence that would
3057 // also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
3058 if (fNumberMatcher->lookingAt(prevPos, status)) {
3059 if (U_FAILURE(status)) {
3060 setAppliedRule(pos, "LB 25 Numbers");
3061 break;
3062 }
3063 // Matched a number. But could have been just a single digit, which would
3064 // not represent a "no break here" between prevChar and thisChar
3065 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3066 if (numEndIdx > pos) {
3067 // Number match includes at least our two chars being checked
3068 if (numEndIdx > nextPos) {
3069 // Number match includes additional chars. Update pos and nextPos
3070 // so that next loop iteration will continue at the end of the number,
3071 // checking for breaks between last char in number & whatever follows.
3072 pos = nextPos = numEndIdx;
3073 do {
3074 pos = fText->moveIndex32(pos, -1);
3075 thisChar = fText->char32At(pos);
3076 } while (fCM->contains(thisChar));
3077 }
3078 setAppliedRule(pos, "LB 25 Numbers");
3079 continue;
3080 }
3081 }
3082
3083
3084 // The monkey test's way of ignoring combining characters doesn't work
3085 // for this rule. ZJ is also a CM. Need to get the actual character
3086 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3087 {
3088 int32_t prevIdx = fText->moveIndex32(pos, -1);
3089 UChar32 prevC = fText->char32At(prevIdx);
3090 if (fZWJ->contains(prevC)) {
3091 setAppliedRule(pos, "LB 8a ZWJ x");
3092 continue;
3093 }
3094 }
3095
3096
3097 // appliedRule: "LB 9, 10"; // Already done, at top of loop.";
3098 //
3099
3100
3101 // x WJ
3102 // WJ x
3103 //
3104 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3105 setAppliedRule(pos, "LB 11 Do not break before or after WORD JOINER and related characters.");
3106 continue;
3107 }
3108
3109
3110 if (fGL->contains(prevChar)) {
3111 setAppliedRule(pos, "LB 12 GL x");
3112 continue;
3113 }
3114
3115
3116 if (!(fSP->contains(prevChar) ||
3117 fBA->contains(prevChar) ||
3118 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3119 setAppliedRule(pos, "LB 12a [^SP BA HY] x GL");
3120 continue;
3121 }
3122
3123
3124 if (fCL->contains(thisChar) ||
3125 fCP->contains(thisChar) ||
3126 fEX->contains(thisChar) ||
3127 fSY->contains(thisChar)) {
3128 setAppliedRule(pos, "LB 13 Don't break before closings.");
3129 continue;
3130 }
3131
3132
3133 // Scan backwards, checking for this sequence.
3134 // The OP char could include combining marks, so we actually check for
3135 // OP CM* SP*
3136 // Another Twist: The Rule 9 fixes may have changed a SP CM
3137 // sequence into a ID char, so before scanning back through spaces,
3138 // verify that prevChar is indeed a space. The prevChar variable
3139 // may differ from fText[prevPos]
3140 tPos = prevPos;
3141 if (fSP->contains(prevChar)) {
3142 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3143 tPos=fText->moveIndex32(tPos, -1);
3144 }
3145 }
3146 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3147 tPos=fText->moveIndex32(tPos, -1);
3148 }
3149 if (fOP->contains(fText->char32At(tPos))) {
3150 setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3151 continue;
3152 }
3153
3154 // Same as LB 14, scan backward for
3155 // (sot | BK | CR | LF | NL | OP CM*| QU CM* | GL CM* | SP) [\p{Pi}&QU] CM* SP*.
3156 tPos = prevPos;
3157 // SP* (with the aforementioned Twist).
3158 if (fSP->contains(prevChar)) {
3159 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3160 tPos = fText->moveIndex32(tPos, -1);
3161 }
3162 }
3163 // CM*.
3164 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3165 tPos = fText->moveIndex32(tPos, -1);
3166 }
3167 // [\p{Pi}&QU].
3168 if (fPi->contains(fText->char32At(tPos)) && fQU->contains(fText->char32At(tPos))) {
3169 if (tPos == 0) {
3170 setAppliedRule(pos, "LB 15a sot [\\p{Pi}&QU] SP* x");
3171 continue;
3172 } else {
3173 tPos = fText->moveIndex32(tPos, -1);
3174 if (fBK->contains(fText->char32At(tPos)) || fCR->contains(fText->char32At(tPos)) ||
3175 fLF->contains(fText->char32At(tPos)) || fNL->contains(fText->char32At(tPos)) ||
3176 fSP->contains(fText->char32At(tPos)) || fZW->contains(fText->char32At(tPos))) {
3177 setAppliedRule(pos, "LB 15a (BK | CR | LF | NL | SP | ZW) [\\p{Pi}&QU] SP* x");
3178 continue;
3179 }
3180 }
3181 // CM*.
3182 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3183 tPos = fText->moveIndex32(tPos, -1);
3184 }
3185 if (fOP->contains(fText->char32At(tPos)) || fQU->contains(fText->char32At(tPos)) ||
3186 fGL->contains(fText->char32At(tPos))) {
3187 setAppliedRule(pos, "LB 15a (OP | QU | GL) [\\p{Pi}&QU] SP* x");
3188 continue;
3189 }
3190 }
3191
3192 if (fPf->contains(thisChar) && fQU->contains(thisChar)) {
3193 UChar32 nextChar = fText->char32At(nextPos);
3194 if (nextPos == fText->length() || fSP->contains(nextChar) || fGL->contains(nextChar) ||
3195 fWJ->contains(nextChar) || fCL->contains(nextChar) || fQU->contains(nextChar) ||
3196 fCP->contains(nextChar) || fEX->contains(nextChar) || fIS->contains(nextChar) ||
3197 fSY->contains(nextChar) || fBK->contains(nextChar) || fCR->contains(nextChar) ||
3198 fLF->contains(nextChar) || fNL->contains(nextChar) || fZW->contains(nextChar)) {
3199 setAppliedRule(pos, "LB 15b x [\\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS | SY "
3200 "| BK | CR | LF | NL | ZW | eot)");
3201 continue;
3202 }
3203 }
3204
3205 if (nextPos < fText->length()) {
3206 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3207 // from a legit ffff noncharacter. So test length separately.
3208 UChar32 nextChar = fText->char32At(nextPos);
3209 if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3210 setAppliedRule(pos,
3211 "LB 15c Break before an IS that begins a number and follows a space");
3212 break;
3213 }
3214 }
3215
3216 if (fIS->contains(thisChar)) {
3217 setAppliedRule(pos, "LB 15d Do not break before numeric separators, even after spaces.");
3218 continue;
3219 }
3220
3221 // Scan backwards for SP* CM* (CL | CP)
3222 if (fNS->contains(thisChar)) {
3223 int tPos = prevPos;
3224 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3225 tPos = fText->moveIndex32(tPos, -1);
3226 }
3227 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3228 tPos = fText->moveIndex32(tPos, -1);
3229 }
3230 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3231 setAppliedRule(pos, "LB 16 (CL | CP) SP* x NS");
3232 continue;
3233 }
3234 }
3235
3236
3237 if (fB2->contains(thisChar)) {
3238 // Scan backwards, checking for the B2 CM* SP* sequence.
3239 tPos = prevPos;
3240 if (fSP->contains(prevChar)) {
3241 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3242 tPos=fText->moveIndex32(tPos, -1);
3243 }
3244 }
3245 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3246 tPos=fText->moveIndex32(tPos, -1);
3247 }
3248 if (fB2->contains(fText->char32At(tPos))) {
3249 setAppliedRule(pos, "LB 17 B2 SP* x B2");
3250 continue;
3251 }
3252 }
3253
3254
3255 if (fSP->contains(prevChar)) {
3256 setAppliedRule(pos, "LB 18 break after space");
3257 break;
3258 }
3259
3260 // x QU
3261 // QU x
3262 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3263 setAppliedRule(pos, "LB 19");
3264 continue;
3265 }
3266
3267 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3268 setAppliedRule(pos, "LB 20 Break around a CB");
3269 break;
3270 }
3271
3272 // Don't break between Hyphens and letters if a break precedes the hyphen.
3273 // Formerly this was a Finnish tailoring.
3274 // Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3275 // ^($HY | $HH) $AL;
3276 if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3277 prevPosX2 == -1) {
3278 setAppliedRule(pos, "LB 20.09");
3279 continue;
3280 }
3281
3282 if (fBA->contains(thisChar) ||
3283 fHY->contains(thisChar) ||
3284 fNS->contains(thisChar) ||
3285 fBB->contains(prevChar) ) {
3286 setAppliedRule(pos, "LB 21");
3287 continue;
3288 }
3289
3290 if (fHL->contains(prevCharX2) &&
3291 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3292 setAppliedRule(pos, "LB 21a HL (HY | BA) x");
3293 continue;
3294 }
3295
3296 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3297 setAppliedRule(pos, "LB 21b SY x HL");
3298 continue;
3299 }
3300
3301 if (fIN->contains(thisChar)) {
3302 setAppliedRule(pos, "LB 22");
3303 continue;
3304 }
3305
3306
3307 // (AL | HL) x NU
3308 // NU x (AL | HL)
3309 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3310 setAppliedRule(pos, "LB 23");
3311 continue;
3312 }
3313 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3314 setAppliedRule(pos, "LB 23");
3315 continue;
3316 }
3317
3318 // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3319 // PR x (ID | EB | EM)
3320 // (ID | EB | EM) x PO
3321 if (fPR->contains(prevChar) &&
3322 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3323 setAppliedRule(pos, "LB 23a");
3324 continue;
3325 }
3326 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3327 fPO->contains(thisChar)) {
3328 setAppliedRule(pos, "LB 23a");
3329 continue;
3330 }
3331
3332 // Do not break between prefix and letters or ideographs.
3333 // (PR | PO) x (AL | HL)
3334 // (AL | HL) x (PR | PO)
3335 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3336 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3337 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3338 continue;
3339 }
3340 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3341 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3342 setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3343 continue;
3344 }
3345
3346 // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3347
3348 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3349 fJV->contains(thisChar) ||
3350 fH2->contains(thisChar) ||
3351 fH3->contains(thisChar))) {
3352 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3353 continue;
3354 }
3355
3356 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3357 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3358 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3359 continue;
3360 }
3361
3362 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3363 fJT->contains(thisChar)) {
3364 setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3365 continue;
3366 }
3367
3368 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3369 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3370 fPO->contains(thisChar)) {
3371 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3372 continue;
3373 }
3374 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3375 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3376 setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3377 continue;
3378 }
3379
3380
3381 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3382 setAppliedRule(pos, "LB 28 Do not break between alphabetics (\"at\").");
3383 continue;
3384 }
3385
3386 if (fAP->contains(prevChar) &&
3387 (fAK->contains(thisChar) || thisChar == U'◌' || fAS->contains(thisChar))) {
3388 setAppliedRule(pos, "LB 28a.1 AP x (AK | ◌ | AS)");
3389 continue;
3390 }
3391
3392 if ((fAK->contains(prevChar) || prevChar == U'◌' || fAS->contains(prevChar)) &&
3393 (fVF->contains(thisChar) || fVI->contains(thisChar))) {
3394 setAppliedRule(pos, "LB 28a.2 (AK | ◌ | AS) x (VF | VI)");
3395 continue;
3396 }
3397
3398 if ((fAK->contains(prevCharX2) || prevCharX2 == U'◌' || fAS->contains(prevCharX2)) &&
3399 fVI->contains(prevChar) &&
3400 (fAK->contains(thisChar) || thisChar == U'◌')) {
3401 setAppliedRule(pos, "LB 28a.3 (AK | ◌ | AS) VI x (AK | ◌)");
3402 continue;
3403 }
3404
3405 if (nextPos < fText->length()) {
3406 // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3407 // from a legit ffff noncharacter. So test length separately.
3408 UChar32 nextChar = fText->char32At(nextPos);
3409 if ((fAK->contains(prevChar) || prevChar == U'◌' || fAS->contains(prevChar)) &&
3410 (fAK->contains(thisChar) || thisChar == U'◌' || fAS->contains(thisChar)) &&
3411 fVF->contains(nextChar)) {
3412 setAppliedRule(pos, "LB 28a.4 (AK | ◌ | AS) x (AK | ◌ | AS) VF");
3413 continue;
3414 }
3415 }
3416
3417 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3418 setAppliedRule(pos, "LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3419 continue;
3420 }
3421
3422 // (AL | NU) x OP
3423 // CP x (AL | NU)
3424 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3425 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3426 continue;
3427 }
3428 if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3429 setAppliedRule(pos, "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3430 continue;
3431 }
3432
3433 // RI x RI
3434 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3435 setAppliedRule(pos, "LB30a RI RI : RI");
3436 break;
3437 }
3438 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3439 // Two Regional Indicators have been paired.
3440 // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3441 // following RI. This is a hack.
3442 thisChar = -1;
3443 setAppliedRule(pos, "LB30a RI RI : RI");
3444 continue;
3445 }
3446
3447 // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
3448 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3449 setAppliedRule(pos, "LB30b Emoji Base x Emoji Modifier");
3450 continue;
3451 }
3452
3453 if (fExtPictUnassigned->contains(prevChar) && fEM->contains(thisChar)) {
3454 setAppliedRule(pos, "LB30b [\\p{Extended_Pictographic}&\\p{Cn}] x EM");
3455 continue;
3456 }
3457
3458 setAppliedRule(pos, "LB 31 Break everywhere else");
3459 break;
3460 }
3461
3462 return pos;
3463 }
3464
3465
charClasses()3466 UVector *RBBILineMonkey::charClasses() {
3467 return fSets;
3468 }
3469
3470
~RBBILineMonkey()3471 RBBILineMonkey::~RBBILineMonkey() {
3472 delete fSets;
3473
3474 delete fBK;
3475 delete fCR;
3476 delete fLF;
3477 delete fCM;
3478 delete fNL;
3479 delete fWJ;
3480 delete fZW;
3481 delete fGL;
3482 delete fCB;
3483 delete fSP;
3484 delete fB2;
3485 delete fBA;
3486 delete fBB;
3487 delete fHH;
3488 delete fHY;
3489 delete fH2;
3490 delete fH3;
3491 delete fCL;
3492 delete fCP;
3493 delete fEX;
3494 delete fIN;
3495 delete fJL;
3496 delete fJV;
3497 delete fJT;
3498 delete fNS;
3499 delete fOP;
3500 delete fQU;
3501 delete fIS;
3502 delete fNU;
3503 delete fPO;
3504 delete fPR;
3505 delete fSY;
3506 delete fAI;
3507 delete fAL;
3508 delete fCJ;
3509 delete fHL;
3510 delete fID;
3511 delete fRI;
3512 delete fSG;
3513 delete fXX;
3514 delete fEB;
3515 delete fEM;
3516 delete fZWJ;
3517 delete fOP30;
3518 delete fCP30;
3519 delete fExtPictUnassigned;
3520 delete fAK;
3521 delete fAP;
3522 delete fAS;
3523 delete fVF;
3524 delete fVI;
3525 delete fPi;
3526 delete fPf;
3527
3528 delete fCharBI;
3529 delete fNumberMatcher;
3530 }
3531
3532
3533 //-------------------------------------------------------------------------------------------
3534 //
3535 // TestMonkey
3536 //
3537 // params
3538 // seed=nnnnn Random number starting seed.
3539 // Setting the seed allows errors to be reproduced.
3540 // loop=nnn Looping count. Controls running time.
3541 // -1: run forever.
3542 // 0 or greater: run length.
3543 //
3544 // type = char | word | line | sent | title
3545 //
3546 // export = (path) Export test cases to (path)_(type).txt in the UCD
3547 // test case format.
3548 //
3549 // Example:
3550 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3551 //
3552 //-------------------------------------------------------------------------------------------
3553
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3554 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3555 int32_t val = defaultVal;
3556 name.append(" *= *(-?\\d+)");
3557 UErrorCode status = U_ZERO_ERROR;
3558 RegexMatcher m(name, params, 0, status);
3559 if (m.find()) {
3560 // The param exists. Convert the string to an int.
3561 char valString[100];
3562 int32_t paramLength = m.end(1, status) - m.start(1, status);
3563 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3564 paramLength = (int32_t)(sizeof(valString)-2);
3565 }
3566 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3567 val = strtol(valString, nullptr, 10);
3568
3569 // Delete this parameter from the params string.
3570 m.reset();
3571 params = m.replaceFirst("", status);
3572 }
3573 U_ASSERT(U_SUCCESS(status));
3574 return val;
3575 }
3576 #endif
3577
3578 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3579 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3580 BreakIterator *bi,
3581 int expected[],
3582 int expectedcount)
3583 {
3584 int count = 0;
3585 int i = 0;
3586 int forward[50];
3587 bi->setText(ustr);
3588 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3589 forward[count] = i;
3590 if (count < expectedcount && expected[count] != i) {
3591 test->errln("%s:%d break forward test failed: expected %d but got %d",
3592 __FILE__, __LINE__, expected[count], i);
3593 break;
3594 }
3595 count ++;
3596 }
3597 if (count != expectedcount) {
3598 printStringBreaks(ustr, expected, expectedcount);
3599 test->errln("%s:%d break forward test failed: missed %d match",
3600 __FILE__, __LINE__, expectedcount - count);
3601 return;
3602 }
3603 // testing boundaries
3604 for (i = 1; i < expectedcount; i ++) {
3605 int j = expected[i - 1];
3606 if (!bi->isBoundary(j)) {
3607 printStringBreaks(ustr, expected, expectedcount);
3608 test->errln("%s:%d isBoundary() failed. Expected boundary at position %d",
3609 __FILE__, __LINE__, j);
3610 return;
3611 }
3612 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3613 if (bi->isBoundary(j)) {
3614 printStringBreaks(ustr, expected, expectedcount);
3615 test->errln("%s:%d isBoundary() failed. Not expecting boundary at position %d",
3616 __FILE__, __LINE__, j);
3617 return;
3618 }
3619 }
3620 }
3621
3622 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3623 count --;
3624 if (forward[count] != i) {
3625 printStringBreaks(ustr, expected, expectedcount);
3626 test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3627 __FILE__, __LINE__, forward[count], i);
3628 break;
3629 }
3630 }
3631 if (count != 0) {
3632 printStringBreaks(ustr, expected, expectedcount);
3633 test->errln("break test previous() failed: missed a match");
3634 return;
3635 }
3636
3637 // testing preceding
3638 for (i = 0; i < expectedcount - 1; i ++) {
3639 // int j = expected[i] + 1;
3640 int j = ustr.moveIndex32(expected[i], 1);
3641 for (; j <= expected[i + 1]; j ++) {
3642 int32_t expectedPreceding = expected[i];
3643 int32_t actualPreceding = bi->preceding(j);
3644 if (actualPreceding != expectedPreceding) {
3645 printStringBreaks(ustr, expected, expectedcount);
3646 test->errln("%s:%d preceding(%d): expected %d, got %d",
3647 __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3648 return;
3649 }
3650 }
3651 }
3652 }
3653 #endif
3654
TestWordBreaks()3655 void RBBITest::TestWordBreaks()
3656 {
3657 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3658
3659 Locale locale("en");
3660 UErrorCode status = U_ZERO_ERROR;
3661 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3662 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3663 // Replaced any C+J characters in a row with a random sequence of characters
3664 // of the same length to make our C+J segmentation not get in the way.
3665 static const char *strlist[] =
3666 {
3667 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3668 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3669 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3670 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3671 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3672 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3673 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3674 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3675 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3676 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3677 "\\u2027\\U000e0067\\u0a47\\u00b7",
3678 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3679 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3680 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3681 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3682 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3683 "\\u0027\\u11af\\U000e0057\\u0602",
3684 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3685 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3686 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3687 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3688 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3689 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3690 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3691 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3692 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3693 "\\u18f4\\U000e0049\\u20e7\\u2027",
3694 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3695 "\\ua183\\u102d\\u0bec\\u003a",
3696 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3697 "\\u003a\\u0e57\\u0fad\\u002e",
3698 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3699 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3700 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3701 "\\u003a\\u0664\\u00b7\\u1fba",
3702 "\\u003b\\u0027\\u00b7\\u47a3",
3703 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3704 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3705 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3706 };
3707 int loop;
3708 if (U_FAILURE(status)) {
3709 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3710 return;
3711 }
3712 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3713 // printf("looping %d\n", loop);
3714 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3715 // RBBICharMonkey monkey;
3716 RBBIWordMonkey monkey;
3717
3718 int expected[50];
3719 int expectedcount = 0;
3720
3721 monkey.setText(ustr);
3722 int i;
3723 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3724 expected[expectedcount ++] = i;
3725 }
3726
3727 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3728 }
3729 delete bi;
3730 #endif
3731 }
3732
TestWordBoundary()3733 void RBBITest::TestWordBoundary()
3734 {
3735 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3736 Locale locale("en");
3737 UErrorCode status = U_ZERO_ERROR;
3738 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3739 LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3740 if (U_FAILURE(status)) {
3741 errcheckln(status, "%s:%d Creation of break iterator failed %s",
3742 __FILE__, __LINE__, u_errorName(status));
3743 return;
3744 }
3745 char16_t str[50];
3746 static const char *strlist[] =
3747 {
3748 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3749 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3750 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3751 "\\u2027\\U000e0067\\u0a47\\u00b7",
3752 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3753 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3754 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3755 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3756 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3757 "\\u0027\\u11af\\U000e0057\\u0602",
3758 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3759 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3760 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3761 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3762 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3763 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3764 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3765 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3766 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3767 "\\u58f4\\U000e0049\\u20e7\\u2027",
3768 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3769 "\\ua183\\u102d\\u0bec\\u003a",
3770 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3771 "\\u003a\\u0e57\\u0fad\\u002e",
3772 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3773 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3774 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3775 "\\u003a\\u0664\\u00b7\\u1fba",
3776 "\\u003b\\u0027\\u00b7\\u47a3",
3777 };
3778 int loop;
3779 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3780 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3781 UnicodeString ustr(str);
3782 int forward[50];
3783 int count = 0;
3784
3785 bi->setText(ustr);
3786 int prev = -1;
3787 for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3788 ++count;
3789 if (count >= UPRV_LENGTHOF(forward)) {
3790 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3791 __FILE__, __LINE__, loop, count, boundary);
3792 return;
3793 }
3794 forward[count] = boundary;
3795 if (boundary <= prev) {
3796 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3797 __FILE__, __LINE__, loop, prev, boundary);
3798 break;
3799 }
3800 for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3801 if (bi->isBoundary(nonBoundary)) {
3802 printStringBreaks(ustr, forward, count);
3803 errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3804 __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3805 return;
3806 }
3807 }
3808 if (!bi->isBoundary(boundary)) {
3809 printStringBreaks(ustr, forward, count);
3810 errln("%s:%d happy boundary test failed: expected %d a boundary",
3811 __FILE__, __LINE__, boundary);
3812 return;
3813 }
3814 prev = boundary;
3815 }
3816 }
3817 }
3818
TestLineBreaks()3819 void RBBITest::TestLineBreaks()
3820 {
3821 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3822 Locale locale("en");
3823 UErrorCode status = U_ZERO_ERROR;
3824 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3825 const int32_t STRSIZE = 50;
3826 char16_t str[STRSIZE];
3827 static const char *strlist[] =
3828 {
3829 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3830 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3831 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3832 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3833 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3834 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3835 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3836 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3837 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3838 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3839 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3840 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3841 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3842 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3843 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3844 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3845 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3846 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3847 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3848 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3849 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3850 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3851 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3852 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3853 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3854 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3855 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3856 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3857 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3858 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3859 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3860 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3861 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3862 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3863 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3864 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3865 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3866 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3867 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3868 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3869 };
3870 int loop;
3871 TEST_ASSERT_SUCCESS(status);
3872 if (U_FAILURE(status)) {
3873 return;
3874 }
3875 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3876 // printf("looping %d\n", loop);
3877 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3878 if (t >= STRSIZE) {
3879 TEST_ASSERT(false);
3880 continue;
3881 }
3882
3883
3884 UnicodeString ustr(str);
3885 RBBILineMonkey monkey;
3886 if (U_FAILURE(monkey.deferredStatus)) {
3887 continue;
3888 }
3889
3890 const int EXPECTEDSIZE = 50;
3891 int expected[EXPECTEDSIZE];
3892 int expectedcount = 0;
3893
3894 monkey.setText(ustr);
3895
3896 int i;
3897 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3898 if (expectedcount >= EXPECTEDSIZE) {
3899 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3900 return;
3901 }
3902 expected[expectedcount ++] = i;
3903 }
3904
3905 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3906 }
3907 delete bi;
3908 #endif
3909 }
3910
TestSentBreaks()3911 void RBBITest::TestSentBreaks()
3912 {
3913 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3914 Locale locale("en");
3915 UErrorCode status = U_ZERO_ERROR;
3916 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3917 char16_t str[200];
3918 static const char *strlist[] =
3919 {
3920 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3921 "This\n",
3922 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3923 "\"Sentence ending with a quote.\" Bye.",
3924 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3925 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3926 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3927 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3928 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3929 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3930 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3931 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3932 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3933 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3934 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3935 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3936 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3937 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3938 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3939 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3940 };
3941 int loop;
3942 if (U_FAILURE(status)) {
3943 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3944 return;
3945 }
3946 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3947 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3948 UnicodeString ustr(str);
3949
3950 RBBISentMonkey monkey;
3951 if (U_FAILURE(monkey.deferredStatus)) {
3952 continue;
3953 }
3954
3955 const int EXPECTEDSIZE = 50;
3956 int expected[EXPECTEDSIZE];
3957 int expectedcount = 0;
3958
3959 monkey.setText(ustr);
3960
3961 int i;
3962 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3963 if (expectedcount >= EXPECTEDSIZE) {
3964 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3965 return;
3966 }
3967 expected[expectedcount ++] = i;
3968 }
3969
3970 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3971 }
3972 delete bi;
3973 #endif
3974 }
3975
TestMonkey()3976 void RBBITest::TestMonkey() {
3977 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3978
3979 UErrorCode status = U_ZERO_ERROR;
3980 int32_t loopCount = 500;
3981 int32_t seed = 1;
3982 UnicodeString breakType = "all";
3983 Locale locale("en");
3984 UBool useUText = false;
3985 UBool scalarsOnly = false;
3986 std::string exportPath;
3987
3988 if (quick == false) {
3989 loopCount = 10000;
3990 }
3991
3992 if (fTestParams) {
3993 UnicodeString p(fTestParams);
3994 loopCount = getIntParam("loop", p, loopCount);
3995 seed = getIntParam("seed", p, seed);
3996
3997 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3998 if (m.find()) {
3999 breakType = m.group(1, status);
4000 m.reset();
4001 p = m.replaceFirst("", status);
4002 }
4003
4004 RegexMatcher u(" *utext", p, 0, status);
4005 if (u.find()) {
4006 useUText = true;
4007 u.reset();
4008 p = u.replaceFirst("", status);
4009 }
4010
4011 RegexMatcher pathMatcher(" *export *= *([^ ]+) *", p, 0, status);
4012 if (pathMatcher.find()) {
4013 pathMatcher.group(1, status).toUTF8String(exportPath);
4014 pathMatcher.reset();
4015 p = pathMatcher.replaceFirst("", status);
4016 }
4017
4018 RegexMatcher s(" *scalars_only", p, 0, status);
4019 if (s.find()) {
4020 scalarsOnly = true;
4021 s.reset();
4022 p = s.replaceFirst("", status);
4023 }
4024
4025 // m.reset(p);
4026 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4027 // Each option is stripped out of the option string as it is processed.
4028 // All options have been checked. The option string should have been completely emptied..
4029 char buf[100];
4030 p.extract(buf, sizeof(buf), nullptr, status);
4031 buf[sizeof(buf)-1] = 0;
4032 errln("Unrecognized or extra parameter: %s\n", buf);
4033 return;
4034 }
4035
4036 }
4037
4038 if (breakType == "char" || breakType == "all") {
4039 FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_char.txt").c_str(), "w");
4040 RBBICharMonkey m;
4041 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4042 if (U_SUCCESS(status)) {
4043 RunMonkey(bi, m, "char", seed, loopCount, useUText, file, scalarsOnly);
4044 if (breakType == "all" && useUText==false) {
4045 // Also run a quick test with UText when "all" is specified
4046 RunMonkey(bi, m, "char", seed, loopCount, true, nullptr, scalarsOnly);
4047 }
4048 }
4049 else {
4050 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4051 }
4052 delete bi;
4053 if (file != nullptr) {
4054 fclose(file);
4055 }
4056 }
4057
4058 if (breakType == "word" || breakType == "all") {
4059 logln("Word Break Monkey Test");
4060 FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_word.txt").c_str(), "w");
4061 RBBIWordMonkey m;
4062 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4063 if (U_SUCCESS(status)) {
4064 RunMonkey(bi, m, "word", seed, loopCount, useUText, file, scalarsOnly);
4065 }
4066 else {
4067 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4068 }
4069 delete bi;
4070 if (file != nullptr) {
4071 fclose(file);
4072 }
4073 }
4074
4075 if (breakType == "line" || breakType == "all") {
4076 logln("Line Break Monkey Test");
4077 FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_line.txt").c_str(), "w");
4078 RBBILineMonkey m;
4079 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4080 if (loopCount >= 10) {
4081 loopCount = loopCount / 5; // Line break runs slower than the others.
4082 }
4083 if (U_SUCCESS(status)) {
4084 RunMonkey(bi, m, "line", seed, loopCount, useUText, file, scalarsOnly);
4085 }
4086 else {
4087 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4088 }
4089 delete bi;
4090 if (file != nullptr) {
4091 fclose(file);
4092 }
4093 }
4094
4095 if (breakType == "sent" || breakType == "all" ) {
4096 logln("Sentence Break Monkey Test");
4097 FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_sent.txt").c_str(), "w");
4098 RBBISentMonkey m;
4099 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4100 if (loopCount >= 10) {
4101 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4102 }
4103 if (U_SUCCESS(status)) {
4104 RunMonkey(bi, m, "sent", seed, loopCount, useUText, file, scalarsOnly);
4105 }
4106 else {
4107 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4108 }
4109 delete bi;
4110 if (file != nullptr) {
4111 fclose(file);
4112 }
4113 }
4114
4115 #endif
4116 }
4117
4118 //
4119 // Run a RBBI monkey test. Common routine, for all break iterator types.
4120 // Parameters:
4121 // bi - the break iterator to use
4122 // mk - MonkeyKind, abstraction for obtaining expected results
4123 // name - Name of test (char, word, etc.) for use in error messages
4124 // seed - Seed for starting random number generator (parameter from user)
4125 // numIterations
4126 // exportFile - Pointer to a file to which the test cases will be written in
4127 // UCD format. May be null.
4128 // scalarsOnly - Only test sequences of Unicode scalar values; if this is false,
4129 // arbitrary sequences of code points (including unpaired surrogates)
4130 // are tested.
4131 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText,FILE * exportFile,UBool scalarsOnly)4132 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
4133 int32_t numIterations, UBool useUText, FILE *exportFile, UBool scalarsOnly) {
4134
4135 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4136
4137 const int32_t TESTSTRINGLEN = 500;
4138 UnicodeString testText;
4139 int32_t numCharClasses;
4140 UVector *chClasses;
4141 int expectedCount = 0;
4142 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4143 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4144 char reverseBreaks[TESTSTRINGLEN*2+1];
4145 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4146 char followingBreaks[TESTSTRINGLEN*2+1];
4147 char precedingBreaks[TESTSTRINGLEN*2+1];
4148 int i;
4149 int loopCount = 0;
4150
4151
4152 m_seed = seed;
4153
4154 numCharClasses = mk.charClasses()->size();
4155 chClasses = mk.charClasses();
4156
4157 // Check for errors that occurred during the construction of the MonkeyKind object.
4158 // Can't report them where they occurred because errln() is a method coming from intlTest,
4159 // and is not visible outside of RBBITest :-(
4160 if (U_FAILURE(mk.deferredStatus)) {
4161 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4162 return;
4163 }
4164
4165 // Verify that the character classes all have at least one member.
4166 for (i=0; i<numCharClasses; i++) {
4167 UnicodeSet *s = static_cast<UnicodeSet *>(chClasses->elementAt(i));
4168 if (s == nullptr || s->size() == 0) {
4169 errln("Character Class #%d is null or of zero size.", i);
4170 return;
4171 }
4172 }
4173
4174 // For minimizing width of class name output.
4175 int classNameSize = mk.maxClassNameSize();
4176
4177 while (loopCount < numIterations || numIterations == -1) {
4178 if (numIterations == -1 && loopCount % 10 == 0) {
4179 // If test is running in an infinite loop, display a periodic tic so
4180 // we can tell that it is making progress.
4181 fprintf(stderr, ".");
4182 }
4183 // Save current random number seed, so that we can recreate the random numbers
4184 // for this loop iteration in event of an error.
4185 seed = m_seed;
4186
4187 // Populate a test string with data.
4188 testText.truncate(0);
4189 for (i=0; i<TESTSTRINGLEN; i++) {
4190 int32_t aClassNum = m_rand() % numCharClasses;
4191 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4192 int32_t charIdx = m_rand() % classSet->size();
4193 UChar32 c = classSet->charAt(charIdx);
4194 if (c < 0) { // TODO: deal with sets containing strings.
4195 errln("%s:%d c < 0", __FILE__, __LINE__);
4196 break;
4197 }
4198 if (scalarsOnly && U16_IS_SURROGATE(c)) {
4199 continue;
4200 }
4201 // Do not assemble a supplementary character from randomly generated separate surrogates.
4202 // (It could be a dictionary character)
4203 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4204 continue;
4205 }
4206
4207 testText.append(c);
4208 }
4209
4210 // Calculate the expected results for this test string and reset applied rules.
4211 mk.setText(testText);
4212
4213 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4214 expectedBreaks[0] = 1;
4215 int32_t breakPos = 0;
4216 expectedCount = 0;
4217 for (;;) {
4218 breakPos = mk.next(breakPos);
4219 if (breakPos == -1) {
4220 break;
4221 }
4222 if (breakPos > testText.length()) {
4223 errln("breakPos > testText.length()");
4224 }
4225 expectedBreaks[breakPos] = 1;
4226 expectedCount++;
4227 U_ASSERT(expectedCount<testText.length());
4228 (void)expectedCount; // Used by U_ASSERT().
4229 }
4230
4231 // Find the break positions using forward iteration
4232 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4233 if (useUText) {
4234 UErrorCode status = U_ZERO_ERROR;
4235 UText *testUText = utext_openReplaceable(nullptr, &testText, &status);
4236 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4237 bi->setText(testUText, status);
4238 TEST_ASSERT_SUCCESS(status);
4239 utext_close(testUText); // The break iterator does a shallow clone of the UText
4240 // This UText can be closed immediately, so long as the
4241 // testText string continues to exist.
4242 } else {
4243 bi->setText(testText);
4244 }
4245
4246 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4247 if (i < 0 || i > testText.length()) {
4248 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4249 break;
4250 }
4251 forwardBreaks[i] = 1;
4252 }
4253
4254 // Find the break positions using reverse iteration
4255 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4256 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4257 if (i < 0 || i > testText.length()) {
4258 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4259 break;
4260 }
4261 reverseBreaks[i] = 1;
4262 }
4263
4264 // Find the break positions using isBoundary() tests.
4265 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4266 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4267 for (i=0; i<=testText.length(); i++) {
4268 isBoundaryBreaks[i] = bi->isBoundary(i);
4269 }
4270
4271
4272 // Find the break positions using the following() function.
4273 // printf(".");
4274 memset(followingBreaks, 0, sizeof(followingBreaks));
4275 int32_t lastBreakPos = 0;
4276 followingBreaks[0] = 1;
4277 for (i=0; i<testText.length(); i++) {
4278 breakPos = bi->following(i);
4279 if (breakPos <= i ||
4280 breakPos < lastBreakPos ||
4281 breakPos > testText.length() ||
4282 (breakPos > lastBreakPos && lastBreakPos > i)) {
4283 errln("%s break monkey test: "
4284 "Out of range value returned by BreakIterator::following().\n"
4285 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4286 name, seed, i, breakPos, lastBreakPos);
4287 break;
4288 }
4289 followingBreaks[breakPos] = 1;
4290 lastBreakPos = breakPos;
4291 }
4292
4293 // Find the break positions using the preceding() function.
4294 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4295 lastBreakPos = testText.length();
4296 precedingBreaks[testText.length()] = 1;
4297 for (i=testText.length(); i>0; i--) {
4298 breakPos = bi->preceding(i);
4299 if (breakPos >= i ||
4300 breakPos > lastBreakPos ||
4301 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4302 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4303 errln("%s break monkey test: "
4304 "Out of range value returned by BreakIterator::preceding().\n"
4305 "index=%d; prev returned %d; lastBreak=%d" ,
4306 name, i, breakPos, lastBreakPos);
4307 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4308 precedingBreaks[i] = 2; // Forces an error.
4309 }
4310 } else {
4311 if (breakPos >= 0) {
4312 precedingBreaks[breakPos] = 1;
4313 }
4314 lastBreakPos = breakPos;
4315 }
4316 }
4317
4318 if (exportFile != nullptr) {
4319 for (i = 0; i < testText.length();) {
4320 fprintf(exportFile, expectedBreaks[i] ? "÷ " : "× ");
4321 char32_t const c = testText.char32At(i);
4322 fprintf(exportFile, "%04X ", static_cast<uint32_t>(c));
4323 i += U16_LENGTH(c);
4324 }
4325 fprintf(exportFile, expectedBreaks[testText.length()] ? "÷ # \n" : "× # \n");
4326 }
4327
4328 // Compare the expected and actual results.
4329 for (i=0; i<=testText.length(); i++) {
4330 const char *errorType = nullptr;
4331 const char* currentBreakData = nullptr;
4332 if (forwardBreaks[i] != expectedBreaks[i]) {
4333 errorType = "next()";
4334 currentBreakData = forwardBreaks;
4335 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4336 errorType = "previous()";
4337 currentBreakData = reverseBreaks;
4338 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4339 errorType = "isBoundary()";
4340 currentBreakData = isBoundaryBreaks;
4341 } else if (followingBreaks[i] != expectedBreaks[i]) {
4342 errorType = "following()";
4343 currentBreakData = followingBreaks;
4344 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4345 errorType = "preceding()";
4346 currentBreakData = precedingBreaks;
4347 }
4348
4349 if (errorType != nullptr) {
4350 // Format a range of the test text that includes the failure as
4351 // a data item that can be included in the rbbi test data file.
4352
4353 // Start of the range is the last point where expected and actual results
4354 // both agreed that there was a break position.
4355
4356 int startContext = i;
4357 int32_t count = 0;
4358 for (;;) {
4359 if (startContext==0) { break; }
4360 startContext --;
4361 if (expectedBreaks[startContext] != 0) {
4362 if (count == 2) break;
4363 count ++;
4364 }
4365 }
4366
4367 // End of range is two expected breaks past the start position.
4368 int endContext = i + 1;
4369 int ci;
4370 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4371 for (;;) {
4372 if (endContext >= testText.length()) {break;}
4373 if (expectedBreaks[endContext-1] != 0) {
4374 if (count == 0) break;
4375 count --;
4376 }
4377 endContext ++;
4378 }
4379 }
4380
4381 // Formatting of each line includes:
4382 // character code
4383 // reference break: '|' -> a break, '.' -> no break
4384 // actual break: '|' -> a break, '.' -> no break
4385 // (name of character clase)
4386 // Unicode name of character
4387 // '-->' indicates location of the difference.
4388
4389 MONKEY_ERROR(
4390 (expectedBreaks[i] ? "Break expected but not found" :
4391 "Break found but not expected"),
4392 name, i, seed);
4393
4394 for (ci = startContext;; (ci = testText.moveIndex32(ci, 1))) {
4395 UChar32 c;
4396 c = testText.char32At(ci);
4397
4398 std::string currentLineFlag = " ";
4399 if (ci == i) {
4400 currentLineFlag = "-->"; // Error position
4401 }
4402
4403 // BMP or SMP character in hex
4404 char hexCodePoint[12];
4405 std::string format = " \\u%04x";
4406 if (c >= 0x10000) {
4407 format = "\\U%08x";
4408 }
4409 snprintf(hexCodePoint, sizeof(hexCodePoint), format.c_str(), c);
4410
4411 // Get the class name and character name for the character.
4412 char cName[200];
4413 UErrorCode status = U_ZERO_ERROR;
4414 u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4415
4416 char buffer[200];
4417 auto ret = snprintf(buffer, sizeof(buffer),
4418 "%4s %3i : %1s %1s %10s %-*s %-40s %-40s",
4419 currentLineFlag.c_str(),
4420 ci,
4421 expectedBreaks[ci] == 0 ? "." : "|", // Reference break
4422 currentBreakData[ci] == 0 ? "." : "|", // Actual break
4423 hexCodePoint,
4424 classNameSize,
4425 mk.classNameFromCodepoint(c).c_str(),
4426 mk.getAppliedRule(ci).c_str(), cName);
4427 (void)ret;
4428 U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4429
4430 // Output the error
4431 if (ci == i) {
4432 errln(buffer);
4433 } else {
4434 infoln(buffer);
4435 }
4436
4437 if (ci >= endContext) { break; }
4438 }
4439 break;
4440 }
4441 }
4442
4443 loopCount++;
4444 }
4445 #endif
4446 }
4447
4448
4449 // Bug 5532. UTF-8 based UText fails in dictionary code.
4450 // This test checks the initial patch,
4451 // which is to just keep it from crashing. Correct word boundaries
4452 // await a proper fix to the dictionary code.
4453 //
TestBug5532()4454 void RBBITest::TestBug5532() {
4455 // Text includes a mixture of Thai and Latin.
4456 const unsigned char utf8Data[] = {
4457 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4458 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4459 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4460 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4461 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4462 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4463 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4464 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4465 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4466 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4467 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4468
4469 UErrorCode status = U_ZERO_ERROR;
4470 UText utext=UTEXT_INITIALIZER;
4471 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4472 TEST_ASSERT_SUCCESS(status);
4473
4474 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4475 TEST_ASSERT_SUCCESS(status);
4476 if (U_SUCCESS(status)) {
4477 bi->setText(&utext, status);
4478 TEST_ASSERT_SUCCESS(status);
4479
4480 int32_t breakCount = 0;
4481 int32_t previousBreak = -1;
4482 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4483 // For now, just make sure that the break iterator doesn't hang.
4484 TEST_ASSERT(previousBreak < bi->current());
4485 previousBreak = bi->current();
4486 }
4487 TEST_ASSERT(breakCount > 0);
4488 }
4489 delete bi;
4490 utext_close(&utext);
4491 }
4492
4493
TestBug9983()4494 void RBBITest::TestBug9983() {
4495 UnicodeString text = UnicodeString("\\u002A" // * Other
4496 "\\uFF65" // Other
4497 "\\u309C" // Katakana
4498 "\\uFF9F" // Extend
4499 "\\uFF65" // Other
4500 "\\u0020" // Other
4501 "\\u0000").unescape();
4502
4503 UErrorCode status = U_ZERO_ERROR;
4504 LocalPointer<RuleBasedBreakIterator> brkiter(dynamic_cast<RuleBasedBreakIterator *>(
4505 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4506 TEST_ASSERT_SUCCESS(status);
4507 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(dynamic_cast<RuleBasedBreakIterator *>(
4508 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4509 TEST_ASSERT_SUCCESS(status);
4510 if (U_FAILURE(status)) {
4511 return;
4512 }
4513 int32_t offset, rstatus, iterationCount;
4514
4515 brkiter->setText(text);
4516 brkiter->last();
4517 iterationCount = 0;
4518 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4519 iterationCount++;
4520 rstatus = brkiter->getRuleStatus();
4521 (void)rstatus; // Suppress set but not used warning.
4522 if (iterationCount >= 10) {
4523 break;
4524 }
4525 }
4526 TEST_ASSERT(iterationCount == 6);
4527
4528 brkiterPOSIX->setText(text);
4529 brkiterPOSIX->last();
4530 iterationCount = 0;
4531 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4532 iterationCount++;
4533 rstatus = brkiterPOSIX->getRuleStatus();
4534 (void)rstatus; // Suppress set but not used warning.
4535 if (iterationCount >= 10) {
4536 break;
4537 }
4538 }
4539 TEST_ASSERT(iterationCount == 6);
4540 }
4541
4542 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4543 //
TestBug7547()4544 void RBBITest::TestBug7547() {
4545 UnicodeString rules;
4546 UErrorCode status = U_ZERO_ERROR;
4547 UParseError parseError;
4548 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4549 if (status != U_BRK_RULE_SYNTAX) {
4550 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4551 }
4552 if (parseError.line != 1 || parseError.offset != 0) {
4553 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4554 }
4555 }
4556
4557
TestBug12797()4558 void RBBITest::TestBug12797() {
4559 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4560 UErrorCode status = U_ZERO_ERROR;
4561 UParseError parseError;
4562 RuleBasedBreakIterator bi(rules, parseError, status);
4563 if (U_FAILURE(status)) {
4564 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4565 return;
4566 }
4567 UnicodeString text = "abc";
4568 bi.setText(text);
4569 bi.first();
4570 int32_t boundary = bi.next();
4571 if (boundary != 3) {
4572 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4573 }
4574 }
4575
TestBug12918()4576 void RBBITest::TestBug12918() {
4577 // This test triggers an assertion failure in dictbe.cpp
4578 const char16_t *crasherString = u"\u3325\u4a16";
4579 UErrorCode status = U_ZERO_ERROR;
4580 UBreakIterator* iter = ubrk_open(UBRK_WORD, nullptr, crasherString, -1, &status);
4581 if (U_FAILURE(status)) {
4582 dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4583 return;
4584 }
4585 ubrk_first(iter);
4586 int32_t pos = 0;
4587 int32_t lastPos = -1;
4588 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4589 if (pos <= lastPos) {
4590 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4591 break;
4592 }
4593 }
4594 ubrk_close(iter);
4595 }
4596
TestBug12932()4597 void RBBITest::TestBug12932() {
4598 // Node Stack overflow in the RBBI rule parser caused a seg fault.
4599 UnicodeString ruleStr(
4600 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4601 "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4602 "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4603 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4604 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4605 ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4606
4607 UErrorCode status = U_ZERO_ERROR;
4608 UParseError parseError;
4609 RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4610 if (status != U_BRK_RULE_SYNTAX) {
4611 errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4612 __FILE__, __LINE__, u_errorName(status));
4613 }
4614 }
4615
4616
4617 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4618 // remain undevided by ICU char, word and line break.
TestEmoji()4619 void RBBITest::TestEmoji() {
4620 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4621 UErrorCode status = U_ZERO_ERROR;
4622
4623 CharString testFileName;
4624 testFileName.append(IntlTest::getSourceTestData(status), status);
4625 testFileName.appendPathPart("emoji-test.txt", status);
4626 if (U_FAILURE(status)) {
4627 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4628 return;
4629 }
4630 logln("Opening data file %s\n", testFileName.data());
4631
4632 int len;
4633 char16_t *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4634 if (U_FAILURE(status) || testFile == nullptr) {
4635 errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4636 return;
4637 }
4638 UnicodeString testFileAsString(testFile, len);
4639 delete [] testFile;
4640
4641 RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4642 RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4643 // hexMatcher group(1) is a hex number, or empty string if no hex number present.
4644 int32_t lineNumber = 0;
4645
4646 LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4647 LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4648 LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4649 if (U_FAILURE(status)) {
4650 dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4651 return;
4652 }
4653
4654 while (lineMatcher.find()) {
4655 ++lineNumber;
4656 UnicodeString line = lineMatcher.group(status);
4657 hexMatcher.reset(line);
4658 UnicodeString testString; // accumulates the emoji sequence.
4659 while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4660 UnicodeString hex = hexMatcher.group(1, status);
4661 if (hex.length() > 8) {
4662 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4663 break;
4664 }
4665 CharString hex8;
4666 hex8.appendInvariantChars(hex, status);
4667 UChar32 c = (UChar32)strtol(hex8.data(), nullptr, 16);
4668 if (c<=0x10ffff) {
4669 testString.append(c);
4670 } else {
4671 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4672 __FILE__, __LINE__, lineNumber, hex8.data());
4673 break;
4674 }
4675 }
4676
4677 if (testString.length() > 1) {
4678 charBreaks->setText(testString);
4679 charBreaks->first();
4680 int32_t firstBreak = charBreaks->next();
4681 if (testString.length() != firstBreak) {
4682 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4683 __FILE__, __LINE__, lineNumber, firstBreak);
4684 }
4685 wordBreaks->setText(testString);
4686 wordBreaks->first();
4687 firstBreak = wordBreaks->next();
4688 if (testString.length() != firstBreak) {
4689 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4690 __FILE__, __LINE__, lineNumber, firstBreak);
4691 }
4692 lineBreaks->setText(testString);
4693 lineBreaks->first();
4694 firstBreak = lineBreaks->next();
4695 if (testString.length() != firstBreak) {
4696 errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
4697 __FILE__, __LINE__, lineNumber, firstBreak);
4698 }
4699 }
4700 }
4701 #endif
4702 }
4703
4704
4705 // TestBug12519 - Correct handling of Locales by assignment / copy / clone
4706
TestBug12519()4707 void RBBITest::TestBug12519() {
4708 UErrorCode status = U_ZERO_ERROR;
4709 LocalPointer<RuleBasedBreakIterator> biEn(dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4710 LocalPointer<RuleBasedBreakIterator> biFr(dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getFrance(), status)));
4711 if (!assertSuccess(WHERE, status)) {
4712 dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4713 return;
4714 }
4715 assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4716
4717 assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4718 assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4719
4720 LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4721 assertTrue(WHERE, *biEn == *cloneEn);
4722 assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4723
4724 LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4725 assertTrue(WHERE, *biFr == *cloneFr);
4726 assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4727
4728 LocalPointer<RuleBasedBreakIterator>biDe(dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getGerman(), status)));
4729 UnicodeString text("Hallo Welt");
4730 biDe->setText(text);
4731 assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4732 *biDe = *biFr;
4733 assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4734 }
4735
TestBug12677()4736 void RBBITest::TestBug12677() {
4737 // Check that stripping of comments from rules for getRules() is not confused by
4738 // the presence of '#' characters in the rules that do not introduce comments.
4739 UnicodeString rules(u"!!forward; \n"
4740 "$x = [ab#]; # a set with a # literal. \n"
4741 " # .; # a comment that looks sort of like a rule. \n"
4742 " '#' '?'; # a rule with a quoted # \n"
4743 );
4744
4745 UErrorCode status = U_ZERO_ERROR;
4746 UParseError pe;
4747 RuleBasedBreakIterator bi(rules, pe, status);
4748 assertSuccess(WHERE, status);
4749 UnicodeString rtRules = bi.getRules();
4750 assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"), rtRules);
4751 }
4752
4753
TestTableRedundancies()4754 void RBBITest::TestTableRedundancies() {
4755 UErrorCode status = U_ZERO_ERROR;
4756
4757 LocalPointer<RuleBasedBreakIterator> bi (
4758 dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4759 assertSuccess(WHERE, status);
4760 if (U_FAILURE(status)) return;
4761
4762 RBBIDataWrapper *dw = bi->fData;
4763 const RBBIStateTable *fwtbl = dw->fForwardTable;
4764 UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4765 int32_t numCharClasses = dw->fHeader->fCatCount;
4766 // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
4767
4768 // Check for duplicate columns (character categories)
4769
4770 std::vector<UnicodeString> columns;
4771 for (int32_t column = 0; column < numCharClasses; column++) {
4772 UnicodeString s;
4773 for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4774 RBBIStateTableRow *row = reinterpret_cast<RBBIStateTableRow *>(const_cast<char*>(fwtbl->fTableData + (fwtbl->fRowLen * r)));
4775 s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4776 }
4777 columns.push_back(s);
4778 }
4779 // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4780 for (int c1=1; c1<numCharClasses; c1++) {
4781 int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4782 for (int c2 = c1+1; c2 < limit; c2++) {
4783 if (columns.at(c1) == columns.at(c2)) {
4784 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4785 goto out;
4786 }
4787 }
4788 }
4789 out:
4790
4791 // Check for duplicate states
4792 std::vector<UnicodeString> rows;
4793 for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4794 UnicodeString s;
4795 RBBIStateTableRow *row = reinterpret_cast<RBBIStateTableRow *>(const_cast<char*>((fwtbl->fTableData + (fwtbl->fRowLen * r))));
4796 if (in8Bits) {
4797 s.append(row->r8.fAccepting);
4798 s.append(row->r8.fLookAhead);
4799 s.append(row->r8.fTagsIdx);
4800 for (int32_t column = 0; column < numCharClasses; column++) {
4801 s.append(row->r8.fNextState[column]);
4802 }
4803 } else {
4804 s.append(row->r16.fAccepting);
4805 s.append(row->r16.fLookAhead);
4806 s.append(row->r16.fTagsIdx);
4807 for (int32_t column = 0; column < numCharClasses; column++) {
4808 s.append(row->r16.fNextState[column]);
4809 }
4810 }
4811 rows.push_back(s);
4812 }
4813 for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4814 for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4815 if (rows.at(r1) == rows.at(r2)) {
4816 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4817 return;
4818 }
4819 }
4820 }
4821 }
4822
4823 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4824 // even after next() has returned DONE.
4825
TestBug13447()4826 void RBBITest::TestBug13447() {
4827 UErrorCode status = U_ZERO_ERROR;
4828 LocalPointer<RuleBasedBreakIterator> bi(
4829 dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4830 assertSuccess(WHERE, status);
4831 if (U_FAILURE(status)) return;
4832 UnicodeString data(u"1234");
4833 bi->setText(data);
4834 assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4835 assertEquals(WHERE, 4, bi->next());
4836 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4837 assertEquals(WHERE, UBRK_DONE, bi->next());
4838 assertEquals(WHERE, 4, bi->current());
4839 assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4840 }
4841
4842 // TestReverse exercises both the synthesized safe reverse rules and the logic
4843 // for filling the break iterator cache when starting from random positions
4844 // in the text.
4845 //
4846 // It's a monkey test, working on random data, with the expected data obtained
4847 // from forward iteration (no safe rules involved), comparing with results
4848 // when indexing into the interior of the string (safe rules needed).
4849
TestReverse()4850 void RBBITest::TestReverse() {
4851 UErrorCode status = U_ZERO_ERROR;
4852
4853 TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4854 BreakIterator::createCharacterInstance(Locale::getEnglish(), status))));
4855 assertSuccess(WHERE, status, true);
4856 status = U_ZERO_ERROR;
4857 TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4858 BreakIterator::createWordInstance(Locale::getEnglish(), status))));
4859 assertSuccess(WHERE, status, true);
4860 status = U_ZERO_ERROR;
4861 TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4862 BreakIterator::createLineInstance(Locale::getEnglish(), status))));
4863 assertSuccess(WHERE, status, true);
4864 status = U_ZERO_ERROR;
4865 TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4866 BreakIterator::createSentenceInstance(Locale::getEnglish(), status))));
4867 assertSuccess(WHERE, status, true);
4868 }
4869
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4870 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4871 if (!bi) {
4872 return;
4873 }
4874
4875 // From the mapping trie in the break iterator's internal data, create a
4876 // vector of UnicodeStrings, one for each character category, containing
4877 // all of the code points that map to that category. Unicode planes 0 and 1 only,
4878 // to avoid an execess of unassigned code points.
4879
4880 RBBIDataWrapper *data = bi->fData;
4881 int32_t categoryCount = data->fHeader->fCatCount;
4882 UCPTrie *trie = data->fTrie;
4883 bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4884 uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4885
4886 std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4887 for (int cp=0; cp<0x1fff0; ++cp) {
4888 int cat = ucptrie_get(trie, cp);
4889 cat &= ~dictBit; // And off the dictionary bit from the category.
4890 assertTrue(WHERE, cat < categoryCount && cat >= 0);
4891 if (cat < 0 || cat >= categoryCount) return;
4892 strings[cat].append(cp);
4893 }
4894
4895 icu_rand randomGen;
4896 const int testStringLength = 10000;
4897 UnicodeString testString;
4898
4899 for (int i=0; i<testStringLength; ++i) {
4900 int charClass = randomGen() % categoryCount;
4901 if (strings[charClass].length() > 0) {
4902 int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4903 testString.append(cp);
4904 }
4905 }
4906
4907 typedef std::pair<UBool, int32_t> Result;
4908 std::vector<Result> expectedResults;
4909 bi->setText(testString);
4910 for (int i=0; i<testString.length(); ++i) {
4911 bool isboundary = bi->isBoundary(i);
4912 int ruleStatus = bi->getRuleStatus();
4913 expectedResults.emplace_back(isboundary, ruleStatus);
4914 }
4915
4916 for (int i=testString.length()-1; i>=0; --i) {
4917 bi->setText(testString); // clears the internal break cache
4918 Result expected = expectedResults[i];
4919 assertEquals(WHERE, expected.first, bi->isBoundary(i));
4920 assertEquals(WHERE, expected.second, bi->getRuleStatus());
4921 }
4922 }
4923
4924
4925 // Ticket 13692 - finding word boundaries in very large numbers or words could
4926 // be very time consuming. When the problem was present, this void test
4927 // would run more than fifteen minutes, which is to say, the failure was noticeale.
4928
TestBug13692()4929 void RBBITest::TestBug13692() {
4930 UErrorCode status = U_ZERO_ERROR;
4931 LocalPointer<RuleBasedBreakIterator> bi (dynamic_cast<RuleBasedBreakIterator*>(
4932 BreakIterator::createWordInstance(Locale::getEnglish(), status)), status);
4933 if (!assertSuccess(WHERE, status, true)) {
4934 return;
4935 }
4936 constexpr int32_t LENGTH = 1000000;
4937 UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4938 for (int i=0; i<20; i+=2) {
4939 longNumber.setCharAt(i, u' ');
4940 }
4941 bi->setText(longNumber);
4942 assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4943 assertSuccess(WHERE, status);
4944 }
4945
4946
TestProperties()4947 void RBBITest::TestProperties() {
4948 UErrorCode errorCode = U_ZERO_ERROR;
4949 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4950 if (!prependSet.isEmpty()) {
4951 errln(
4952 "[:GCB=Prepend:] is not empty any more. "
4953 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4954 "change this test to the opposite condition.");
4955 }
4956 }
4957
4958
4959 //
4960 // TestDebug - A place-holder test for debugging purposes.
4961 // For putting in fragments of other tests that can be invoked
4962 // for tracing without a lot of unwanted extra stuff happening.
4963 //
TestDebug()4964 void RBBITest::TestDebug() {
4965 UErrorCode status = U_ZERO_ERROR;
4966 LocalPointer<RuleBasedBreakIterator> bi (dynamic_cast<RuleBasedBreakIterator*>(
4967 BreakIterator::createCharacterInstance(Locale::getEnglish(), status)), status);
4968 if (!assertSuccess(WHERE, status, true)) {
4969 return;
4970 }
4971 const UnicodeString &rules = bi->getRules();
4972 UParseError pe;
4973 LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4974 assertSuccess(WHERE, status);
4975 }
4976
4977
4978 //
4979 // TestDebugRules A stub test for use in debugging rule compilation problems.
4980 // Can be freely altered as needed or convenient.
4981 // Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4982 // data files may not be available in all environments.
4983 // Any permanent test cases should be moved to rbbitst.txt
4984 // (see Bug 20303 in that file, for example), or to another test function in this file.
4985 //
TestDebugRules()4986 void RBBITest::TestDebugRules() {
4987 #if 0
4988 const char16_t *rules = u""
4989 "!!quoted_literals_only; \n"
4990 "!!chain; \n"
4991 "!!lookAheadHardBreak; \n"
4992 " \n"
4993 // "[a] / ; \n"
4994 "[a] [b] / [c] [d]; \n"
4995 "[a] [b] / [c] [d] {100}; \n"
4996 "[x] [a] [b] / [c] [d] {100}; \n"
4997 "[a] [b] [c] / [d] {100}; \n"
4998 //" [c] [d] / [e] [f]; \n"
4999 //"[a] [b] / [c]; \n"
5000 ;
5001
5002 UErrorCode status = U_ZERO_ERROR;
5003 CharString path(pathToDataDirectory(), status);
5004 path.appendPathPart("brkitr", status);
5005 path.appendPathPart("rules", status);
5006 path.appendPathPart("line.txt", status);
5007 int len;
5008 std::unique_ptr<char16_t []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
5009 if (!assertSuccess(WHERE, status)) {
5010 return;
5011 }
5012
5013 UParseError pe;
5014 // rules = testFile.get();
5015 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
5016
5017 if (!assertSuccess(WHERE, status)) {
5018 delete bi;
5019 return;
5020 }
5021 // bi->dumpTables();
5022
5023 delete bi;
5024 #endif
5025 }
5026
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)5027 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
5028 UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
5029 int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
5030 // Text are duplicate characters from U+4E00 to U+4FFF
5031 UnicodeString text;
5032 for (char16_t c = 0x4e00; c < 0x5000; c++) {
5033 text.append(c).append(c);
5034 }
5035 // Generate rule which will caused length+4 character classes and
5036 // length+3 states
5037 UnicodeString rules(u"!!quoted_literals_only;");
5038 for (char16_t c = 0x4e00; c < 0x4e00 + numChar; c++) {
5039 rules.append(u'\'').append(c).append(c).append(u"';");
5040 }
5041 rules.append(u".;");
5042 UErrorCode status = U_ZERO_ERROR;
5043 UParseError parseError;
5044 RuleBasedBreakIterator bi(rules, parseError, status);
5045
5046 assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
5047 assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
5048 assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
5049 assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
5050 assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
5051
5052 bi.setText(text);
5053
5054 int32_t pos;
5055 int32_t i = 0;
5056 while ((pos = bi.next()) > 0) {
5057 // The first numChar should not break between the pair
5058 if (i++ < numChar) {
5059 assertEquals(WHERE, i * 2, pos);
5060 } else {
5061 // After the first numChar next(), break on each character.
5062 assertEquals(WHERE, i + numChar, pos);
5063 }
5064 }
5065 while ((pos = bi.previous()) > 0) {
5066 // The first numChar should not break between the pair
5067 if (--i < numChar) {
5068 assertEquals(WHERE, i * 2, pos);
5069 } else {
5070 // After the first numChar next(), break on each character.
5071 assertEquals(WHERE, i + numChar, pos);
5072 }
5073 }
5074 }
5075
Test8BitsTrieWith8BitStateTable()5076 void RBBITest::Test8BitsTrieWith8BitStateTable() {
5077 testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
5078 }
5079
Test16BitsTrieWith8BitStateTable()5080 void RBBITest::Test16BitsTrieWith8BitStateTable() {
5081 testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
5082 }
5083
Test16BitsTrieWith16BitStateTable()5084 void RBBITest::Test16BitsTrieWith16BitStateTable() {
5085 testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
5086 }
5087
Test8BitsTrieWith16BitStateTable()5088 void RBBITest::Test8BitsTrieWith16BitStateTable() {
5089 // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
5090 // create state table in 16 bits.
5091
5092 // Generate 510 'a' as text
5093 UnicodeString text;
5094 for (int32_t i = 0; i < 510; i++) {
5095 text.append(u'a');
5096 }
5097
5098 UnicodeString rules(u"!!quoted_literals_only;'");
5099 // 254 'a' in the rule will cause 256 states
5100 for (int32_t i = 0; i < 254; i++) {
5101 rules.append(u'a');
5102 }
5103 rules.append(u"';.;");
5104
5105 UErrorCode status = U_ZERO_ERROR;
5106 UParseError parseError;
5107 LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
5108
5109 assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
5110 assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
5111 assertEquals(WHERE,
5112 false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
5113 bi->setText(text);
5114
5115 // break positions:
5116 // 254, 508, 509, ... 510
5117 assertEquals("next()", 254, bi->next());
5118 int32_t i = 0;
5119 int32_t pos;
5120 while ((pos = bi->next()) > 0) {
5121 assertEquals(WHERE, 508 + i , pos);
5122 i++;
5123 }
5124 i = 0;
5125 while ((pos = bi->previous()) > 0) {
5126 i++;
5127 if (pos >= 508) {
5128 assertEquals(WHERE, 510 - i , pos);
5129 } else {
5130 assertEquals(WHERE, 254 , pos);
5131 }
5132 }
5133 }
5134
5135 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
5136 // that there are no problems with rules at the size that transitions between the two.
5137 //
5138 // A rule that matches a literal string, like 'abcdefghij', will require one state and
5139 // one character class per character in the string. So we can make a rule to tickle the
5140 // boundaries by using literal strings of various lengths.
5141 //
5142 // For both the number of states and the number of character classes, the eight bit format
5143 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
5144 // leaving 120 something available. This test runs the string over the range of 120 - 130,
5145 // which allows some margin for changes to the number of values reserved by the rule builder
5146 // without breaking the test.
5147
TestTable_8_16_Bits()5148 void RBBITest::TestTable_8_16_Bits() {
5149
5150 // testStr serves as both the source of the rule string (truncated to the desired length)
5151 // and as test data to check matching behavior. A break rule consisting of the first 120
5152 // characters of testStr will match the first 120 chars of the full-length testStr.
5153 UnicodeString testStr;
5154 for (char16_t c=0x3000; c<0x3200; ++c) {
5155 testStr.append(c);
5156 }
5157
5158 const int32_t startLength = 120; // The shortest rule string to test.
5159 const int32_t endLength = 260; // The longest rule string to test
5160 const int32_t increment = this->quick ? endLength - startLength : 1;
5161
5162 for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
5163 UParseError parseError;
5164 UErrorCode status = U_ZERO_ERROR;
5165
5166 UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
5167 ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
5168 RuleBasedBreakIterator bi(ruleString, parseError, status);
5169 if (!assertSuccess(WHERE, status)) {
5170 errln(ruleString);
5171 break;
5172 }
5173 // bi.dumpTables();
5174
5175 // Verify that the break iterator is functioning - that the first boundary found
5176 // in testStr is at the length of the rule string.
5177 bi.setText(testStr);
5178 assertEquals(WHERE, ruleLen, bi.next());
5179
5180 // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
5181 // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
5182 bi.setText(testStr);
5183 int32_t result = bi.preceding(ruleLen);
5184 assertEquals(WHERE, 0, result);
5185
5186 // Verify that the range of rule lengths being tested cover the translations
5187 // from 8 to 16 bit data.
5188 bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
5189 bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
5190
5191 if (ruleLen == startLength) {
5192 assertEquals(WHERE, true, has8BitRowData);
5193 assertEquals(WHERE, true, has8BitsTrie);
5194 }
5195 if (ruleLen == endLength) {
5196 assertEquals(WHERE, false, has8BitRowData);
5197 assertEquals(WHERE, false, has8BitsTrie);
5198 }
5199 }
5200 }
5201
5202 /* Test handling of a large number of look-ahead rules.
5203 * The number of rules in the test exceeds the implementation limits prior to the
5204 * improvements introduced with #13590.
5205 *
5206 * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
5207 * The text being matched is sequential, "ABCDEFGHI..."
5208 *
5209 * The upshot is that the look-ahead rules all match on their preceding context,
5210 * and consequently must save a potential result, but then fail to match on their
5211 * trailing context, so that they don't actually cause a boundary.
5212 *
5213 * Additionally, add a ".*" rule, so there are no boundaries unless a
5214 * look-ahead hard-break rule forces one.
5215 */
TestBug13590()5216 void RBBITest::TestBug13590() {
5217 UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5218
5219 const int NUM_LOOKAHEAD_RULES = 50;
5220 const char16_t STARTING_CHAR = u'\u5000';
5221 char16_t firstChar;
5222 for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5223 firstChar = STARTING_CHAR + ruleNum*2;
5224 rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5225 .append(u' ') .append(u'/') .append(u' ')
5226 .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5227 .append(u';') .append(u'\n');
5228 }
5229
5230 // Change the last rule added from the form "UV / WY" to "UV / WX".
5231 // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5232 rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5233
5234 UErrorCode status = U_ZERO_ERROR;
5235 UParseError parseError;
5236 RuleBasedBreakIterator bi(rules, parseError, status);
5237 if (!assertSuccess(WHERE, status)) {
5238 errln(rules);
5239 return;
5240 }
5241 // bi.dumpTables();
5242
5243 UnicodeString testString;
5244 for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5245 testString.append(c);
5246 }
5247 bi.setText(testString);
5248
5249 int breaksFound = 0;
5250 while (bi.next() != UBRK_DONE) {
5251 ++breaksFound;
5252 }
5253
5254 // Two matches are expected, one from the last rule that was explicitly modified,
5255 // and one at the end of the text.
5256 assertEquals(WHERE, 2, breaksFound);
5257 }
5258
5259
5260 #if U_ENABLE_TRACING
5261 static std::vector<std::string> gData;
5262 static std::vector<int32_t> gEntryFn;
5263 static std::vector<int32_t> gExitFn;
5264 static std::vector<int32_t> gDataFn;
5265
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5266 static void U_CALLCONV traceData(
5267 const void*,
5268 int32_t fnNumber,
5269 int32_t,
5270 const char *,
5271 va_list args) {
5272 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5273 const char* data = va_arg(args, const char*);
5274 gDataFn.push_back(fnNumber);
5275 gData.push_back(data);
5276 }
5277 }
5278
traceEntry(const void *,int32_t fnNumber)5279 static void traceEntry(const void *, int32_t fnNumber) {
5280 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5281 gEntryFn.push_back(fnNumber);
5282 }
5283 }
5284
traceExit(const void *,int32_t fnNumber,const char *,va_list)5285 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5286 if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5287 gExitFn.push_back(fnNumber);
5288 }
5289 }
5290
5291
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5292 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5293 assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5294 assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5295 assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5296 assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5297
5298 if (expectedData == nullptr) {
5299 assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5300 assertEquals("utrace_data should not be called ", 0, gData.size());
5301 } else {
5302 assertEquals("utrace_data should be called ", 1, gDataFn.size());
5303 assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5304 assertEquals("utrace_data should be called ", 1, gData.size());
5305 assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5306 }
5307 }
5308
SetupTestTrace()5309 void SetupTestTrace() {
5310 gEntryFn.clear();
5311 gExitFn.clear();
5312 gDataFn.clear();
5313 gData.clear();
5314
5315 const void* context = nullptr;
5316 utrace_setFunctions(context, traceEntry, traceExit, traceData);
5317 utrace_setLevel(UTRACE_INFO);
5318 }
5319
TestTraceCreateCharacter()5320 void RBBITest::TestTraceCreateCharacter() {
5321 SetupTestTrace();
5322 IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5323 LocalPointer<BreakIterator> brkitr(
5324 BreakIterator::createCharacterInstance("zh-CN", status));
5325 status.errIfFailureAndReset();
5326 assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5327 }
5328
TestTraceCreateTitle()5329 void RBBITest::TestTraceCreateTitle() {
5330 SetupTestTrace();
5331 IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5332 LocalPointer<BreakIterator> brkitr(
5333 BreakIterator::createTitleInstance("zh-CN", status));
5334 status.errIfFailureAndReset();
5335 assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5336 }
5337
TestTraceCreateSentence()5338 void RBBITest::TestTraceCreateSentence() {
5339 SetupTestTrace();
5340 IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5341 LocalPointer<BreakIterator> brkitr(
5342 BreakIterator::createSentenceInstance("zh-CN", status));
5343 status.errIfFailureAndReset();
5344 assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5345 }
5346
TestTraceCreateWord()5347 void RBBITest::TestTraceCreateWord() {
5348 SetupTestTrace();
5349 IcuTestErrorCode status(*this, "TestTraceCreateWord");
5350 LocalPointer<BreakIterator> brkitr(
5351 BreakIterator::createWordInstance("zh-CN", status));
5352 status.errIfFailureAndReset();
5353 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5354 }
5355
TestTraceCreateLine()5356 void RBBITest::TestTraceCreateLine() {
5357 SetupTestTrace();
5358 IcuTestErrorCode status(*this, "TestTraceCreateLine");
5359 LocalPointer<BreakIterator> brkitr(
5360 BreakIterator::createLineInstance("zh-CN", status));
5361 status.errIfFailureAndReset();
5362 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line");
5363 }
5364
TestTraceCreateLineStrict()5365 void RBBITest::TestTraceCreateLineStrict() {
5366 SetupTestTrace();
5367 IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5368 LocalPointer<BreakIterator> brkitr(
5369 BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5370 status.errIfFailureAndReset();
5371 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict");
5372 }
5373
TestTraceCreateLineNormal()5374 void RBBITest::TestTraceCreateLineNormal() {
5375 SetupTestTrace();
5376 IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5377 LocalPointer<BreakIterator> brkitr(
5378 BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5379 status.errIfFailureAndReset();
5380 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal");
5381 }
5382
TestTraceCreateLineLoose()5383 void RBBITest::TestTraceCreateLineLoose() {
5384 SetupTestTrace();
5385 IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5386 LocalPointer<BreakIterator> brkitr(
5387 BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5388 status.errIfFailureAndReset();
5389 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose");
5390 }
5391
TestTraceCreateLineLoosePhrase()5392 void RBBITest::TestTraceCreateLineLoosePhrase() {
5393 SetupTestTrace();
5394 IcuTestErrorCode status(*this, "TestTraceCreateLineLoosePhrase");
5395 LocalPointer<BreakIterator> brkitr(
5396 BreakIterator::createLineInstance("ja-u-lb-loose-lw-phrase", status));
5397 status.errIfFailureAndReset();
5398 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose_phrase");
5399 }
5400
TestTraceCreateLineNormalPhrase()5401 void RBBITest::TestTraceCreateLineNormalPhrase() {
5402 SetupTestTrace();
5403 IcuTestErrorCode status(*this, "TestTraceCreateLineNormalPhrase");
5404 LocalPointer<BreakIterator> brkitr(
5405 BreakIterator::createLineInstance("ja-u-lb-normal-lw-phrase", status));
5406 status.errIfFailureAndReset();
5407 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal_phrase");
5408 }
5409
TestTraceCreateLineStrictPhrase()5410 void RBBITest::TestTraceCreateLineStrictPhrase() {
5411 SetupTestTrace();
5412 IcuTestErrorCode status(*this, "TestTraceCreateLineStrictPhrase");
5413 LocalPointer<BreakIterator> brkitr(
5414 BreakIterator::createLineInstance("ja-u-lb-strict-lw-phrase", status));
5415 status.errIfFailureAndReset();
5416 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict_phrase");
5417 }
5418
TestTraceCreateLinePhrase()5419 void RBBITest::TestTraceCreateLinePhrase() {
5420 SetupTestTrace();
5421 IcuTestErrorCode status(*this, "TestTraceCreateLinePhrase");
5422 LocalPointer<BreakIterator> brkitr(
5423 BreakIterator::createLineInstance("ja-u-lw-phrase", status));
5424 status.errIfFailureAndReset();
5425 assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_phrase");
5426 }
5427
TestTraceCreateBreakEngine()5428 void RBBITest::TestTraceCreateBreakEngine() {
5429 rbbi_cleanup();
5430 SetupTestTrace();
5431 IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5432 LocalPointer<BreakIterator> brkitr(
5433 BreakIterator::createWordInstance("zh-CN", status));
5434 status.errIfFailureAndReset();
5435 assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5436
5437 // To word break the following text, BreakIterator will create 5 dictionary
5438 // break engine internally.
5439 UnicodeString text(
5440 u"test "
5441 u"測試 " // Hani
5442 u"សាកល្បង " // Khmr
5443 u"ທົດສອບ " // Laoo
5444 u"စမ်းသပ်မှု " // Mymr
5445 u"ทดสอบ " // Thai
5446 u"test "
5447 );
5448 brkitr->setText(text);
5449
5450 // Loop through all the text.
5451 while (brkitr->next() > 0) ;
5452
5453 assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5454 assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5455 assertEquals("utrace_data should be called ", 5, gDataFn.size());
5456
5457 for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5458 assertEquals("utrace_entry should be called ",
5459 UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5460 assertEquals("utrace_exit should be called ",
5461 UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5462 assertEquals("utrace_data should be called ",
5463 UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5464 }
5465
5466 assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5467 assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5468 assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5469 assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5470 assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5471
5472 }
5473 #endif
5474
TestUnpairedSurrogate()5475 void RBBITest::TestUnpairedSurrogate() {
5476 UnicodeString rules(u"ab;");
5477
5478 UErrorCode status = U_ZERO_ERROR;
5479 UParseError pe;
5480 RuleBasedBreakIterator bi1(rules, pe, status);
5481 assertSuccess(WHERE, status);
5482 UnicodeString rtRules = bi1.getRules();
5483 // make sure the simple one work first.
5484 assertEquals(WHERE, rules, rtRules);
5485
5486
5487 rules = UnicodeString(u"a\\ud800b;").unescape();
5488 pe.line = 0;
5489 pe.offset = 0;
5490 RuleBasedBreakIterator bi2(rules, pe, status);
5491 assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5492 if (pe.line != 1 || pe.offset != 1) {
5493 errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5494 }
5495
5496 status = U_ZERO_ERROR;
5497 rules = UnicodeString(u"a\\ude00b;").unescape();
5498 pe.line = 0;
5499 pe.offset = 0;
5500 RuleBasedBreakIterator bi3(rules, pe, status);
5501 assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5502 if (pe.line != 1 || pe.offset != 1) {
5503 errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5504 }
5505
5506 // make sure the surrogate one work too.
5507 status = U_ZERO_ERROR;
5508 rules = UnicodeString(u"ab;");
5509 RuleBasedBreakIterator bi4(rules, pe, status);
5510 rtRules = bi4.getRules();
5511 assertEquals(WHERE, rules, rtRules);
5512 }
5513
5514 // Read file generated by
5515 // https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
5516 // as test cases and compare the Output.
5517 // Format of the file
5518 // Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
5519 // Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
5520 // Input:\t[source text]
5521 // Output:\t[expected output separated by | ]
5522 // Input: ...
5523 // Output: ...
5524
runLSTMTestFromFile(const char * filename,UScriptCode script)5525 void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
5526 // The expectation in this test depends on LSTM, skip the test if the
5527 // configuration is not build with LSTM data.
5528 if (skipLSTMTest()) {
5529 return;
5530 }
5531 UErrorCode status = U_ZERO_ERROR;
5532 LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
5533 if (U_FAILURE(status)) {
5534 errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
5535 return;
5536 }
5537 // Open and read the test data file.
5538 const char *testDataDirectory = IntlTest::getSourceTestData(status);
5539 CharString testFileName(testDataDirectory, -1, status);
5540 testFileName.append(filename, -1, status);
5541
5542 int len;
5543 char16_t *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
5544 if (U_FAILURE(status)) {
5545 errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
5546 return;
5547 }
5548
5549 // Put the test data into a UnicodeString
5550 UnicodeString testString(false, testFile, len);
5551
5552 int32_t start = 0;
5553
5554 UnicodeString line;
5555 int32_t end;
5556 std::string actual_sep_str;
5557 int32_t caseNum = 0;
5558 // Iterate through all the lines in the test file.
5559 do {
5560 int32_t cr = testString.indexOf(u'\r', start);
5561 int32_t lf = testString.indexOf(u'\n', start);
5562 end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
5563 line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
5564 if (line.length() > 0) {
5565 // Separate each line to key and value by TAB.
5566 int32_t tab = line.indexOf(u'\t');
5567 UnicodeString key = line.tempSubString(0, tab);
5568 const UnicodeString value = line.tempSubString(tab+1);
5569
5570 if (key == "Model:") {
5571 // Verify the expectation in the test file match the LSTM model
5572 // we are using now.
5573 const LSTMData* data = CreateLSTMDataForScript(script, status);
5574 if (U_FAILURE(status)) {
5575 dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
5576 __FILE__, __LINE__, u_errorName(status), uscript_getName(script));
5577 return;
5578 }
5579 UnicodeString name(LSTMDataName(data));
5580 DeleteLSTMData(data);
5581 if (value != name) {
5582 std::string utf8Name, utf8Value;
5583 dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
5584 __FILE__, __LINE__, u_errorName(status), uscript_getName(script),
5585 name.toUTF8String<std::string>(utf8Name).c_str(),
5586 value.toUTF8String<std::string>(utf8Value).c_str());
5587 return;
5588 }
5589 } else if (key == "Input:") {
5590 UnicodeString input("prefix ");
5591 input += value + " suffix";
5592 std::stringstream ss;
5593
5594 // Construct the UText which is expected by the the engine as
5595 // input from the UnicodeString.
5596 UText ut = UTEXT_INITIALIZER;
5597 utext_openConstUnicodeString(&ut, &input, &status);
5598 if (U_FAILURE(status)) {
5599 dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
5600 return;
5601 }
5602
5603 iterator->setText(&ut, status);
5604 if (U_FAILURE(status)) {
5605 errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
5606 return;
5607 }
5608
5609 int32_t bp;
5610 for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
5611 ss << bp;
5612 if (bp != input.length()) {
5613 ss << ", ";
5614 }
5615 }
5616
5617 utext_close(&ut);
5618 // Turn the break points into a string for easy comparison
5619 // output.
5620 actual_sep_str = "{" + ss.str() + "}";
5621 } else if (key == "Output:" && !actual_sep_str.empty()) {
5622 UnicodeString input("prefix| |");
5623 input += value + "| |suffix";
5624 std::string d;
5625 int32_t sep;
5626 int32_t start = 0;
5627 int32_t curr = 0;
5628 std::stringstream ss;
5629 // Include 0 as the break point.
5630 ss << "0, ";
5631 while ((sep = input.indexOf(u'|', start)) >= 0) {
5632 int32_t len = sep - start;
5633 if (len > 0) {
5634 if (curr > 0) {
5635 ss << ", ";
5636 }
5637 curr += len;
5638 ss << curr;
5639 }
5640 start = sep + 1;
5641 }
5642 // Include end of the string as break point.
5643 ss << ", " << curr + input.length() - start;
5644 // Turn the break points into a string for easy comparison
5645 // output.
5646 std::string expected = "{" + ss.str() + "}";
5647 std::string utf8;
5648
5649 assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
5650 expected.c_str(), actual_sep_str.c_str());
5651 actual_sep_str.clear();
5652 }
5653 }
5654 start = std::max(cr, lf) + 1;
5655 } while (end >= 0);
5656
5657 delete [] testFile;
5658 }
5659
TestLSTMThai()5660 void RBBITest::TestLSTMThai() {
5661 runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
5662 }
5663
TestLSTMBurmese()5664 void RBBITest::TestLSTMBurmese() {
5665 runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
5666 }
5667
5668
5669 // Test preceding(index) and following(index), with semi-random indexes.
5670 // The random indexes are produced in clusters that are relatively closely spaced,
5671 // to increase the occurrences of hits to the internal break cache.
5672
TestRandomAccess()5673 void RBBITest::TestRandomAccess() {
5674 static constexpr int32_t CACHE_SIZE = 128;
5675
5676 UnicodeString testData;
5677 for (int i=0; i<CACHE_SIZE*2; ++i) {
5678 testData.append(u"aaaa\n");
5679 }
5680
5681 UErrorCode status = U_ZERO_ERROR;
5682 LocalPointer<RuleBasedBreakIterator> bi(
5683 dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getEnglish(), status)),
5684 status);
5685 if (!assertSuccess(WHERE, status)) { return; };
5686
5687 bi->setText(testData);
5688
5689 auto expectedPreceding = [](int from) {
5690 if (from == 0) {return UBRK_DONE;}
5691 if (from % 5 == 0) {return from - 5;}
5692 return from - (from % 5);
5693 };
5694
5695 auto expectedFollow = [testData](int from) {
5696 if (from >= testData.length()) {return UBRK_DONE;}
5697 if (from % 5 == 0) {return from + 5;}
5698 return from + (5 - (from % 5));
5699 };
5700
5701 auto randomStringIndex = [testData]() {
5702 static icu_rand randomGenerator; // produces random uint32_t values.
5703 static int lastNum;
5704 static int clusterCount;
5705 static constexpr int CLUSTER_SIZE = 100;
5706 static constexpr int CLUSTER_LENGTH = 10;
5707
5708 if (clusterCount < CLUSTER_LENGTH) {
5709 ++clusterCount;
5710 lastNum += (randomGenerator() % CLUSTER_SIZE);
5711 lastNum -= CLUSTER_SIZE / 2;
5712 lastNum = std::max(0, lastNum);
5713 // Deliberately test indexes > testData.length.
5714 lastNum = std::min(testData.length() + 5, lastNum);
5715 } else {
5716 clusterCount = 0;
5717 lastNum = randomGenerator() % testData.length();
5718 }
5719 return lastNum;
5720 };
5721
5722 for (int i=0; i<5000; ++i) {
5723 int idx = randomStringIndex();
5724 assertEquals(WHERE, expectedFollow(idx), bi->following(idx));
5725 idx = randomStringIndex();
5726 assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx));
5727 }
5728 }
5729
5730 // A Fake Tai Le break engine which handle Unicode Tai Le (Tale) block
5731 // https://unicode.org/charts/PDF/U1950.pdf
5732 // U+1950 - U+197F and always break after Tone letters (U+1970-U+1974)
5733 class FakeTaiLeBreakEngine : public ExternalBreakEngine {
5734 public:
FakeTaiLeBreakEngine()5735 FakeTaiLeBreakEngine() : block(0x1950, 0x197f), tones(0x1970, 0x1974) {
5736 }
~FakeTaiLeBreakEngine()5737 virtual ~FakeTaiLeBreakEngine() {
5738 }
isFor(UChar32 c,const char *) const5739 virtual bool isFor(UChar32 c, const char* /* locale */) const override {
5740 // We implmement this for any locale, not return false for some langauge
5741 // here.
5742 return handles(c);
5743 }
handles(UChar32 c) const5744 virtual bool handles(UChar32 c) const override {
5745 return block.contains(c);
5746 }
fillBreaks(UText * text,int32_t start,int32_t end,int32_t * foundBreaks,int32_t foundBreaksCapacity,UErrorCode & status) const5747 virtual int32_t fillBreaks(UText* text, int32_t start, int32_t end,
5748 int32_t* foundBreaks, int32_t foundBreaksCapacity,
5749 UErrorCode& status) const override {
5750 if (U_FAILURE(status)) return 0;
5751 int32_t i = 0;
5752 // Save the state of the utext
5753 int64_t savedIndex = utext_getNativeIndex(text);
5754 if (savedIndex != start) {
5755 utext_setNativeIndex(text, start);
5756 }
5757 int32_t current;
5758 while((current = (int32_t)utext_getNativeIndex(text)) < end) {
5759 UChar32 c = utext_current32(text);
5760 // Break after tone marks as a fake break point.
5761 if (tones.contains(c)) {
5762 if (i >= foundBreaksCapacity) {
5763 status = U_BUFFER_OVERFLOW_ERROR;
5764 utext_setNativeIndex(text, savedIndex);
5765 return i;
5766 }
5767 foundBreaks[i++] = current;
5768 }
5769 UTEXT_NEXT32(text);
5770 }
5771 // Restore the utext
5772 if (savedIndex != current) {
5773 utext_setNativeIndex(text, savedIndex);
5774 }
5775 return i;
5776 }
5777
5778 private:
5779 UnicodeSet block;
5780 UnicodeSet tones;
5781 };
5782
5783 // A Fake Yue Break Engine which handle CJK Unified Ideographs
5784 // block (U+4E00-U+9FFF) when locale start with 'yue' and break
5785 // after every character.
5786 class FakeYueBreakEngine : public ExternalBreakEngine {
5787 public:
FakeYueBreakEngine()5788 FakeYueBreakEngine() : block(0x4e00, 0x9FFF) {
5789 }
~FakeYueBreakEngine()5790 virtual ~FakeYueBreakEngine() {
5791 }
isFor(UChar32 c,const char * locale) const5792 virtual bool isFor(UChar32 c, const char* locale) const override {
5793 // We implmement this for any locale starts with "yue" such as
5794 // "yue", "yue-CN", "yue-Hant-CN", etc.
5795 return handles(c) && uprv_strncmp("yue", locale, 3) == 0;
5796 }
handles(UChar32 c) const5797 virtual bool handles(UChar32 c) const override {
5798 return block.contains(c);
5799 }
fillBreaks(UText * text,int32_t start,int32_t end,int32_t * foundBreaks,int32_t foundBreaksCapacity,UErrorCode & status) const5800 virtual int32_t fillBreaks(UText* text, int32_t start, int32_t end,
5801 int32_t* foundBreaks, int32_t foundBreaksCapacity,
5802 UErrorCode& status) const override {
5803 (void)text;
5804 if (U_FAILURE(status)) return 0;
5805 int32_t i = 0;
5806 int32_t current = start;
5807 while (current++ < end) {
5808 // A fake word segmentation by breaking every two Unicode.
5809 if ((current - start) % 2 == 0) {
5810 if (i >= foundBreaksCapacity) {
5811 status = U_BUFFER_OVERFLOW_ERROR;
5812 return i;
5813 }
5814 foundBreaks[i++] = current;
5815 }
5816 }
5817 return i;
5818 }
5819
5820 private:
5821 UnicodeSet block;
5822 };
5823
TestExternalBreakEngineWithFakeYue()5824 void RBBITest::TestExternalBreakEngineWithFakeYue() {
5825 UErrorCode status = U_ZERO_ERROR;
5826 UnicodeString text(u"a bc def一兩年前佢真係唔鍾意畀我影相i jk lmn");
5827
5828 std::vector<int32_t> actual1;
5829 {
5830 LocalPointer<BreakIterator> bi1(
5831 BreakIterator::createWordInstance(Locale::getRoot(), status),
5832 status);
5833 bi1->setText(text);
5834 assertTrue(WHERE "BreakIterator::createWordInstance( root )",
5835 U_SUCCESS(status));
5836
5837 do {
5838 actual1.push_back(bi1->current());
5839 } while(bi1->next() != BreakIterator::DONE);
5840 }
5841
5842 std::vector<int32_t> expected1({{ 0, 1, 2, 4, 5, 8, 10, 12, 13, 14, 15,
5843 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 30}});
5844 assertTrue("root break Yue as Chinese", expected1 == actual1);
5845
5846 status = U_ZERO_ERROR;
5847 RuleBasedBreakIterator::registerExternalBreakEngine(
5848 new FakeYueBreakEngine(), status);
5849 assertTrue(WHERE "registerExternalBreakEngine w FakeYueBreakEngine",
5850 U_SUCCESS(status));
5851
5852 std::vector<int32_t> actual2;
5853 {
5854 status = U_ZERO_ERROR;
5855 LocalPointer<BreakIterator> bi2(
5856 BreakIterator::createWordInstance(Locale("yue"), status), status);
5857 assertTrue(WHERE "BreakIterator::createWordInstance( yue )",
5858 U_SUCCESS(status));
5859 bi2->setText(text);
5860 do {
5861 actual2.push_back(bi2->current());
5862 } while(bi2->next() != BreakIterator::DONE);
5863 }
5864 std::vector<int32_t> expected2({{ 0, 1, 2, 4, 5, 8, 10, 12, 14, 16, 18, 20,
5865 22, 23, 24, 26, 27, 30}});
5866 assertTrue(WHERE "break Yue by Fake external breaker",
5867 expected2 == actual2);
5868 }
5869
TestExternalBreakEngineWithFakeTaiLe()5870 void RBBITest::TestExternalBreakEngineWithFakeTaiLe() {
5871 UErrorCode status = U_ZERO_ERROR;
5872 UnicodeString text(
5873 u"a bc defᥛᥫᥒᥰᥖᥭᥰᥞᥝᥰᥙᥥᥢᥛᥫᥒᥰᥑᥩᥢᥲᥔᥣᥝᥴᥓᥬᥖᥩᥢᥲᥛᥣᥝᥱᥙᥝᥱᥙᥤᥱᥓᥣᥒᥛᥣᥰᥓᥧ"
5874 u"ᥰᥘᥩᥰᥗᥪᥒᥴᥛᥣᥰᥘᥬᥰᥝᥣᥱᥘᥒᥱᥔᥣᥛᥴᥘᥫᥢi jk lmn");
5875
5876 std::vector<int32_t> actual1;
5877 {
5878 LocalPointer<BreakIterator> bi1(
5879 BreakIterator::createLineInstance(Locale::getRoot(), status),
5880 status);
5881 bi1->setText(text);
5882 assertTrue(WHERE "BreakIterator::createLineInstance( root )",
5883 U_SUCCESS(status));
5884
5885 do {
5886 actual1.push_back(bi1->current());
5887 } while(bi1->next() != BreakIterator::DONE);
5888 }
5889
5890 std::vector<int32_t> expected1({{
5891 0, 2, 5, 86, 89, 92 }});
5892 assertTrue(WHERE "root break Tai Le", expected1 == actual1);
5893
5894 RuleBasedBreakIterator::registerExternalBreakEngine(
5895 new FakeTaiLeBreakEngine(), status);
5896 assertTrue(WHERE "registerExternalBreakEngine w FakeTaiLeBreakEngine",
5897 U_SUCCESS(status));
5898
5899 std::vector<int32_t> actual2;
5900 {
5901 status = U_ZERO_ERROR;
5902 LocalPointer<BreakIterator> bi2(
5903 BreakIterator::createLineInstance(Locale("tdd"), status), status);
5904 assertTrue(WHERE "BreakIterator::createLineInstance( tdd )",
5905 U_SUCCESS(status));
5906 bi2->setText(text);
5907 do {
5908 actual2.push_back(bi2->current());
5909 } while(bi2->next() != BreakIterator::DONE);
5910 }
5911 std::vector<int32_t> expected2({{
5912 0, 2, 5, 11, 14, 17, 24, 28, 32, 38, 42, 45, 48, 54, 57, 60, 64, 67,
5913 70, 73, 76, 80, 86, 89, 92}});
5914 assertTrue("break Tai Le by Fake external breaker",
5915 expected2 == actual2);
5916 }
5917
5918 // Test a single unpaired unpaired char (either surrogate low or high) in
5919 // an Unicode set will not cause infinity loop.
TestBug22585()5920 void RBBITest::TestBug22585() {
5921 UnicodeString rule = u"$a=[";
5922 rule.append(0xdecb) // an unpaired surrogate high
5923 .append("];");
5924 UParseError pe {};
5925 UErrorCode ec {U_ZERO_ERROR};
5926 RuleBasedBreakIterator bi(rule, pe, ec);
5927
5928 rule = u"$a=[";
5929 rule.append(0xd94e) // an unpaired surrogate low
5930 .append("];");
5931 ec = U_ZERO_ERROR;
5932 RuleBasedBreakIterator bi2(rule, pe, ec);
5933 }
5934
5935 // Test a long string with a ; in the end will not cause stack overflow.
TestBug22602()5936 void RBBITest::TestBug22602() {
5937 UnicodeString rule(25000, (UChar32)'A', 25000-1);
5938 rule.append(u";");
5939 UParseError pe {};
5940 UErrorCode ec {U_ZERO_ERROR};
5941 RuleBasedBreakIterator bi(rule, pe, ec);
5942 }
5943
TestBug22636()5944 void RBBITest::TestBug22636() {
5945 UParseError pe {};
5946 UErrorCode ec {U_ZERO_ERROR};
5947 RuleBasedBreakIterator bi(u"A{77777777777777};", pe, ec);
5948 assertEquals(WHERE, ec, U_BRK_RULE_SYNTAX);
5949 ec = U_ZERO_ERROR;
5950 RuleBasedBreakIterator bi2(u"A{2147483648};", pe, ec);
5951 assertEquals(WHERE, ec, U_BRK_RULE_SYNTAX);
5952 ec = U_ZERO_ERROR;
5953 RuleBasedBreakIterator bi3(u"A{2147483647};", pe, ec);
5954 assertEquals(WHERE, ec, U_ZERO_ERROR);
5955 }
5956
TestBug22584()5957 void RBBITest::TestBug22584() {
5958 // Creating a break iterator from a rule consisting of a very long
5959 // literal input string caused a stack overflow when deleting the
5960 // parse tree for the input during the rule building process.
5961
5962 // Failure of this test showed as a crash during the break iterator construction.
5963
5964 UnicodeString ruleStr(100000, (UChar32)0, 100000);
5965 UParseError pe {};
5966 UErrorCode ec {U_ZERO_ERROR};
5967
5968 RuleBasedBreakIterator bi(ruleStr, pe, ec);
5969 ec = U_ZERO_ERROR;
5970 ruleStr = u"a/b;c";
5971 RuleBasedBreakIterator bi2(ruleStr, pe, ec);
5972 }
5973
TestBug22579()5974 void RBBITest::TestBug22579() {
5975 // Test not causing null deref in cloneTree
5976 UnicodeString ruleStr = u"[{ab}];";
5977 UParseError pe {};
5978 UErrorCode ec {U_ZERO_ERROR};
5979
5980 RuleBasedBreakIterator bi(ruleStr, pe, ec);
5981 }
TestBug22581()5982 void RBBITest::TestBug22581() {
5983 // Test duplicate variable setting will not leak the rule compilation
5984 UnicodeString ruleStr = u"$foo=[abc]; $foo=[xyz]; $foo;";
5985 UParseError pe {};
5986 UErrorCode ec {U_ZERO_ERROR};
5987
5988 RuleBasedBreakIterator bi(ruleStr, pe, ec);
5989 }
5990 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5991