xref: /aosp_15_r20/external/icu/icu4c/source/test/intltest/rbbitst.cpp (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1999-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /************************************************************************
9 *   Date        Name        Description
10 *   12/15/99    Madhu        Creation.
11 *   01/12/2000  Madhu        Updated for changed API and added new tests
12 ************************************************************************/
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16 
17 #include <algorithm>
18 #include <set>
19 #include <sstream>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <utility>
24 #include <vector>
25 
26 #include "unicode/brkiter.h"
27 #include "unicode/localpointer.h"
28 #include "unicode/numfmt.h"
29 #include "unicode/rbbi.h"
30 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
31 #include "unicode/regex.h"
32 #endif
33 #include "unicode/schriter.h"
34 #include "unicode/uchar.h"
35 #include "unicode/utf16.h"
36 #include "unicode/ucnv.h"
37 #include "unicode/uniset.h"
38 #include "unicode/uscript.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
41 #include "unicode/utrace.h"
42 
43 #include "charstr.h"
44 #include "cmemory.h"
45 #include "cstr.h"
46 #include "cstring.h"
47 #include "intltest.h"
48 #include "lstmbe.h"
49 #include "rbbitst.h"
50 #include "rbbidata.h"
51 #include "utypeinfo.h"  // for 'typeid' to work
52 #include "uvector.h"
53 #include "uvectr32.h"
54 
55 
56 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
57 #include "unicode/filteredbrk.h"
58 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
59 
60 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
61     if (!(x)) { \
62         errln("Failure in file %s, line %d", __FILE__, __LINE__); \
63     } \
64 } UPRV_BLOCK_MACRO_END
65 
66 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
67     if (U_FAILURE(errcode)) { \
68         errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
69     } \
70 } UPRV_BLOCK_MACRO_END
71 
72 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
73     IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
74                     __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
75 }
76 
77 //---------------------------------------------
78 // runIndexedTest
79 //---------------------------------------------
80 
81 
82 //  Note:  Before adding new tests to this file, check whether the desired test data can
83 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
84 //         it's much less work than writing a new test, diagnostic output in the event of failures
85 //         is good, and the test data file will is shared with ICU4J, so eventually the test
86 //         will run there as well, without additional effort.
87 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)88 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
89 {
90     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
91     fTestParams = params;
92 
93     TESTCASE_AUTO_BEGIN;
94 #if !UCONFIG_NO_FILE_IO
95     TESTCASE_AUTO(TestBug4153072);
96 #endif
97 #if !UCONFIG_NO_FILE_IO
98     TESTCASE_AUTO(TestUnicodeFiles);
99 #endif
100     TESTCASE_AUTO(TestGetAvailableLocales);
101     TESTCASE_AUTO(TestGetDisplayName);
102 #if !UCONFIG_NO_FILE_IO
103     TESTCASE_AUTO(TestEndBehaviour);
104     TESTCASE_AUTO(TestWordBreaks);
105     TESTCASE_AUTO(TestWordBoundary);
106     TESTCASE_AUTO(TestLineBreaks);
107     TESTCASE_AUTO(TestSentBreaks);
108     TESTCASE_AUTO(TestExtended);
109 #endif
110 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
111     TESTCASE_AUTO(TestMonkey);
112 #endif
113 #if !UCONFIG_NO_FILE_IO
114     TESTCASE_AUTO(TestBug3818);
115 #endif
116     TESTCASE_AUTO(TestDebug);
117 #if !UCONFIG_NO_FILE_IO
118     TESTCASE_AUTO(TestBug5775);
119 #endif
120     TESTCASE_AUTO(TestBug9983);
121     TESTCASE_AUTO(TestDictRules);
122     TESTCASE_AUTO(TestBug5532);
123     TESTCASE_AUTO(TestBug7547);
124     TESTCASE_AUTO(TestBug12797);
125     TESTCASE_AUTO(TestBug12918);
126     TESTCASE_AUTO(TestBug12932);
127     TESTCASE_AUTO(TestEmoji);
128     TESTCASE_AUTO(TestBug12519);
129     TESTCASE_AUTO(TestBug12677);
130     TESTCASE_AUTO(TestTableRedundancies);
131     TESTCASE_AUTO(TestBug13447);
132     TESTCASE_AUTO(TestReverse);
133     TESTCASE_AUTO(TestBug13692);
134     TESTCASE_AUTO(TestDebugRules);
135     TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable);
136     TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable);
137     TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable);
138     TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable);
139     TESTCASE_AUTO(TestTable_8_16_Bits);
140     TESTCASE_AUTO(TestBug13590);
141     TESTCASE_AUTO(TestUnpairedSurrogate);
142     TESTCASE_AUTO(TestLSTMThai);
143     TESTCASE_AUTO(TestLSTMBurmese);
144     TESTCASE_AUTO(TestRandomAccess);
145     TESTCASE_AUTO(TestExternalBreakEngineWithFakeTaiLe);
146     TESTCASE_AUTO(TestExternalBreakEngineWithFakeYue);
147     TESTCASE_AUTO(TestBug22579);
148     TESTCASE_AUTO(TestBug22581);
149     TESTCASE_AUTO(TestBug22584);
150     TESTCASE_AUTO(TestBug22585);
151     TESTCASE_AUTO(TestBug22602);
152     TESTCASE_AUTO(TestBug22636);
153 
154 #if U_ENABLE_TRACING
155     TESTCASE_AUTO(TestTraceCreateCharacter);
156     TESTCASE_AUTO(TestTraceCreateWord);
157     TESTCASE_AUTO(TestTraceCreateSentence);
158     TESTCASE_AUTO(TestTraceCreateTitle);
159     TESTCASE_AUTO(TestTraceCreateLine);
160     TESTCASE_AUTO(TestTraceCreateLineNormal);
161     TESTCASE_AUTO(TestTraceCreateLineLoose);
162     TESTCASE_AUTO(TestTraceCreateLineStrict);
163     TESTCASE_AUTO(TestTraceCreateLineNormalPhrase);
164     TESTCASE_AUTO(TestTraceCreateLineLoosePhrase);
165     TESTCASE_AUTO(TestTraceCreateLineStrictPhrase);
166     TESTCASE_AUTO(TestTraceCreateLinePhrase);
167     TESTCASE_AUTO(TestTraceCreateBreakEngine);
168 #endif
169 
170     TESTCASE_AUTO_END;
171 }
172 
173 
174 //--------------------------------------------------------------------------------------
175 //
176 //    RBBITest    constructor and destructor
177 //
178 //--------------------------------------------------------------------------------------
179 
RBBITest()180 RBBITest::RBBITest() {
181     fTestParams = nullptr;
182 }
183 
184 
~RBBITest()185 RBBITest::~RBBITest() {
186 }
187 
188 
printStringBreaks(UText * tstr,int expected[],int expectedCount)189 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
190     UErrorCode status = U_ZERO_ERROR;
191     char name[100];
192     printf("code    alpha extend alphanum type word sent line name\n");
193     int nextExpectedIndex = 0;
194     utext_setNativeIndex(tstr, 0);
195     for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
196         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
197             printf("------------------------------------------------ %d\n", j);
198             ++nextExpectedIndex;
199         }
200 
201         UChar32 c = utext_next32(tstr);
202         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
203         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
204                            u_isUAlphabetic(c),
205                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
206                            u_isalnum(c),
207                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
208                                                   u_charType(c),
209                                                   U_SHORT_PROPERTY_NAME),
210                            u_getPropertyValueName(UCHAR_WORD_BREAK,
211                                                   u_getIntPropertyValue(c,
212                                                           UCHAR_WORD_BREAK),
213                                                   U_SHORT_PROPERTY_NAME),
214                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
215                                    u_getIntPropertyValue(c,
216                                            UCHAR_SENTENCE_BREAK),
217                                    U_SHORT_PROPERTY_NAME),
218                            u_getPropertyValueName(UCHAR_LINE_BREAK,
219                                    u_getIntPropertyValue(c,
220                                            UCHAR_LINE_BREAK),
221                                    U_SHORT_PROPERTY_NAME),
222                            name);
223     }
224 }
225 
226 
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)227 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
228    UErrorCode status = U_ZERO_ERROR;
229    UText *tstr = nullptr;
230    tstr = utext_openConstUnicodeString(nullptr, &ustr, &status);
231    if (U_FAILURE(status)) {
232        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
233        return;
234     }
235    printStringBreaks(tstr, expected, expectedCount);
236    utext_close(tstr);
237 }
238 
239 
TestBug3818()240 void RBBITest::TestBug3818() {
241     UErrorCode  status = U_ZERO_ERROR;
242 
243     // Four Thai words...
244     static const char16_t thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
245                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
246     UnicodeString  thaiStr(thaiWordData);
247 
248     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
249     if (U_FAILURE(status) || bi == nullptr) {
250         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
251         return;
252     }
253     bi->setText(thaiStr);
254 
255     int32_t  startOfSecondWord = bi->following(1);
256     if (startOfSecondWord != 4) {
257         errln("Fail at file %s, line %d expected start of word at 4, got %d",
258             __FILE__, __LINE__, startOfSecondWord);
259     }
260     startOfSecondWord = bi->following(0);
261     if (startOfSecondWord != 4) {
262         errln("Fail at file %s, line %d expected start of word at 4, got %d",
263             __FILE__, __LINE__, startOfSecondWord);
264     }
265     delete bi;
266 }
267 
268 
269 //---------------------------------------------
270 //
271 //     other tests
272 //
273 //---------------------------------------------
274 
TestGetAvailableLocales()275 void RBBITest::TestGetAvailableLocales()
276 {
277     int32_t locCount = 0;
278     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
279 
280     if (locCount == 0)
281         dataerrln("getAvailableLocales() returned an empty list!");
282     // Just make sure that it's returning good memory.
283     int32_t i;
284     for (i = 0; i < locCount; ++i) {
285         logln(locList[i].getName());
286     }
287 }
288 
289 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()290 void RBBITest::TestGetDisplayName()
291 {
292     UnicodeString   result;
293 
294     BreakIterator::getDisplayName(Locale::getUS(), result);
295     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
296         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
297                 + result);
298 
299     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
300     if (result != "French (France)")
301         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
302                 + result);
303 }
304 /**
305  * Test End Behaviour
306  * @bug 4068137
307  */
TestEndBehaviour()308 void RBBITest::TestEndBehaviour()
309 {
310     UErrorCode status = U_ZERO_ERROR;
311     UnicodeString testString("boo.");
312     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
313     if (U_FAILURE(status))
314     {
315         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
316         return;
317     }
318     wb->setText(testString);
319 
320     if (wb->first() != 0)
321         errln("Didn't get break at beginning of string.");
322     if (wb->next() != 3)
323         errln("Didn't get break before period in \"boo.\"");
324     if (wb->current() != 4 && wb->next() != 4)
325         errln("Didn't get break at end of string.");
326     delete wb;
327 }
328 /*
329  * @bug 4153072
330  */
TestBug4153072()331 void RBBITest::TestBug4153072() {
332     UErrorCode status = U_ZERO_ERROR;
333     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
334     if (U_FAILURE(status))
335     {
336         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
337         return;
338     }
339     UnicodeString str("...Hello, World!...");
340     int32_t begin = 3;
341     int32_t end = str.length() - 3;
342     UBool onBoundary;
343 
344     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
345     iter->adoptText(textIterator);
346     int index;
347     // Note: with the switch to UText, there is no way to restrict the
348     //       iteration range to begin at an index other than zero.
349     //       String character iterators created with a non-zero bound are
350     //         treated by RBBI as being empty.
351     for (index = -1; index < begin + 1; ++index) {
352         onBoundary = iter->isBoundary(index);
353         if (index == 0?  !onBoundary : onBoundary) {
354             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
355                             " and begin index = " + begin);
356         }
357     }
358     delete iter;
359 }
360 
361 
362 //
363 // Test for problem reported by Ashok Matoria on 9 July 2007
364 //    One.<kSoftHyphen><kSpace>Two.
365 //
366 //    Sentence break at start (0) and then on calling next() it breaks at
367 //   'T' of "Two". Now, at this point if I do next() and
368 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
369 //
TestBug5775()370 void RBBITest::TestBug5775() {
371     UErrorCode status = U_ZERO_ERROR;
372     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
373     TEST_ASSERT_SUCCESS(status);
374     if (U_FAILURE(status)) {
375         return;
376     }
377 // Check for status first for better handling of no data errors.
378     TEST_ASSERT(bi != nullptr);
379     if (bi == nullptr) {
380         return;
381     }
382 
383     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
384     //               01234      56789
385     s = s.unescape();
386     bi->setText(s);
387     int pos = bi->next();
388     TEST_ASSERT(pos == 6);
389     pos = bi->next();
390     TEST_ASSERT(pos == 10);
391     pos = bi->previous();
392     TEST_ASSERT(pos == 6);
393     delete bi;
394 }
395 
396 
397 
398 //------------------------------------------------------------------------------
399 //
400 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
401 //
402 //------------------------------------------------------------------------------
403 
404 struct TestParams {
405     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
406                                            //   Changed out whenever test data changes break type.
407 
408     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
409     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
410     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
411     UVector32       *srcCol;
412 
413     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
414     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
415     CharString       utf8String;           // UTF-8 form of text to break.
416 
TestParamsTestParams417     TestParams(UErrorCode &status) : dataToBreak() {
418         bi               = nullptr;
419         expectedBreaks   = new UVector32(status);
420         srcLine          = new UVector32(status);
421         srcCol           = new UVector32(status);
422         textToBreak      = nullptr;
423         textMap          = new UVector32(status);
424     }
425 
~TestParamsTestParams426     ~TestParams() {
427         delete bi;
428         delete expectedBreaks;
429         delete srcLine;
430         delete srcCol;
431         utext_close(textToBreak);
432         delete textMap;
433     }
434 
435     int32_t getSrcLine(int32_t bp);
436     int32_t getExpectedBreak(int32_t bp);
437     int32_t getSrcCol(int32_t bp);
438 
439     void setUTF16(UErrorCode &status);
440     void setUTF8(UErrorCode &status);
441 };
442 
443 // Append a UnicodeString to a CharString with UTF-8 encoding.
444 // Substitute any invalid chars.
445 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)446 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
447     if (U_FAILURE(status)) {
448         return;
449     }
450     int32_t utf8Length;
451     u_strToUTF8WithSub(nullptr, 0, &utf8Length,         // Output Buffer, nullptr for preflight.
452                        src.getBuffer(), src.length(),   // UTF-16 data
453                        0xfffd, nullptr,                 // Substitution char, number of subs.
454                        &status);
455     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
456         return;
457     }
458     status = U_ZERO_ERROR;
459     int32_t capacity;
460     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
461     u_strToUTF8WithSub(buffer, utf8Length, nullptr,
462                        src.getBuffer(), src.length(),
463                        0xfffd, nullptr, &status);
464     dest.append(buffer, utf8Length, status);
465 }
466 
467 
setUTF16(UErrorCode & status)468 void TestParams::setUTF16(UErrorCode &status) {
469     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
470     textMap->removeAllElements();
471     for (int32_t i=0; i<dataToBreak.length(); i++) {
472         if (i == dataToBreak.getChar32Start(i)) {
473             textMap->addElement(i, status);
474         } else {
475             textMap->addElement(-1, status);
476         }
477     }
478     textMap->addElement(dataToBreak.length(), status);
479     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
480 }
481 
482 
setUTF8(UErrorCode & status)483 void TestParams::setUTF8(UErrorCode &status) {
484     if (U_FAILURE(status)) {
485         return;
486     }
487     utf8String.clear();
488     CharStringAppend(utf8String, dataToBreak, status);
489     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
490     if (U_FAILURE(status)) {
491         return;
492     }
493 
494     textMap->removeAllElements();
495     int32_t utf16Index = 0;
496     for (;;) {
497         textMap->addElement(utf16Index, status);
498         UChar32 c32 = utext_current32(textToBreak);
499         if (c32 < 0) {
500             break;
501         }
502         utf16Index += U16_LENGTH(c32);
503         utext_next32(textToBreak);
504         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
505             textMap->addElement(-1, status);
506         }
507     }
508     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
509 }
510 
511 
getSrcLine(int32_t bp)512 int32_t TestParams::getSrcLine(int32_t bp) {
513     if (bp >= textMap->size()) {
514         bp = textMap->size() - 1;
515     }
516     int32_t i = 0;
517     for(; bp >= 0 ; --bp) {
518         // Move to a character boundary if we are not on one already.
519         i = textMap->elementAti(bp);
520         if (i >= 0) {
521             break;
522         }
523     }
524     return srcLine->elementAti(i);
525 }
526 
527 
getExpectedBreak(int32_t bp)528 int32_t TestParams::getExpectedBreak(int32_t bp) {
529     if (bp >= textMap->size()) {
530         return 0;
531     }
532     int32_t i = textMap->elementAti(bp);
533     int32_t retVal = 0;
534     if (i >= 0) {
535         retVal = expectedBreaks->elementAti(i);
536     }
537     return retVal;
538 }
539 
540 
getSrcCol(int32_t bp)541 int32_t TestParams::getSrcCol(int32_t bp) {
542     if (bp >= textMap->size()) {
543         bp = textMap->size() - 1;
544     }
545     int32_t i = 0;
546     for(; bp >= 0; --bp) {
547         // Move bp to a character boundary if we are not on one already.
548         i = textMap->elementAti(bp);
549         if (i >= 0) {
550             break;
551         }
552     }
553     return srcCol->elementAti(i);
554 }
555 
556 
executeTest(TestParams * t,UErrorCode & status)557 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
558     int32_t    bp;
559     int32_t    prevBP;
560     int32_t    i;
561 
562     TEST_ASSERT_SUCCESS(status);
563     if (U_FAILURE(status)) {
564         return;
565     }
566 
567     if (t->bi == nullptr) {
568         return;
569     }
570 
571     t->bi->setText(t->textToBreak, status);
572     //
573     //  Run the iterator forward
574     //
575     prevBP = -1;
576     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
577         if (prevBP ==  bp) {
578             // Fail for lack of forward progress.
579             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
580                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
581             break;
582         }
583 
584         // Check that there we didn't miss an expected break between the last one
585         //  and this one.
586         for (i=prevBP+1; i<bp; i++) {
587             if (t->getExpectedBreak(i) != 0) {
588                 int expected[] = {0, i};
589                 printStringBreaks(t->dataToBreak, expected, 2);
590                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
591                       i, t->getSrcLine(i), t->getSrcCol(i));
592             }
593         }
594 
595         // Check that the break we did find was expected
596         if (t->getExpectedBreak(bp) == 0) {
597             int expected[] = {0, bp};
598             printStringBreaks(t->textToBreak, expected, 2);
599             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
600                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
601         } else {
602             // The break was expected.
603             //   Check that the {nnn} tag value is correct.
604             int32_t expectedTagVal = t->getExpectedBreak(bp);
605             if (expectedTagVal == -1) {
606                 expectedTagVal = 0;
607             }
608             int32_t line = t->getSrcLine(bp);
609             int32_t rs = t->bi->getRuleStatus();
610             if (rs != expectedTagVal) {
611                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
612                       "          Actual, Expected status = %4d, %4d",
613                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
614             }
615         }
616 
617         prevBP = bp;
618     }
619 
620     // Verify that there were no missed expected breaks after the last one found
621     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
622         if (t->getExpectedBreak(i) != 0) {
623             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
624                       i, t->getSrcLine(i), t->getSrcCol(i));
625         }
626     }
627 
628     //
629     //  Run the iterator backwards, verify that the same breaks are found.
630     //
631     prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
632     bp = t->bi->last();
633     while (bp != BreakIterator::DONE) {
634         if (prevBP ==  bp) {
635             // Fail for lack of progress.
636             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
637                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
638             break;
639         }
640 
641         // Check that we didn't miss an expected break between the last one
642         //  and this one.  (UVector returns zeros for index out of bounds.)
643         for (i=prevBP-1; i>bp; i--) {
644             if (t->getExpectedBreak(i) != 0) {
645                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
646                       i, t->getSrcLine(i), t->getSrcCol(i));
647             }
648         }
649 
650         // Check that the break we did find was expected
651         if (t->getExpectedBreak(bp) == 0) {
652             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
653                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
654         } else {
655             // The break was expected.
656             //   Check that the {nnn} tag value is correct.
657             int32_t expectedTagVal = t->getExpectedBreak(bp);
658             if (expectedTagVal == -1) {
659                 expectedTagVal = 0;
660             }
661             int line = t->getSrcLine(bp);
662             int32_t rs = t->bi->getRuleStatus();
663             if (rs != expectedTagVal) {
664                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
665                       "          Actual, Expected status = %4d, %4d",
666                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
667             }
668         }
669 
670         prevBP = bp;
671         bp = t->bi->previous();
672     }
673 
674     // Verify that there were no missed breaks prior to the last one found
675     for (i=prevBP-1; i>=0; i--) {
676         if (t->getExpectedBreak(i) != 0) {
677             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
678                       i, t->getSrcLine(i), t->getSrcCol(i));
679         }
680     }
681 
682     // Check isBoundary()
683     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
684         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
685         UBool boundaryFound    = t->bi->isBoundary(i);
686         if (boundaryExpected != boundaryFound) {
687             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
688                   "        Expected, Actual= %s, %s",
689                   i, t->getSrcLine(i), t->getSrcCol(i),
690                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
691         }
692     }
693 
694     // Check following()
695     for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
696         int32_t actualBreak = t->bi->following(i);
697         int32_t expectedBreak = BreakIterator::DONE;
698         for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
699             if (t->getExpectedBreak(j) != 0) {
700                 expectedBreak = j;
701                 break;
702             }
703         }
704         if (expectedBreak != actualBreak) {
705             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
706                   "        Expected, Actual= %d, %d",
707                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
708         }
709     }
710 
711     // Check preceding()
712     for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
713         int32_t actualBreak = t->bi->preceding(i);
714         int32_t expectedBreak = BreakIterator::DONE;
715 
716         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
717         // preceding(trailing byte) will return the index of some preceding code point,
718         // not the lead byte of the current code point, even though that has a smaller index.
719         // Therefore, start looking at the expected break data not at i-1, but at
720         // the start of code point index - 1.
721         utext_setNativeIndex(t->textToBreak, i);
722         int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
723         for (; j >= 0; j--) {
724             if (t->getExpectedBreak(j) != 0) {
725                 expectedBreak = j;
726                 break;
727             }
728         }
729         if (expectedBreak != actualBreak) {
730             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
731                   "        Expected, Actual= %d, %d",
732                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
733         }
734     }
735 }
736 
TestExtended()737 void RBBITest::TestExtended() {
738      // The expectations in this test heavily depends on the Thai dictionary.
739      // Therefore, we skip this test under the LSTM configuration.
740      if (skipDictionaryTest()) {
741          return;
742      }
743   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
744   // data driven test closely entangles filtered and regular data.
745 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
746     UErrorCode      status  = U_ZERO_ERROR;
747     Locale          locale("");
748 
749     TestParams          tp(status);
750 
751     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
752     if (U_FAILURE(status)) {
753         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
754     }
755 
756     //
757     //  Open and read the test data file.
758     //
759     const char *testDataDirectory = IntlTest::getSourceTestData(status);
760     CharString testFileName(testDataDirectory, -1, status);
761     testFileName.append("rbbitst.txt", -1, status);
762 
763     int    len;
764     char16_t *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
765     if (U_FAILURE(status)) {
766         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
767         return;
768     }
769 
770     bool skipTest = false; // Skip this test?
771 
772     //
773     //  Put the test data into a UnicodeString
774     //
775     UnicodeString testString(false, testFile, len);
776 
777     enum EParseState{
778         PARSE_COMMENT,
779         PARSE_TAG,
780         PARSE_DATA,
781         PARSE_NUM,
782         PARSE_RULES
783     }
784     parseState = PARSE_TAG;
785 
786     EParseState savedState = PARSE_TAG;
787 
788     int32_t    lineNum  = 1;
789     int32_t    colStart = 0;
790     int32_t    column   = 0;
791     int32_t    charIdx  = 0;
792 
793     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
794 
795     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
796     int32_t             rulesFirstLine = 0;  // Line number of the start of current <rules> block
797 
798     for (charIdx = 0; charIdx < len; ) {
799         status = U_ZERO_ERROR;
800         char16_t  c = testString.charAt(charIdx);
801         charIdx++;
802         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
803             // treat CRLF as a unit
804             c = u'\n';
805             charIdx++;
806         }
807         if (c == u'\n' || c == u'\r') {
808             lineNum++;
809             colStart = charIdx;
810         }
811         column = charIdx - colStart + 1;
812 
813         switch (parseState) {
814         case PARSE_COMMENT:
815             if (c == u'\n' || c == u'\r') {
816                 parseState = savedState;
817             }
818             break;
819 
820         case PARSE_TAG:
821             {
822             if (c == u'#') {
823                 parseState = PARSE_COMMENT;
824                 savedState = PARSE_TAG;
825                 break;
826             }
827             if (u_isUWhiteSpace(c)) {
828                 break;
829             }
830             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
831                 delete tp.bi;
832                 tp.bi = BreakIterator::createWordInstance(locale,  status);
833                 skipTest = false;
834                 charIdx += 5;
835                 break;
836             }
837             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
838                 delete tp.bi;
839                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
840                 skipTest = false;
841                 charIdx += 5;
842                 break;
843             }
844             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
845                 delete tp.bi;
846                 tp.bi = BreakIterator::createLineInstance(locale,  status);
847                 skipTest = false;
848 #if UCONFIG_USE_ML_PHRASE_BREAKING
849                 if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
850                     // skip <line> test cases of JP's phrase breaking when ML is enabled.
851                     skipTest = true;
852                 }
853 #endif
854                 charIdx += 5;
855                 break;
856             }
857             if (testString.compare(charIdx-1, 8, u"<lineML>") == 0) {
858                 delete tp.bi;
859                 tp.bi = BreakIterator::createLineInstance(locale,  status);
860                 skipTest = false;
861 #if !UCONFIG_USE_ML_PHRASE_BREAKING
862                 if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
863                     // skip <lineML> test cases of JP's phrase breaking when ML is disabled.
864                     skipTest = true;
865                 }
866 #endif
867                 charIdx += 7;
868                 break;
869             }
870             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
871                 delete tp.bi;
872                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
873                 skipTest = false;
874                 charIdx += 5;
875                 break;
876             }
877             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
878                 delete tp.bi;
879                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
880                 charIdx += 6;
881                 break;
882             }
883 
884             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
885                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
886                 charIdx = testString.indexOf(u'>', charIdx) + 1;
887                 parseState = PARSE_RULES;
888                 rules.remove();
889                 rulesFirstLine = lineNum;
890                 break;
891             }
892 
893             // <locale  loc_name>
894             localeMatcher.reset(testString);
895             if (localeMatcher.lookingAt(charIdx-1, status)) {
896                 UnicodeString localeName = localeMatcher.group(1, status);
897                 char localeName8[100];
898                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), nullptr);
899                 locale = Locale::createFromName(localeName8);
900                 charIdx += localeMatcher.group(0, status).length() - 1;
901                 TEST_ASSERT_SUCCESS(status);
902                 break;
903             }
904             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
905                 parseState = PARSE_DATA;
906                 charIdx += 5;
907                 tp.dataToBreak = "";
908                 tp.expectedBreaks->removeAllElements();
909                 tp.srcCol ->removeAllElements();
910                 tp.srcLine->removeAllElements();
911                 break;
912             }
913 
914             errln("line %d: Tag expected in test file.", lineNum);
915             parseState = PARSE_COMMENT;
916             savedState = PARSE_DATA;
917             goto end_test; // Stop the test.
918             }
919             break;
920 
921         case PARSE_RULES:
922             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
923                 charIdx += 7;
924                 parseState = PARSE_TAG;
925                 delete tp.bi;
926                 UParseError pe;
927                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
928                 skipTest = U_FAILURE(status);
929                 if (U_FAILURE(status)) {
930                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
931                         rulesFirstLine + pe.line - 1, u_errorName(status));
932                 }
933             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
934                 charIdx += 10;
935                 parseState = PARSE_TAG;
936                 UErrorCode ec = U_ZERO_ERROR;
937                 UParseError pe;
938                 RuleBasedBreakIterator bi(rules, pe, ec);
939                 if (U_SUCCESS(ec)) {
940                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
941                         rulesFirstLine + pe.line - 1);
942                 }
943             } else {
944                 rules.append(c);
945             }
946             break;
947 
948         case PARSE_DATA:
949             if (c == u'•') {
950                 int32_t  breakIdx = tp.dataToBreak.length();
951                 if (tp.expectedBreaks->size() > breakIdx) {
952                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
953                           lineNum, column);
954                 }
955                 tp.expectedBreaks->setSize(breakIdx+1);
956                 tp.expectedBreaks->setElementAt(-1, breakIdx);
957                 tp.srcLine->setSize(breakIdx+1);
958                 tp.srcLine->setElementAt(lineNum, breakIdx);
959                 tp.srcCol ->setSize(breakIdx+1);
960                 tp.srcCol ->setElementAt(column, breakIdx);
961                 break;
962             }
963 
964             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
965                 // Add final entry to mappings from break location to source file position.
966                 //  Need one extra because last break position returned is after the
967                 //    last char in the data, not at the last char.
968                 tp.srcLine->addElement(lineNum, status);
969                 tp.srcCol ->addElement(column, status);
970 
971                 parseState = PARSE_TAG;
972                 charIdx += 6;
973 
974                 if (!skipTest) {
975                     // RUN THE TEST!
976                     status = U_ZERO_ERROR;
977                     tp.setUTF16(status);
978                     executeTest(&tp, status);
979                     TEST_ASSERT_SUCCESS(status);
980 
981                     // Run again, this time with UTF-8 text wrapped in a UText.
982                     status = U_ZERO_ERROR;
983                     tp.setUTF8(status);
984                     TEST_ASSERT_SUCCESS(status);
985                     executeTest(&tp, status);
986                 }
987                 break;
988             }
989 
990             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
991                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
992                 // Get the code point from the name and insert it into the test data.
993                 //   (Damn, no API takes names in Unicode  !!!
994                 //    we've got to take it back to char *)
995                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
996                 int32_t nameLength = nameEndIdx - (charIdx+2);
997                 char charNameBuf[200];
998                 UChar32 theChar = -1;
999                 if (nameEndIdx != -1) {
1000                     UErrorCode status = U_ZERO_ERROR;
1001                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1002                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1003                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1004                     if (U_FAILURE(status)) {
1005                         theChar = -1;
1006                     }
1007                 }
1008                 if (theChar == -1) {
1009                     errln("Error in named character in test file at line %d, col %d",
1010                         lineNum, column);
1011                 } else {
1012                     // Named code point was recognized.  Insert it
1013                     //   into the test data.
1014                     tp.dataToBreak.append(theChar);
1015                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1016                         tp.srcLine->addElement(lineNum, status);
1017                         tp.srcCol ->addElement(column, status);
1018                     }
1019                 }
1020                 if (nameEndIdx > charIdx) {
1021                     charIdx = nameEndIdx+1;
1022 
1023                 }
1024                 break;
1025             }
1026 
1027 
1028 
1029             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
1030                 charIdx++;
1031                 int32_t  breakIdx = tp.dataToBreak.length();
1032                 tp.expectedBreaks->setSize(breakIdx+1);
1033                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1034                 tp.srcLine->setSize(breakIdx+1);
1035                 tp.srcLine->setElementAt(lineNum, breakIdx);
1036                 tp.srcCol ->setSize(breakIdx+1);
1037                 tp.srcCol ->setElementAt(column, breakIdx);
1038                 break;
1039             }
1040 
1041             if (c == u'<') {
1042                 tagValue   = 0;
1043                 parseState = PARSE_NUM;
1044                 break;
1045             }
1046 
1047             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
1048                 parseState = PARSE_COMMENT;
1049                 savedState = PARSE_DATA;
1050                 break;
1051             }
1052 
1053             if (c == u'\\') {
1054                 // Check for \ at end of line, a line continuation.
1055                 //     Advance over (discard) the newline
1056                 UChar32 cp = testString.char32At(charIdx);
1057                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1058                     // We have a CR LF
1059                     //  Need an extra increment of the input ptr to move over both of them
1060                     charIdx++;
1061                 }
1062                 if (cp == u'\n' || cp == u'\r') {
1063                     lineNum++;
1064                     colStart = charIdx;
1065                     charIdx++;
1066                     break;
1067                 }
1068 
1069                 // Let unescape handle the back slash.
1070                 cp = testString.unescapeAt(charIdx);
1071                 if (cp != -1) {
1072                     // Escape sequence was recognized.  Insert the char
1073                     //   into the test data.
1074                     tp.dataToBreak.append(cp);
1075                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1076                         tp.srcLine->addElement(lineNum, status);
1077                         tp.srcCol ->addElement(column, status);
1078                     }
1079                     break;
1080                 }
1081 
1082 
1083                 // Not a recognized backslash escape sequence.
1084                 // Take the next char as a literal.
1085                 //  TODO:  Should this be an error?
1086                 c = testString.charAt(charIdx);
1087                 charIdx = testString.moveIndex32(charIdx, 1);
1088             }
1089 
1090             // Normal, non-escaped data char.
1091             tp.dataToBreak.append(c);
1092 
1093             // Save the mapping from offset in the data to line/column numbers in
1094             //   the original input file.  Will be used for better error messages only.
1095             //   If there's an expected break before this char, the slot in the mapping
1096             //     vector will already be set for this char; don't overwrite it.
1097             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1098                 tp.srcLine->addElement(lineNum, status);
1099                 tp.srcCol ->addElement(column, status);
1100             }
1101             break;
1102 
1103 
1104         case PARSE_NUM:
1105             // We are parsing an expected numeric tag value, like <1234>,
1106             //   within a chunk of data.
1107             if (u_isUWhiteSpace(c)) {
1108                 break;
1109             }
1110 
1111             if (c == u'>') {
1112                 // Finished the number.  Add the info to the expected break data,
1113                 //   and switch parse state back to doing plain data.
1114                 parseState = PARSE_DATA;
1115                 if (tagValue == 0) {
1116                     tagValue = -1;
1117                 }
1118                 int32_t  breakIdx = tp.dataToBreak.length();
1119                 if (tp.expectedBreaks->size() > breakIdx) {
1120                     errln("rbbitst.txt:%d:%d adjacent expected breaks with no intervening test text",
1121                           lineNum, column);
1122                 }
1123                 tp.expectedBreaks->setSize(breakIdx+1);
1124                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1125                 tp.srcLine->setSize(breakIdx+1);
1126                 tp.srcLine->setElementAt(lineNum, breakIdx);
1127                 tp.srcCol ->setSize(breakIdx+1);
1128                 tp.srcCol ->setElementAt(column, breakIdx);
1129                 break;
1130             }
1131 
1132             if (u_isdigit(c)) {
1133                 tagValue = tagValue*10 + u_charDigitValue(c);
1134                 break;
1135             }
1136 
1137             errln("Syntax Error in test file at line %d, col %d",
1138                 lineNum, column);
1139             parseState = PARSE_COMMENT;
1140             goto end_test; // Stop the test
1141             break;
1142         }
1143 
1144 
1145         if (U_FAILURE(status)) {
1146             dataerrln("ICU Error %s while parsing test file at line %d.",
1147                 u_errorName(status), lineNum);
1148             status = U_ZERO_ERROR;
1149             goto end_test; // Stop the test
1150         }
1151 
1152     }
1153 
1154     // Reached end of test file. Raise an error if parseState indicates that we are
1155     //   within a block that should have been terminated.
1156 
1157     if (parseState == PARSE_RULES) {
1158         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1159             lineNum, rulesFirstLine);
1160     }
1161     if (parseState == PARSE_DATA) {
1162         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1163     }
1164 
1165 
1166 end_test:
1167     delete [] testFile;
1168 #endif
1169 }
1170 
1171 //-------------------------------------------------------------------------------
1172 //
1173 //  TestDictRules   create a break iterator from source rules that includes a
1174 //                  dictionary range.   Regression for bug #7130.  Source rules
1175 //                  do not declare a break iterator type (word, line, sentence, etc.
1176 //                  but the dictionary code, without a type, would loop.
1177 //
1178 //-------------------------------------------------------------------------------
TestDictRules()1179 void RBBITest::TestDictRules() {
1180     const char *rules =  "$dictionary = [a-z]; \n"
1181                          "!!forward; \n"
1182                          "$dictionary $dictionary; \n"
1183                          "!!reverse; \n"
1184                          "$dictionary $dictionary; \n";
1185     const char *text = "aa";
1186     UErrorCode status = U_ZERO_ERROR;
1187     UParseError parseError;
1188 
1189     RuleBasedBreakIterator bi(rules, parseError, status);
1190     if (U_SUCCESS(status)) {
1191         UnicodeString utext = text;
1192         bi.setText(utext);
1193         int32_t position;
1194         int32_t loops;
1195         for (loops = 0; loops<10; loops++) {
1196             position = bi.next();
1197             if (position == RuleBasedBreakIterator::DONE) {
1198                 break;
1199             }
1200         }
1201         TEST_ASSERT(loops == 1);
1202     } else {
1203         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1204     }
1205 }
1206 
1207 
1208 
1209 //--------------------------------------------------------------------------------------------
1210 //
1211 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1212 //
1213 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1214 void RBBITest::TestUnicodeFiles() {
1215     RuleBasedBreakIterator  *bi;
1216     UErrorCode               status = U_ZERO_ERROR;
1217 
1218     bi =  dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createCharacterInstance(Locale::getEnglish(), status));
1219     TEST_ASSERT_SUCCESS(status);
1220     if (U_SUCCESS(status)) {
1221         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1222     }
1223     delete bi;
1224 
1225     bi =  dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getEnglish(), status));
1226     TEST_ASSERT_SUCCESS(status);
1227     if (U_SUCCESS(status)) {
1228         runUnicodeTestData("WordBreakTest.txt", bi);
1229     }
1230     delete bi;
1231 
1232     bi =  dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
1233     TEST_ASSERT_SUCCESS(status);
1234     if (U_SUCCESS(status)) {
1235         runUnicodeTestData("SentenceBreakTest.txt", bi);
1236     }
1237     delete bi;
1238 
1239     bi =  dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getEnglish(), status));
1240     TEST_ASSERT_SUCCESS(status);
1241     if (U_SUCCESS(status)) {
1242         runUnicodeTestData("LineBreakTest.txt", bi);
1243     }
1244     delete bi;
1245 }
1246 
1247 
1248 // Check for test cases from the Unicode test data files that are known to fail
1249 // and should be skipped as known issues because ICU does not fully implement
1250 // the Unicode specifications, or because ICU includes tailorings that differ from
1251 // the Unicode standard.
1252 //
1253 // Test cases are identified by the test data sequence, which tends to be more stable
1254 // across Unicode versions than the test file line numbers.
1255 //
1256 // The test case with ticket "10666" is a dummy, included as an example.
1257 
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1258 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1259     static struct TestCase {
1260         const char *fTicketNum;
1261         const char *fFileName;
1262         const char16_t *fString;
1263     } badTestCases[] = {
1264         {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
1265         // The following tests were originally for
1266         // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1267         // However, that ticket has been closed as fixed but these tests still fail, so
1268         // ICU-21097 has been created to investigate and address these remaining issues.
1269         {"21097",  "LineBreakTest.txt", u"-#"},
1270         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1271         {"21097",  "LineBreakTest.txt", u"\u002d\u00a7"},
1272         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1273         {"21097",  "LineBreakTest.txt", u"\u002d\U00050005"},
1274         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1275         {"21097",  "LineBreakTest.txt", u"\u002d\u0e01"},
1276         {"21097",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1277 
1278         // The following tests were originally for
1279         // Issue ICU-12017 Improve line break around numbers.
1280         // However, that ticket has been closed as fixed but these tests still fail, so
1281         // ICU-21097 has been created to investigate and address these remaining issues.
1282         {"21097", "LineBreakTest.txt", u"\u002C\u0030"},   // ",0"
1283         {"21097", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1284         {"21097", "LineBreakTest.txt", u"equals .35 cents"},
1285         {"21097", "LineBreakTest.txt", u"a.2 "},
1286         {"21097", "LineBreakTest.txt", u"a.2 \u0915"},
1287         {"21097", "LineBreakTest.txt", u"a.2 \u672C"},
1288         {"21097", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1289         {"21097", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1290         {"21097", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1291         {"21097", "LineBreakTest.txt", u"A.1 \uBABB"},
1292         {"21097", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1293         {"21097", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1294         {"21097", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1295         {"21097", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1296 
1297         // ICU-22127 until UAX #29 wordbreak is update for the colon changes in ICU-22112,
1298         // need to skip some tests in WordBreakTest.txt
1299         {"22127", "WordBreakTest.txt", u"a:"},
1300         {"22127", "WordBreakTest.txt", u"A:"},
1301     };
1302 
1303     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1304         const TestCase &badCase = badTestCases[n];
1305         if (!strcmp(fileName, badCase.fFileName) &&
1306                 testCase.startsWith(UnicodeString(badCase.fString))) {
1307             return logKnownIssue(badCase.fTicketNum);
1308         }
1309     }
1310     return false;
1311 }
1312 
1313 
1314 //--------------------------------------------------------------------------------------------
1315 //
1316 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1317 //
1318 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1319 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1320 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1321     UErrorCode  status = U_ZERO_ERROR;
1322 
1323     //
1324     //  Open and read the test data file, put it into a UnicodeString.
1325     //
1326     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1327     char testFileName[1000];
1328     if (testDataDirectory == nullptr || strlen(testDataDirectory) >= sizeof(testFileName)) {
1329         dataerrln("Can't open test data.  Path too long.");
1330         return;
1331     }
1332     strcpy(testFileName, testDataDirectory);
1333     strcat(testFileName, fileName);
1334 
1335     logln("Opening data file %s\n", fileName);
1336 
1337     int    len;
1338     char16_t *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1339     if (status != U_FILE_ACCESS_ERROR) {
1340         TEST_ASSERT_SUCCESS(status);
1341         TEST_ASSERT(testFile != nullptr);
1342     }
1343     if (U_FAILURE(status) || testFile == nullptr) {
1344         return; /* something went wrong, error already output */
1345     }
1346     UnicodeString testFileAsString(true, testFile, len);
1347 
1348     //
1349     //  Parse the test data file using a regular expression.
1350     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1351     //     is identified by which group had a match.
1352     //
1353     //    Capture Group  #                  1          2            3            4           5
1354     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1355     //
1356     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1357     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1358     UnicodeString   testString;
1359     UVector32       breakPositions(status);
1360     int             lineNumber = 1;
1361     TEST_ASSERT_SUCCESS(status);
1362     if (U_FAILURE(status)) {
1363         return;
1364     }
1365 
1366     //
1367     //  Scan through each test case, building up the string to be broken in testString,
1368     //   and the positions that should be boundaries in the breakPositions vector.
1369     //
1370     int spin = 0;
1371     while (tokenMatcher.find()) {
1372         if(tokenMatcher.hitEnd()) {
1373           /* Shouldn't Happen(TM).  This means we didn't find the symbols we were looking for.
1374              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1375              and caused an infinite loop here on EBCDIC systems!
1376           */
1377           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1378           //       return;
1379         }
1380         if (tokenMatcher.start(1, status) >= 0) {
1381             // Scanned a divide sign, indicating a break position in the test data.
1382             if (testString.length()>0) {
1383                 breakPositions.addElement(testString.length(), status);
1384             }
1385         }
1386         else if (tokenMatcher.start(2, status) >= 0) {
1387             // Scanned an 'x', meaning no break at this position in the test data
1388             //   Nothing to be done here.
1389             }
1390         else if (tokenMatcher.start(3, status) >= 0) {
1391             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1392             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1393             int length = hexNumber.length();
1394             if (length<=8) {
1395                 char buf[10];
1396                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1397                 UChar32 c = (UChar32)strtol(buf, nullptr, 16);
1398                 if (c<=0x10ffff) {
1399                     testString.append(c);
1400                 } else {
1401                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1402                        fileName, lineNumber);
1403                 }
1404             } else {
1405                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1406                        fileName, lineNumber);
1407              }
1408         }
1409         else if (tokenMatcher.start(4, status) >= 0) {
1410             // Scanned to end of a line, possibly skipping over a comment in the process.
1411             //   If the line from the file contained test data, run the test now.
1412             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1413                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1414             }
1415 
1416             // Clear out this test case.
1417             //    The string and breakPositions vector will be refilled as the next
1418             //       test case is parsed.
1419             testString.remove();
1420             breakPositions.removeAllElements();
1421             lineNumber++;
1422         } else {
1423             // Scanner catchall.  Something unrecognized appeared on the line.
1424             char token[16];
1425             UnicodeString uToken = tokenMatcher.group(0, status);
1426             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1427             token[sizeof(token)-1] = 0;
1428             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1429 
1430             // Clean up, in preparation for continuing with the next line.
1431             testString.remove();
1432             breakPositions.removeAllElements();
1433             lineNumber++;
1434         }
1435         TEST_ASSERT_SUCCESS(status);
1436         if (U_FAILURE(status)) {
1437             break;
1438         }
1439     }
1440 
1441     delete [] testFile;
1442  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1443 }
1444 
1445 //--------------------------------------------------------------------------------------------
1446 //
1447 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1448 //                            test data files.  Do only a simple, forward-only check -
1449 //                            this test is mostly to check that ICU and the Unicode
1450 //                            data agree with each other.
1451 //
1452 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1453 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1454                          const UnicodeString &testString,   // Text data to be broken
1455                          UVector32 *breakPositions,         // Positions where breaks should be found.
1456                          RuleBasedBreakIterator *bi) {
1457     int32_t pos;                 // Break Position in the test string
1458     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1459     int32_t expectedPos;         // Expected break position (index into test string)
1460 
1461     bi->setText(testString);
1462     pos = bi->first();
1463     pos = bi->next();
1464 
1465     bool error = false;
1466     std::set<int32_t> actualBreaks;
1467     std::set<int32_t> expectedBreaks;
1468     while (pos != BreakIterator::DONE) {
1469         actualBreaks.insert(pos);
1470         if (expectedI >= breakPositions->size()) {
1471             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1472                 testFileName, lineNumber, pos);
1473             error = true;
1474             break;
1475         }
1476         expectedPos = breakPositions->elementAti(expectedI);
1477         expectedBreaks.insert(expectedPos);
1478         if (pos < expectedPos) {
1479             errln("Test file \"%s\", line %d, unexpected break found at position %d", testFileName,
1480                   lineNumber, pos);
1481             error = true;
1482             break;
1483         }
1484         if (pos > expectedPos) {
1485             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1486                   testFileName, lineNumber, expectedPos);
1487             error = true;
1488             break;
1489         }
1490         pos = bi->next();
1491         expectedI++;
1492     }
1493 
1494     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1495         errln("Test file \"%s\", line %d, failed to find expected break at position %d", testFileName,
1496               lineNumber, breakPositions->elementAti(expectedI));
1497         error = true;
1498     }
1499 
1500     if (error) {
1501         for (; pos != BreakIterator::DONE; pos = bi->next()) {
1502             actualBreaks.insert(pos);
1503         }
1504         for (; expectedI < breakPositions->size(); ++expectedI) {
1505             expectedBreaks.insert(breakPositions->elementAti(expectedI));
1506         }
1507         UnicodeString expected;
1508         UnicodeString actual;
1509         for (int32_t i = 0; i < testString.length();) {
1510             const UChar32 c = testString.char32At(i);
1511             i += U16_LENGTH(c);
1512             expected += expectedBreaks.count(i) == 1 ? u"÷" : u"×";
1513             actual += actualBreaks.count(i) == 1 ? u"÷" : u"×";
1514             expected += c;
1515             actual += c;
1516         }
1517         expected += expectedBreaks.count(testString.length()) == 1 ? u"÷" : u"×";
1518         actual += actualBreaks.count(testString.length()) == 1 ? u"÷" : u"×";
1519         errln("Expected : " + expected);
1520         errln("Actual   : " + actual);
1521     }
1522 }
1523 
1524 
1525 
1526 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1527 //---------------------------------------------------------------------------------------
1528 //
1529 //   class RBBIMonkeyKind
1530 //
1531 //      Monkey Test for Break Iteration
1532 //      Abstract interface class.   Concrete derived classes independently
1533 //      implement the break rules for different iterator types.
1534 //
1535 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1536 //      testing, but works purely in terms of the interface defined here.
1537 //
1538 //---------------------------------------------------------------------------------------
1539 class RBBIMonkeyKind {
1540 public:
1541     // Return a UVector of UnicodeSets, representing the character classes used
1542     //   for this type of iterator.
1543     virtual  UVector  *charClasses() = 0;
1544 
1545     // Set the test text on which subsequent calls to next() will operate
1546     virtual  void      setText(const UnicodeString &s) = 0;
1547 
1548     // Find the next break position, starting from the prev break position, or from zero.
1549     // Return -1 after reaching end of string.
1550     virtual  int32_t   next(int32_t i) = 0;
1551 
1552     // Name of each character class, parallel with charClasses. Used for debugging output
1553     // of characters.
1554     virtual  std::vector<std::string>&     characterClassNames();
1555 
1556     void setAppliedRule(int32_t position, const char* value);
1557 
1558     std::string getAppliedRule(int32_t position);
1559 
1560     virtual ~RBBIMonkeyKind();
1561     UErrorCode deferredStatus;
1562 
1563     std::string classNameFromCodepoint(const UChar32 c);
1564     unsigned int maxClassNameSize();
1565 
1566  protected:
1567      RBBIMonkeyKind();
1568      std::vector<std::string> classNames;
1569      std::vector<std::string> appliedRules;
1570 
1571     // Clear `appliedRules` and fill it with empty strings in the size of test text.
1572     void prepareAppliedRules(int32_t size );
1573 
1574  private:
1575 
1576 };
1577 
RBBIMonkeyKind()1578 RBBIMonkeyKind::RBBIMonkeyKind() {
1579     deferredStatus = U_ZERO_ERROR;
1580 }
1581 
~RBBIMonkeyKind()1582 RBBIMonkeyKind::~RBBIMonkeyKind() {
1583 }
1584 
characterClassNames()1585 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1586     return classNames;
1587 }
1588 
prepareAppliedRules(int32_t size)1589 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1590     // Remove all the information in the `appliedRules`.
1591     appliedRules.clear();
1592     appliedRules.resize(size + 1);
1593 }
1594 
setAppliedRule(int32_t position,const char * value)1595 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1596     appliedRules[position] = value;
1597 }
1598 
getAppliedRule(int32_t position)1599 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1600     return appliedRules[position];
1601 }
1602 
classNameFromCodepoint(const UChar32 c)1603 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1604     // Simply iterate through charClasses to find character's class
1605     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1606         UnicodeSet *classSet = static_cast<UnicodeSet *>(charClasses()->elementAt(aClassNum));
1607         if (classSet->contains(c)) {
1608             return classNames[aClassNum];
1609         }
1610     }
1611     U_ASSERT(false);  // This should not happen.
1612     return "bad class name";
1613 }
1614 
maxClassNameSize()1615 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1616     unsigned int maxSize = 0;
1617     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1618         auto aClassNumSize = static_cast<unsigned int>(classNames[aClassNum].size());
1619         if (aClassNumSize > maxSize) {
1620             maxSize = aClassNumSize;
1621         }
1622     }
1623     return maxSize;
1624 }
1625 
1626 //----------------------------------------------------------------------------------------
1627 //
1628 //   Random Numbers.  Similar to standard lib rand() and srand()
1629 //                    Not using library to
1630 //                      1.  Get same results on all platforms.
1631 //                      2.  Get access to current seed, to more easily reproduce failures.
1632 //
1633 //---------------------------------------------------------------------------------------
1634 static uint32_t m_seed = 1;
1635 
m_rand()1636 static uint32_t m_rand()
1637 {
1638     m_seed = m_seed * 1103515245 + 12345;
1639     return (uint32_t)(m_seed/65536) % 32768;
1640 }
1641 
1642 
1643 //------------------------------------------------------------------------------------------
1644 //
1645 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1646 //                             of RBBIMonkeyKind.
1647 //
1648 //------------------------------------------------------------------------------------------
1649 class RBBICharMonkey: public RBBIMonkeyKind {
1650 public:
1651     RBBICharMonkey();
1652     virtual          ~RBBICharMonkey();
1653     virtual  UVector *charClasses() override;
1654     virtual  void     setText(const UnicodeString &s) override;
1655     virtual  int32_t  next(int32_t i) override;
1656 private:
1657     UVector   *fSets;
1658 
1659     UnicodeSet  *fCRLFSet;
1660     UnicodeSet  *fControlSet;
1661     UnicodeSet  *fExtendSet;
1662     UnicodeSet  *fZWJSet;
1663     UnicodeSet  *fRegionalIndicatorSet;
1664     UnicodeSet  *fPrependSet;
1665     UnicodeSet  *fSpacingSet;
1666     UnicodeSet  *fLSet;
1667     UnicodeSet  *fVSet;
1668     UnicodeSet  *fTSet;
1669     UnicodeSet  *fLVSet;
1670     UnicodeSet  *fLVTSet;
1671     UnicodeSet  *fHangulSet;
1672     UnicodeSet  *fExtendedPictSet;
1673     UnicodeSet  *fViramaSet;
1674     UnicodeSet  *fLinkingConsonantSet;
1675     UnicodeSet  *fExtCccZwjSet;
1676     UnicodeSet  *fAnySet;
1677 
1678     const UnicodeString *fText;
1679 };
1680 
1681 
RBBICharMonkey()1682 RBBICharMonkey::RBBICharMonkey() {
1683     UErrorCode  status = U_ZERO_ERROR;
1684 
1685     fText = nullptr;
1686 
1687     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1688     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1689     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1690     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1691     fRegionalIndicatorSet =
1692                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1693     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1694     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1695     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1696     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1697     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1698     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1699     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1700     fHangulSet  = new UnicodeSet();
1701     fHangulSet->addAll(*fLSet);
1702     fHangulSet->addAll(*fVSet);
1703     fHangulSet->addAll(*fTSet);
1704     fHangulSet->addAll(*fLVSet);
1705     fHangulSet->addAll(*fLVTSet);
1706 
1707     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1708     fViramaSet        = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1709                                         "\\p{Indic_Syllabic_Category=Virama}]", status);
1710     fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1711                                         "\\p{Indic_Syllabic_Category=Consonant}]", status);
1712     fExtCccZwjSet     = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1713     fAnySet           = new UnicodeSet(0, 0x10ffff);
1714 
1715     // Create sets of characters, and add the names of the above character sets.
1716     // In each new ICU release, add new names corresponding to the sets above.
1717     fSets             = new UVector(status);
1718 
1719     // Important: Keep class names the same as the class contents.
1720     fSets->addElement(fCRLFSet, status); classNames.emplace_back("CRLF");
1721     fSets->addElement(fControlSet, status); classNames.emplace_back("Control");
1722     fSets->addElement(fExtendSet, status); classNames.emplace_back("Extended");
1723     fSets->addElement(fRegionalIndicatorSet, status); classNames.emplace_back("RegionalIndicator");
1724     if (!fPrependSet->isEmpty()) {
1725         fSets->addElement(fPrependSet, status); classNames.emplace_back("Prepend");
1726     }
1727     fSets->addElement(fSpacingSet, status); classNames.emplace_back("Spacing");
1728     fSets->addElement(fHangulSet, status); classNames.emplace_back("Hangul");
1729     fSets->addElement(fZWJSet, status); classNames.emplace_back("ZWJ");
1730     fSets->addElement(fExtendedPictSet, status); classNames.emplace_back("ExtendedPict");
1731     fSets->addElement(fViramaSet, status); classNames.emplace_back("Virama");
1732     fSets->addElement(fLinkingConsonantSet, status); classNames.emplace_back("LinkingConsonant");
1733     fSets->addElement(fExtCccZwjSet, status); classNames.emplace_back("ExtCcccZwj");
1734     fSets->addElement(fAnySet, status); classNames.emplace_back("Any");
1735 
1736     if (U_FAILURE(status)) {
1737         deferredStatus = status;
1738     }
1739 }
1740 
1741 
setText(const UnicodeString & s)1742 void RBBICharMonkey::setText(const UnicodeString &s) {
1743     fText = &s;
1744     prepareAppliedRules(s.length());
1745 }
1746 
1747 
1748 
next(int32_t prevPos)1749 int32_t RBBICharMonkey::next(int32_t prevPos) {
1750     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1751                               //   break position being tested.  The candidate break
1752                               //   location is before p2.
1753 
1754     int     breakPos = -1;
1755 
1756     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1757     UChar32 cBase;            // for (X Extend*) patterns, the X character.
1758 
1759     if (U_FAILURE(deferredStatus)) {
1760         return -1;
1761     }
1762 
1763     // Previous break at end of string.  return DONE.
1764     if (prevPos >= fText->length()) {
1765         return -1;
1766     }
1767 
1768     p0 = p1 = p2 = p3 = prevPos;
1769     c3 =  fText->char32At(prevPos);
1770     c0 = c1 = c2 = cBase = 0;
1771     (void)p0;   // suppress set but not used warning.
1772     (void)c0;
1773 
1774     // Loop runs once per "significant" character position in the input text.
1775     for (;;) {
1776         // Move all of the positions forward in the input string.
1777         p0 = p1;  c0 = c1;
1778         p1 = p2;  c1 = c2;
1779         p2 = p3;  c2 = c3;
1780 
1781         // Advance p3 by one codepoint
1782         p3 = fText->moveIndex32(p3, 1);
1783         c3 = fText->char32At(p3);
1784 
1785         if (p1 == p2) {
1786             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1787             continue;
1788         }
1789 
1790         if (p2 == fText->length()) {
1791             setAppliedRule(p2, "End of String");
1792             break;
1793         }
1794 
1795         //     No Extend or Format characters may appear between the CR and LF,
1796         //     which requires the additional check for p2 immediately following p1.
1797         //
1798         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1799           setAppliedRule(p2, "GB3   CR x LF");
1800           continue;
1801         }
1802 
1803         if (fControlSet->contains(c1) ||
1804             c1 == 0x0D ||
1805             c1 == 0x0A)  {
1806           setAppliedRule(p2, "GB4   ( Control | CR | LF ) <break>");
1807           break;
1808         }
1809 
1810         if (fControlSet->contains(c2) ||
1811             c2 == 0x0D ||
1812             c2 == 0x0A)  {
1813             setAppliedRule(p2, "GB5   <break>  ( Control | CR | LF )");
1814             break;
1815         }
1816 
1817         if (fLSet->contains(c1) &&
1818                (fLSet->contains(c2)  ||
1819                 fVSet->contains(c2)  ||
1820                 fLVSet->contains(c2) ||
1821                 fLVTSet->contains(c2))) {
1822             setAppliedRule(p2, "GB6   L x ( L | V | LV | LVT )");
1823             continue;
1824         }
1825 
1826         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1827             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1828             setAppliedRule(p2, "GB7    ( LV | V )  x  ( V | T )");
1829             continue;
1830         }
1831 
1832         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1833             fTSet->contains(c2))  {
1834             setAppliedRule(p2, "GB8   ( LVT | T)  x T");
1835             continue;
1836         }
1837 
1838         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1839             if (!fExtendSet->contains(c1)) {
1840                 cBase = c1;
1841             }
1842             setAppliedRule(p2, "GB9   x (Extend | ZWJ)");
1843             continue;
1844         }
1845 
1846         if (fSpacingSet->contains(c2)) {
1847             setAppliedRule(p2, "GB9a  x  SpacingMark");
1848             continue;
1849         }
1850 
1851         if (fPrependSet->contains(c1)) {
1852             setAppliedRule(p2, "GB9b  Prepend x");
1853             continue;
1854         }
1855 
1856         //   Note: Viramas are also included in the ExtCccZwj class.
1857         if (fLinkingConsonantSet->contains(c2)) {
1858             int pi = p1;
1859             bool sawVirama = false;
1860             while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1861                 if (fViramaSet->contains(fText->char32At(pi))) {
1862                     sawVirama = true;
1863                 }
1864                 pi = fText->moveIndex32(pi, -1);
1865             }
1866             if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1867               setAppliedRule(p2, "GB9.3  LinkingConsonant ExtCccZwj* Virama ExtCccZwj* x LinkingConsonant");
1868               continue;
1869             }
1870         }
1871 
1872         if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1873           setAppliedRule(p2, "GB11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1874           continue;
1875         }
1876 
1877         //                   Note: The first if condition is a little tricky. We only need to force
1878         //                      a break if there are three or more contiguous RIs. If there are
1879         //                      only two, a break following will occur via other rules, and will include
1880         //                      any trailing extend characters, which is needed behavior.
1881         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1882                 && fRegionalIndicatorSet->contains(c2)) {
1883           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1884           break;
1885         }
1886         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1887           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1888           continue;
1889         }
1890 
1891         setAppliedRule(p2, "GB999 Any <break> Any");
1892         break;
1893     }
1894 
1895     breakPos = p2;
1896     return breakPos;
1897 }
1898 
1899 
1900 
charClasses()1901 UVector  *RBBICharMonkey::charClasses() {
1902     return fSets;
1903 }
1904 
~RBBICharMonkey()1905 RBBICharMonkey::~RBBICharMonkey() {
1906     delete fSets;
1907     delete fCRLFSet;
1908     delete fControlSet;
1909     delete fExtendSet;
1910     delete fRegionalIndicatorSet;
1911     delete fPrependSet;
1912     delete fSpacingSet;
1913     delete fLSet;
1914     delete fVSet;
1915     delete fTSet;
1916     delete fLVSet;
1917     delete fLVTSet;
1918     delete fHangulSet;
1919     delete fAnySet;
1920     delete fZWJSet;
1921     delete fExtendedPictSet;
1922     delete fViramaSet;
1923     delete fLinkingConsonantSet;
1924     delete fExtCccZwjSet;
1925 }
1926 
1927 //------------------------------------------------------------------------------------------
1928 //
1929 //   class RBBIWordMonkey      Word Break specific implementation
1930 //                             of RBBIMonkeyKind.
1931 //
1932 //------------------------------------------------------------------------------------------
1933 class RBBIWordMonkey: public RBBIMonkeyKind {
1934 public:
1935     RBBIWordMonkey();
1936     virtual          ~RBBIWordMonkey();
1937     virtual  UVector *charClasses() override;
1938     virtual  void     setText(const UnicodeString &s) override;
1939     virtual int32_t   next(int32_t i) override;
1940 private:
1941     UVector      *fSets;
1942 
1943     UnicodeSet  *fCRSet;
1944     UnicodeSet  *fLFSet;
1945     UnicodeSet  *fNewlineSet;
1946     UnicodeSet  *fRegionalIndicatorSet;
1947     UnicodeSet  *fKatakanaSet;
1948     UnicodeSet  *fHebrew_LetterSet;
1949     UnicodeSet  *fALetterSet;
1950     UnicodeSet  *fSingle_QuoteSet;
1951     UnicodeSet  *fDouble_QuoteSet;
1952     UnicodeSet  *fMidNumLetSet;
1953     UnicodeSet  *fMidLetterSet;
1954     UnicodeSet  *fMidNumSet;
1955     UnicodeSet  *fNumericSet;
1956     UnicodeSet  *fFormatSet;
1957     UnicodeSet  *fOtherSet = nullptr;
1958     UnicodeSet  *fExtendSet;
1959     UnicodeSet  *fExtendNumLetSet;
1960     UnicodeSet  *fWSegSpaceSet;
1961     UnicodeSet  *fDictionarySet = nullptr;
1962     UnicodeSet  *fZWJSet;
1963     UnicodeSet  *fExtendedPictSet;
1964 
1965     const UnicodeString  *fText;
1966 };
1967 
1968 
RBBIWordMonkey()1969 RBBIWordMonkey::RBBIWordMonkey()
1970 {
1971     UErrorCode  status = U_ZERO_ERROR;
1972 
1973     fSets            = new UVector(status);
1974 
1975     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
1976     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
1977     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
1978     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
1979     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1980     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1981     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
1982     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
1983     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
1984     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
1985     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\: \\uFE55 \\uFF1A]]",    status);
1986     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
1987     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
1988     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
1989     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1990     // There are some sc=Hani characters with WB=Extend.
1991     // The break rules need to pick one or the other because
1992     // Extend overlapping with something else is messy.
1993     // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
1994     // in $Han (for $dictionary) and out of $Extend.
1995     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
1996     fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
1997 
1998     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
1999     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
2000     if(U_FAILURE(status)) {
2001         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2002         deferredStatus = status;
2003         return;
2004     }
2005 
2006     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2007     fDictionarySet->addAll(*fKatakanaSet);
2008     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2009 
2010     fALetterSet->removeAll(*fDictionarySet);
2011 
2012     fOtherSet        = new UnicodeSet();
2013     if(U_FAILURE(status)) {
2014         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2015         deferredStatus = status;
2016         return;
2017     }
2018 
2019     fOtherSet->complement();
2020     fOtherSet->removeAll(*fCRSet);
2021     fOtherSet->removeAll(*fLFSet);
2022     fOtherSet->removeAll(*fNewlineSet);
2023     fOtherSet->removeAll(*fKatakanaSet);
2024     fOtherSet->removeAll(*fHebrew_LetterSet);
2025     fOtherSet->removeAll(*fALetterSet);
2026     fOtherSet->removeAll(*fSingle_QuoteSet);
2027     fOtherSet->removeAll(*fDouble_QuoteSet);
2028     fOtherSet->removeAll(*fMidLetterSet);
2029     fOtherSet->removeAll(*fMidNumSet);
2030     fOtherSet->removeAll(*fNumericSet);
2031     fOtherSet->removeAll(*fExtendNumLetSet);
2032     fOtherSet->removeAll(*fWSegSpaceSet);
2033     fOtherSet->removeAll(*fFormatSet);
2034     fOtherSet->removeAll(*fExtendSet);
2035     fOtherSet->removeAll(*fRegionalIndicatorSet);
2036     fOtherSet->removeAll(*fZWJSet);
2037     fOtherSet->removeAll(*fExtendedPictSet);
2038 
2039     // Inhibit dictionary characters from being tested at all.
2040     fOtherSet->removeAll(*fDictionarySet);
2041 
2042     // Add classes and their names
2043     fSets->addElement(fCRSet, status); classNames.emplace_back("CR");
2044     fSets->addElement(fLFSet, status); classNames.emplace_back("LF");
2045     fSets->addElement(fNewlineSet, status); classNames.emplace_back("Newline");
2046     fSets->addElement(fRegionalIndicatorSet, status); classNames.emplace_back("RegionalIndicator");
2047     fSets->addElement(fHebrew_LetterSet, status); classNames.emplace_back("Hebrew");
2048     fSets->addElement(fALetterSet, status); classNames.emplace_back("ALetter");
2049     fSets->addElement(fSingle_QuoteSet, status); classNames.emplace_back("Single Quote");
2050     fSets->addElement(fDouble_QuoteSet, status); classNames.emplace_back("Double Quote");
2051     // Omit Katakana from fSets, which omits Katakana characters
2052     // from the test data. They are all in the dictionary set,
2053     // which this (old, to be retired) monkey test cannot handle.
2054     //fSets->addElement(fKatakanaSet, status);
2055 
2056     fSets->addElement(fMidLetterSet, status); classNames.emplace_back("MidLetter");
2057     fSets->addElement(fMidNumLetSet, status); classNames.emplace_back("MidNumLet");
2058     fSets->addElement(fMidNumSet, status); classNames.emplace_back("MidNum");
2059     fSets->addElement(fNumericSet, status); classNames.emplace_back("Numeric");
2060     fSets->addElement(fFormatSet, status); classNames.emplace_back("Format");
2061     fSets->addElement(fExtendSet, status); classNames.emplace_back("Extend");
2062     fSets->addElement(fOtherSet, status); classNames.emplace_back("Other");
2063     fSets->addElement(fExtendNumLetSet, status); classNames.emplace_back("ExtendNumLet");
2064     fSets->addElement(fWSegSpaceSet, status); classNames.emplace_back("WSegSpace");
2065 
2066     fSets->addElement(fZWJSet, status); classNames.emplace_back("ZWJ");
2067     fSets->addElement(fExtendedPictSet, status); classNames.emplace_back("ExtendedPict");
2068 
2069     if (U_FAILURE(status)) {
2070         deferredStatus = status;
2071     }
2072 }
2073 
setText(const UnicodeString & s)2074 void RBBIWordMonkey::setText(const UnicodeString &s) {
2075     fText       = &s;
2076     prepareAppliedRules(s.length());
2077 }
2078 
2079 
next(int32_t prevPos)2080 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2081     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2082                               //   break position being tested.  The candidate break
2083                               //   location is before p2.
2084 
2085     int     breakPos = -1;
2086 
2087     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2088 
2089     if (U_FAILURE(deferredStatus)) {
2090         return -1;
2091     }
2092 
2093     // Prev break at end of string.  return DONE.
2094     if (prevPos >= fText->length()) {
2095         return -1;
2096     }
2097     p0 = p1 = p2 = p3 = prevPos;
2098     c3 =  fText->char32At(prevPos);
2099     c0 = c1 = c2 = 0;
2100     (void)p0;       // Suppress set but not used warning.
2101 
2102     // Loop runs once per "significant" character position in the input text.
2103     for (;;) {
2104         // Move all of the positions forward in the input string.
2105         p0 = p1;  c0 = c1;
2106         p1 = p2;  c1 = c2;
2107         p2 = p3;  c2 = c3;
2108 
2109         // Advance p3 by    X(Extend | Format)*   Rule 4
2110         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2111         do {
2112             p3 = fText->moveIndex32(p3, 1);
2113             c3 = fText->char32At(p3);
2114             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2115                break;
2116             }
2117         }
2118         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2119 
2120 
2121         if (p1 == p2) {
2122             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2123             continue;
2124         }
2125 
2126         if (p2 == fText->length()) {
2127             // Reached end of string.  Always a break position.
2128             break;
2129         }
2130 
2131         //     No Extend or Format characters may appear between the CR and LF,
2132         //     which requires the additional check for p2 immediately following p1.
2133         //
2134         if (c1==0x0D && c2==0x0A) {
2135           setAppliedRule(p2, "WB3   CR x LF");
2136           continue;
2137         }
2138 
2139         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2140             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2141             break;
2142         }
2143         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2144             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2145             break;
2146         }
2147 
2148         //              Not ignoring extend chars, so peek into input text to
2149         //              get the potential ZWJ, the character immediately preceding c2.
2150         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2151         //              but char32At will get the full code point.
2152         if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2153             setAppliedRule(p2, "WB3c  ZWJ x Extended_Pictographic");
2154             continue;
2155         }
2156 
2157         if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2158             setAppliedRule(p2, "WB3d  Keep horizontal whitespace together.");
2159             continue;
2160         }
2161 
2162         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2163             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2164             setAppliedRule(p2, "WB4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2165             continue;
2166         }
2167 
2168         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2169              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2170              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2171             setAppliedRule(p2,
2172                            "WB6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2173             continue;
2174         }
2175 
2176         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2177             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2178             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2179             setAppliedRule(p2,
2180                            "WB7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
2181             continue;
2182         }
2183 
2184         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2185             setAppliedRule(p2, "WB7a  Hebrew_Letter x Single_Quote");
2186             continue;
2187         }
2188 
2189           if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2190             setAppliedRule(p2, "WB7b  Hebrew_Letter x Double_Quote Hebrew_Letter");
2191             continue;
2192         }
2193 
2194         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2195             setAppliedRule(p2, "WB7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
2196             continue;
2197         }
2198 
2199         if (fNumericSet->contains(c1) &&
2200             fNumericSet->contains(c2)) {
2201             setAppliedRule(p2, "WB8   Numeric x Numeric");
2202             continue;
2203         }
2204 
2205         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2206             fNumericSet->contains(c2)) {
2207             setAppliedRule(p2, "WB9   (ALetter | Hebrew_Letter) x Numeric");
2208             continue;
2209         }
2210 
2211         if (fNumericSet->contains(c1) &&
2212             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2213             setAppliedRule(p2, "WB10   Numeric x (ALetter | Hebrew_Letter)");
2214             continue;
2215         }
2216 
2217           if (fNumericSet->contains(c0) &&
2218             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2219             fNumericSet->contains(c2)) {
2220             setAppliedRule(p2, "WB11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
2221             continue;
2222         }
2223 
2224         if (fNumericSet->contains(c1) &&
2225             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2226             fNumericSet->contains(c3)) {
2227             setAppliedRule(p2, "WB12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2228             continue;
2229         }
2230 
2231         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2232         //                  all Katakana are handled by the dictionary breaker.
2233         if (fKatakanaSet->contains(c1) &&
2234             fKatakanaSet->contains(c2))  {
2235             setAppliedRule(p2, "WB13  Katakana x Katakana");
2236             continue;
2237         }
2238 
2239         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2240              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2241              fExtendNumLetSet->contains(c2)) {
2242             setAppliedRule(p2,
2243                            "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2244             continue;
2245         }
2246 
2247         if (fExtendNumLetSet->contains(c1) &&
2248                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2249                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2250             setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2251             continue;
2252         }
2253 
2254         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2255             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2256             break;
2257         }
2258         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2259             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2260             continue;
2261         }
2262 
2263         setAppliedRule(p2, "WB999");
2264         break;
2265     }
2266 
2267     breakPos = p2;
2268     return breakPos;
2269 }
2270 
2271 
charClasses()2272 UVector  *RBBIWordMonkey::charClasses() {
2273     return fSets;
2274 }
2275 
~RBBIWordMonkey()2276 RBBIWordMonkey::~RBBIWordMonkey() {
2277     delete fSets;
2278     delete fCRSet;
2279     delete fLFSet;
2280     delete fNewlineSet;
2281     delete fKatakanaSet;
2282     delete fHebrew_LetterSet;
2283     delete fALetterSet;
2284     delete fSingle_QuoteSet;
2285     delete fDouble_QuoteSet;
2286     delete fMidNumLetSet;
2287     delete fMidLetterSet;
2288     delete fMidNumSet;
2289     delete fNumericSet;
2290     delete fFormatSet;
2291     delete fExtendSet;
2292     delete fExtendNumLetSet;
2293     delete fWSegSpaceSet;
2294     delete fRegionalIndicatorSet;
2295     delete fDictionarySet;
2296     delete fOtherSet;
2297     delete fZWJSet;
2298     delete fExtendedPictSet;
2299 }
2300 
2301 
2302 
2303 
2304 //------------------------------------------------------------------------------------------
2305 //
2306 //   class RBBISentMonkey      Sentence Break specific implementation
2307 //                             of RBBIMonkeyKind.
2308 //
2309 //------------------------------------------------------------------------------------------
2310 class RBBISentMonkey: public RBBIMonkeyKind {
2311 public:
2312     RBBISentMonkey();
2313     virtual          ~RBBISentMonkey();
2314     virtual  UVector *charClasses() override;
2315     virtual  void     setText(const UnicodeString &s) override;
2316     virtual int32_t   next(int32_t i) override;
2317 private:
2318     int               moveBack(int posFrom);
2319     int               moveForward(int posFrom);
2320     UChar32           cAt(int pos);
2321 
2322     UVector      *fSets;
2323 
2324     UnicodeSet  *fSepSet;
2325     UnicodeSet  *fFormatSet;
2326     UnicodeSet  *fSpSet;
2327     UnicodeSet  *fLowerSet;
2328     UnicodeSet  *fUpperSet;
2329     UnicodeSet  *fOLetterSet;
2330     UnicodeSet  *fNumericSet;
2331     UnicodeSet  *fATermSet;
2332     UnicodeSet  *fSContinueSet;
2333     UnicodeSet  *fSTermSet;
2334     UnicodeSet  *fCloseSet;
2335     UnicodeSet  *fOtherSet;
2336     UnicodeSet  *fExtendSet;
2337 
2338     const UnicodeString  *fText;
2339 };
2340 
RBBISentMonkey()2341 RBBISentMonkey::RBBISentMonkey()
2342 {
2343     UErrorCode  status = U_ZERO_ERROR;
2344 
2345     fSets            = new UVector(status);
2346 
2347     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2348     //                       set and made into character classes of their own.  For the monkey impl,
2349     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2350     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2351     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2352     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2353     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2354     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2355     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2356     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2357     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2358     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2359     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2360     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2361     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2362     fOtherSet        = new UnicodeSet();
2363 
2364     if(U_FAILURE(status)) {
2365       deferredStatus = status;
2366       return;
2367     }
2368 
2369     fOtherSet->complement();
2370     fOtherSet->removeAll(*fSepSet);
2371     fOtherSet->removeAll(*fFormatSet);
2372     fOtherSet->removeAll(*fSpSet);
2373     fOtherSet->removeAll(*fLowerSet);
2374     fOtherSet->removeAll(*fUpperSet);
2375     fOtherSet->removeAll(*fOLetterSet);
2376     fOtherSet->removeAll(*fNumericSet);
2377     fOtherSet->removeAll(*fATermSet);
2378     fOtherSet->removeAll(*fSContinueSet);
2379     fOtherSet->removeAll(*fSTermSet);
2380     fOtherSet->removeAll(*fCloseSet);
2381     fOtherSet->removeAll(*fExtendSet);
2382 
2383     fSets->addElement(fSepSet, status); classNames.emplace_back("Sep");
2384     fSets->addElement(fFormatSet, status); classNames.emplace_back("Format");
2385     fSets->addElement(fSpSet, status); classNames.emplace_back("Sp");
2386     fSets->addElement(fLowerSet, status); classNames.emplace_back("Lower");
2387     fSets->addElement(fUpperSet, status); classNames.emplace_back("Upper");
2388     fSets->addElement(fOLetterSet, status); classNames.emplace_back("OLetter");
2389     fSets->addElement(fNumericSet, status); classNames.emplace_back("Numeric");
2390     fSets->addElement(fATermSet, status); classNames.emplace_back("ATerm");
2391     fSets->addElement(fSContinueSet, status); classNames.emplace_back("SContinue");
2392     fSets->addElement(fSTermSet, status); classNames.emplace_back("STerm");
2393     fSets->addElement(fCloseSet, status); classNames.emplace_back("Close");
2394     fSets->addElement(fOtherSet, status); classNames.emplace_back("Other");
2395     fSets->addElement(fExtendSet, status); classNames.emplace_back("Extend");
2396 
2397     if (U_FAILURE(status)) {
2398         deferredStatus = status;
2399     }
2400 }
2401 
2402 
2403 
setText(const UnicodeString & s)2404 void RBBISentMonkey::setText(const UnicodeString &s) {
2405     fText       = &s;
2406     prepareAppliedRules(s.length());
2407 }
2408 
charClasses()2409 UVector  *RBBISentMonkey::charClasses() {
2410     return fSets;
2411 }
2412 
2413 //  moveBack()   Find the "significant" code point preceding the index i.
2414 //               Skips over ($Extend | $Format)* .
2415 //
moveBack(int i)2416 int RBBISentMonkey::moveBack(int i) {
2417     if (i <= 0) {
2418         return -1;
2419     }
2420     UChar32   c;
2421     int32_t   j = i;
2422     do {
2423         j = fText->moveIndex32(j, -1);
2424         c = fText->char32At(j);
2425     }
2426     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2427     return j;
2428 
2429  }
2430 
2431 
moveForward(int i)2432 int RBBISentMonkey::moveForward(int i) {
2433     if (i>=fText->length()) {
2434         return fText->length();
2435     }
2436     UChar32   c;
2437     int32_t   j = i;
2438     do {
2439         j = fText->moveIndex32(j, 1);
2440         c = cAt(j);
2441     }
2442     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2443     return j;
2444 }
2445 
cAt(int pos)2446 UChar32 RBBISentMonkey::cAt(int pos) {
2447     if (pos<0 || pos>=fText->length()) {
2448         return -1;
2449     } else {
2450         return fText->char32At(pos);
2451     }
2452 }
2453 
next(int32_t prevPos)2454 int32_t RBBISentMonkey::next(int32_t prevPos) {
2455     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2456                               //   break position being tested.  The candidate break
2457                               //   location is before p2.
2458 
2459     int     breakPos = -1;
2460 
2461     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2462     UChar32 c;
2463 
2464     if (U_FAILURE(deferredStatus)) {
2465         return -1;
2466     }
2467 
2468     // Prev break at end of string.  return DONE.
2469     if (prevPos >= fText->length()) {
2470         return -1;
2471     }
2472     p0 = p1 = p2 = p3 = prevPos;
2473     c3 =  fText->char32At(prevPos);
2474     c0 = c1 = c2 = 0;
2475     (void)p0;     // Suppress set but not used warning.
2476 
2477     // Loop runs once per "significant" character position in the input text.
2478     for (;;) {
2479         // Move all of the positions forward in the input string.
2480         p0 = p1;  c0 = c1;
2481         p1 = p2;  c1 = c2;
2482         p2 = p3;  c2 = c3;
2483 
2484         // Advance p3 by    X(Extend | Format)*   Rule 4
2485         p3 = moveForward(p3);
2486         c3 = cAt(p3);
2487 
2488         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2489             setAppliedRule(p2, "SB3   CR x LF");
2490             continue;
2491         }
2492 
2493         if (fSepSet->contains(c1)) {
2494             p2 = p1+1;   // Separators don't combine with Extend or Format.
2495 
2496             setAppliedRule(p2, "SB4   Sep  <break>");
2497             break;
2498         }
2499 
2500         if (p2 >= fText->length()) {
2501             // Reached end of string.  Always a break position.
2502             setAppliedRule(p2, "SB4   Sep  <break>");
2503             break;
2504         }
2505 
2506         if (p2 == prevPos) {
2507             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2508             setAppliedRule(p2, "SB4   Sep  <break>");
2509             continue;
2510         }
2511 
2512         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2513             setAppliedRule(p2, "SB6   ATerm x Numeric");
2514             continue;
2515         }
2516 
2517           if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2518                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2519             setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
2520             continue;
2521         }
2522 
2523         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2524         //                  note to the Unicode 5.0 documents.
2525         int p8 = p1;
2526         while (fSpSet->contains(cAt(p8))) {
2527             p8 = moveBack(p8);
2528         }
2529         while (fCloseSet->contains(cAt(p8))) {
2530             p8 = moveBack(p8);
2531         }
2532         if (fATermSet->contains(cAt(p8))) {
2533             p8=p2;
2534             for (;;) {
2535                 c = cAt(p8);
2536                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2537                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2538                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2539 
2540                     setAppliedRule(p2,
2541                                    "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2542                     break;
2543                 }
2544                 p8 = moveForward(p8);
2545             }
2546             if (fLowerSet->contains(cAt(p8))) {
2547 
2548                 setAppliedRule(p2,
2549                                "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2550                 continue;
2551             }
2552         }
2553 
2554         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2555             p8 = p1;
2556             while (fSpSet->contains(cAt(p8))) {
2557                 p8 = moveBack(p8);
2558             }
2559             while (fCloseSet->contains(cAt(p8))) {
2560                 p8 = moveBack(p8);
2561             }
2562             c = cAt(p8);
2563             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2564                 setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2565                 continue;
2566             }
2567         }
2568 
2569         int p9 = p1;
2570         while (fCloseSet->contains(cAt(p9))) {
2571             p9 = moveBack(p9);
2572         }
2573         c = cAt(p9);
2574         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2575             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2576 
2577                 setAppliedRule(p2, "SB9  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
2578                 continue;
2579             }
2580         }
2581 
2582         int p10 = p1;
2583         while (fSpSet->contains(cAt(p10))) {
2584             p10 = moveBack(p10);
2585         }
2586         while (fCloseSet->contains(cAt(p10))) {
2587             p10 = moveBack(p10);
2588         }
2589         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2590             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2591                 setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
2592                 continue;
2593             }
2594         }
2595 
2596         int p11 = p1;
2597         if (fSepSet->contains(cAt(p11))) {
2598             p11 = moveBack(p11);
2599         }
2600         while (fSpSet->contains(cAt(p11))) {
2601             p11 = moveBack(p11);
2602         }
2603         while (fCloseSet->contains(cAt(p11))) {
2604             p11 = moveBack(p11);
2605         }
2606         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2607           setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>");
2608             break;
2609         }
2610 
2611         setAppliedRule(p2, "SB12  Any x Any");
2612     }
2613 
2614     breakPos = p2;
2615     return breakPos;
2616 }
2617 
~RBBISentMonkey()2618 RBBISentMonkey::~RBBISentMonkey() {
2619     delete fSets;
2620     delete fSepSet;
2621     delete fFormatSet;
2622     delete fSpSet;
2623     delete fLowerSet;
2624     delete fUpperSet;
2625     delete fOLetterSet;
2626     delete fNumericSet;
2627     delete fATermSet;
2628     delete fSContinueSet;
2629     delete fSTermSet;
2630     delete fCloseSet;
2631     delete fOtherSet;
2632     delete fExtendSet;
2633 }
2634 
2635 
2636 
2637 //-------------------------------------------------------------------------------------------
2638 //
2639 //  RBBILineMonkey
2640 //
2641 //-------------------------------------------------------------------------------------------
2642 
2643 class RBBILineMonkey: public RBBIMonkeyKind {
2644 public:
2645     RBBILineMonkey();
2646     virtual          ~RBBILineMonkey();
2647     virtual  UVector *charClasses() override;
2648     virtual  void     setText(const UnicodeString &s) override;
2649     virtual  int32_t  next(int32_t i) override;
2650     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2651 private:
2652     UVector      *fSets;
2653 
2654     UnicodeSet  *fBK;
2655     UnicodeSet  *fCR;
2656     UnicodeSet  *fLF;
2657     UnicodeSet  *fCM;
2658     UnicodeSet  *fNL;
2659     UnicodeSet  *fSG;
2660     UnicodeSet  *fWJ;
2661     UnicodeSet  *fZW;
2662     UnicodeSet  *fGL;
2663     UnicodeSet  *fCB;
2664     UnicodeSet  *fSP;
2665     UnicodeSet  *fB2;
2666     UnicodeSet  *fBA;
2667     UnicodeSet  *fBB;
2668     UnicodeSet  *fHH;
2669     UnicodeSet  *fHY;
2670     UnicodeSet  *fH2;
2671     UnicodeSet  *fH3;
2672     UnicodeSet  *fCL;
2673     UnicodeSet  *fCP;
2674     UnicodeSet  *fEX;
2675     UnicodeSet  *fIN;
2676     UnicodeSet  *fJL;
2677     UnicodeSet  *fJV;
2678     UnicodeSet  *fJT;
2679     UnicodeSet  *fNS;
2680     UnicodeSet  *fOP;
2681     UnicodeSet  *fQU;
2682     UnicodeSet  *fIS;
2683     UnicodeSet  *fNU;
2684     UnicodeSet  *fPO;
2685     UnicodeSet  *fPR;
2686     UnicodeSet  *fSY;
2687     UnicodeSet  *fAI;
2688     UnicodeSet  *fAL;
2689     UnicodeSet  *fCJ;
2690     UnicodeSet  *fHL;
2691     UnicodeSet  *fID;
2692     UnicodeSet  *fRI;
2693     UnicodeSet  *fXX;
2694     UnicodeSet  *fEB;
2695     UnicodeSet  *fEM;
2696     UnicodeSet  *fZWJ;
2697     UnicodeSet  *fOP30;
2698     UnicodeSet  *fCP30;
2699     UnicodeSet  *fExtPictUnassigned;
2700     UnicodeSet  *fAK;
2701     UnicodeSet  *fAP;
2702     UnicodeSet  *fAS;
2703     UnicodeSet  *fVF;
2704     UnicodeSet  *fVI;
2705     UnicodeSet  *fPi;
2706     UnicodeSet  *fPf;
2707 
2708     BreakIterator        *fCharBI;
2709     const UnicodeString  *fText;
2710     RegexMatcher         *fNumberMatcher;
2711 };
2712 
RBBILineMonkey()2713 RBBILineMonkey::RBBILineMonkey() :
2714     RBBIMonkeyKind(),
2715     fSets(nullptr),
2716 
2717     fCharBI(nullptr),
2718     fText(nullptr),
2719     fNumberMatcher(nullptr)
2720 
2721 {
2722     if (U_FAILURE(deferredStatus)) {
2723         return;
2724     }
2725 
2726     UErrorCode  status = U_ZERO_ERROR;
2727 
2728     fSets  = new UVector(status);
2729 
2730     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2731     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2732     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2733     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2734     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2735     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2736     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2737     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2738     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2739     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2740     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2741     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2742     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2743     fHH    = new UnicodeSet();
2744     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2745     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2746     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2747     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2748     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2749     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2750     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2751     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2752     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2753     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2754     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2755     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2756     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2757     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2758     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2759     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2760     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2761     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2762     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2763     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2764     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2765     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2766     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2767     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2768     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2769     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2770     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
2771     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2772     fZWJ   = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2773     fOP30  = new UnicodeSet(u"[\\p{Line_break=OP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2774     fCP30  = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2775     fExtPictUnassigned = new UnicodeSet(u"[\\p{Extended_Pictographic}&\\p{Cn}]", status);
2776 
2777     fAK = new UnicodeSet(uR"([\p{Line_Break=AK}])", status);
2778     fAP = new UnicodeSet(uR"([\p{Line_Break=AP}])", status);
2779     fAS = new UnicodeSet(uR"([\p{Line_Break=AS}])", status);
2780     fVF = new UnicodeSet(uR"([\p{Line_Break=VF}])", status);
2781     fVI = new UnicodeSet(uR"([\p{Line_Break=VI}])", status);
2782 
2783     fPi = new UnicodeSet(uR"([\p{Pi}])", status);
2784     fPf = new UnicodeSet(uR"([\p{Pf}])", status);
2785 
2786     if (U_FAILURE(status)) {
2787         deferredStatus = status;
2788         return;
2789     }
2790 
2791     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2792     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2793     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2794 
2795     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2796     fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
2797 
2798     fHH->add(u'\u2010');   // Hyphen, '‐'
2799 
2800     // Sets and names.
2801     fSets->addElement(fBK, status); classNames.emplace_back("fBK");
2802     fSets->addElement(fCR, status); classNames.emplace_back("fCR");
2803     fSets->addElement(fLF, status); classNames.emplace_back("fLF");
2804     fSets->addElement(fCM, status); classNames.emplace_back("fCM");
2805     fSets->addElement(fNL, status); classNames.emplace_back("fNL");
2806     fSets->addElement(fWJ, status); classNames.emplace_back("fWJ");
2807     fSets->addElement(fZW, status); classNames.emplace_back("fZW");
2808     fSets->addElement(fGL, status); classNames.emplace_back("fGL");
2809     fSets->addElement(fCB, status); classNames.emplace_back("fCB");
2810     fSets->addElement(fSP, status); classNames.emplace_back("fSP");
2811     fSets->addElement(fB2, status); classNames.emplace_back("fB2");
2812     fSets->addElement(fBA, status); classNames.emplace_back("fBA");
2813     fSets->addElement(fBB, status); classNames.emplace_back("fBB");
2814     fSets->addElement(fHY, status); classNames.emplace_back("fHY");
2815     fSets->addElement(fH2, status); classNames.emplace_back("fH2");
2816     fSets->addElement(fH3, status); classNames.emplace_back("fH3");
2817     fSets->addElement(fCL, status); classNames.emplace_back("fCL");
2818     fSets->addElement(fCP, status); classNames.emplace_back("fCP");
2819     fSets->addElement(fEX, status); classNames.emplace_back("fEX");
2820     fSets->addElement(fIN, status); classNames.emplace_back("fIN");
2821     fSets->addElement(fJL, status); classNames.emplace_back("fJL");
2822     fSets->addElement(fJT, status); classNames.emplace_back("fJT");
2823     fSets->addElement(fJV, status); classNames.emplace_back("fJV");
2824     fSets->addElement(fNS, status); classNames.emplace_back("fNS");
2825     fSets->addElement(fOP, status); classNames.emplace_back("fOP");
2826     fSets->addElement(fQU, status); classNames.emplace_back("fQU");
2827     fSets->addElement(fIS, status); classNames.emplace_back("fIS");
2828     fSets->addElement(fNU, status); classNames.emplace_back("fNU");
2829     fSets->addElement(fPO, status); classNames.emplace_back("fPO");
2830     fSets->addElement(fPR, status); classNames.emplace_back("fPR");
2831     fSets->addElement(fSY, status); classNames.emplace_back("fSY");
2832     fSets->addElement(fAI, status); classNames.emplace_back("fAI");
2833     fSets->addElement(fAL, status); classNames.emplace_back("fAL");
2834     fSets->addElement(fHL, status); classNames.emplace_back("fHL");
2835     fSets->addElement(fID, status); classNames.emplace_back("fID");
2836     fSets->addElement(fRI, status); classNames.emplace_back("fRI");
2837     fSets->addElement(fSG, status); classNames.emplace_back("fSG");
2838     fSets->addElement(fEB, status); classNames.emplace_back("fEB");
2839     fSets->addElement(fEM, status); classNames.emplace_back("fEM");
2840     fSets->addElement(fZWJ, status); classNames.emplace_back("fZWJ");
2841     // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2842     fSets->addElement(fOP30, status); classNames.emplace_back("fOP30");
2843     fSets->addElement(fCP30, status); classNames.emplace_back("fCP30");
2844     fSets->addElement(fExtPictUnassigned, status); classNames.emplace_back("fExtPictUnassigned");
2845     fSets->addElement(fAK, status); classNames.emplace_back("fAK");
2846     fSets->addElement(fAP, status); classNames.emplace_back("fAP");
2847     fSets->addElement(fAS, status); classNames.emplace_back("fAS");
2848     fSets->addElement(fVF, status); classNames.emplace_back("fVF");
2849     fSets->addElement(fVI, status); classNames.emplace_back("fVI");
2850 
2851 
2852     UnicodeString CMx {uR"([[\p{Line_Break=CM}]\u200d])"};
2853     UnicodeString rules;
2854     rules = rules + u"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(" + CMx + u")*)?"
2855                   + u"((\\p{Line_Break=OP}|\\p{Line_Break=HY})(" + CMx + u")*)?"
2856                   + u"((\\p{Line_Break=IS})(" + CMx + u")*)?"
2857                   + u"\\p{Line_Break=NU}(" + CMx + u")*"
2858                   + u"((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(" + CMx + u")*)*"
2859                   + u"((\\p{Line_Break=CL}|\\p{Line_Break=CP})(" + CMx + u")*)?"
2860                   + u"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(" + CMx + u")*)?";
2861 
2862     fNumberMatcher = new RegexMatcher(rules, 0, status);
2863 
2864     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2865 
2866     if (U_FAILURE(status)) {
2867         deferredStatus = status;
2868     }
2869 
2870 }
2871 
2872 
setText(const UnicodeString & s)2873 void RBBILineMonkey::setText(const UnicodeString &s) {
2874     fText       = &s;
2875     fCharBI->setText(s);
2876     prepareAppliedRules(s.length());
2877     fNumberMatcher->reset(s);
2878 }
2879 
2880 //
2881 //  rule9Adjust
2882 //     Line Break TR rules 9 and 10 implementation.
2883 //     This deals with combining marks and other sequences that
2884 //     that must be treated as if they were something other than what they actually are.
2885 //
2886 //     This is factored out into a separate function because it must be applied twice for
2887 //     each potential break, once to the chars before the position being checked, then
2888 //     again to the text following the possible break.
2889 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)2890 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2891     if (pos == -1) {
2892         // Invalid initial position.  Happens during the warmup iteration of the
2893         //   main loop in next().
2894         return;
2895     }
2896 
2897     int32_t  nPos = *nextPos;
2898 
2899     // LB 9  Keep combining sequences together.
2900     // advance over any CM class chars.  Note that Line Break CM is different
2901     // from the normal Grapheme Extend property.
2902     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2903           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2904         for (;;) {
2905             *nextChar = fText->char32At(nPos);
2906             if (!fCM->contains(*nextChar)) {
2907                 break;
2908             }
2909             nPos = fText->moveIndex32(nPos, 1);
2910         }
2911     }
2912 
2913 
2914     // LB 9 Treat X CM* as if it were x.
2915     //       No explicit action required.
2916 
2917     // LB 10  Treat any remaining combining mark as AL
2918     if (fCM->contains(*posChar)) {
2919         *posChar = u'A';
2920     }
2921 
2922     // Push the updated nextPos and nextChar back to our caller.
2923     // This only makes a difference if posChar got bigger by consuming a
2924     // combining sequence.
2925     *nextPos  = nPos;
2926     *nextChar = fText->char32At(nPos);
2927 }
2928 
2929 
2930 
next(int32_t startPos)2931 int32_t RBBILineMonkey::next(int32_t startPos) {
2932     UErrorCode status = U_ZERO_ERROR;
2933     int32_t    pos;       //  Index of the char following a potential break position
2934     UChar32    thisChar;  //  Character at above position "pos"
2935 
2936     int32_t    prevPos;   //  Index of the char preceding a potential break position
2937     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2938                           //   and thisChar may not be adjacent because combining
2939                           //   characters between them will be ignored.
2940 
2941     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2942     UChar32    prevCharX2;
2943 
2944     int32_t    nextPos;   //  Index of the next character following pos.
2945                           //     Usually skips over combining marks.
2946     int32_t    nextCPPos; //  Index of the code point following "pos."
2947                           //     May point to a combining mark.
2948     int32_t    tPos;      //  temp value.
2949     UChar32    c;
2950 
2951     if (U_FAILURE(deferredStatus)) {
2952         return -1;
2953     }
2954 
2955     if (startPos >= fText->length()) {
2956         return -1;
2957     }
2958 
2959 
2960     // Initial values for loop.  Loop will run the first time without finding breaks,
2961     //                           while the invalid values shift out and the "this" and
2962     //                           "prev" positions are filled in with good values.
2963     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2964     thisChar = prevChar  = prevCharX2 = 0;
2965     nextPos  = nextCPPos = startPos;
2966 
2967 
2968     // Loop runs once per position in the test text, until a break position
2969     //  is found.
2970     for (;;) {
2971         prevPosX2 = prevPos;
2972         prevCharX2 = prevChar;
2973 
2974         prevPos   = pos;
2975         prevChar  = thisChar;
2976 
2977         pos       = nextPos;
2978         thisChar  = fText->char32At(pos);
2979 
2980         nextCPPos = fText->moveIndex32(pos, 1);
2981         nextPos   = nextCPPos;
2982 
2983 
2984         if (pos >= fText->length()) {
2985             setAppliedRule(pos, "LB2 - Break at end of text.");
2986             break;
2987         }
2988 
2989 
2990         //             We do this one out-of-order because the adjustment does not change anything
2991         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2992         //             be applied.
2993         rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2994         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2995         c = fText->char32At(nextPos);
2996         rule9Adjust(pos, &thisChar, &nextPos, &c);
2997 
2998         // If the loop is still warming up - if we haven't shifted the initial
2999         //   -1 positions out of prevPos yet - loop back to advance the
3000         //    position in the input without any further looking for breaks.
3001         if (prevPos == -1) {
3002           setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
3003             continue;
3004         }
3005 
3006 
3007         if (fBK->contains(prevChar)) {
3008             setAppliedRule(pos, "LB 4  Always break after hard line breaks");
3009             break;
3010         }
3011 
3012 
3013         if (prevChar == 0x0d && thisChar == 0x0a) {
3014             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
3015             continue;
3016         }
3017         if (prevChar == 0x0d ||
3018             prevChar == 0x0a ||
3019             prevChar == 0x85)  {
3020             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
3021             break;
3022         }
3023 
3024 
3025         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3026             fBK->contains(thisChar)) {
3027             setAppliedRule(pos, "LB 6  Don't break before hard line breaks");
3028             continue;
3029         }
3030 
3031 
3032         if (fSP->contains(thisChar)) {
3033             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
3034             continue;
3035         }
3036 
3037         // !!! ??? Is this the right text for the applied rule?
3038         if (fZW->contains(thisChar)) {
3039             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
3040             continue;
3041         }
3042 
3043 
3044         //       ZW SP* ÷
3045         //       Scan backwards from prevChar for SP* ZW
3046         tPos = prevPos;
3047         while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3048             tPos = fText->moveIndex32(tPos, -1);
3049         }
3050         if (fZW->contains(fText->char32At(tPos))) {
3051             setAppliedRule(pos, "LB 8  Break after zero width space");
3052             break;
3053         }
3054 
3055 
3056         //          Move this test up, before LB8a, because numbers can match a longer sequence that would
3057         //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
3058         if (fNumberMatcher->lookingAt(prevPos, status)) {
3059             if (U_FAILURE(status)) {
3060                 setAppliedRule(pos, "LB 25 Numbers");
3061                 break;
3062             }
3063             // Matched a number.  But could have been just a single digit, which would
3064             //    not represent a "no break here" between prevChar and thisChar
3065             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3066             if (numEndIdx > pos) {
3067                 // Number match includes at least our two chars being checked
3068                 if (numEndIdx > nextPos) {
3069                     // Number match includes additional chars.  Update pos and nextPos
3070                     //   so that next loop iteration will continue at the end of the number,
3071                     //   checking for breaks between last char in number & whatever follows.
3072                     pos = nextPos = numEndIdx;
3073                     do {
3074                         pos = fText->moveIndex32(pos, -1);
3075                         thisChar = fText->char32At(pos);
3076                     } while (fCM->contains(thisChar));
3077                 }
3078                 setAppliedRule(pos, "LB 25 Numbers");
3079                 continue;
3080             }
3081         }
3082 
3083 
3084         //       The monkey test's way of ignoring combining characters doesn't work
3085         //       for this rule. ZJ is also a CM. Need to get the actual character
3086         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3087         {
3088             int32_t prevIdx = fText->moveIndex32(pos, -1);
3089             UChar32 prevC = fText->char32At(prevIdx);
3090             if (fZWJ->contains(prevC)) {
3091                 setAppliedRule(pos, "LB 8a ZWJ x");
3092                 continue;
3093             }
3094         }
3095 
3096 
3097         // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
3098         //
3099 
3100 
3101         //    x  WJ
3102         //    WJ  x
3103         //
3104         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3105             setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
3106             continue;
3107         }
3108 
3109 
3110         if (fGL->contains(prevChar)) {
3111             setAppliedRule(pos, "LB 12  GL  x");
3112             continue;
3113         }
3114 
3115 
3116           if (!(fSP->contains(prevChar) ||
3117               fBA->contains(prevChar) ||
3118               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3119               setAppliedRule(pos, "LB 12a  [^SP BA HY] x GL");
3120               continue;
3121         }
3122 
3123 
3124         if (fCL->contains(thisChar) ||
3125                 fCP->contains(thisChar) ||
3126                 fEX->contains(thisChar) ||
3127                 fSY->contains(thisChar)) {
3128             setAppliedRule(pos, "LB 13  Don't break before closings.");
3129             continue;
3130         }
3131 
3132 
3133         //       Scan backwards, checking for this sequence.
3134         //       The OP char could include combining marks, so we actually check for
3135         //           OP CM* SP*
3136         //       Another Twist: The Rule 9 fixes may have changed a SP CM
3137         //       sequence into a ID char, so before scanning back through spaces,
3138         //       verify that prevChar is indeed a space.  The prevChar variable
3139         //       may differ from fText[prevPos]
3140         tPos = prevPos;
3141         if (fSP->contains(prevChar)) {
3142             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3143                 tPos=fText->moveIndex32(tPos, -1);
3144             }
3145         }
3146         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3147             tPos=fText->moveIndex32(tPos, -1);
3148         }
3149         if (fOP->contains(fText->char32At(tPos))) {
3150             setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3151             continue;
3152         }
3153 
3154         // Same as LB 14, scan backward for
3155         // (sot | BK | CR | LF | NL | OP CM*| QU CM* | GL CM* | SP) [\p{Pi}&QU] CM* SP*.
3156         tPos = prevPos;
3157         // SP* (with the aforementioned Twist).
3158         if (fSP->contains(prevChar)) {
3159             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3160                 tPos = fText->moveIndex32(tPos, -1);
3161             }
3162         }
3163         // CM*.
3164         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3165             tPos = fText->moveIndex32(tPos, -1);
3166         }
3167         // [\p{Pi}&QU].
3168         if (fPi->contains(fText->char32At(tPos)) && fQU->contains(fText->char32At(tPos))) {
3169             if (tPos == 0) {
3170                 setAppliedRule(pos, "LB 15a sot [\\p{Pi}&QU] SP* x");
3171                 continue;
3172             } else {
3173                 tPos = fText->moveIndex32(tPos, -1);
3174                 if (fBK->contains(fText->char32At(tPos)) || fCR->contains(fText->char32At(tPos)) ||
3175                     fLF->contains(fText->char32At(tPos)) || fNL->contains(fText->char32At(tPos)) ||
3176                     fSP->contains(fText->char32At(tPos)) || fZW->contains(fText->char32At(tPos))) {
3177                     setAppliedRule(pos, "LB 15a (BK | CR | LF | NL | SP | ZW) [\\p{Pi}&QU] SP* x");
3178                     continue;
3179                 }
3180             }
3181             // CM*.
3182             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3183                 tPos = fText->moveIndex32(tPos, -1);
3184             }
3185             if (fOP->contains(fText->char32At(tPos)) || fQU->contains(fText->char32At(tPos)) ||
3186                 fGL->contains(fText->char32At(tPos))) {
3187                 setAppliedRule(pos, "LB 15a (OP | QU | GL) [\\p{Pi}&QU] SP* x");
3188                 continue;
3189             }
3190         }
3191 
3192         if (fPf->contains(thisChar) && fQU->contains(thisChar)) {
3193             UChar32 nextChar = fText->char32At(nextPos);
3194             if (nextPos == fText->length() || fSP->contains(nextChar) || fGL->contains(nextChar) ||
3195                 fWJ->contains(nextChar) || fCL->contains(nextChar) || fQU->contains(nextChar) ||
3196                 fCP->contains(nextChar) || fEX->contains(nextChar) || fIS->contains(nextChar) ||
3197                 fSY->contains(nextChar) || fBK->contains(nextChar) || fCR->contains(nextChar) ||
3198                 fLF->contains(nextChar) || fNL->contains(nextChar) || fZW->contains(nextChar)) {
3199                 setAppliedRule(pos, "LB 15b x [\\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS | SY "
3200                                     "| BK | CR | LF | NL | ZW | eot)");
3201                 continue;
3202             }
3203         }
3204 
3205         if (nextPos < fText->length()) {
3206             // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3207             //       from a legit ffff noncharacter. So test length separately.
3208             UChar32 nextChar = fText->char32At(nextPos);
3209             if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3210                 setAppliedRule(pos,
3211                                "LB 15c Break before an IS that begins a number and follows a space");
3212                 break;
3213             }
3214         }
3215 
3216         if (fIS->contains(thisChar)) {
3217             setAppliedRule(pos, "LB 15d  Do not break before numeric separators, even after spaces.");
3218             continue;
3219         }
3220 
3221         //    Scan backwards for SP* CM* (CL | CP)
3222         if (fNS->contains(thisChar)) {
3223             int tPos = prevPos;
3224             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3225                 tPos = fText->moveIndex32(tPos, -1);
3226             }
3227             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3228                 tPos = fText->moveIndex32(tPos, -1);
3229             }
3230             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3231                 setAppliedRule(pos, "LB 16   (CL | CP) SP* x NS");
3232                 continue;
3233             }
3234         }
3235 
3236 
3237         if (fB2->contains(thisChar)) {
3238             //  Scan backwards, checking for the B2 CM* SP* sequence.
3239             tPos = prevPos;
3240             if (fSP->contains(prevChar)) {
3241                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3242                     tPos=fText->moveIndex32(tPos, -1);
3243                 }
3244             }
3245             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3246                 tPos=fText->moveIndex32(tPos, -1);
3247             }
3248             if (fB2->contains(fText->char32At(tPos))) {
3249                 setAppliedRule(pos, "LB 17   B2 SP* x B2");
3250                 continue;
3251             }
3252         }
3253 
3254 
3255         if (fSP->contains(prevChar)) {
3256             setAppliedRule(pos, "LB 18    break after space");
3257             break;
3258         }
3259 
3260         //    x   QU
3261         //    QU  x
3262         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3263             setAppliedRule(pos, "LB 19");
3264             continue;
3265         }
3266 
3267         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3268             setAppliedRule(pos, "LB 20  Break around a CB");
3269             break;
3270         }
3271 
3272         //           Don't break between Hyphens and letters if a break precedes the hyphen.
3273         //           Formerly this was a Finnish tailoring.
3274         //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3275         //           ^($HY | $HH) $AL;
3276         if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3277                 prevPosX2 == -1) {
3278             setAppliedRule(pos, "LB 20.09");
3279             continue;
3280         }
3281 
3282         if (fBA->contains(thisChar) ||
3283             fHY->contains(thisChar) ||
3284             fNS->contains(thisChar) ||
3285             fBB->contains(prevChar) )   {
3286             setAppliedRule(pos, "LB 21");
3287             continue;
3288         }
3289 
3290         if (fHL->contains(prevCharX2) &&
3291                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3292             setAppliedRule(pos, "LB 21a   HL (HY | BA) x");
3293             continue;
3294         }
3295 
3296         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3297             setAppliedRule(pos, "LB 21b SY x HL");
3298             continue;
3299         }
3300 
3301         if (fIN->contains(thisChar))   {
3302             setAppliedRule(pos, "LB 22");
3303             continue;
3304         }
3305 
3306 
3307         //          (AL | HL) x NU
3308         //          NU x (AL | HL)
3309         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3310             setAppliedRule(pos, "LB 23");
3311             continue;
3312         }
3313         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3314             setAppliedRule(pos, "LB 23");
3315             continue;
3316         }
3317 
3318         // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3319         //      PR x (ID | EB | EM)
3320         //     (ID | EB | EM) x PO
3321         if (fPR->contains(prevChar) &&
3322                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3323             setAppliedRule(pos, "LB 23a");
3324             continue;
3325         }
3326         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3327                 fPO->contains(thisChar)) {
3328             setAppliedRule(pos, "LB 23a");
3329             continue;
3330         }
3331 
3332         //   Do not break between prefix and letters or ideographs.
3333         //         (PR | PO) x (AL | HL)
3334         //         (AL | HL) x (PR | PO)
3335         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3336                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3337             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3338             continue;
3339         }
3340         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3341                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3342             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3343             continue;
3344         }
3345 
3346         // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3347 
3348         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3349                                         fJV->contains(thisChar) ||
3350                                         fH2->contains(thisChar) ||
3351                                         fH3->contains(thisChar))) {
3352             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3353             continue;
3354                                         }
3355 
3356         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3357             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3358             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3359             continue;
3360         }
3361 
3362         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3363             fJT->contains(thisChar)) {
3364             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3365             continue;
3366         }
3367 
3368         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3369             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3370             fPO->contains(thisChar)) {
3371             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3372             continue;
3373         }
3374         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3375             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3376             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3377             continue;
3378         }
3379 
3380 
3381         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3382             setAppliedRule(pos, "LB 28  Do not break between alphabetics (\"at\").");
3383             continue;
3384         }
3385 
3386         if (fAP->contains(prevChar) &&
3387             (fAK->contains(thisChar) || thisChar == U'◌' || fAS->contains(thisChar))) {
3388             setAppliedRule(pos, "LB 28a.1  AP x (AK | ◌ | AS)");
3389             continue;
3390         }
3391 
3392         if ((fAK->contains(prevChar) || prevChar == U'◌' || fAS->contains(prevChar)) &&
3393             (fVF->contains(thisChar) || fVI->contains(thisChar))) {
3394             setAppliedRule(pos, "LB 28a.2  (AK | ◌ | AS) x (VF | VI)");
3395             continue;
3396         }
3397 
3398         if ((fAK->contains(prevCharX2) || prevCharX2 == U'◌' || fAS->contains(prevCharX2)) &&
3399             fVI->contains(prevChar) &&
3400             (fAK->contains(thisChar) || thisChar == U'◌')) {
3401             setAppliedRule(pos, "LB 28a.3  (AK | ◌ | AS) VI x (AK | ◌)");
3402             continue;
3403         }
3404 
3405         if (nextPos < fText->length()) {
3406             // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3407             //       from a legit ffff noncharacter. So test length separately.
3408             UChar32 nextChar = fText->char32At(nextPos);
3409             if ((fAK->contains(prevChar) || prevChar == U'◌' || fAS->contains(prevChar)) &&
3410                 (fAK->contains(thisChar) || thisChar == U'◌' || fAS->contains(thisChar)) &&
3411                 fVF->contains(nextChar)) {
3412                 setAppliedRule(pos, "LB 28a.4  (AK | ◌ | AS) x (AK | ◌ | AS) VF");
3413                 continue;
3414             }
3415         }
3416 
3417         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3418             setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3419             continue;
3420         }
3421 
3422         //          (AL | NU) x OP
3423         //          CP x (AL | NU)
3424         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3425             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3426             continue;
3427         }
3428         if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3429             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3430             continue;
3431         }
3432 
3433         //             RI  x  RI
3434         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3435             setAppliedRule(pos, "LB30a    RI RI  :  RI");
3436             break;
3437         }
3438         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3439             // Two Regional Indicators have been paired.
3440             // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3441             // following RI. This is a hack.
3442             thisChar = -1;
3443             setAppliedRule(pos, "LB30a    RI RI  :  RI");
3444             continue;
3445         }
3446 
3447         // LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
3448         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3449             setAppliedRule(pos, "LB30b    Emoji Base x Emoji Modifier");
3450             continue;
3451         }
3452 
3453         if (fExtPictUnassigned->contains(prevChar) && fEM->contains(thisChar)) {
3454             setAppliedRule(pos, "LB30b    [\\p{Extended_Pictographic}&\\p{Cn}] x EM");
3455             continue;
3456         }
3457 
3458         setAppliedRule(pos, "LB 31    Break everywhere else");
3459         break;
3460     }
3461 
3462     return pos;
3463 }
3464 
3465 
charClasses()3466 UVector  *RBBILineMonkey::charClasses() {
3467     return fSets;
3468 }
3469 
3470 
~RBBILineMonkey()3471 RBBILineMonkey::~RBBILineMonkey() {
3472     delete fSets;
3473 
3474     delete fBK;
3475     delete fCR;
3476     delete fLF;
3477     delete fCM;
3478     delete fNL;
3479     delete fWJ;
3480     delete fZW;
3481     delete fGL;
3482     delete fCB;
3483     delete fSP;
3484     delete fB2;
3485     delete fBA;
3486     delete fBB;
3487     delete fHH;
3488     delete fHY;
3489     delete fH2;
3490     delete fH3;
3491     delete fCL;
3492     delete fCP;
3493     delete fEX;
3494     delete fIN;
3495     delete fJL;
3496     delete fJV;
3497     delete fJT;
3498     delete fNS;
3499     delete fOP;
3500     delete fQU;
3501     delete fIS;
3502     delete fNU;
3503     delete fPO;
3504     delete fPR;
3505     delete fSY;
3506     delete fAI;
3507     delete fAL;
3508     delete fCJ;
3509     delete fHL;
3510     delete fID;
3511     delete fRI;
3512     delete fSG;
3513     delete fXX;
3514     delete fEB;
3515     delete fEM;
3516     delete fZWJ;
3517     delete fOP30;
3518     delete fCP30;
3519     delete fExtPictUnassigned;
3520     delete fAK;
3521     delete fAP;
3522     delete fAS;
3523     delete fVF;
3524     delete fVI;
3525     delete fPi;
3526     delete fPf;
3527 
3528     delete fCharBI;
3529     delete fNumberMatcher;
3530 }
3531 
3532 
3533 //-------------------------------------------------------------------------------------------
3534 //
3535 //   TestMonkey
3536 //
3537 //     params
3538 //       seed=nnnnn        Random number starting seed.
3539 //                         Setting the seed allows errors to be reproduced.
3540 //       loop=nnn          Looping count.  Controls running time.
3541 //                         -1:  run forever.
3542 //                          0 or greater:  run length.
3543 //
3544 //       type = char | word | line | sent | title
3545 //
3546 //       export = (path)   Export test cases to (path)_(type).txt in the UCD
3547 //                         test case format.
3548 //
3549 //  Example:
3550 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3551 //
3552 //-------------------------------------------------------------------------------------------
3553 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3554 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3555     int32_t val = defaultVal;
3556     name.append(" *= *(-?\\d+)");
3557     UErrorCode status = U_ZERO_ERROR;
3558     RegexMatcher m(name, params, 0, status);
3559     if (m.find()) {
3560         // The param exists.  Convert the string to an int.
3561         char valString[100];
3562         int32_t paramLength = m.end(1, status) - m.start(1, status);
3563         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3564             paramLength = (int32_t)(sizeof(valString)-2);
3565         }
3566         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3567         val = strtol(valString, nullptr, 10);
3568 
3569         // Delete this parameter from the params string.
3570         m.reset();
3571         params = m.replaceFirst("", status);
3572     }
3573     U_ASSERT(U_SUCCESS(status));
3574     return val;
3575 }
3576 #endif
3577 
3578 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3579 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3580                                     BreakIterator *bi,
3581                                     int expected[],
3582                                     int expectedcount)
3583 {
3584     int count = 0;
3585     int i = 0;
3586     int forward[50];
3587     bi->setText(ustr);
3588     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3589         forward[count] = i;
3590         if (count < expectedcount && expected[count] != i) {
3591             test->errln("%s:%d break forward test failed: expected %d but got %d",
3592                         __FILE__, __LINE__, expected[count], i);
3593             break;
3594         }
3595         count ++;
3596     }
3597     if (count != expectedcount) {
3598         printStringBreaks(ustr, expected, expectedcount);
3599         test->errln("%s:%d break forward test failed: missed %d match",
3600                     __FILE__, __LINE__, expectedcount - count);
3601         return;
3602     }
3603     // testing boundaries
3604     for (i = 1; i < expectedcount; i ++) {
3605         int j = expected[i - 1];
3606         if (!bi->isBoundary(j)) {
3607             printStringBreaks(ustr, expected, expectedcount);
3608             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3609                     __FILE__, __LINE__, j);
3610             return;
3611         }
3612         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3613             if (bi->isBoundary(j)) {
3614                 printStringBreaks(ustr, expected, expectedcount);
3615                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3616                     __FILE__, __LINE__, j);
3617                 return;
3618             }
3619         }
3620     }
3621 
3622     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3623         count --;
3624         if (forward[count] != i) {
3625             printStringBreaks(ustr, expected, expectedcount);
3626             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3627                         __FILE__, __LINE__, forward[count], i);
3628             break;
3629         }
3630     }
3631     if (count != 0) {
3632         printStringBreaks(ustr, expected, expectedcount);
3633         test->errln("break test previous() failed: missed a match");
3634         return;
3635     }
3636 
3637     // testing preceding
3638     for (i = 0; i < expectedcount - 1; i ++) {
3639         // int j = expected[i] + 1;
3640         int j = ustr.moveIndex32(expected[i], 1);
3641         for (; j <= expected[i + 1]; j ++) {
3642             int32_t expectedPreceding = expected[i];
3643             int32_t actualPreceding = bi->preceding(j);
3644             if (actualPreceding != expectedPreceding) {
3645                 printStringBreaks(ustr, expected, expectedcount);
3646                 test->errln("%s:%d preceding(%d): expected %d, got %d",
3647                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3648                 return;
3649             }
3650         }
3651     }
3652 }
3653 #endif
3654 
TestWordBreaks()3655 void RBBITest::TestWordBreaks()
3656 {
3657 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3658 
3659     Locale        locale("en");
3660     UErrorCode    status = U_ZERO_ERROR;
3661     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3662     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3663     // Replaced any C+J characters in a row with a random sequence of characters
3664     // of the same length to make our C+J segmentation not get in the way.
3665     static const char *strlist[] =
3666     {
3667     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3668     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3669     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3670     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3671     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3672     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3673     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3674     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3675     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3676     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3677     "\\u2027\\U000e0067\\u0a47\\u00b7",
3678     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3679     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3680     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3681     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3682     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3683     "\\u0027\\u11af\\U000e0057\\u0602",
3684     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3685     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3686     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3687     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3688     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3689     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3690     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3691     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3692     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3693     "\\u18f4\\U000e0049\\u20e7\\u2027",
3694     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3695     "\\ua183\\u102d\\u0bec\\u003a",
3696     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3697     "\\u003a\\u0e57\\u0fad\\u002e",
3698     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3699     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3700     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3701     "\\u003a\\u0664\\u00b7\\u1fba",
3702     "\\u003b\\u0027\\u00b7\\u47a3",
3703     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3704     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3705     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3706     };
3707     int loop;
3708     if (U_FAILURE(status)) {
3709         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3710         return;
3711     }
3712     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3713         // printf("looping %d\n", loop);
3714         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3715         // RBBICharMonkey monkey;
3716         RBBIWordMonkey monkey;
3717 
3718         int expected[50];
3719         int expectedcount = 0;
3720 
3721         monkey.setText(ustr);
3722         int i;
3723         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3724             expected[expectedcount ++] = i;
3725         }
3726 
3727         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3728     }
3729     delete bi;
3730 #endif
3731 }
3732 
TestWordBoundary()3733 void RBBITest::TestWordBoundary()
3734 {
3735     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3736     Locale        locale("en");
3737     UErrorCode    status = U_ZERO_ERROR;
3738     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3739     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3740     if (U_FAILURE(status)) {
3741         errcheckln(status, "%s:%d Creation of break iterator failed %s",
3742                 __FILE__, __LINE__, u_errorName(status));
3743         return;
3744     }
3745     char16_t      str[50];
3746     static const char *strlist[] =
3747     {
3748     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3749     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3750     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3751     "\\u2027\\U000e0067\\u0a47\\u00b7",
3752     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3753     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3754     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3755     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3756     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3757     "\\u0027\\u11af\\U000e0057\\u0602",
3758     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3759     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3760     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3761     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3762     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3763     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3764     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3765     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3766     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3767     "\\u58f4\\U000e0049\\u20e7\\u2027",
3768     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3769     "\\ua183\\u102d\\u0bec\\u003a",
3770     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3771     "\\u003a\\u0e57\\u0fad\\u002e",
3772     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3773     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3774     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3775     "\\u003a\\u0664\\u00b7\\u1fba",
3776     "\\u003b\\u0027\\u00b7\\u47a3",
3777     };
3778     int loop;
3779     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3780         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3781         UnicodeString ustr(str);
3782         int forward[50];
3783         int count = 0;
3784 
3785         bi->setText(ustr);
3786         int prev = -1;
3787         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3788             ++count;
3789             if (count >= UPRV_LENGTHOF(forward)) {
3790                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3791                         __FILE__, __LINE__, loop, count, boundary);
3792                 return;
3793             }
3794             forward[count] = boundary;
3795             if (boundary <= prev) {
3796                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3797                         __FILE__, __LINE__, loop, prev, boundary);
3798                 break;
3799             }
3800             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3801                 if (bi->isBoundary(nonBoundary)) {
3802                     printStringBreaks(ustr, forward, count);
3803                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3804                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3805                     return;
3806                 }
3807             }
3808             if (!bi->isBoundary(boundary)) {
3809                 printStringBreaks(ustr, forward, count);
3810                 errln("%s:%d happy boundary test failed: expected %d a boundary",
3811                        __FILE__, __LINE__, boundary);
3812                 return;
3813             }
3814             prev = boundary;
3815         }
3816     }
3817 }
3818 
TestLineBreaks()3819 void RBBITest::TestLineBreaks()
3820 {
3821 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3822     Locale        locale("en");
3823     UErrorCode    status = U_ZERO_ERROR;
3824     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3825     const int32_t  STRSIZE = 50;
3826     char16_t      str[STRSIZE];
3827     static const char *strlist[] =
3828     {
3829      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3830      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3831              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3832      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3833              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3834      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3835      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3836      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3837      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3838      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3839      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3840      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3841      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3842      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3843      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3844      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3845      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3846      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3847      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3848      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3849      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3850      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3851      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3852      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3853      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3854      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3855      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3856      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3857      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3858      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3859      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3860      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3861      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3862      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3863      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3864      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3865      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3866      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3867      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3868          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3869     };
3870     int loop;
3871     TEST_ASSERT_SUCCESS(status);
3872     if (U_FAILURE(status)) {
3873         return;
3874     }
3875     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3876         // printf("looping %d\n", loop);
3877         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3878         if (t >= STRSIZE) {
3879             TEST_ASSERT(false);
3880             continue;
3881         }
3882 
3883 
3884         UnicodeString ustr(str);
3885         RBBILineMonkey monkey;
3886         if (U_FAILURE(monkey.deferredStatus)) {
3887             continue;
3888         }
3889 
3890         const int EXPECTEDSIZE = 50;
3891         int expected[EXPECTEDSIZE];
3892         int expectedcount = 0;
3893 
3894         monkey.setText(ustr);
3895 
3896         int i;
3897         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3898             if (expectedcount >= EXPECTEDSIZE) {
3899                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3900                 return;
3901             }
3902             expected[expectedcount ++] = i;
3903         }
3904 
3905         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3906     }
3907     delete bi;
3908 #endif
3909 }
3910 
TestSentBreaks()3911 void RBBITest::TestSentBreaks()
3912 {
3913 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3914     Locale        locale("en");
3915     UErrorCode    status = U_ZERO_ERROR;
3916     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3917     char16_t      str[200];
3918     static const char *strlist[] =
3919     {
3920      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3921      "This\n",
3922      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3923      "\"Sentence ending with a quote.\" Bye.",
3924      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3925      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3926      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3927      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3928      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3929      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3930      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3931              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3932              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3933              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3934      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3935              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3936              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3937              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3938              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3939              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3940     };
3941     int loop;
3942     if (U_FAILURE(status)) {
3943         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3944         return;
3945     }
3946     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3947         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3948         UnicodeString ustr(str);
3949 
3950         RBBISentMonkey monkey;
3951         if (U_FAILURE(monkey.deferredStatus)) {
3952             continue;
3953         }
3954 
3955         const int EXPECTEDSIZE = 50;
3956         int expected[EXPECTEDSIZE];
3957         int expectedcount = 0;
3958 
3959         monkey.setText(ustr);
3960 
3961         int i;
3962         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3963             if (expectedcount >= EXPECTEDSIZE) {
3964                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3965                 return;
3966             }
3967             expected[expectedcount ++] = i;
3968         }
3969 
3970         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3971     }
3972     delete bi;
3973 #endif
3974 }
3975 
TestMonkey()3976 void RBBITest::TestMonkey() {
3977 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3978 
3979     UErrorCode     status    = U_ZERO_ERROR;
3980     int32_t        loopCount = 500;
3981     int32_t        seed      = 1;
3982     UnicodeString  breakType = "all";
3983     Locale         locale("en");
3984     UBool          useUText  = false;
3985     UBool          scalarsOnly = false;
3986     std::string    exportPath;
3987 
3988     if (quick == false) {
3989         loopCount = 10000;
3990     }
3991 
3992     if (fTestParams) {
3993         UnicodeString p(fTestParams);
3994         loopCount = getIntParam("loop", p, loopCount);
3995         seed      = getIntParam("seed", p, seed);
3996 
3997         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3998         if (m.find()) {
3999             breakType = m.group(1, status);
4000             m.reset();
4001             p = m.replaceFirst("", status);
4002         }
4003 
4004         RegexMatcher u(" *utext", p, 0, status);
4005         if (u.find()) {
4006             useUText = true;
4007             u.reset();
4008             p = u.replaceFirst("", status);
4009         }
4010 
4011         RegexMatcher pathMatcher(" *export *= *([^ ]+) *", p, 0, status);
4012         if (pathMatcher.find()) {
4013             pathMatcher.group(1, status).toUTF8String(exportPath);
4014             pathMatcher.reset();
4015             p = pathMatcher.replaceFirst("", status);
4016         }
4017 
4018         RegexMatcher s(" *scalars_only", p, 0, status);
4019         if (s.find()) {
4020             scalarsOnly = true;
4021             s.reset();
4022             p = s.replaceFirst("", status);
4023         }
4024 
4025         // m.reset(p);
4026         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4027             // Each option is stripped out of the option string as it is processed.
4028             // All options have been checked.  The option string should have been completely emptied..
4029             char buf[100];
4030             p.extract(buf, sizeof(buf), nullptr, status);
4031             buf[sizeof(buf)-1] = 0;
4032             errln("Unrecognized or extra parameter:  %s\n", buf);
4033             return;
4034         }
4035 
4036     }
4037 
4038     if (breakType == "char" || breakType == "all") {
4039         FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_char.txt").c_str(), "w");
4040         RBBICharMonkey  m;
4041         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4042         if (U_SUCCESS(status)) {
4043             RunMonkey(bi, m, "char", seed, loopCount, useUText, file, scalarsOnly);
4044             if (breakType == "all" && useUText==false) {
4045                 // Also run a quick test with UText when "all" is specified
4046                 RunMonkey(bi, m, "char", seed, loopCount, true, nullptr, scalarsOnly);
4047             }
4048         }
4049         else {
4050             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4051         }
4052         delete bi;
4053         if (file != nullptr) {
4054             fclose(file);
4055         }
4056     }
4057 
4058     if (breakType == "word" || breakType == "all") {
4059         logln("Word Break Monkey Test");
4060         FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_word.txt").c_str(), "w");
4061         RBBIWordMonkey  m;
4062         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4063         if (U_SUCCESS(status)) {
4064             RunMonkey(bi, m, "word", seed, loopCount, useUText, file, scalarsOnly);
4065         }
4066         else {
4067             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4068         }
4069         delete bi;
4070         if (file != nullptr) {
4071             fclose(file);
4072         }
4073     }
4074 
4075     if (breakType == "line" || breakType == "all") {
4076         logln("Line Break Monkey Test");
4077         FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_line.txt").c_str(), "w");
4078         RBBILineMonkey  m;
4079         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4080         if (loopCount >= 10) {
4081             loopCount = loopCount / 5;   // Line break runs slower than the others.
4082         }
4083         if (U_SUCCESS(status)) {
4084             RunMonkey(bi, m, "line", seed, loopCount, useUText, file, scalarsOnly);
4085         }
4086         else {
4087             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4088         }
4089         delete bi;
4090         if (file != nullptr) {
4091             fclose(file);
4092         }
4093     }
4094 
4095     if (breakType == "sent" || breakType == "all"  ) {
4096         logln("Sentence Break Monkey Test");
4097         FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_sent.txt").c_str(), "w");
4098         RBBISentMonkey  m;
4099         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4100         if (loopCount >= 10) {
4101             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4102         }
4103         if (U_SUCCESS(status)) {
4104             RunMonkey(bi, m, "sent", seed, loopCount, useUText, file, scalarsOnly);
4105         }
4106         else {
4107             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4108         }
4109         delete bi;
4110         if (file != nullptr) {
4111             fclose(file);
4112         }
4113     }
4114 
4115 #endif
4116 }
4117 
4118 //
4119 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4120 //    Parameters:
4121 //       bi          - the break iterator to use
4122 //       mk          - MonkeyKind, abstraction for obtaining expected results
4123 //       name        - Name of test (char, word, etc.) for use in error messages
4124 //       seed        - Seed for starting random number generator (parameter from user)
4125 //       numIterations
4126 //       exportFile  - Pointer to a file to which the test cases will be written in
4127 //                     UCD format.  May be null.
4128 //       scalarsOnly - Only test sequences of Unicode scalar values; if this is false,
4129 //                     arbitrary sequences of code points (including unpaired surrogates)
4130 //                     are tested.
4131 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText,FILE * exportFile,UBool scalarsOnly)4132 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4133                          int32_t numIterations, UBool useUText, FILE *exportFile, UBool scalarsOnly) {
4134 
4135 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4136 
4137     const int32_t    TESTSTRINGLEN = 500;
4138     UnicodeString    testText;
4139     int32_t          numCharClasses;
4140     UVector          *chClasses;
4141     int              expectedCount = 0;
4142     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4143     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4144     char             reverseBreaks[TESTSTRINGLEN*2+1];
4145     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4146     char             followingBreaks[TESTSTRINGLEN*2+1];
4147     char             precedingBreaks[TESTSTRINGLEN*2+1];
4148     int              i;
4149     int              loopCount = 0;
4150 
4151 
4152     m_seed = seed;
4153 
4154     numCharClasses = mk.charClasses()->size();
4155     chClasses      = mk.charClasses();
4156 
4157     // Check for errors that occurred during the construction of the MonkeyKind object.
4158     //  Can't report them where they occurred because errln() is a method coming from intlTest,
4159     //  and is not visible outside of RBBITest :-(
4160     if (U_FAILURE(mk.deferredStatus)) {
4161         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4162         return;
4163     }
4164 
4165     // Verify that the character classes all have at least one member.
4166     for (i=0; i<numCharClasses; i++) {
4167         UnicodeSet *s = static_cast<UnicodeSet *>(chClasses->elementAt(i));
4168         if (s == nullptr || s->size() == 0) {
4169             errln("Character Class #%d is null or of zero size.", i);
4170             return;
4171         }
4172     }
4173 
4174     // For minimizing width of class name output.
4175     int classNameSize = mk.maxClassNameSize();
4176 
4177     while (loopCount < numIterations || numIterations == -1) {
4178         if (numIterations == -1 && loopCount % 10 == 0) {
4179             // If test is running in an infinite loop, display a periodic tic so
4180             //   we can tell that it is making progress.
4181             fprintf(stderr, ".");
4182         }
4183         // Save current random number seed, so that we can recreate the random numbers
4184         //   for this loop iteration in event of an error.
4185         seed = m_seed;
4186 
4187         // Populate a test string with data.
4188         testText.truncate(0);
4189         for (i=0; i<TESTSTRINGLEN; i++) {
4190             int32_t  aClassNum = m_rand() % numCharClasses;
4191             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4192             int32_t   charIdx = m_rand() % classSet->size();
4193             UChar32   c = classSet->charAt(charIdx);
4194             if (c < 0) {   // TODO:  deal with sets containing strings.
4195                 errln("%s:%d c < 0", __FILE__, __LINE__);
4196                 break;
4197             }
4198             if (scalarsOnly && U16_IS_SURROGATE(c)) {
4199               continue;
4200             }
4201             // Do not assemble a supplementary character from randomly generated separate surrogates.
4202             //   (It could be a dictionary character)
4203             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4204                 continue;
4205             }
4206 
4207             testText.append(c);
4208         }
4209 
4210         // Calculate the expected results for this test string and reset applied rules.
4211         mk.setText(testText);
4212 
4213         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4214         expectedBreaks[0] = 1;
4215         int32_t breakPos = 0;
4216         expectedCount = 0;
4217         for (;;) {
4218             breakPos = mk.next(breakPos);
4219             if (breakPos == -1) {
4220                 break;
4221             }
4222             if (breakPos > testText.length()) {
4223                 errln("breakPos > testText.length()");
4224             }
4225             expectedBreaks[breakPos] = 1;
4226             expectedCount++;
4227             U_ASSERT(expectedCount<testText.length());
4228 	    (void)expectedCount;  // Used by U_ASSERT().
4229         }
4230 
4231         // Find the break positions using forward iteration
4232         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4233         if (useUText) {
4234             UErrorCode status = U_ZERO_ERROR;
4235             UText *testUText = utext_openReplaceable(nullptr, &testText, &status);
4236             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4237             bi->setText(testUText, status);
4238             TEST_ASSERT_SUCCESS(status);
4239             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4240                                       //  This UText can be closed immediately, so long as the
4241                                       //  testText string continues to exist.
4242         } else {
4243             bi->setText(testText);
4244         }
4245 
4246         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4247             if (i < 0 || i > testText.length()) {
4248                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4249                 break;
4250             }
4251             forwardBreaks[i] = 1;
4252         }
4253 
4254         // Find the break positions using reverse iteration
4255         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4256         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4257             if (i < 0 || i > testText.length()) {
4258                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4259                 break;
4260             }
4261             reverseBreaks[i] = 1;
4262         }
4263 
4264         // Find the break positions using isBoundary() tests.
4265         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4266         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4267         for (i=0; i<=testText.length(); i++) {
4268             isBoundaryBreaks[i] = bi->isBoundary(i);
4269         }
4270 
4271 
4272         // Find the break positions using the following() function.
4273         // printf(".");
4274         memset(followingBreaks, 0, sizeof(followingBreaks));
4275         int32_t   lastBreakPos = 0;
4276         followingBreaks[0] = 1;
4277         for (i=0; i<testText.length(); i++) {
4278             breakPos = bi->following(i);
4279             if (breakPos <= i ||
4280                 breakPos < lastBreakPos ||
4281                 breakPos > testText.length() ||
4282                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4283                 errln("%s break monkey test: "
4284                     "Out of range value returned by BreakIterator::following().\n"
4285                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4286                          name, seed, i, breakPos, lastBreakPos);
4287                 break;
4288             }
4289             followingBreaks[breakPos] = 1;
4290             lastBreakPos = breakPos;
4291         }
4292 
4293         // Find the break positions using the preceding() function.
4294         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4295         lastBreakPos = testText.length();
4296         precedingBreaks[testText.length()] = 1;
4297         for (i=testText.length(); i>0; i--) {
4298             breakPos = bi->preceding(i);
4299             if (breakPos >= i ||
4300                 breakPos > lastBreakPos ||
4301                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4302                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4303                 errln("%s break monkey test: "
4304                     "Out of range value returned by BreakIterator::preceding().\n"
4305                     "index=%d;  prev returned %d; lastBreak=%d" ,
4306                     name,  i, breakPos, lastBreakPos);
4307                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4308                     precedingBreaks[i] = 2;   // Forces an error.
4309                 }
4310             } else {
4311                 if (breakPos >= 0) {
4312                     precedingBreaks[breakPos] = 1;
4313                 }
4314                 lastBreakPos = breakPos;
4315             }
4316         }
4317 
4318         if (exportFile != nullptr) {
4319             for (i = 0; i < testText.length();) {
4320                 fprintf(exportFile, expectedBreaks[i] ? "÷ " : "× ");
4321                 char32_t const c = testText.char32At(i);
4322                 fprintf(exportFile, "%04X ", static_cast<uint32_t>(c));
4323                 i += U16_LENGTH(c);
4324             }
4325             fprintf(exportFile, expectedBreaks[testText.length()] ? "÷  # ��\n" : "×  # ��\n");
4326         }
4327 
4328         // Compare the expected and actual results.
4329         for (i=0; i<=testText.length(); i++) {
4330             const char *errorType = nullptr;
4331             const char* currentBreakData = nullptr;
4332             if  (forwardBreaks[i] != expectedBreaks[i]) {
4333                 errorType = "next()";
4334                 currentBreakData = forwardBreaks;
4335             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4336                 errorType = "previous()";
4337                 currentBreakData = reverseBreaks;
4338            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4339                 errorType = "isBoundary()";
4340                 currentBreakData = isBoundaryBreaks;
4341             } else if (followingBreaks[i] != expectedBreaks[i]) {
4342                 errorType = "following()";
4343                 currentBreakData = followingBreaks;
4344             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4345                 errorType = "preceding()";
4346                 currentBreakData = precedingBreaks;
4347             }
4348 
4349             if (errorType != nullptr) {
4350                 // Format a range of the test text that includes the failure as
4351                 //  a data item that can be included in the rbbi test data file.
4352 
4353                 // Start of the range is the last point where expected and actual results
4354                 //  both agreed that there was a break position.
4355 
4356                 int startContext = i;
4357                 int32_t count = 0;
4358                 for (;;) {
4359                     if (startContext==0) { break; }
4360                     startContext --;
4361                     if (expectedBreaks[startContext] != 0) {
4362                         if (count == 2) break;
4363                         count ++;
4364                     }
4365                 }
4366 
4367                 // End of range is two expected breaks past the start position.
4368                 int endContext = i + 1;
4369                 int ci;
4370                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4371                     for (;;) {
4372                         if (endContext >= testText.length()) {break;}
4373                         if (expectedBreaks[endContext-1] != 0) {
4374                             if (count == 0) break;
4375                             count --;
4376                         }
4377                         endContext ++;
4378                     }
4379                 }
4380 
4381                 // Formatting of each line includes:
4382                 //   character code
4383                 //   reference break: '|' -> a break, '.' -> no break
4384                 //   actual break:    '|' -> a break, '.' -> no break
4385                 //   (name of character clase)
4386                 //   Unicode name of character
4387                 //   '-->' indicates location of the difference.
4388 
4389                 MONKEY_ERROR(
4390                     (expectedBreaks[i] ? "Break expected but not found" :
4391                        "Break found but not expected"),
4392                     name, i, seed);
4393 
4394                 for (ci = startContext;; (ci = testText.moveIndex32(ci, 1))) {
4395                     UChar32  c;
4396                     c = testText.char32At(ci);
4397 
4398                     std::string currentLineFlag = "   ";
4399                     if (ci == i) {
4400                         currentLineFlag = "-->";  // Error position
4401                     }
4402 
4403                     // BMP or SMP character in hex
4404                     char hexCodePoint[12];
4405                     std::string format = "    \\u%04x";
4406                     if (c >= 0x10000) {
4407                         format = "\\U%08x";
4408                     }
4409                     snprintf(hexCodePoint, sizeof(hexCodePoint), format.c_str(), c);
4410 
4411                     // Get the class name and character name for the character.
4412                     char cName[200];
4413                     UErrorCode status = U_ZERO_ERROR;
4414                     u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4415 
4416                     char buffer[200];
4417                     auto ret = snprintf(buffer, sizeof(buffer),
4418                              "%4s %3i :  %1s  %1s  %10s  %-*s  %-40s  %-40s",
4419                              currentLineFlag.c_str(),
4420                              ci,
4421                              expectedBreaks[ci] == 0 ? "." : "|",  // Reference break
4422                              currentBreakData[ci] == 0 ? "." : "|",  // Actual break
4423                              hexCodePoint,
4424                              classNameSize,
4425                              mk.classNameFromCodepoint(c).c_str(),
4426                              mk.getAppliedRule(ci).c_str(), cName);
4427                     (void)ret;
4428                     U_ASSERT(0 <= ret && ret < UPRV_LENGTHOF(buffer));
4429 
4430                     // Output the error
4431                     if (ci == i) {
4432                         errln(buffer);
4433                     } else {
4434                         infoln(buffer);
4435                     }
4436 
4437                     if (ci >= endContext) { break; }
4438                 }
4439                 break;
4440             }
4441         }
4442 
4443         loopCount++;
4444     }
4445 #endif
4446 }
4447 
4448 
4449 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4450 //             This test checks the initial patch,
4451 //             which is to just keep it from crashing.  Correct word boundaries
4452 //             await a proper fix to the dictionary code.
4453 //
TestBug5532()4454 void RBBITest::TestBug5532()  {
4455    // Text includes a mixture of Thai and Latin.
4456    const unsigned char utf8Data[] = {
4457            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4458            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4459            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4460            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4461            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4462            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4463            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4464            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4465            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4466            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4467            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4468 
4469     UErrorCode status = U_ZERO_ERROR;
4470     UText utext=UTEXT_INITIALIZER;
4471     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4472     TEST_ASSERT_SUCCESS(status);
4473 
4474     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4475     TEST_ASSERT_SUCCESS(status);
4476     if (U_SUCCESS(status)) {
4477         bi->setText(&utext, status);
4478         TEST_ASSERT_SUCCESS(status);
4479 
4480         int32_t breakCount = 0;
4481         int32_t previousBreak = -1;
4482         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4483             // For now, just make sure that the break iterator doesn't hang.
4484             TEST_ASSERT(previousBreak < bi->current());
4485             previousBreak = bi->current();
4486         }
4487         TEST_ASSERT(breakCount > 0);
4488     }
4489     delete bi;
4490     utext_close(&utext);
4491 }
4492 
4493 
TestBug9983()4494 void RBBITest::TestBug9983()  {
4495     UnicodeString text = UnicodeString("\\u002A"  // * Other
4496                                        "\\uFF65"  //   Other
4497                                        "\\u309C"  //   Katakana
4498                                        "\\uFF9F"  //   Extend
4499                                        "\\uFF65"  //   Other
4500                                        "\\u0020"  //   Other
4501                                        "\\u0000").unescape();
4502 
4503     UErrorCode status = U_ZERO_ERROR;
4504     LocalPointer<RuleBasedBreakIterator> brkiter(dynamic_cast<RuleBasedBreakIterator *>(
4505         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4506     TEST_ASSERT_SUCCESS(status);
4507     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(dynamic_cast<RuleBasedBreakIterator *>(
4508         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4509     TEST_ASSERT_SUCCESS(status);
4510     if (U_FAILURE(status)) {
4511         return;
4512     }
4513     int32_t offset, rstatus, iterationCount;
4514 
4515     brkiter->setText(text);
4516     brkiter->last();
4517     iterationCount = 0;
4518     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4519         iterationCount++;
4520         rstatus = brkiter->getRuleStatus();
4521         (void)rstatus;     // Suppress set but not used warning.
4522         if (iterationCount >= 10) {
4523            break;
4524         }
4525     }
4526     TEST_ASSERT(iterationCount == 6);
4527 
4528     brkiterPOSIX->setText(text);
4529     brkiterPOSIX->last();
4530     iterationCount = 0;
4531     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4532         iterationCount++;
4533         rstatus = brkiterPOSIX->getRuleStatus();
4534         (void)rstatus;     // Suppress set but not used warning.
4535         if (iterationCount >= 10) {
4536            break;
4537         }
4538     }
4539     TEST_ASSERT(iterationCount == 6);
4540 }
4541 
4542 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4543 //
TestBug7547()4544 void RBBITest::TestBug7547() {
4545     UnicodeString rules;
4546     UErrorCode status = U_ZERO_ERROR;
4547     UParseError parseError;
4548     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4549     if (status != U_BRK_RULE_SYNTAX) {
4550         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4551     }
4552     if (parseError.line != 1 || parseError.offset != 0) {
4553         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4554     }
4555 }
4556 
4557 
TestBug12797()4558 void RBBITest::TestBug12797() {
4559     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4560     UErrorCode status = U_ZERO_ERROR;
4561     UParseError parseError;
4562     RuleBasedBreakIterator bi(rules, parseError, status);
4563     if (U_FAILURE(status)) {
4564         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4565         return;
4566     }
4567     UnicodeString text = "abc";
4568     bi.setText(text);
4569     bi.first();
4570     int32_t boundary = bi.next();
4571     if (boundary != 3) {
4572         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4573     }
4574 }
4575 
TestBug12918()4576 void RBBITest::TestBug12918() {
4577     // This test triggers an assertion failure in dictbe.cpp
4578     const char16_t *crasherString = u"\u3325\u4a16";
4579     UErrorCode status = U_ZERO_ERROR;
4580     UBreakIterator* iter = ubrk_open(UBRK_WORD, nullptr, crasherString, -1, &status);
4581     if (U_FAILURE(status)) {
4582         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4583         return;
4584     }
4585     ubrk_first(iter);
4586     int32_t pos = 0;
4587     int32_t lastPos = -1;
4588     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4589         if (pos <= lastPos) {
4590             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4591             break;
4592         }
4593     }
4594     ubrk_close(iter);
4595 }
4596 
TestBug12932()4597 void RBBITest::TestBug12932() {
4598     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4599     UnicodeString ruleStr(
4600             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4601             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4602             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4603             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4604             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4605             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4606 
4607     UErrorCode status = U_ZERO_ERROR;
4608     UParseError parseError;
4609     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4610     if (status != U_BRK_RULE_SYNTAX) {
4611         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4612                 __FILE__, __LINE__, u_errorName(status));
4613     }
4614 }
4615 
4616 
4617 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4618 //             remain undevided by ICU char, word and line break.
TestEmoji()4619 void RBBITest::TestEmoji() {
4620 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4621     UErrorCode  status = U_ZERO_ERROR;
4622 
4623     CharString testFileName;
4624     testFileName.append(IntlTest::getSourceTestData(status), status);
4625     testFileName.appendPathPart("emoji-test.txt", status);
4626     if (U_FAILURE(status)) {
4627         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4628         return;
4629     }
4630     logln("Opening data file %s\n", testFileName.data());
4631 
4632     int    len;
4633     char16_t *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4634     if (U_FAILURE(status) || testFile == nullptr) {
4635         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4636         return;
4637     }
4638     UnicodeString testFileAsString(testFile, len);
4639     delete [] testFile;
4640 
4641     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4642     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4643     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4644     int32_t lineNumber = 0;
4645 
4646     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4647     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4648     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4649     if (U_FAILURE(status)) {
4650         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4651         return;
4652     }
4653 
4654     while (lineMatcher.find()) {
4655         ++lineNumber;
4656         UnicodeString line = lineMatcher.group(status);
4657         hexMatcher.reset(line);
4658         UnicodeString testString;   // accumulates the emoji sequence.
4659         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4660             UnicodeString hex = hexMatcher.group(1, status);
4661             if (hex.length() > 8) {
4662                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4663                 break;
4664             }
4665             CharString hex8;
4666             hex8.appendInvariantChars(hex, status);
4667             UChar32 c = (UChar32)strtol(hex8.data(), nullptr, 16);
4668             if (c<=0x10ffff) {
4669                 testString.append(c);
4670             } else {
4671                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4672                         __FILE__, __LINE__, lineNumber, hex8.data());
4673                 break;
4674             }
4675         }
4676 
4677         if (testString.length() > 1) {
4678             charBreaks->setText(testString);
4679             charBreaks->first();
4680             int32_t firstBreak = charBreaks->next();
4681             if (testString.length() != firstBreak) {
4682                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4683                         __FILE__, __LINE__, lineNumber, firstBreak);
4684             }
4685             wordBreaks->setText(testString);
4686             wordBreaks->first();
4687             firstBreak = wordBreaks->next();
4688             if (testString.length() != firstBreak) {
4689                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4690                         __FILE__, __LINE__, lineNumber, firstBreak);
4691             }
4692             lineBreaks->setText(testString);
4693             lineBreaks->first();
4694             firstBreak = lineBreaks->next();
4695             if (testString.length() != firstBreak) {
4696                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4697                         __FILE__, __LINE__, lineNumber, firstBreak);
4698             }
4699         }
4700     }
4701 #endif
4702 }
4703 
4704 
4705 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4706 
TestBug12519()4707 void RBBITest::TestBug12519() {
4708     UErrorCode status = U_ZERO_ERROR;
4709     LocalPointer<RuleBasedBreakIterator> biEn(dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4710     LocalPointer<RuleBasedBreakIterator> biFr(dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getFrance(), status)));
4711     if (!assertSuccess(WHERE, status)) {
4712         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4713         return;
4714     }
4715     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4716 
4717     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4718     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4719 
4720     LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4721     assertTrue(WHERE, *biEn == *cloneEn);
4722     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4723 
4724     LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4725     assertTrue(WHERE, *biFr == *cloneFr);
4726     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4727 
4728     LocalPointer<RuleBasedBreakIterator>biDe(dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getGerman(), status)));
4729     UnicodeString text("Hallo Welt");
4730     biDe->setText(text);
4731     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4732     *biDe = *biFr;
4733     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4734 }
4735 
TestBug12677()4736 void RBBITest::TestBug12677() {
4737     // Check that stripping of comments from rules for getRules() is not confused by
4738     // the presence of '#' characters in the rules that do not introduce comments.
4739     UnicodeString rules(u"!!forward; \n"
4740                          "$x = [ab#];  # a set with a # literal. \n"
4741                          " # .;        # a comment that looks sort of like a rule.   \n"
4742                          " '#' '?';    # a rule with a quoted #   \n"
4743                        );
4744 
4745     UErrorCode status = U_ZERO_ERROR;
4746     UParseError pe;
4747     RuleBasedBreakIterator bi(rules, pe, status);
4748     assertSuccess(WHERE, status);
4749     UnicodeString rtRules = bi.getRules();
4750     assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"),  rtRules);
4751 }
4752 
4753 
TestTableRedundancies()4754 void RBBITest::TestTableRedundancies() {
4755     UErrorCode status = U_ZERO_ERROR;
4756 
4757     LocalPointer<RuleBasedBreakIterator> bi (
4758         dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4759     assertSuccess(WHERE, status);
4760     if (U_FAILURE(status)) return;
4761 
4762     RBBIDataWrapper *dw = bi->fData;
4763     const RBBIStateTable *fwtbl = dw->fForwardTable;
4764     UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS;
4765     int32_t numCharClasses = dw->fHeader->fCatCount;
4766     // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4767 
4768     // Check for duplicate columns (character categories)
4769 
4770     std::vector<UnicodeString> columns;
4771     for (int32_t column = 0; column < numCharClasses; column++) {
4772         UnicodeString s;
4773         for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4774             RBBIStateTableRow  *row = reinterpret_cast<RBBIStateTableRow *>(const_cast<char*>(fwtbl->fTableData + (fwtbl->fRowLen * r)));
4775             s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]);
4776         }
4777         columns.push_back(s);
4778     }
4779     // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4780     for (int c1=1; c1<numCharClasses; c1++) {
4781         int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
4782         for (int c2 = c1+1; c2 < limit; c2++) {
4783             if (columns.at(c1) == columns.at(c2)) {
4784                 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4785                 goto out;
4786             }
4787         }
4788     }
4789   out:
4790 
4791     // Check for duplicate states
4792     std::vector<UnicodeString> rows;
4793     for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4794         UnicodeString s;
4795         RBBIStateTableRow  *row = reinterpret_cast<RBBIStateTableRow *>(const_cast<char*>((fwtbl->fTableData + (fwtbl->fRowLen * r))));
4796         if (in8Bits) {
4797             s.append(row->r8.fAccepting);
4798             s.append(row->r8.fLookAhead);
4799             s.append(row->r8.fTagsIdx);
4800             for (int32_t column = 0; column < numCharClasses; column++) {
4801                 s.append(row->r8.fNextState[column]);
4802             }
4803         } else {
4804             s.append(row->r16.fAccepting);
4805             s.append(row->r16.fLookAhead);
4806             s.append(row->r16.fTagsIdx);
4807             for (int32_t column = 0; column < numCharClasses; column++) {
4808                 s.append(row->r16.fNextState[column]);
4809             }
4810         }
4811         rows.push_back(s);
4812     }
4813     for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4814         for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4815             if (rows.at(r1) == rows.at(r2)) {
4816                 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4817                 return;
4818             }
4819         }
4820     }
4821 }
4822 
4823 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4824 //            even after next() has returned DONE.
4825 
TestBug13447()4826 void RBBITest::TestBug13447() {
4827     UErrorCode status = U_ZERO_ERROR;
4828     LocalPointer<RuleBasedBreakIterator> bi(
4829         dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4830     assertSuccess(WHERE, status);
4831     if (U_FAILURE(status)) return;
4832     UnicodeString data(u"1234");
4833     bi->setText(data);
4834     assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4835     assertEquals(WHERE, 4, bi->next());
4836     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4837     assertEquals(WHERE, UBRK_DONE, bi->next());
4838     assertEquals(WHERE, 4, bi->current());
4839     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4840 }
4841 
4842 //  TestReverse exercises both the synthesized safe reverse rules and the logic
4843 //  for filling the break iterator cache when starting from random positions
4844 //  in the text.
4845 //
4846 //  It's a monkey test, working on random data, with the expected data obtained
4847 //  from forward iteration (no safe rules involved), comparing with results
4848 //  when indexing into the interior of the string (safe rules needed).
4849 
TestReverse()4850 void RBBITest::TestReverse() {
4851     UErrorCode status = U_ZERO_ERROR;
4852 
4853     TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4854             BreakIterator::createCharacterInstance(Locale::getEnglish(), status))));
4855     assertSuccess(WHERE, status, true);
4856     status = U_ZERO_ERROR;
4857     TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4858             BreakIterator::createWordInstance(Locale::getEnglish(), status))));
4859     assertSuccess(WHERE, status, true);
4860     status = U_ZERO_ERROR;
4861     TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4862             BreakIterator::createLineInstance(Locale::getEnglish(), status))));
4863     assertSuccess(WHERE, status, true);
4864     status = U_ZERO_ERROR;
4865     TestReverse(std::unique_ptr<RuleBasedBreakIterator>(dynamic_cast<RuleBasedBreakIterator*>(
4866             BreakIterator::createSentenceInstance(Locale::getEnglish(), status))));
4867     assertSuccess(WHERE, status, true);
4868 }
4869 
TestReverse(std::unique_ptr<RuleBasedBreakIterator> bi)4870 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4871     if (!bi) {
4872         return;
4873     }
4874 
4875     // From the mapping trie in the break iterator's internal data, create a
4876     // vector of UnicodeStrings, one for each character category, containing
4877     // all of the code points that map to that category. Unicode planes 0 and 1 only,
4878     // to avoid an execess of unassigned code points.
4879 
4880     RBBIDataWrapper *data = bi->fData;
4881     int32_t categoryCount = data->fHeader->fCatCount;
4882     UCPTrie *trie = data->fTrie;
4883     bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8;
4884     uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000;
4885 
4886     std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4887     for (int cp=0; cp<0x1fff0; ++cp) {
4888         int cat = ucptrie_get(trie, cp);
4889         cat &= ~dictBit;    // And off the dictionary bit from the category.
4890         assertTrue(WHERE, cat < categoryCount && cat >= 0);
4891         if (cat < 0 || cat >= categoryCount) return;
4892         strings[cat].append(cp);
4893     }
4894 
4895     icu_rand randomGen;
4896     const int testStringLength = 10000;
4897     UnicodeString testString;
4898 
4899     for (int i=0; i<testStringLength; ++i) {
4900         int charClass = randomGen() % categoryCount;
4901         if (strings[charClass].length() > 0) {
4902             int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4903             testString.append(cp);
4904         }
4905     }
4906 
4907     typedef std::pair<UBool, int32_t> Result;
4908     std::vector<Result> expectedResults;
4909     bi->setText(testString);
4910     for (int i=0; i<testString.length(); ++i) {
4911         bool isboundary = bi->isBoundary(i);
4912         int  ruleStatus = bi->getRuleStatus();
4913         expectedResults.emplace_back(isboundary, ruleStatus);
4914     }
4915 
4916     for (int i=testString.length()-1; i>=0; --i) {
4917         bi->setText(testString);   // clears the internal break cache
4918         Result expected = expectedResults[i];
4919         assertEquals(WHERE, expected.first, bi->isBoundary(i));
4920         assertEquals(WHERE, expected.second, bi->getRuleStatus());
4921     }
4922 }
4923 
4924 
4925 // Ticket 13692 - finding word boundaries in very large numbers or words could
4926 //                be very time consuming. When the problem was present, this void test
4927 //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4928 
TestBug13692()4929 void RBBITest::TestBug13692() {
4930     UErrorCode status = U_ZERO_ERROR;
4931     LocalPointer<RuleBasedBreakIterator> bi (dynamic_cast<RuleBasedBreakIterator*>(
4932             BreakIterator::createWordInstance(Locale::getEnglish(), status)), status);
4933     if (!assertSuccess(WHERE, status, true)) {
4934         return;
4935     }
4936     constexpr int32_t LENGTH = 1000000;
4937     UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4938     for (int i=0; i<20; i+=2) {
4939         longNumber.setCharAt(i, u' ');
4940     }
4941     bi->setText(longNumber);
4942     assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4943     assertSuccess(WHERE, status);
4944 }
4945 
4946 
TestProperties()4947 void RBBITest::TestProperties() {
4948     UErrorCode errorCode = U_ZERO_ERROR;
4949     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4950     if (!prependSet.isEmpty()) {
4951         errln(
4952             "[:GCB=Prepend:] is not empty any more. "
4953             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4954             "change this test to the opposite condition.");
4955     }
4956 }
4957 
4958 
4959 //
4960 //  TestDebug    -  A place-holder test for debugging purposes.
4961 //                  For putting in fragments of other tests that can be invoked
4962 //                  for tracing  without a lot of unwanted extra stuff happening.
4963 //
TestDebug()4964 void RBBITest::TestDebug() {
4965     UErrorCode status = U_ZERO_ERROR;
4966     LocalPointer<RuleBasedBreakIterator> bi (dynamic_cast<RuleBasedBreakIterator*>(
4967             BreakIterator::createCharacterInstance(Locale::getEnglish(), status)), status);
4968     if (!assertSuccess(WHERE, status, true)) {
4969         return;
4970     }
4971     const UnicodeString &rules = bi->getRules();
4972     UParseError pe;
4973     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4974     assertSuccess(WHERE, status);
4975 }
4976 
4977 
4978 //
4979 //  TestDebugRules   A stub test for use in debugging rule compilation problems.
4980 //                   Can be freely altered as needed or convenient.
4981 //                   Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
4982 //                   data files may not be available in all environments.
4983 //                   Any permanent test cases should be moved to rbbitst.txt
4984 //                   (see Bug 20303 in that file, for example), or to another test function in this file.
4985 //
TestDebugRules()4986 void RBBITest::TestDebugRules() {
4987 #if 0
4988     const char16_t *rules = u""
4989         "!!quoted_literals_only; \n"
4990         "!!chain; \n"
4991         "!!lookAheadHardBreak; \n"
4992         " \n"
4993         // "[a] / ; \n"
4994         "[a] [b] / [c] [d]; \n"
4995         "[a] [b] / [c] [d] {100}; \n"
4996         "[x] [a] [b] / [c] [d] {100}; \n"
4997         "[a] [b] [c] / [d] {100}; \n"
4998         //" [c] [d] / [e] [f]; \n"
4999         //"[a] [b] / [c]; \n"
5000         ;
5001 
5002     UErrorCode status = U_ZERO_ERROR;
5003     CharString path(pathToDataDirectory(), status);
5004     path.appendPathPart("brkitr", status);
5005     path.appendPathPart("rules", status);
5006     path.appendPathPart("line.txt", status);
5007     int    len;
5008     std::unique_ptr<char16_t []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
5009     if (!assertSuccess(WHERE, status)) {
5010         return;
5011     }
5012 
5013     UParseError pe;
5014     // rules = testFile.get();
5015     RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
5016 
5017     if (!assertSuccess(WHERE, status)) {
5018         delete bi;
5019         return;
5020     }
5021     // bi->dumpTables();
5022 
5023     delete bi;
5024 #endif
5025 }
5026 
testTrieStateTable(int32_t numChar,bool expectedTrieWidthIn8Bits,bool expectedStateRowIn8Bits)5027 void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) {
5028     UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16;
5029     int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0;
5030     // Text are duplicate characters from U+4E00 to U+4FFF
5031     UnicodeString text;
5032     for (char16_t c = 0x4e00; c < 0x5000; c++) {
5033         text.append(c).append(c);
5034     }
5035     // Generate rule which will caused length+4 character classes and
5036     // length+3 states
5037     UnicodeString rules(u"!!quoted_literals_only;");
5038     for (char16_t c = 0x4e00; c < 0x4e00 + numChar; c++) {
5039         rules.append(u'\'').append(c).append(c).append(u"';");
5040     }
5041     rules.append(u".;");
5042     UErrorCode status = U_ZERO_ERROR;
5043     UParseError parseError;
5044     RuleBasedBreakIterator bi(rules, parseError, status);
5045 
5046     assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount);
5047     assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates);
5048     assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie));
5049     assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS);
5050     assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS);
5051 
5052     bi.setText(text);
5053 
5054     int32_t pos;
5055     int32_t i = 0;
5056     while ((pos = bi.next()) > 0) {
5057         // The first numChar should not break between the pair
5058         if (i++ < numChar) {
5059             assertEquals(WHERE, i * 2, pos);
5060         } else {
5061             // After the first numChar next(), break on each character.
5062             assertEquals(WHERE, i + numChar, pos);
5063         }
5064     }
5065     while ((pos = bi.previous()) > 0) {
5066         // The first numChar should not break between the pair
5067         if (--i < numChar) {
5068             assertEquals(WHERE, i * 2, pos);
5069         } else {
5070             // After the first numChar next(), break on each character.
5071             assertEquals(WHERE, i + numChar, pos);
5072         }
5073     }
5074 }
5075 
Test8BitsTrieWith8BitStateTable()5076 void RBBITest::Test8BitsTrieWith8BitStateTable() {
5077     testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
5078 }
5079 
Test16BitsTrieWith8BitStateTable()5080 void RBBITest::Test16BitsTrieWith8BitStateTable() {
5081     testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
5082 }
5083 
Test16BitsTrieWith16BitStateTable()5084 void RBBITest::Test16BitsTrieWith16BitStateTable() {
5085     testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
5086 }
5087 
Test8BitsTrieWith16BitStateTable()5088 void RBBITest::Test8BitsTrieWith16BitStateTable() {
5089     // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to
5090     // create state table in 16 bits.
5091 
5092     // Generate 510 'a' as text
5093     UnicodeString text;
5094     for (int32_t i = 0; i < 510; i++) {
5095         text.append(u'a');
5096     }
5097 
5098     UnicodeString rules(u"!!quoted_literals_only;'");
5099     // 254 'a' in the rule will cause 256 states
5100     for (int32_t i = 0; i < 254; i++) {
5101         rules.append(u'a');
5102     }
5103     rules.append(u"';.;");
5104 
5105     UErrorCode status = U_ZERO_ERROR;
5106     UParseError parseError;
5107     LocalPointer<RuleBasedBreakIterator> bi(new RuleBasedBreakIterator(rules, parseError, status));
5108 
5109     assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates);
5110     assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie));
5111     assertEquals(WHERE,
5112                  false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS));
5113     bi->setText(text);
5114 
5115     // break positions:
5116     // 254, 508, 509, ... 510
5117     assertEquals("next()", 254, bi->next());
5118     int32_t i = 0;
5119     int32_t pos;
5120     while ((pos = bi->next()) > 0) {
5121         assertEquals(WHERE, 508 + i , pos);
5122         i++;
5123     }
5124     i = 0;
5125     while ((pos = bi->previous()) > 0) {
5126         i++;
5127         if (pos >= 508) {
5128             assertEquals(WHERE, 510 - i , pos);
5129         } else {
5130             assertEquals(WHERE, 254 , pos);
5131         }
5132     }
5133 }
5134 
5135 // Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and
5136 // that there are no problems with rules at the size that transitions between the two.
5137 //
5138 // A rule that matches a literal string, like 'abcdefghij', will require one state and
5139 // one character class per character in the string. So we can make a rule to tickle the
5140 // boundaries by using literal strings of various lengths.
5141 //
5142 // For both the number of states and the number of character classes, the eight bit format
5143 // only has 7 bits available, allowing for 128 values. For both, a few values are reserved,
5144 // leaving 120 something available. This test runs the string over the range of 120 - 130,
5145 // which allows some margin for changes to the number of values reserved by the rule builder
5146 // without breaking the test.
5147 
TestTable_8_16_Bits()5148 void RBBITest::TestTable_8_16_Bits() {
5149 
5150     // testStr serves as both the source of the rule string (truncated to the desired length)
5151     // and as test data to check matching behavior. A break rule consisting of the first 120
5152     // characters of testStr will match the first 120 chars of the full-length testStr.
5153     UnicodeString testStr;
5154     for (char16_t c=0x3000; c<0x3200; ++c) {
5155         testStr.append(c);
5156     }
5157 
5158     const int32_t startLength = 120;   // The shortest rule string to test.
5159     const int32_t endLength = 260;     // The longest rule string to test
5160     const int32_t increment = this->quick ? endLength - startLength : 1;
5161 
5162     for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) {
5163         UParseError parseError;
5164         UErrorCode status = U_ZERO_ERROR;
5165 
5166         UnicodeString ruleString{u"!!quoted_literals_only; '#';"};
5167         ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen));
5168         RuleBasedBreakIterator bi(ruleString, parseError, status);
5169         if (!assertSuccess(WHERE, status)) {
5170             errln(ruleString);
5171             break;
5172         }
5173         // bi.dumpTables();
5174 
5175         // Verify that the break iterator is functioning - that the first boundary found
5176         // in testStr is at the length of the rule string.
5177         bi.setText(testStr);
5178         assertEquals(WHERE, ruleLen, bi.next());
5179 
5180         // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache
5181         // of previously detected boundaries, thus forcing the engine to run the safe reverse rules.
5182         bi.setText(testStr);
5183         int32_t result = bi.preceding(ruleLen);
5184         assertEquals(WHERE, 0, result);
5185 
5186         // Verify that the range of rule lengths being tested cover the translations
5187         // from 8 to 16 bit data.
5188         bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS;
5189         bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8;
5190 
5191         if (ruleLen == startLength) {
5192             assertEquals(WHERE, true, has8BitRowData);
5193             assertEquals(WHERE, true, has8BitsTrie);
5194         }
5195         if (ruleLen == endLength) {
5196             assertEquals(WHERE, false, has8BitRowData);
5197             assertEquals(WHERE, false, has8BitsTrie);
5198         }
5199     }
5200 }
5201 
5202 /* Test handling of a large number of look-ahead rules.
5203  * The number of rules in the test exceeds the implementation limits prior to the
5204  * improvements introduced with #13590.
5205  *
5206  * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ...
5207  * The text being matched is sequential, "ABCDEFGHI..."
5208  *
5209  * The upshot is that the look-ahead rules all match on their preceding context,
5210  * and consequently must save a potential result, but then fail to match on their
5211  * trailing context, so that they don't actually cause a boundary.
5212  *
5213  * Additionally, add a ".*" rule, so there are no boundaries unless a
5214  * look-ahead hard-break rule forces one.
5215  */
TestBug13590()5216 void RBBITest::TestBug13590() {
5217     UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"};
5218 
5219     const int NUM_LOOKAHEAD_RULES = 50;
5220     const char16_t STARTING_CHAR = u'\u5000';
5221     char16_t firstChar;
5222     for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) {
5223         firstChar = STARTING_CHAR + ruleNum*2;
5224         rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'')
5225              .append(u' ') .append(u'/') .append(u' ')
5226              .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'')
5227              .append(u';') .append(u'\n');
5228     }
5229 
5230     // Change the last rule added from the form "UV / WY" to "UV / WX".
5231     // Changes the rule so that it will match - all 4 chars are in ascending sequence.
5232     rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3));
5233 
5234     UErrorCode status = U_ZERO_ERROR;
5235     UParseError parseError;
5236     RuleBasedBreakIterator bi(rules, parseError, status);
5237     if (!assertSuccess(WHERE, status)) {
5238         errln(rules);
5239         return;
5240     }
5241     // bi.dumpTables();
5242 
5243     UnicodeString testString;
5244     for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) {
5245         testString.append(c);
5246     }
5247     bi.setText(testString);
5248 
5249     int breaksFound = 0;
5250     while (bi.next() != UBRK_DONE) {
5251         ++breaksFound;
5252     }
5253 
5254     // Two matches are expected, one from the last rule that was explicitly modified,
5255     // and one at the end of the text.
5256     assertEquals(WHERE, 2, breaksFound);
5257 }
5258 
5259 
5260 #if U_ENABLE_TRACING
5261 static std::vector<std::string> gData;
5262 static std::vector<int32_t> gEntryFn;
5263 static std::vector<int32_t> gExitFn;
5264 static std::vector<int32_t> gDataFn;
5265 
traceData(const void *,int32_t fnNumber,int32_t,const char *,va_list args)5266 static void U_CALLCONV traceData(
5267         const void*,
5268         int32_t fnNumber,
5269         int32_t,
5270         const char *,
5271         va_list args) {
5272     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5273         const char* data = va_arg(args, const char*);
5274         gDataFn.push_back(fnNumber);
5275         gData.push_back(data);
5276     }
5277 }
5278 
traceEntry(const void *,int32_t fnNumber)5279 static void traceEntry(const void *, int32_t fnNumber) {
5280     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5281         gEntryFn.push_back(fnNumber);
5282     }
5283 }
5284 
traceExit(const void *,int32_t fnNumber,const char *,va_list)5285 static void traceExit(const void *, int32_t fnNumber, const char *, va_list) {
5286     if (UTRACE_UBRK_START <= fnNumber && fnNumber <= UTRACE_UBRK_LIMIT) {
5287         gExitFn.push_back(fnNumber);
5288     }
5289 }
5290 
5291 
assertTestTraceResult(int32_t fnNumber,const char * expectedData)5292 void RBBITest::assertTestTraceResult(int32_t fnNumber, const char* expectedData) {
5293     assertEquals("utrace_entry should be called ", 1, gEntryFn.size());
5294     assertEquals("utrace_entry should be called with ", fnNumber, gEntryFn[0]);
5295     assertEquals("utrace_exit should be called ", 1, gExitFn.size());
5296     assertEquals("utrace_exit should be called with ", fnNumber, gExitFn[0]);
5297 
5298     if (expectedData == nullptr) {
5299       assertEquals("utrace_data should not be called ", 0, gDataFn.size());
5300       assertEquals("utrace_data should not be called ", 0, gData.size());
5301     } else {
5302       assertEquals("utrace_data should be called ", 1, gDataFn.size());
5303       assertEquals("utrace_data should be called with ", fnNumber, gDataFn[0]);
5304       assertEquals("utrace_data should be called ", 1, gData.size());
5305       assertEquals("utrace_data should pass in ", expectedData, gData[0].c_str());
5306     }
5307 }
5308 
SetupTestTrace()5309 void SetupTestTrace() {
5310     gEntryFn.clear();
5311     gExitFn.clear();
5312     gDataFn.clear();
5313     gData.clear();
5314 
5315     const void* context = nullptr;
5316     utrace_setFunctions(context, traceEntry, traceExit, traceData);
5317     utrace_setLevel(UTRACE_INFO);
5318 }
5319 
TestTraceCreateCharacter()5320 void RBBITest::TestTraceCreateCharacter() {
5321     SetupTestTrace();
5322     IcuTestErrorCode status(*this, "TestTraceCreateCharacter");
5323     LocalPointer<BreakIterator> brkitr(
5324         BreakIterator::createCharacterInstance("zh-CN", status));
5325     status.errIfFailureAndReset();
5326     assertTestTraceResult(UTRACE_UBRK_CREATE_CHARACTER, nullptr);
5327 }
5328 
TestTraceCreateTitle()5329 void RBBITest::TestTraceCreateTitle() {
5330     SetupTestTrace();
5331     IcuTestErrorCode status(*this, "TestTraceCreateTitle");
5332     LocalPointer<BreakIterator> brkitr(
5333         BreakIterator::createTitleInstance("zh-CN", status));
5334     status.errIfFailureAndReset();
5335     assertTestTraceResult(UTRACE_UBRK_CREATE_TITLE, nullptr);
5336 }
5337 
TestTraceCreateSentence()5338 void RBBITest::TestTraceCreateSentence() {
5339     SetupTestTrace();
5340     IcuTestErrorCode status(*this, "TestTraceCreateSentence");
5341     LocalPointer<BreakIterator> brkitr(
5342         BreakIterator::createSentenceInstance("zh-CN", status));
5343     status.errIfFailureAndReset();
5344     assertTestTraceResult(UTRACE_UBRK_CREATE_SENTENCE, nullptr);
5345 }
5346 
TestTraceCreateWord()5347 void RBBITest::TestTraceCreateWord() {
5348     SetupTestTrace();
5349     IcuTestErrorCode status(*this, "TestTraceCreateWord");
5350     LocalPointer<BreakIterator> brkitr(
5351         BreakIterator::createWordInstance("zh-CN", status));
5352     status.errIfFailureAndReset();
5353     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5354 }
5355 
TestTraceCreateLine()5356 void RBBITest::TestTraceCreateLine() {
5357     SetupTestTrace();
5358     IcuTestErrorCode status(*this, "TestTraceCreateLine");
5359     LocalPointer<BreakIterator> brkitr(
5360         BreakIterator::createLineInstance("zh-CN", status));
5361     status.errIfFailureAndReset();
5362     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line");
5363 }
5364 
TestTraceCreateLineStrict()5365 void RBBITest::TestTraceCreateLineStrict() {
5366     SetupTestTrace();
5367     IcuTestErrorCode status(*this, "TestTraceCreateLineStrict");
5368     LocalPointer<BreakIterator> brkitr(
5369         BreakIterator::createLineInstance("zh-CN-u-lb-strict", status));
5370     status.errIfFailureAndReset();
5371     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict");
5372 }
5373 
TestTraceCreateLineNormal()5374 void RBBITest::TestTraceCreateLineNormal() {
5375     SetupTestTrace();
5376     IcuTestErrorCode status(*this, "TestTraceCreateLineNormal");
5377     LocalPointer<BreakIterator> brkitr(
5378         BreakIterator::createLineInstance("zh-CN-u-lb-normal", status));
5379     status.errIfFailureAndReset();
5380     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal");
5381 }
5382 
TestTraceCreateLineLoose()5383 void RBBITest::TestTraceCreateLineLoose() {
5384     SetupTestTrace();
5385     IcuTestErrorCode status(*this, "TestTraceCreateLineLoose");
5386     LocalPointer<BreakIterator> brkitr(
5387         BreakIterator::createLineInstance("zh-CN-u-lb-loose", status));
5388     status.errIfFailureAndReset();
5389     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose");
5390 }
5391 
TestTraceCreateLineLoosePhrase()5392 void RBBITest::TestTraceCreateLineLoosePhrase() {
5393     SetupTestTrace();
5394     IcuTestErrorCode status(*this, "TestTraceCreateLineLoosePhrase");
5395     LocalPointer<BreakIterator> brkitr(
5396         BreakIterator::createLineInstance("ja-u-lb-loose-lw-phrase", status));
5397     status.errIfFailureAndReset();
5398     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_loose_phrase");
5399 }
5400 
TestTraceCreateLineNormalPhrase()5401 void RBBITest::TestTraceCreateLineNormalPhrase() {
5402     SetupTestTrace();
5403     IcuTestErrorCode status(*this, "TestTraceCreateLineNormalPhrase");
5404     LocalPointer<BreakIterator> brkitr(
5405         BreakIterator::createLineInstance("ja-u-lb-normal-lw-phrase", status));
5406     status.errIfFailureAndReset();
5407     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_normal_phrase");
5408 }
5409 
TestTraceCreateLineStrictPhrase()5410 void RBBITest::TestTraceCreateLineStrictPhrase() {
5411     SetupTestTrace();
5412     IcuTestErrorCode status(*this, "TestTraceCreateLineStrictPhrase");
5413     LocalPointer<BreakIterator> brkitr(
5414         BreakIterator::createLineInstance("ja-u-lb-strict-lw-phrase", status));
5415     status.errIfFailureAndReset();
5416     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_strict_phrase");
5417 }
5418 
TestTraceCreateLinePhrase()5419 void RBBITest::TestTraceCreateLinePhrase() {
5420     SetupTestTrace();
5421     IcuTestErrorCode status(*this, "TestTraceCreateLinePhrase");
5422     LocalPointer<BreakIterator> brkitr(
5423         BreakIterator::createLineInstance("ja-u-lw-phrase", status));
5424     status.errIfFailureAndReset();
5425     assertTestTraceResult(UTRACE_UBRK_CREATE_LINE, "line_phrase");
5426 }
5427 
TestTraceCreateBreakEngine()5428 void RBBITest::TestTraceCreateBreakEngine() {
5429     rbbi_cleanup();
5430     SetupTestTrace();
5431     IcuTestErrorCode status(*this, "TestTraceCreateBreakEngine");
5432     LocalPointer<BreakIterator> brkitr(
5433         BreakIterator::createWordInstance("zh-CN", status));
5434     status.errIfFailureAndReset();
5435     assertTestTraceResult(UTRACE_UBRK_CREATE_WORD, nullptr);
5436 
5437     // To word break the following text, BreakIterator will create 5 dictionary
5438     // break engine internally.
5439     UnicodeString text(
5440         u"test "
5441         u"測試 " // Hani
5442         u"សាកល្បង " // Khmr
5443         u"ທົດສອບ " // Laoo
5444         u"စမ်းသပ်မှု " // Mymr
5445         u"ทดสอบ " // Thai
5446         u"test "
5447     );
5448     brkitr->setText(text);
5449 
5450     // Loop through all the text.
5451     while (brkitr->next() > 0) ;
5452 
5453     assertEquals("utrace_entry should be called ", 6, gEntryFn.size());
5454     assertEquals("utrace_exit should be called ", 6, gExitFn.size());
5455     assertEquals("utrace_data should be called ", 5, gDataFn.size());
5456 
5457     for (std::vector<int>::size_type i = 0; i < gDataFn.size(); i++) {
5458         assertEquals("utrace_entry should be called ",
5459                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gEntryFn[i+1]);
5460         assertEquals("utrace_exit should be called ",
5461                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gExitFn[i+1]);
5462         assertEquals("utrace_data should be called ",
5463                      UTRACE_UBRK_CREATE_BREAK_ENGINE, gDataFn[i]);
5464     }
5465 
5466     assertEquals("utrace_data should pass ", "Hani", gData[0].c_str());
5467     assertEquals("utrace_data should pass ", "Khmr", gData[1].c_str());
5468     assertEquals("utrace_data should pass ", "Laoo", gData[2].c_str());
5469     assertEquals("utrace_data should pass ", "Mymr", gData[3].c_str());
5470     assertEquals("utrace_data should pass ", "Thai", gData[4].c_str());
5471 
5472 }
5473 #endif
5474 
TestUnpairedSurrogate()5475 void RBBITest::TestUnpairedSurrogate() {
5476     UnicodeString rules(u"ab;");
5477 
5478     UErrorCode status = U_ZERO_ERROR;
5479     UParseError pe;
5480     RuleBasedBreakIterator bi1(rules, pe, status);
5481     assertSuccess(WHERE, status);
5482     UnicodeString rtRules = bi1.getRules();
5483     // make sure the simple one work first.
5484     assertEquals(WHERE, rules,  rtRules);
5485 
5486 
5487     rules = UnicodeString(u"a\\ud800b;").unescape();
5488     pe.line = 0;
5489     pe.offset = 0;
5490     RuleBasedBreakIterator bi2(rules, pe, status);
5491     assertEquals(WHERE "unpaired lead surrogate", U_ILLEGAL_CHAR_FOUND , status);
5492     if (pe.line != 1 || pe.offset != 1) {
5493         errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5494     }
5495 
5496     status = U_ZERO_ERROR;
5497     rules = UnicodeString(u"a\\ude00b;").unescape();
5498     pe.line = 0;
5499     pe.offset = 0;
5500     RuleBasedBreakIterator bi3(rules, pe, status);
5501     assertEquals(WHERE "unpaired tail surrogate", U_ILLEGAL_CHAR_FOUND , status);
5502     if (pe.line != 1 || pe.offset != 1) {
5503         errln("pe (line, offset) expected (1, 1), got (%d, %d)", pe.line, pe.offset);
5504     }
5505 
5506     // make sure the surrogate one work too.
5507     status = U_ZERO_ERROR;
5508     rules = UnicodeString(u"a��b;");
5509     RuleBasedBreakIterator bi4(rules, pe, status);
5510     rtRules = bi4.getRules();
5511     assertEquals(WHERE, rules, rtRules);
5512 }
5513 
5514 // Read file generated by
5515 // https://github.com/unicode-org/lstm_word_segmentation/blob/master/segment_text.py
5516 // as test cases and compare the Output.
5517 // Format of the file
5518 //   Model:\t[Model Name (such as 'Thai_graphclust_model4_heavy')]
5519 //   Embedding:\t[Embedding type (such as 'grapheme_clusters_tf')]
5520 //   Input:\t[source text]
5521 //   Output:\t[expected output separated by | ]
5522 //   Input: ...
5523 //   Output: ...
5524 
runLSTMTestFromFile(const char * filename,UScriptCode script)5525 void RBBITest::runLSTMTestFromFile(const char* filename, UScriptCode script) {
5526     // The expectation in this test depends on LSTM, skip the test if the
5527     // configuration is not build with LSTM data.
5528     if (skipLSTMTest()) {
5529         return;
5530     }
5531     UErrorCode   status = U_ZERO_ERROR;
5532     LocalPointer<BreakIterator> iterator(BreakIterator::createWordInstance(Locale(), status));
5533     if (U_FAILURE(status)) {
5534         errln("%s:%d Error %s Cannot create Word BreakIterator", __FILE__, __LINE__, u_errorName(status));
5535         return;
5536     }
5537     //  Open and read the test data file.
5538     const char *testDataDirectory = IntlTest::getSourceTestData(status);
5539     CharString testFileName(testDataDirectory, -1, status);
5540     testFileName.append(filename, -1, status);
5541 
5542     int len;
5543     char16_t *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
5544     if (U_FAILURE(status)) {
5545         errln("%s:%d Error %s opening test file %s", __FILE__, __LINE__, u_errorName(status), filename);
5546         return;
5547     }
5548 
5549     //  Put the test data into a UnicodeString
5550     UnicodeString testString(false, testFile, len);
5551 
5552     int32_t start = 0;
5553 
5554     UnicodeString line;
5555     int32_t end;
5556     std::string actual_sep_str;
5557     int32_t caseNum = 0;
5558     // Iterate through all the lines in the test file.
5559     do {
5560         int32_t cr = testString.indexOf(u'\r', start);
5561         int32_t lf = testString.indexOf(u'\n', start);
5562         end = cr >= 0 ? (lf >= 0 ? std::min(cr, lf) : cr) : lf;
5563         line = testString.tempSubString(start, end < 0 ? INT32_MAX : end - start);
5564         if (line.length() > 0) {
5565             // Separate each line to key and value by TAB.
5566             int32_t tab = line.indexOf(u'\t');
5567             UnicodeString key = line.tempSubString(0, tab);
5568             const UnicodeString value = line.tempSubString(tab+1);
5569 
5570             if (key == "Model:") {
5571                 // Verify the expectation in the test file match the LSTM model
5572                 // we are using now.
5573                 const LSTMData* data = CreateLSTMDataForScript(script, status);
5574                 if (U_FAILURE(status)) {
5575                     dataerrln("%s:%d Error %s Cannot create LSTM data for script %s",
5576                               __FILE__, __LINE__, u_errorName(status), uscript_getName(script));
5577                     return;
5578                 }
5579                 UnicodeString name(LSTMDataName(data));
5580                 DeleteLSTMData(data);
5581                 if (value != name) {
5582                     std::string utf8Name, utf8Value;
5583                     dataerrln("%s:%d Error %s The LSTM data for script %s is %s instead of %s",
5584                               __FILE__, __LINE__, u_errorName(status), uscript_getName(script),
5585                               name.toUTF8String<std::string>(utf8Name).c_str(),
5586                               value.toUTF8String<std::string>(utf8Value).c_str());
5587                     return;
5588                 }
5589             } else if (key == "Input:") {
5590                 UnicodeString input("prefix ");
5591                 input += value + " suffix";
5592                 std::stringstream ss;
5593 
5594                 // Construct the UText which is expected by the the engine as
5595                 // input from the UnicodeString.
5596                 UText ut = UTEXT_INITIALIZER;
5597                 utext_openConstUnicodeString(&ut, &input, &status);
5598                 if (U_FAILURE(status)) {
5599                     dataerrln("Could not utext_openConstUnicodeString for " + value + UnicodeString(u_errorName(status)));
5600                     return;
5601                 }
5602 
5603                 iterator->setText(&ut, status);
5604                 if (U_FAILURE(status)) {
5605                     errln("%s:%d Error %s Could not setText to BreakIterator", __FILE__, __LINE__, u_errorName(status));
5606                     return;
5607                 }
5608 
5609                 int32_t bp;
5610                 for (bp = iterator->first(); bp != BreakIterator::DONE; bp = iterator->next()) {
5611                     ss << bp;
5612                     if (bp != input.length()) {
5613                         ss << ", ";
5614                     }
5615                 }
5616 
5617                 utext_close(&ut);
5618                 // Turn the break points into a string for easy comparison
5619                 // output.
5620                 actual_sep_str = "{" + ss.str() + "}";
5621             } else if (key == "Output:" && !actual_sep_str.empty()) {
5622                 UnicodeString input("prefix| |");
5623                 input += value + "| |suffix";
5624                 std::string d;
5625                 int32_t sep;
5626                 int32_t start = 0;
5627                 int32_t curr = 0;
5628                 std::stringstream ss;
5629                 // Include 0 as the break point.
5630                 ss << "0, ";
5631                 while ((sep = input.indexOf(u'|', start)) >= 0) {
5632                     int32_t len = sep - start;
5633                     if (len > 0) {
5634                         if (curr > 0) {
5635                             ss << ", ";
5636                         }
5637                         curr += len;
5638                         ss << curr;
5639                     }
5640                     start = sep + 1;
5641                 }
5642                 // Include end of the string as break point.
5643                 ss << ", " << curr + input.length() - start;
5644                 // Turn the break points into a string for easy comparison
5645                 // output.
5646                 std::string expected = "{" + ss.str() + "}";
5647                 std::string utf8;
5648 
5649                 assertEquals((input + " Test Case#" + caseNum).toUTF8String<std::string>(utf8).c_str(),
5650                              expected.c_str(), actual_sep_str.c_str());
5651                 actual_sep_str.clear();
5652             }
5653         }
5654         start = std::max(cr, lf) + 1;
5655     } while (end >= 0);
5656 
5657     delete [] testFile;
5658 }
5659 
TestLSTMThai()5660 void RBBITest::TestLSTMThai() {
5661     runLSTMTestFromFile("Thai_graphclust_model4_heavy_Test.txt", USCRIPT_THAI);
5662 }
5663 
TestLSTMBurmese()5664 void RBBITest::TestLSTMBurmese() {
5665     runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
5666 }
5667 
5668 
5669 // Test preceding(index) and following(index), with semi-random indexes.
5670 // The random indexes are produced in clusters that are relatively closely spaced,
5671 // to increase the occurrences of hits to the internal break cache.
5672 
TestRandomAccess()5673 void RBBITest::TestRandomAccess() {
5674     static constexpr int32_t CACHE_SIZE = 128;
5675 
5676     UnicodeString testData;
5677     for (int i=0; i<CACHE_SIZE*2; ++i) {
5678         testData.append(u"aaaa\n");
5679     }
5680 
5681     UErrorCode status = U_ZERO_ERROR;
5682     LocalPointer<RuleBasedBreakIterator> bi(
5683           dynamic_cast<RuleBasedBreakIterator*>(BreakIterator::createLineInstance(Locale::getEnglish(), status)),
5684             status);
5685     if (!assertSuccess(WHERE, status)) { return; };
5686 
5687     bi->setText(testData);
5688 
5689     auto expectedPreceding = [](int from) {
5690         if (from == 0) {return UBRK_DONE;}
5691         if (from % 5 == 0) {return from - 5;}
5692         return from - (from % 5);
5693     };
5694 
5695     auto expectedFollow = [testData](int from) {
5696         if (from >= testData.length()) {return UBRK_DONE;}
5697         if (from % 5 == 0) {return from + 5;}
5698         return from + (5 - (from % 5));
5699     };
5700 
5701     auto randomStringIndex = [testData]() {
5702         static icu_rand randomGenerator;  // produces random uint32_t values.
5703         static int lastNum;
5704         static int clusterCount;
5705         static constexpr int CLUSTER_SIZE = 100;
5706         static constexpr int CLUSTER_LENGTH = 10;
5707 
5708         if (clusterCount < CLUSTER_LENGTH) {
5709             ++clusterCount;
5710             lastNum += (randomGenerator() % CLUSTER_SIZE);
5711             lastNum -= CLUSTER_SIZE / 2;
5712             lastNum = std::max(0, lastNum);
5713             // Deliberately test indexes > testData.length.
5714             lastNum = std::min(testData.length() + 5, lastNum);
5715         } else {
5716             clusterCount = 0;
5717             lastNum = randomGenerator() % testData.length();
5718         }
5719         return lastNum;
5720     };
5721 
5722     for (int i=0; i<5000; ++i) {
5723         int idx = randomStringIndex();
5724         assertEquals(WHERE, expectedFollow(idx), bi->following(idx));
5725         idx = randomStringIndex();
5726         assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx));
5727     }
5728 }
5729 
5730 // A Fake Tai Le break engine which handle Unicode Tai Le (Tale) block
5731 // https://unicode.org/charts/PDF/U1950.pdf
5732 // U+1950 - U+197F and always break after Tone letters (U+1970-U+1974)
5733 class FakeTaiLeBreakEngine : public ExternalBreakEngine {
5734  public:
FakeTaiLeBreakEngine()5735   FakeTaiLeBreakEngine() : block(0x1950, 0x197f), tones(0x1970, 0x1974) {
5736   }
~FakeTaiLeBreakEngine()5737   virtual ~FakeTaiLeBreakEngine() {
5738   }
isFor(UChar32 c,const char *) const5739   virtual bool isFor(UChar32 c, const char* /* locale */) const override {
5740       // We implmement this for any locale, not return false for some langauge
5741       // here.
5742       return handles(c);
5743   }
handles(UChar32 c) const5744   virtual bool handles(UChar32 c) const override {
5745       return block.contains(c);
5746   }
fillBreaks(UText * text,int32_t start,int32_t end,int32_t * foundBreaks,int32_t foundBreaksCapacity,UErrorCode & status) const5747   virtual int32_t fillBreaks(UText* text,  int32_t start, int32_t end,
5748                              int32_t* foundBreaks, int32_t foundBreaksCapacity,
5749                              UErrorCode& status) const override {
5750        if (U_FAILURE(status)) return 0;
5751        int32_t i = 0;
5752        // Save the state of the utext
5753        int64_t savedIndex = utext_getNativeIndex(text);
5754        if (savedIndex != start) {
5755            utext_setNativeIndex(text, start);
5756        }
5757        int32_t current;
5758        while((current = (int32_t)utext_getNativeIndex(text)) < end) {
5759          UChar32 c = utext_current32(text);
5760          // Break after tone marks as a fake break point.
5761          if (tones.contains(c)) {
5762              if (i >= foundBreaksCapacity) {
5763                  status = U_BUFFER_OVERFLOW_ERROR;
5764                  utext_setNativeIndex(text, savedIndex);
5765                  return i;
5766              }
5767              foundBreaks[i++] = current;
5768          }
5769          UTEXT_NEXT32(text);
5770        }
5771        // Restore the utext
5772        if (savedIndex != current) {
5773            utext_setNativeIndex(text, savedIndex);
5774        }
5775        return i;
5776   }
5777 
5778  private:
5779   UnicodeSet block;
5780   UnicodeSet tones;
5781 };
5782 
5783 // A Fake Yue Break Engine which handle CJK Unified Ideographs
5784 // block (U+4E00-U+9FFF) when locale start with 'yue' and break
5785 // after every character.
5786 class FakeYueBreakEngine : public ExternalBreakEngine {
5787  public:
FakeYueBreakEngine()5788   FakeYueBreakEngine() : block(0x4e00, 0x9FFF) {
5789   }
~FakeYueBreakEngine()5790   virtual ~FakeYueBreakEngine() {
5791   }
isFor(UChar32 c,const char * locale) const5792   virtual bool isFor(UChar32 c, const char* locale) const override {
5793       // We implmement this for any locale starts with "yue" such as
5794       // "yue", "yue-CN", "yue-Hant-CN", etc.
5795       return handles(c) && uprv_strncmp("yue", locale, 3) == 0;
5796   }
handles(UChar32 c) const5797   virtual bool handles(UChar32 c) const override {
5798       return block.contains(c);
5799   }
fillBreaks(UText * text,int32_t start,int32_t end,int32_t * foundBreaks,int32_t foundBreaksCapacity,UErrorCode & status) const5800   virtual int32_t fillBreaks(UText* text,  int32_t start, int32_t end,
5801                              int32_t* foundBreaks, int32_t foundBreaksCapacity,
5802                              UErrorCode& status) const override {
5803        (void)text;
5804        if (U_FAILURE(status)) return 0;
5805        int32_t i = 0;
5806        int32_t current = start;
5807        while (current++ < end) {
5808            // A fake word segmentation by breaking every two Unicode.
5809            if ((current - start) % 2 == 0) {
5810                if (i >= foundBreaksCapacity) {
5811                    status = U_BUFFER_OVERFLOW_ERROR;
5812                    return i;
5813                }
5814                foundBreaks[i++] = current;
5815            }
5816        }
5817        return i;
5818   }
5819 
5820  private:
5821   UnicodeSet block;
5822 };
5823 
TestExternalBreakEngineWithFakeYue()5824 void RBBITest::TestExternalBreakEngineWithFakeYue() {
5825     UErrorCode status = U_ZERO_ERROR;
5826     UnicodeString text(u"a bc def一兩年前佢真係唔鍾意畀我影相i jk lmn");
5827 
5828     std::vector<int32_t> actual1;
5829     {
5830         LocalPointer<BreakIterator> bi1(
5831             BreakIterator::createWordInstance(Locale::getRoot(), status),
5832             status);
5833         bi1->setText(text);
5834         assertTrue(WHERE "BreakIterator::createWordInstance( root )",
5835                    U_SUCCESS(status));
5836 
5837         do {
5838             actual1.push_back(bi1->current());
5839         } while(bi1->next() != BreakIterator::DONE);
5840     }
5841 
5842     std::vector<int32_t> expected1({{ 0, 1, 2, 4, 5, 8, 10, 12, 13, 14, 15,
5843       16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 30}});
5844     assertTrue("root break Yue as Chinese", expected1 == actual1);
5845 
5846     status = U_ZERO_ERROR;
5847     RuleBasedBreakIterator::registerExternalBreakEngine(
5848         new FakeYueBreakEngine(), status);
5849     assertTrue(WHERE "registerExternalBreakEngine w FakeYueBreakEngine",
5850                U_SUCCESS(status));
5851 
5852     std::vector<int32_t> actual2;
5853     {
5854         status = U_ZERO_ERROR;
5855         LocalPointer<BreakIterator> bi2(
5856             BreakIterator::createWordInstance(Locale("yue"), status), status);
5857         assertTrue(WHERE "BreakIterator::createWordInstance( yue )",
5858                    U_SUCCESS(status));
5859         bi2->setText(text);
5860         do {
5861             actual2.push_back(bi2->current());
5862         } while(bi2->next() != BreakIterator::DONE);
5863     }
5864     std::vector<int32_t> expected2({{ 0, 1, 2, 4, 5, 8, 10, 12, 14, 16, 18, 20,
5865       22, 23, 24, 26, 27, 30}});
5866     assertTrue(WHERE "break Yue by Fake external breaker",
5867                expected2 == actual2);
5868 }
5869 
TestExternalBreakEngineWithFakeTaiLe()5870 void RBBITest::TestExternalBreakEngineWithFakeTaiLe() {
5871     UErrorCode status = U_ZERO_ERROR;
5872     UnicodeString text(
5873         u"a bc defᥛᥫᥒᥰᥖᥭᥰᥞᥝᥰᥙᥥᥢᥛᥫᥒᥰᥑᥩᥢᥲᥔᥣᥝᥴᥓᥬᥖᥩᥢᥲᥛᥣᥝᥱᥙᥝᥱᥙᥤᥱᥓᥣᥒᥛᥣᥰᥓᥧ"
5874         u"ᥰᥘᥩᥰᥗᥪᥒᥴᥛᥣᥰᥘᥬᥰᥝᥣᥱᥘᥒᥱᥔᥣᥛᥴᥘᥫᥢi jk lmn");
5875 
5876     std::vector<int32_t> actual1;
5877     {
5878         LocalPointer<BreakIterator> bi1(
5879             BreakIterator::createLineInstance(Locale::getRoot(), status),
5880             status);
5881         bi1->setText(text);
5882         assertTrue(WHERE "BreakIterator::createLineInstance( root )",
5883                    U_SUCCESS(status));
5884 
5885         do {
5886             actual1.push_back(bi1->current());
5887         } while(bi1->next() != BreakIterator::DONE);
5888     }
5889 
5890     std::vector<int32_t> expected1({{
5891       0, 2, 5, 86, 89, 92 }});
5892     assertTrue(WHERE "root break Tai Le", expected1 == actual1);
5893 
5894     RuleBasedBreakIterator::registerExternalBreakEngine(
5895         new FakeTaiLeBreakEngine(), status);
5896     assertTrue(WHERE "registerExternalBreakEngine w FakeTaiLeBreakEngine",
5897                U_SUCCESS(status));
5898 
5899     std::vector<int32_t> actual2;
5900     {
5901         status = U_ZERO_ERROR;
5902         LocalPointer<BreakIterator> bi2(
5903             BreakIterator::createLineInstance(Locale("tdd"), status), status);
5904         assertTrue(WHERE "BreakIterator::createLineInstance( tdd )",
5905                    U_SUCCESS(status));
5906         bi2->setText(text);
5907         do {
5908             actual2.push_back(bi2->current());
5909         } while(bi2->next() != BreakIterator::DONE);
5910     }
5911     std::vector<int32_t> expected2({{
5912          0, 2, 5, 11, 14, 17, 24, 28, 32, 38, 42, 45, 48, 54, 57, 60, 64, 67,
5913          70, 73, 76, 80, 86, 89, 92}});
5914     assertTrue("break Tai Le by Fake external breaker",
5915                expected2 == actual2);
5916 }
5917 
5918 // Test a single unpaired unpaired char (either surrogate low or high) in
5919 // an Unicode set will not cause infinity loop.
TestBug22585()5920 void RBBITest::TestBug22585() {
5921     UnicodeString rule = u"$a=[";
5922     rule.append(0xdecb) // an unpaired surrogate high
5923         .append("];");
5924     UParseError pe {};
5925     UErrorCode ec {U_ZERO_ERROR};
5926     RuleBasedBreakIterator bi(rule, pe, ec);
5927 
5928     rule = u"$a=[";
5929     rule.append(0xd94e) // an unpaired surrogate low
5930         .append("];");
5931     ec = U_ZERO_ERROR;
5932     RuleBasedBreakIterator bi2(rule, pe, ec);
5933 }
5934 
5935 // Test a long string with a ; in the end will not cause stack overflow.
TestBug22602()5936 void RBBITest::TestBug22602() {
5937     UnicodeString rule(25000, (UChar32)'A', 25000-1);
5938     rule.append(u";");
5939     UParseError pe {};
5940     UErrorCode ec {U_ZERO_ERROR};
5941     RuleBasedBreakIterator bi(rule, pe, ec);
5942 }
5943 
TestBug22636()5944 void RBBITest::TestBug22636() {
5945     UParseError pe {};
5946     UErrorCode ec {U_ZERO_ERROR};
5947     RuleBasedBreakIterator bi(u"A{77777777777777};", pe, ec);
5948     assertEquals(WHERE, ec, U_BRK_RULE_SYNTAX);
5949     ec = U_ZERO_ERROR;
5950     RuleBasedBreakIterator bi2(u"A{2147483648};", pe, ec);
5951     assertEquals(WHERE, ec, U_BRK_RULE_SYNTAX);
5952     ec = U_ZERO_ERROR;
5953     RuleBasedBreakIterator bi3(u"A{2147483647};", pe, ec);
5954     assertEquals(WHERE, ec, U_ZERO_ERROR);
5955 }
5956 
TestBug22584()5957 void RBBITest::TestBug22584() {
5958     // Creating a break iterator from a rule consisting of a very long
5959     // literal input string caused a stack overflow when deleting the
5960     // parse tree for the input during the rule building process.
5961 
5962     // Failure of this test showed as a crash during the break iterator construction.
5963 
5964     UnicodeString ruleStr(100000, (UChar32)0, 100000);
5965     UParseError pe {};
5966     UErrorCode ec {U_ZERO_ERROR};
5967 
5968     RuleBasedBreakIterator bi(ruleStr, pe, ec);
5969     ec = U_ZERO_ERROR;
5970     ruleStr = u"a/b;c";
5971     RuleBasedBreakIterator bi2(ruleStr, pe, ec);
5972 }
5973 
TestBug22579()5974 void RBBITest::TestBug22579() {
5975     // Test not causing null deref in cloneTree
5976     UnicodeString ruleStr = u"[{ab}];";
5977     UParseError pe {};
5978     UErrorCode ec {U_ZERO_ERROR};
5979 
5980     RuleBasedBreakIterator bi(ruleStr, pe, ec);
5981 }
TestBug22581()5982 void RBBITest::TestBug22581() {
5983     // Test duplicate variable setting will not leak the rule compilation
5984     UnicodeString ruleStr = u"$foo=[abc]; $foo=[xyz]; $foo;";
5985     UParseError pe {};
5986     UErrorCode ec {U_ZERO_ERROR};
5987 
5988     RuleBasedBreakIterator bi(ruleStr, pe, ec);
5989 }
5990 #endif // #if !UCONFIG_NO_BREAK_ITERATION
5991