xref: /aosp_15_r20/external/cronet/third_party/icu/source/i18n/collationruleparser.cpp (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.cpp
9 *
10 * (replaced the former ucol_tok.cpp)
11 *
12 * created on: 2013apr10
13 * created by: Markus W. Scherer
14 */
15 
16 #include "unicode/utypes.h"
17 
18 #if !UCONFIG_NO_COLLATION
19 
20 #include "unicode/normalizer2.h"
21 #include "unicode/parseerr.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ucol.h"
24 #include "unicode/uloc.h"
25 #include "unicode/unistr.h"
26 #include "unicode/utf16.h"
27 #include "bytesinkutil.h"
28 #include "charstr.h"
29 #include "cmemory.h"
30 #include "collation.h"
31 #include "collationdata.h"
32 #include "collationruleparser.h"
33 #include "collationsettings.h"
34 #include "collationtailoring.h"
35 #include "cstring.h"
36 #include "patternprops.h"
37 #include "uassert.h"
38 #include "ulocimp.h"
39 #include "uvectr32.h"
40 
41 U_NAMESPACE_BEGIN
42 
43 namespace {
44 
45 static const char16_t BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
46 const int32_t BEFORE_LENGTH = 7;
47 
48 }  // namespace
49 
~Sink()50 CollationRuleParser::Sink::~Sink() {}
51 
52 void
suppressContractions(const UnicodeSet &,const char * &,UErrorCode &)53 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
54 
55 void
optimize(const UnicodeSet &,const char * &,UErrorCode &)56 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
57 
~Importer()58 CollationRuleParser::Importer::~Importer() {}
59 
CollationRuleParser(const CollationData * base,UErrorCode & errorCode)60 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
61         : nfd(*Normalizer2::getNFDInstance(errorCode)),
62           nfc(*Normalizer2::getNFCInstance(errorCode)),
63           rules(nullptr), baseData(base), settings(nullptr),
64           parseError(nullptr), errorReason(nullptr),
65           sink(nullptr), importer(nullptr),
66           ruleIndex(0) {
67 }
68 
~CollationRuleParser()69 CollationRuleParser::~CollationRuleParser() {
70 }
71 
72 void
parse(const UnicodeString & ruleString,CollationSettings & outSettings,UParseError * outParseError,UErrorCode & errorCode)73 CollationRuleParser::parse(const UnicodeString &ruleString,
74                            CollationSettings &outSettings,
75                            UParseError *outParseError,
76                            UErrorCode &errorCode) {
77     if(U_FAILURE(errorCode)) { return; }
78     settings = &outSettings;
79     parseError = outParseError;
80     if(parseError != nullptr) {
81         parseError->line = 0;
82         parseError->offset = -1;
83         parseError->preContext[0] = 0;
84         parseError->postContext[0] = 0;
85     }
86     errorReason = nullptr;
87     parse(ruleString, errorCode);
88 }
89 
90 void
parse(const UnicodeString & ruleString,UErrorCode & errorCode)91 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
92     if(U_FAILURE(errorCode)) { return; }
93     rules = &ruleString;
94     ruleIndex = 0;
95 
96     while(ruleIndex < rules->length()) {
97         char16_t c = rules->charAt(ruleIndex);
98         if(PatternProps::isWhiteSpace(c)) {
99             ++ruleIndex;
100             continue;
101         }
102         switch(c) {
103         case 0x26:  // '&'
104             parseRuleChain(errorCode);
105             break;
106         case 0x5b:  // '['
107             parseSetting(errorCode);
108             break;
109         case 0x23:  // '#' starts a comment, until the end of the line
110             ruleIndex = skipComment(ruleIndex + 1);
111             break;
112         case 0x40:  // '@' is equivalent to [backwards 2]
113             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
114                               UCOL_ON, 0, errorCode);
115             ++ruleIndex;
116             break;
117         case 0x21:  // '!' used to turn on Thai/Lao character reversal
118             // Accept but ignore. The root collator has contractions
119             // that are equivalent to the character reversal, where appropriate.
120             ++ruleIndex;
121             break;
122         default:
123             setParseError("expected a reset or setting or comment", errorCode);
124             break;
125         }
126         if(U_FAILURE(errorCode)) { return; }
127     }
128 }
129 
130 void
parseRuleChain(UErrorCode & errorCode)131 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
132     int32_t resetStrength = parseResetAndPosition(errorCode);
133     UBool isFirstRelation = true;
134     for(;;) {
135         int32_t result = parseRelationOperator(errorCode);
136         if(U_FAILURE(errorCode)) { return; }
137         if(result < 0) {
138             if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
139                 // '#' starts a comment, until the end of the line
140                 ruleIndex = skipComment(ruleIndex + 1);
141                 continue;
142             }
143             if(isFirstRelation) {
144                 setParseError("reset not followed by a relation", errorCode);
145             }
146             return;
147         }
148         int32_t strength = result & STRENGTH_MASK;
149         if(resetStrength < UCOL_IDENTICAL) {
150             // reset-before rule chain
151             if(isFirstRelation) {
152                 if(strength != resetStrength) {
153                     setParseError("reset-before strength differs from its first relation", errorCode);
154                     return;
155                 }
156             } else {
157                 if(strength < resetStrength) {
158                     setParseError("reset-before strength followed by a stronger relation", errorCode);
159                     return;
160                 }
161             }
162         }
163         int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
164         if((result & STARRED_FLAG) == 0) {
165             parseRelationStrings(strength, i, errorCode);
166         } else {
167             parseStarredCharacters(strength, i, errorCode);
168         }
169         if(U_FAILURE(errorCode)) { return; }
170         isFirstRelation = false;
171     }
172 }
173 
174 int32_t
parseResetAndPosition(UErrorCode & errorCode)175 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
176     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
177     int32_t i = skipWhiteSpace(ruleIndex + 1);
178     int32_t j;
179     char16_t c;
180     int32_t resetStrength;
181     if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
182             (j = i + BEFORE_LENGTH) < rules->length() &&
183             PatternProps::isWhiteSpace(rules->charAt(j)) &&
184             ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
185             0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
186             rules->charAt(j + 1) == 0x5d) {
187         // &[before n] with n=1 or 2 or 3
188         resetStrength = UCOL_PRIMARY + (c - 0x31);
189         i = skipWhiteSpace(j + 2);
190     } else {
191         resetStrength = UCOL_IDENTICAL;
192     }
193     if(i >= rules->length()) {
194         setParseError("reset without position", errorCode);
195         return UCOL_DEFAULT;
196     }
197     UnicodeString str;
198     if(rules->charAt(i) == 0x5b) {  // '['
199         i = parseSpecialPosition(i, str, errorCode);
200     } else {
201         i = parseTailoringString(i, str, errorCode);
202     }
203     sink->addReset(resetStrength, str, errorReason, errorCode);
204     if(U_FAILURE(errorCode)) { setErrorContext(); }
205     ruleIndex = i;
206     return resetStrength;
207 }
208 
209 int32_t
parseRelationOperator(UErrorCode & errorCode)210 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
211     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
212     ruleIndex = skipWhiteSpace(ruleIndex);
213     if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
214     int32_t strength;
215     int32_t i = ruleIndex;
216     char16_t c = rules->charAt(i++);
217     switch(c) {
218     case 0x3c:  // '<'
219         if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
220             ++i;
221             if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
222                 ++i;
223                 if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
224                     ++i;
225                     strength = UCOL_QUATERNARY;
226                 } else {
227                     strength = UCOL_TERTIARY;
228                 }
229             } else {
230                 strength = UCOL_SECONDARY;
231             }
232         } else {
233             strength = UCOL_PRIMARY;
234         }
235         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
236             ++i;
237             strength |= STARRED_FLAG;
238         }
239         break;
240     case 0x3b:  // ';' same as <<
241         strength = UCOL_SECONDARY;
242         break;
243     case 0x2c:  // ',' same as <<<
244         strength = UCOL_TERTIARY;
245         break;
246     case 0x3d:  // '='
247         strength = UCOL_IDENTICAL;
248         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
249             ++i;
250             strength |= STARRED_FLAG;
251         }
252         break;
253     default:
254         return UCOL_DEFAULT;
255     }
256     return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
257 }
258 
259 void
parseRelationStrings(int32_t strength,int32_t i,UErrorCode & errorCode)260 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
261     // Parse
262     //     prefix | str / extension
263     // where prefix and extension are optional.
264     UnicodeString prefix, str, extension;
265     i = parseTailoringString(i, str, errorCode);
266     if(U_FAILURE(errorCode)) { return; }
267     char16_t next = (i < rules->length()) ? rules->charAt(i) : 0;
268     if(next == 0x7c) {  // '|' separates the context prefix from the string.
269         prefix = str;
270         i = parseTailoringString(i + 1, str, errorCode);
271         if(U_FAILURE(errorCode)) { return; }
272         next = (i < rules->length()) ? rules->charAt(i) : 0;
273     }
274     if(next == 0x2f) {  // '/' separates the string from the extension.
275         i = parseTailoringString(i + 1, extension, errorCode);
276     }
277     if(!prefix.isEmpty()) {
278         UChar32 prefix0 = prefix.char32At(0);
279         UChar32 c = str.char32At(0);
280         if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
281             setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
282                           errorCode);
283             return;
284         }
285     }
286     sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
287     if(U_FAILURE(errorCode)) { setErrorContext(); }
288     ruleIndex = i;
289 }
290 
291 void
parseStarredCharacters(int32_t strength,int32_t i,UErrorCode & errorCode)292 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
293     UnicodeString empty, raw;
294     i = parseString(skipWhiteSpace(i), raw, errorCode);
295     if(U_FAILURE(errorCode)) { return; }
296     if(raw.isEmpty()) {
297         setParseError("missing starred-relation string", errorCode);
298         return;
299     }
300     UChar32 prev = -1;
301     int32_t j = 0;
302     for(;;) {
303         while(j < raw.length()) {
304             UChar32 c = raw.char32At(j);
305             if(!nfd.isInert(c)) {
306                 setParseError("starred-relation string is not all NFD-inert", errorCode);
307                 return;
308             }
309             sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
310             if(U_FAILURE(errorCode)) {
311                 setErrorContext();
312                 return;
313             }
314             j += U16_LENGTH(c);
315             prev = c;
316         }
317         if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
318             break;
319         }
320         if(prev < 0) {
321             setParseError("range without start in starred-relation string", errorCode);
322             return;
323         }
324         i = parseString(i + 1, raw, errorCode);
325         if(U_FAILURE(errorCode)) { return; }
326         if(raw.isEmpty()) {
327             setParseError("range without end in starred-relation string", errorCode);
328             return;
329         }
330         UChar32 c = raw.char32At(0);
331         if(c < prev) {
332             setParseError("range start greater than end in starred-relation string", errorCode);
333             return;
334         }
335         // range prev-c
336         UnicodeString s;
337         while(++prev <= c) {
338             if(!nfd.isInert(prev)) {
339                 setParseError("starred-relation string range is not all NFD-inert", errorCode);
340                 return;
341             }
342             if(U_IS_SURROGATE(prev)) {
343                 setParseError("starred-relation string range contains a surrogate", errorCode);
344                 return;
345             }
346             if(0xfffd <= prev && prev <= 0xffff) {
347                 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
348                 return;
349             }
350             s.setTo(prev);
351             sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
352             if(U_FAILURE(errorCode)) {
353                 setErrorContext();
354                 return;
355             }
356         }
357         prev = -1;
358         j = U16_LENGTH(c);
359     }
360     ruleIndex = skipWhiteSpace(i);
361 }
362 
363 int32_t
parseTailoringString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)364 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
365     i = parseString(skipWhiteSpace(i), raw, errorCode);
366     if(U_SUCCESS(errorCode) && raw.isEmpty()) {
367         setParseError("missing relation string", errorCode);
368     }
369     return skipWhiteSpace(i);
370 }
371 
372 int32_t
parseString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)373 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
374     if(U_FAILURE(errorCode)) { return i; }
375     raw.remove();
376     while(i < rules->length()) {
377         UChar32 c = rules->charAt(i++);
378         if(isSyntaxChar(c)) {
379             if(c == 0x27) {  // apostrophe
380                 if(i < rules->length() && rules->charAt(i) == 0x27) {
381                     // Double apostrophe, encodes a single one.
382                     raw.append((char16_t)0x27);
383                     ++i;
384                     continue;
385                 }
386                 // Quote literal text until the next single apostrophe.
387                 for(;;) {
388                     if(i == rules->length()) {
389                         setParseError("quoted literal text missing terminating apostrophe", errorCode);
390                         return i;
391                     }
392                     c = rules->charAt(i++);
393                     if(c == 0x27) {
394                         if(i < rules->length() && rules->charAt(i) == 0x27) {
395                             // Double apostrophe inside quoted literal text,
396                             // still encodes a single apostrophe.
397                             ++i;
398                         } else {
399                             break;
400                         }
401                     }
402                     raw.append((char16_t)c);
403                 }
404             } else if(c == 0x5c) {  // backslash
405                 if(i == rules->length()) {
406                     setParseError("backslash escape at the end of the rule string", errorCode);
407                     return i;
408                 }
409                 c = rules->char32At(i);
410                 raw.append(c);
411                 i += U16_LENGTH(c);
412             } else {
413                 // Any other syntax character terminates a string.
414                 --i;
415                 break;
416             }
417         } else if(PatternProps::isWhiteSpace(c)) {
418             // Unquoted white space terminates a string.
419             --i;
420             break;
421         } else {
422             raw.append((char16_t)c);
423         }
424     }
425     for(int32_t j = 0; j < raw.length();) {
426         UChar32 c = raw.char32At(j);
427         if(U_IS_SURROGATE(c)) {
428             setParseError("string contains an unpaired surrogate", errorCode);
429             return i;
430         }
431         if(0xfffd <= c && c <= 0xffff) {
432             setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
433             return i;
434         }
435         j += U16_LENGTH(c);
436     }
437     return i;
438 }
439 
440 namespace {
441 
442 static const char *const positions[] = {
443     "first tertiary ignorable",
444     "last tertiary ignorable",
445     "first secondary ignorable",
446     "last secondary ignorable",
447     "first primary ignorable",
448     "last primary ignorable",
449     "first variable",
450     "last variable",
451     "first regular",
452     "last regular",
453     "first implicit",
454     "last implicit",
455     "first trailing",
456     "last trailing"
457 };
458 
459 }  // namespace
460 
461 int32_t
parseSpecialPosition(int32_t i,UnicodeString & str,UErrorCode & errorCode)462 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
463     if(U_FAILURE(errorCode)) { return 0; }
464     UnicodeString raw;
465     int32_t j = readWords(i + 1, raw);
466     if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
467         ++j;
468         for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
469             if(raw == UnicodeString(positions[pos], -1, US_INV)) {
470                 str.setTo((char16_t)POS_LEAD).append((char16_t)(POS_BASE + pos));
471                 return j;
472             }
473         }
474         if(raw == UNICODE_STRING_SIMPLE("top")) {
475             str.setTo((char16_t)POS_LEAD).append((char16_t)(POS_BASE + LAST_REGULAR));
476             return j;
477         }
478         if(raw == UNICODE_STRING_SIMPLE("variable top")) {
479             str.setTo((char16_t)POS_LEAD).append((char16_t)(POS_BASE + LAST_VARIABLE));
480             return j;
481         }
482     }
483     setParseError("not a valid special reset position", errorCode);
484     return i;
485 }
486 
487 void
parseSetting(UErrorCode & errorCode)488 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
489     if(U_FAILURE(errorCode)) { return; }
490     UnicodeString raw;
491     int32_t i = ruleIndex + 1;
492     int32_t j = readWords(i, raw);
493     if(j <= i || raw.isEmpty()) {
494         setParseError("expected a setting/option at '['", errorCode);
495     }
496     if(rules->charAt(j) == 0x5d) {  // words end with ]
497         ++j;
498         if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
499                 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
500             parseReordering(raw, errorCode);
501             ruleIndex = j;
502             return;
503         }
504         if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
505             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
506                               UCOL_ON, 0, errorCode);
507             ruleIndex = j;
508             return;
509         }
510         UnicodeString v;
511         int32_t valueIndex = raw.lastIndexOf((char16_t)0x20);
512         if(valueIndex >= 0) {
513             v.setTo(raw, valueIndex + 1);
514             raw.truncate(valueIndex);
515         }
516         if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
517             int32_t value = UCOL_DEFAULT;
518             char16_t c = v.charAt(0);
519             if(0x31 <= c && c <= 0x34) {  // 1..4
520                 value = UCOL_PRIMARY + (c - 0x31);
521             } else if(c == 0x49) {  // 'I'
522                 value = UCOL_IDENTICAL;
523             }
524             if(value != UCOL_DEFAULT) {
525                 settings->setStrength(value, 0, errorCode);
526                 ruleIndex = j;
527                 return;
528             }
529         } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
530             UColAttributeValue value = UCOL_DEFAULT;
531             if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
532                 value = UCOL_NON_IGNORABLE;
533             } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
534                 value = UCOL_SHIFTED;
535             }
536             if(value != UCOL_DEFAULT) {
537                 settings->setAlternateHandling(value, 0, errorCode);
538                 ruleIndex = j;
539                 return;
540             }
541         } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
542             int32_t value = UCOL_DEFAULT;
543             if(v == UNICODE_STRING_SIMPLE("space")) {
544                 value = CollationSettings::MAX_VAR_SPACE;
545             } else if(v == UNICODE_STRING_SIMPLE("punct")) {
546                 value = CollationSettings::MAX_VAR_PUNCT;
547             } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
548                 value = CollationSettings::MAX_VAR_SYMBOL;
549             } else if(v == UNICODE_STRING_SIMPLE("currency")) {
550                 value = CollationSettings::MAX_VAR_CURRENCY;
551             }
552             if(value != UCOL_DEFAULT) {
553                 settings->setMaxVariable(value, 0, errorCode);
554                 settings->variableTop = baseData->getLastPrimaryForGroup(
555                     UCOL_REORDER_CODE_FIRST + value);
556                 U_ASSERT(settings->variableTop != 0);
557                 ruleIndex = j;
558                 return;
559             }
560         } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
561             UColAttributeValue value = UCOL_DEFAULT;
562             if(v == UNICODE_STRING_SIMPLE("off")) {
563                 value = UCOL_OFF;
564             } else if(v == UNICODE_STRING_SIMPLE("lower")) {
565                 value = UCOL_LOWER_FIRST;
566             } else if(v == UNICODE_STRING_SIMPLE("upper")) {
567                 value = UCOL_UPPER_FIRST;
568             }
569             if(value != UCOL_DEFAULT) {
570                 settings->setCaseFirst(value, 0, errorCode);
571                 ruleIndex = j;
572                 return;
573             }
574         } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
575             UColAttributeValue value = getOnOffValue(v);
576             if(value != UCOL_DEFAULT) {
577                 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
578                 ruleIndex = j;
579                 return;
580             }
581         } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
582             UColAttributeValue value = getOnOffValue(v);
583             if(value != UCOL_DEFAULT) {
584                 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
585                 ruleIndex = j;
586                 return;
587             }
588         } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
589             UColAttributeValue value = getOnOffValue(v);
590             if(value != UCOL_DEFAULT) {
591                 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
592                 ruleIndex = j;
593                 return;
594             }
595         } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
596             UColAttributeValue value = getOnOffValue(v);
597             if(value != UCOL_DEFAULT) {
598                 if(value == UCOL_ON) {
599                     setParseError("[hiraganaQ on] is not supported", errorCode);
600                 }
601                 ruleIndex = j;
602                 return;
603             }
604         } else if(raw == UNICODE_STRING_SIMPLE("import")) {
605             CharString lang;
606             lang.appendInvariantChars(v, errorCode);
607             if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
608             // BCP 47 language tag -> ICU locale ID
609             CharString localeID;
610             int32_t parsedLength;
611             {
612                 CharStringByteSink sink(&localeID);
613                 ulocimp_forLanguageTag(lang.data(), -1, sink, &parsedLength, &errorCode);
614             }
615             if(U_FAILURE(errorCode) || parsedLength != lang.length()) {
616                 errorCode = U_ZERO_ERROR;
617                 setParseError("expected language tag in [import langTag]", errorCode);
618                 return;
619             }
620             // localeID minus all keywords
621             char baseID[ULOC_FULLNAME_CAPACITY];
622             int32_t length = uloc_getBaseName(localeID.data(), baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
623             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
624                 errorCode = U_ZERO_ERROR;
625                 setParseError("expected language tag in [import langTag]", errorCode);
626                 return;
627             }
628             if(length == 0) {
629                 uprv_strcpy(baseID, "root");
630             } else if(*baseID == '_') {
631                 uprv_memmove(baseID + 3, baseID, length + 1);
632                 uprv_memcpy(baseID, "und", 3);
633             }
634             // @collation=type, or length=0 if not specified
635             CharString collationType;
636             {
637                 CharStringByteSink sink(&collationType);
638                 ulocimp_getKeywordValue(localeID.data(), "collation", sink, &errorCode);
639             }
640             if(U_FAILURE(errorCode)) {
641                 errorCode = U_ZERO_ERROR;
642                 setParseError("expected language tag in [import langTag]", errorCode);
643                 return;
644             }
645             if(importer == nullptr) {
646                 setParseError("[import langTag] is not supported", errorCode);
647             } else {
648                 UnicodeString importedRules;
649                 importer->getRules(baseID,
650                                    !collationType.isEmpty() ? collationType.data() : "standard",
651                                    importedRules, errorReason, errorCode);
652                 if(U_FAILURE(errorCode)) {
653                     if(errorReason == nullptr) {
654                         errorReason = "[import langTag] failed";
655                     }
656                     setErrorContext();
657                     return;
658                 }
659                 const UnicodeString *outerRules = rules;
660                 int32_t outerRuleIndex = ruleIndex;
661                 parse(importedRules, errorCode);
662                 if(U_FAILURE(errorCode)) {
663                     if(parseError != nullptr) {
664                         parseError->offset = outerRuleIndex;
665                     }
666                 }
667                 rules = outerRules;
668                 ruleIndex = j;
669             }
670             return;
671         }
672     } else if(rules->charAt(j) == 0x5b) {  // words end with [
673         UnicodeSet set;
674         j = parseUnicodeSet(j, set, errorCode);
675         if(U_FAILURE(errorCode)) { return; }
676         if(raw == UNICODE_STRING_SIMPLE("optimize")) {
677             sink->optimize(set, errorReason, errorCode);
678             if(U_FAILURE(errorCode)) { setErrorContext(); }
679             ruleIndex = j;
680             return;
681         } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
682             sink->suppressContractions(set, errorReason, errorCode);
683             if(U_FAILURE(errorCode)) { setErrorContext(); }
684             ruleIndex = j;
685             return;
686         }
687     }
688     setParseError("not a valid setting/option", errorCode);
689 }
690 
691 void
parseReordering(const UnicodeString & raw,UErrorCode & errorCode)692 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
693     if(U_FAILURE(errorCode)) { return; }
694     int32_t i = 7;  // after "reorder"
695     if(i == raw.length()) {
696         // empty [reorder] with no codes
697         settings->resetReordering();
698         return;
699     }
700     // Parse the codes in [reorder aa bb cc].
701     UVector32 reorderCodes(errorCode);
702     if(U_FAILURE(errorCode)) { return; }
703     CharString word;
704     while(i < raw.length()) {
705         ++i;  // skip the word-separating space
706         int32_t limit = raw.indexOf((char16_t)0x20, i);
707         if(limit < 0) { limit = raw.length(); }
708         word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
709         if(U_FAILURE(errorCode)) { return; }
710         int32_t code = getReorderCode(word.data());
711         if(code < 0) {
712             setParseError("unknown script or reorder code", errorCode);
713             return;
714         }
715         reorderCodes.addElement(code, errorCode);
716         if(U_FAILURE(errorCode)) { return; }
717         i = limit;
718     }
719     settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
720 }
721 
722 static const char *const gSpecialReorderCodes[] = {
723     "space", "punct", "symbol", "currency", "digit"
724 };
725 
726 int32_t
getReorderCode(const char * word)727 CollationRuleParser::getReorderCode(const char *word) {
728     for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
729         if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
730             return UCOL_REORDER_CODE_FIRST + i;
731         }
732     }
733     int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
734     if(script >= 0) {
735         return script;
736     }
737     if(uprv_stricmp(word, "others") == 0) {
738         return UCOL_REORDER_CODE_OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
739     }
740     return -1;
741 }
742 
743 UColAttributeValue
getOnOffValue(const UnicodeString & s)744 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
745     if(s == UNICODE_STRING_SIMPLE("on")) {
746         return UCOL_ON;
747     } else if(s == UNICODE_STRING_SIMPLE("off")) {
748         return UCOL_OFF;
749     } else {
750         return UCOL_DEFAULT;
751     }
752 }
753 
754 int32_t
parseUnicodeSet(int32_t i,UnicodeSet & set,UErrorCode & errorCode)755 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
756     // Collect a UnicodeSet pattern between a balanced pair of [brackets].
757     int32_t level = 0;
758     int32_t j = i;
759     for(;;) {
760         if(j == rules->length()) {
761             setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
762             return j;
763         }
764         char16_t c = rules->charAt(j++);
765         if(c == 0x5b) {  // '['
766             ++level;
767         } else if(c == 0x5d) {  // ']'
768             if(--level == 0) { break; }
769         }
770     }
771     set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
772     if(U_FAILURE(errorCode)) {
773         errorCode = U_ZERO_ERROR;
774         setParseError("not a valid UnicodeSet pattern", errorCode);
775         return j;
776     }
777     j = skipWhiteSpace(j);
778     if(j == rules->length() || rules->charAt(j) != 0x5d) {
779         setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
780         return j;
781     }
782     return ++j;
783 }
784 
785 int32_t
readWords(int32_t i,UnicodeString & raw) const786 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
787     static const char16_t sp = 0x20;
788     raw.remove();
789     i = skipWhiteSpace(i);
790     for(;;) {
791         if(i >= rules->length()) { return 0; }
792         char16_t c = rules->charAt(i);
793         if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
794             if(raw.isEmpty()) { return i; }
795             if(raw.endsWith(&sp, 1)) {  // remove trailing space
796                 raw.truncate(raw.length() - 1);
797             }
798             return i;
799         }
800         if(PatternProps::isWhiteSpace(c)) {
801             raw.append(sp);
802             i = skipWhiteSpace(i + 1);
803         } else {
804             raw.append(c);
805             ++i;
806         }
807     }
808 }
809 
810 int32_t
skipComment(int32_t i) const811 CollationRuleParser::skipComment(int32_t i) const {
812     // skip to past the newline
813     while(i < rules->length()) {
814         char16_t c = rules->charAt(i++);
815         // LF or FF or CR or NEL or LS or PS
816         if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
817             // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
818             // NLF (new line function) = CR or LF or CR+LF or NEL.
819             // No need to collect all of CR+LF because a following LF will be ignored anyway.
820             break;
821         }
822     }
823     return i;
824 }
825 
826 void
setParseError(const char * reason,UErrorCode & errorCode)827 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
828     if(U_FAILURE(errorCode)) { return; }
829     // Error code consistent with the old parser (from ca. 2001),
830     // rather than U_PARSE_ERROR;
831     errorCode = U_INVALID_FORMAT_ERROR;
832     errorReason = reason;
833     if(parseError != nullptr) { setErrorContext(); }
834 }
835 
836 void
setErrorContext()837 CollationRuleParser::setErrorContext() {
838     if(parseError == nullptr) { return; }
839 
840     // Note: This relies on the calling code maintaining the ruleIndex
841     // at a position that is useful for debugging.
842     // For example, at the beginning of a reset or relation etc.
843     parseError->offset = ruleIndex;
844     parseError->line = 0;  // We are not counting line numbers.
845 
846     // before ruleIndex
847     int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
848     if(start < 0) {
849         start = 0;
850     } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
851         ++start;
852     }
853     int32_t length = ruleIndex - start;
854     rules->extract(start, length, parseError->preContext);
855     parseError->preContext[length] = 0;
856 
857     // starting from ruleIndex
858     length = rules->length() - ruleIndex;
859     if(length >= U_PARSE_CONTEXT_LEN) {
860         length = U_PARSE_CONTEXT_LEN - 1;
861         if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
862             --length;
863         }
864     }
865     rules->extract(ruleIndex, length, parseError->postContext);
866     parseError->postContext[length] = 0;
867 }
868 
869 UBool
isSyntaxChar(UChar32 c)870 CollationRuleParser::isSyntaxChar(UChar32 c) {
871     return 0x21 <= c && c <= 0x7e &&
872             (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
873             (0x5b <= c && c <= 0x60) || (0x7b <= c));
874 }
875 
876 int32_t
skipWhiteSpace(int32_t i) const877 CollationRuleParser::skipWhiteSpace(int32_t i) const {
878     while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
879         ++i;
880     }
881     return i;
882 }
883 
884 U_NAMESPACE_END
885 
886 #endif  // !UCONFIG_NO_COLLATION
887