1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.cpp
9 *
10 * (replaced the former ucol_tok.cpp)
11 *
12 * created on: 2013apr10
13 * created by: Markus W. Scherer
14 */
15
16 #include "unicode/utypes.h"
17
18 #if !UCONFIG_NO_COLLATION
19
20 #include "unicode/normalizer2.h"
21 #include "unicode/parseerr.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ucol.h"
24 #include "unicode/uloc.h"
25 #include "unicode/unistr.h"
26 #include "unicode/utf16.h"
27 #include "bytesinkutil.h"
28 #include "charstr.h"
29 #include "cmemory.h"
30 #include "collation.h"
31 #include "collationdata.h"
32 #include "collationruleparser.h"
33 #include "collationsettings.h"
34 #include "collationtailoring.h"
35 #include "cstring.h"
36 #include "patternprops.h"
37 #include "uassert.h"
38 #include "ulocimp.h"
39 #include "uvectr32.h"
40
41 U_NAMESPACE_BEGIN
42
43 namespace {
44
45 static const char16_t BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
46 const int32_t BEFORE_LENGTH = 7;
47
48 } // namespace
49
~Sink()50 CollationRuleParser::Sink::~Sink() {}
51
52 void
suppressContractions(const UnicodeSet &,const char * &,UErrorCode &)53 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
54
55 void
optimize(const UnicodeSet &,const char * &,UErrorCode &)56 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
57
~Importer()58 CollationRuleParser::Importer::~Importer() {}
59
CollationRuleParser(const CollationData * base,UErrorCode & errorCode)60 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
61 : nfd(*Normalizer2::getNFDInstance(errorCode)),
62 nfc(*Normalizer2::getNFCInstance(errorCode)),
63 rules(nullptr), baseData(base), settings(nullptr),
64 parseError(nullptr), errorReason(nullptr),
65 sink(nullptr), importer(nullptr),
66 ruleIndex(0) {
67 }
68
~CollationRuleParser()69 CollationRuleParser::~CollationRuleParser() {
70 }
71
72 void
parse(const UnicodeString & ruleString,CollationSettings & outSettings,UParseError * outParseError,UErrorCode & errorCode)73 CollationRuleParser::parse(const UnicodeString &ruleString,
74 CollationSettings &outSettings,
75 UParseError *outParseError,
76 UErrorCode &errorCode) {
77 if(U_FAILURE(errorCode)) { return; }
78 settings = &outSettings;
79 parseError = outParseError;
80 if(parseError != nullptr) {
81 parseError->line = 0;
82 parseError->offset = -1;
83 parseError->preContext[0] = 0;
84 parseError->postContext[0] = 0;
85 }
86 errorReason = nullptr;
87 parse(ruleString, errorCode);
88 }
89
90 void
parse(const UnicodeString & ruleString,UErrorCode & errorCode)91 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
92 if(U_FAILURE(errorCode)) { return; }
93 rules = &ruleString;
94 ruleIndex = 0;
95
96 while(ruleIndex < rules->length()) {
97 char16_t c = rules->charAt(ruleIndex);
98 if(PatternProps::isWhiteSpace(c)) {
99 ++ruleIndex;
100 continue;
101 }
102 switch(c) {
103 case 0x26: // '&'
104 parseRuleChain(errorCode);
105 break;
106 case 0x5b: // '['
107 parseSetting(errorCode);
108 break;
109 case 0x23: // '#' starts a comment, until the end of the line
110 ruleIndex = skipComment(ruleIndex + 1);
111 break;
112 case 0x40: // '@' is equivalent to [backwards 2]
113 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
114 UCOL_ON, 0, errorCode);
115 ++ruleIndex;
116 break;
117 case 0x21: // '!' used to turn on Thai/Lao character reversal
118 // Accept but ignore. The root collator has contractions
119 // that are equivalent to the character reversal, where appropriate.
120 ++ruleIndex;
121 break;
122 default:
123 setParseError("expected a reset or setting or comment", errorCode);
124 break;
125 }
126 if(U_FAILURE(errorCode)) { return; }
127 }
128 }
129
130 void
parseRuleChain(UErrorCode & errorCode)131 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
132 int32_t resetStrength = parseResetAndPosition(errorCode);
133 UBool isFirstRelation = true;
134 for(;;) {
135 int32_t result = parseRelationOperator(errorCode);
136 if(U_FAILURE(errorCode)) { return; }
137 if(result < 0) {
138 if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
139 // '#' starts a comment, until the end of the line
140 ruleIndex = skipComment(ruleIndex + 1);
141 continue;
142 }
143 if(isFirstRelation) {
144 setParseError("reset not followed by a relation", errorCode);
145 }
146 return;
147 }
148 int32_t strength = result & STRENGTH_MASK;
149 if(resetStrength < UCOL_IDENTICAL) {
150 // reset-before rule chain
151 if(isFirstRelation) {
152 if(strength != resetStrength) {
153 setParseError("reset-before strength differs from its first relation", errorCode);
154 return;
155 }
156 } else {
157 if(strength < resetStrength) {
158 setParseError("reset-before strength followed by a stronger relation", errorCode);
159 return;
160 }
161 }
162 }
163 int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
164 if((result & STARRED_FLAG) == 0) {
165 parseRelationStrings(strength, i, errorCode);
166 } else {
167 parseStarredCharacters(strength, i, errorCode);
168 }
169 if(U_FAILURE(errorCode)) { return; }
170 isFirstRelation = false;
171 }
172 }
173
174 int32_t
parseResetAndPosition(UErrorCode & errorCode)175 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
176 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
177 int32_t i = skipWhiteSpace(ruleIndex + 1);
178 int32_t j;
179 char16_t c;
180 int32_t resetStrength;
181 if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
182 (j = i + BEFORE_LENGTH) < rules->length() &&
183 PatternProps::isWhiteSpace(rules->charAt(j)) &&
184 ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
185 0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
186 rules->charAt(j + 1) == 0x5d) {
187 // &[before n] with n=1 or 2 or 3
188 resetStrength = UCOL_PRIMARY + (c - 0x31);
189 i = skipWhiteSpace(j + 2);
190 } else {
191 resetStrength = UCOL_IDENTICAL;
192 }
193 if(i >= rules->length()) {
194 setParseError("reset without position", errorCode);
195 return UCOL_DEFAULT;
196 }
197 UnicodeString str;
198 if(rules->charAt(i) == 0x5b) { // '['
199 i = parseSpecialPosition(i, str, errorCode);
200 } else {
201 i = parseTailoringString(i, str, errorCode);
202 }
203 sink->addReset(resetStrength, str, errorReason, errorCode);
204 if(U_FAILURE(errorCode)) { setErrorContext(); }
205 ruleIndex = i;
206 return resetStrength;
207 }
208
209 int32_t
parseRelationOperator(UErrorCode & errorCode)210 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
211 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
212 ruleIndex = skipWhiteSpace(ruleIndex);
213 if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
214 int32_t strength;
215 int32_t i = ruleIndex;
216 char16_t c = rules->charAt(i++);
217 switch(c) {
218 case 0x3c: // '<'
219 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<
220 ++i;
221 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<
222 ++i;
223 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<
224 ++i;
225 strength = UCOL_QUATERNARY;
226 } else {
227 strength = UCOL_TERTIARY;
228 }
229 } else {
230 strength = UCOL_SECONDARY;
231 }
232 } else {
233 strength = UCOL_PRIMARY;
234 }
235 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
236 ++i;
237 strength |= STARRED_FLAG;
238 }
239 break;
240 case 0x3b: // ';' same as <<
241 strength = UCOL_SECONDARY;
242 break;
243 case 0x2c: // ',' same as <<<
244 strength = UCOL_TERTIARY;
245 break;
246 case 0x3d: // '='
247 strength = UCOL_IDENTICAL;
248 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
249 ++i;
250 strength |= STARRED_FLAG;
251 }
252 break;
253 default:
254 return UCOL_DEFAULT;
255 }
256 return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
257 }
258
259 void
parseRelationStrings(int32_t strength,int32_t i,UErrorCode & errorCode)260 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
261 // Parse
262 // prefix | str / extension
263 // where prefix and extension are optional.
264 UnicodeString prefix, str, extension;
265 i = parseTailoringString(i, str, errorCode);
266 if(U_FAILURE(errorCode)) { return; }
267 char16_t next = (i < rules->length()) ? rules->charAt(i) : 0;
268 if(next == 0x7c) { // '|' separates the context prefix from the string.
269 prefix = str;
270 i = parseTailoringString(i + 1, str, errorCode);
271 if(U_FAILURE(errorCode)) { return; }
272 next = (i < rules->length()) ? rules->charAt(i) : 0;
273 }
274 if(next == 0x2f) { // '/' separates the string from the extension.
275 i = parseTailoringString(i + 1, extension, errorCode);
276 }
277 if(!prefix.isEmpty()) {
278 UChar32 prefix0 = prefix.char32At(0);
279 UChar32 c = str.char32At(0);
280 if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
281 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
282 errorCode);
283 return;
284 }
285 }
286 sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
287 if(U_FAILURE(errorCode)) { setErrorContext(); }
288 ruleIndex = i;
289 }
290
291 void
parseStarredCharacters(int32_t strength,int32_t i,UErrorCode & errorCode)292 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
293 UnicodeString empty, raw;
294 i = parseString(skipWhiteSpace(i), raw, errorCode);
295 if(U_FAILURE(errorCode)) { return; }
296 if(raw.isEmpty()) {
297 setParseError("missing starred-relation string", errorCode);
298 return;
299 }
300 UChar32 prev = -1;
301 int32_t j = 0;
302 for(;;) {
303 while(j < raw.length()) {
304 UChar32 c = raw.char32At(j);
305 if(!nfd.isInert(c)) {
306 setParseError("starred-relation string is not all NFD-inert", errorCode);
307 return;
308 }
309 sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
310 if(U_FAILURE(errorCode)) {
311 setErrorContext();
312 return;
313 }
314 j += U16_LENGTH(c);
315 prev = c;
316 }
317 if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-'
318 break;
319 }
320 if(prev < 0) {
321 setParseError("range without start in starred-relation string", errorCode);
322 return;
323 }
324 i = parseString(i + 1, raw, errorCode);
325 if(U_FAILURE(errorCode)) { return; }
326 if(raw.isEmpty()) {
327 setParseError("range without end in starred-relation string", errorCode);
328 return;
329 }
330 UChar32 c = raw.char32At(0);
331 if(c < prev) {
332 setParseError("range start greater than end in starred-relation string", errorCode);
333 return;
334 }
335 // range prev-c
336 UnicodeString s;
337 while(++prev <= c) {
338 if(!nfd.isInert(prev)) {
339 setParseError("starred-relation string range is not all NFD-inert", errorCode);
340 return;
341 }
342 if(U_IS_SURROGATE(prev)) {
343 setParseError("starred-relation string range contains a surrogate", errorCode);
344 return;
345 }
346 if(0xfffd <= prev && prev <= 0xffff) {
347 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
348 return;
349 }
350 s.setTo(prev);
351 sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
352 if(U_FAILURE(errorCode)) {
353 setErrorContext();
354 return;
355 }
356 }
357 prev = -1;
358 j = U16_LENGTH(c);
359 }
360 ruleIndex = skipWhiteSpace(i);
361 }
362
363 int32_t
parseTailoringString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)364 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
365 i = parseString(skipWhiteSpace(i), raw, errorCode);
366 if(U_SUCCESS(errorCode) && raw.isEmpty()) {
367 setParseError("missing relation string", errorCode);
368 }
369 return skipWhiteSpace(i);
370 }
371
372 int32_t
parseString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)373 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
374 if(U_FAILURE(errorCode)) { return i; }
375 raw.remove();
376 while(i < rules->length()) {
377 UChar32 c = rules->charAt(i++);
378 if(isSyntaxChar(c)) {
379 if(c == 0x27) { // apostrophe
380 if(i < rules->length() && rules->charAt(i) == 0x27) {
381 // Double apostrophe, encodes a single one.
382 raw.append((char16_t)0x27);
383 ++i;
384 continue;
385 }
386 // Quote literal text until the next single apostrophe.
387 for(;;) {
388 if(i == rules->length()) {
389 setParseError("quoted literal text missing terminating apostrophe", errorCode);
390 return i;
391 }
392 c = rules->charAt(i++);
393 if(c == 0x27) {
394 if(i < rules->length() && rules->charAt(i) == 0x27) {
395 // Double apostrophe inside quoted literal text,
396 // still encodes a single apostrophe.
397 ++i;
398 } else {
399 break;
400 }
401 }
402 raw.append((char16_t)c);
403 }
404 } else if(c == 0x5c) { // backslash
405 if(i == rules->length()) {
406 setParseError("backslash escape at the end of the rule string", errorCode);
407 return i;
408 }
409 c = rules->char32At(i);
410 raw.append(c);
411 i += U16_LENGTH(c);
412 } else {
413 // Any other syntax character terminates a string.
414 --i;
415 break;
416 }
417 } else if(PatternProps::isWhiteSpace(c)) {
418 // Unquoted white space terminates a string.
419 --i;
420 break;
421 } else {
422 raw.append((char16_t)c);
423 }
424 }
425 for(int32_t j = 0; j < raw.length();) {
426 UChar32 c = raw.char32At(j);
427 if(U_IS_SURROGATE(c)) {
428 setParseError("string contains an unpaired surrogate", errorCode);
429 return i;
430 }
431 if(0xfffd <= c && c <= 0xffff) {
432 setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
433 return i;
434 }
435 j += U16_LENGTH(c);
436 }
437 return i;
438 }
439
440 namespace {
441
442 static const char *const positions[] = {
443 "first tertiary ignorable",
444 "last tertiary ignorable",
445 "first secondary ignorable",
446 "last secondary ignorable",
447 "first primary ignorable",
448 "last primary ignorable",
449 "first variable",
450 "last variable",
451 "first regular",
452 "last regular",
453 "first implicit",
454 "last implicit",
455 "first trailing",
456 "last trailing"
457 };
458
459 } // namespace
460
461 int32_t
parseSpecialPosition(int32_t i,UnicodeString & str,UErrorCode & errorCode)462 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
463 if(U_FAILURE(errorCode)) { return 0; }
464 UnicodeString raw;
465 int32_t j = readWords(i + 1, raw);
466 if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]
467 ++j;
468 for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
469 if(raw == UnicodeString(positions[pos], -1, US_INV)) {
470 str.setTo((char16_t)POS_LEAD).append((char16_t)(POS_BASE + pos));
471 return j;
472 }
473 }
474 if(raw == UNICODE_STRING_SIMPLE("top")) {
475 str.setTo((char16_t)POS_LEAD).append((char16_t)(POS_BASE + LAST_REGULAR));
476 return j;
477 }
478 if(raw == UNICODE_STRING_SIMPLE("variable top")) {
479 str.setTo((char16_t)POS_LEAD).append((char16_t)(POS_BASE + LAST_VARIABLE));
480 return j;
481 }
482 }
483 setParseError("not a valid special reset position", errorCode);
484 return i;
485 }
486
487 void
parseSetting(UErrorCode & errorCode)488 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
489 if(U_FAILURE(errorCode)) { return; }
490 UnicodeString raw;
491 int32_t i = ruleIndex + 1;
492 int32_t j = readWords(i, raw);
493 if(j <= i || raw.isEmpty()) {
494 setParseError("expected a setting/option at '['", errorCode);
495 }
496 if(rules->charAt(j) == 0x5d) { // words end with ]
497 ++j;
498 if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
499 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
500 parseReordering(raw, errorCode);
501 ruleIndex = j;
502 return;
503 }
504 if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
505 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
506 UCOL_ON, 0, errorCode);
507 ruleIndex = j;
508 return;
509 }
510 UnicodeString v;
511 int32_t valueIndex = raw.lastIndexOf((char16_t)0x20);
512 if(valueIndex >= 0) {
513 v.setTo(raw, valueIndex + 1);
514 raw.truncate(valueIndex);
515 }
516 if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
517 int32_t value = UCOL_DEFAULT;
518 char16_t c = v.charAt(0);
519 if(0x31 <= c && c <= 0x34) { // 1..4
520 value = UCOL_PRIMARY + (c - 0x31);
521 } else if(c == 0x49) { // 'I'
522 value = UCOL_IDENTICAL;
523 }
524 if(value != UCOL_DEFAULT) {
525 settings->setStrength(value, 0, errorCode);
526 ruleIndex = j;
527 return;
528 }
529 } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
530 UColAttributeValue value = UCOL_DEFAULT;
531 if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
532 value = UCOL_NON_IGNORABLE;
533 } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
534 value = UCOL_SHIFTED;
535 }
536 if(value != UCOL_DEFAULT) {
537 settings->setAlternateHandling(value, 0, errorCode);
538 ruleIndex = j;
539 return;
540 }
541 } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
542 int32_t value = UCOL_DEFAULT;
543 if(v == UNICODE_STRING_SIMPLE("space")) {
544 value = CollationSettings::MAX_VAR_SPACE;
545 } else if(v == UNICODE_STRING_SIMPLE("punct")) {
546 value = CollationSettings::MAX_VAR_PUNCT;
547 } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
548 value = CollationSettings::MAX_VAR_SYMBOL;
549 } else if(v == UNICODE_STRING_SIMPLE("currency")) {
550 value = CollationSettings::MAX_VAR_CURRENCY;
551 }
552 if(value != UCOL_DEFAULT) {
553 settings->setMaxVariable(value, 0, errorCode);
554 settings->variableTop = baseData->getLastPrimaryForGroup(
555 UCOL_REORDER_CODE_FIRST + value);
556 U_ASSERT(settings->variableTop != 0);
557 ruleIndex = j;
558 return;
559 }
560 } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
561 UColAttributeValue value = UCOL_DEFAULT;
562 if(v == UNICODE_STRING_SIMPLE("off")) {
563 value = UCOL_OFF;
564 } else if(v == UNICODE_STRING_SIMPLE("lower")) {
565 value = UCOL_LOWER_FIRST;
566 } else if(v == UNICODE_STRING_SIMPLE("upper")) {
567 value = UCOL_UPPER_FIRST;
568 }
569 if(value != UCOL_DEFAULT) {
570 settings->setCaseFirst(value, 0, errorCode);
571 ruleIndex = j;
572 return;
573 }
574 } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
575 UColAttributeValue value = getOnOffValue(v);
576 if(value != UCOL_DEFAULT) {
577 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
578 ruleIndex = j;
579 return;
580 }
581 } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
582 UColAttributeValue value = getOnOffValue(v);
583 if(value != UCOL_DEFAULT) {
584 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
585 ruleIndex = j;
586 return;
587 }
588 } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
589 UColAttributeValue value = getOnOffValue(v);
590 if(value != UCOL_DEFAULT) {
591 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
592 ruleIndex = j;
593 return;
594 }
595 } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
596 UColAttributeValue value = getOnOffValue(v);
597 if(value != UCOL_DEFAULT) {
598 if(value == UCOL_ON) {
599 setParseError("[hiraganaQ on] is not supported", errorCode);
600 }
601 ruleIndex = j;
602 return;
603 }
604 } else if(raw == UNICODE_STRING_SIMPLE("import")) {
605 CharString lang;
606 lang.appendInvariantChars(v, errorCode);
607 if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
608 // BCP 47 language tag -> ICU locale ID
609 CharString localeID;
610 int32_t parsedLength;
611 {
612 CharStringByteSink sink(&localeID);
613 ulocimp_forLanguageTag(lang.data(), -1, sink, &parsedLength, &errorCode);
614 }
615 if(U_FAILURE(errorCode) || parsedLength != lang.length()) {
616 errorCode = U_ZERO_ERROR;
617 setParseError("expected language tag in [import langTag]", errorCode);
618 return;
619 }
620 // localeID minus all keywords
621 char baseID[ULOC_FULLNAME_CAPACITY];
622 int32_t length = uloc_getBaseName(localeID.data(), baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
623 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
624 errorCode = U_ZERO_ERROR;
625 setParseError("expected language tag in [import langTag]", errorCode);
626 return;
627 }
628 if(length == 0) {
629 uprv_strcpy(baseID, "root");
630 } else if(*baseID == '_') {
631 uprv_memmove(baseID + 3, baseID, length + 1);
632 uprv_memcpy(baseID, "und", 3);
633 }
634 // @collation=type, or length=0 if not specified
635 CharString collationType;
636 {
637 CharStringByteSink sink(&collationType);
638 ulocimp_getKeywordValue(localeID.data(), "collation", sink, &errorCode);
639 }
640 if(U_FAILURE(errorCode)) {
641 errorCode = U_ZERO_ERROR;
642 setParseError("expected language tag in [import langTag]", errorCode);
643 return;
644 }
645 if(importer == nullptr) {
646 setParseError("[import langTag] is not supported", errorCode);
647 } else {
648 UnicodeString importedRules;
649 importer->getRules(baseID,
650 !collationType.isEmpty() ? collationType.data() : "standard",
651 importedRules, errorReason, errorCode);
652 if(U_FAILURE(errorCode)) {
653 if(errorReason == nullptr) {
654 errorReason = "[import langTag] failed";
655 }
656 setErrorContext();
657 return;
658 }
659 const UnicodeString *outerRules = rules;
660 int32_t outerRuleIndex = ruleIndex;
661 parse(importedRules, errorCode);
662 if(U_FAILURE(errorCode)) {
663 if(parseError != nullptr) {
664 parseError->offset = outerRuleIndex;
665 }
666 }
667 rules = outerRules;
668 ruleIndex = j;
669 }
670 return;
671 }
672 } else if(rules->charAt(j) == 0x5b) { // words end with [
673 UnicodeSet set;
674 j = parseUnicodeSet(j, set, errorCode);
675 if(U_FAILURE(errorCode)) { return; }
676 if(raw == UNICODE_STRING_SIMPLE("optimize")) {
677 sink->optimize(set, errorReason, errorCode);
678 if(U_FAILURE(errorCode)) { setErrorContext(); }
679 ruleIndex = j;
680 return;
681 } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
682 sink->suppressContractions(set, errorReason, errorCode);
683 if(U_FAILURE(errorCode)) { setErrorContext(); }
684 ruleIndex = j;
685 return;
686 }
687 }
688 setParseError("not a valid setting/option", errorCode);
689 }
690
691 void
parseReordering(const UnicodeString & raw,UErrorCode & errorCode)692 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
693 if(U_FAILURE(errorCode)) { return; }
694 int32_t i = 7; // after "reorder"
695 if(i == raw.length()) {
696 // empty [reorder] with no codes
697 settings->resetReordering();
698 return;
699 }
700 // Parse the codes in [reorder aa bb cc].
701 UVector32 reorderCodes(errorCode);
702 if(U_FAILURE(errorCode)) { return; }
703 CharString word;
704 while(i < raw.length()) {
705 ++i; // skip the word-separating space
706 int32_t limit = raw.indexOf((char16_t)0x20, i);
707 if(limit < 0) { limit = raw.length(); }
708 word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
709 if(U_FAILURE(errorCode)) { return; }
710 int32_t code = getReorderCode(word.data());
711 if(code < 0) {
712 setParseError("unknown script or reorder code", errorCode);
713 return;
714 }
715 reorderCodes.addElement(code, errorCode);
716 if(U_FAILURE(errorCode)) { return; }
717 i = limit;
718 }
719 settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
720 }
721
722 static const char *const gSpecialReorderCodes[] = {
723 "space", "punct", "symbol", "currency", "digit"
724 };
725
726 int32_t
getReorderCode(const char * word)727 CollationRuleParser::getReorderCode(const char *word) {
728 for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
729 if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
730 return UCOL_REORDER_CODE_FIRST + i;
731 }
732 }
733 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
734 if(script >= 0) {
735 return script;
736 }
737 if(uprv_stricmp(word, "others") == 0) {
738 return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN
739 }
740 return -1;
741 }
742
743 UColAttributeValue
getOnOffValue(const UnicodeString & s)744 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
745 if(s == UNICODE_STRING_SIMPLE("on")) {
746 return UCOL_ON;
747 } else if(s == UNICODE_STRING_SIMPLE("off")) {
748 return UCOL_OFF;
749 } else {
750 return UCOL_DEFAULT;
751 }
752 }
753
754 int32_t
parseUnicodeSet(int32_t i,UnicodeSet & set,UErrorCode & errorCode)755 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
756 // Collect a UnicodeSet pattern between a balanced pair of [brackets].
757 int32_t level = 0;
758 int32_t j = i;
759 for(;;) {
760 if(j == rules->length()) {
761 setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
762 return j;
763 }
764 char16_t c = rules->charAt(j++);
765 if(c == 0x5b) { // '['
766 ++level;
767 } else if(c == 0x5d) { // ']'
768 if(--level == 0) { break; }
769 }
770 }
771 set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
772 if(U_FAILURE(errorCode)) {
773 errorCode = U_ZERO_ERROR;
774 setParseError("not a valid UnicodeSet pattern", errorCode);
775 return j;
776 }
777 j = skipWhiteSpace(j);
778 if(j == rules->length() || rules->charAt(j) != 0x5d) {
779 setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
780 return j;
781 }
782 return ++j;
783 }
784
785 int32_t
readWords(int32_t i,UnicodeString & raw) const786 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
787 static const char16_t sp = 0x20;
788 raw.remove();
789 i = skipWhiteSpace(i);
790 for(;;) {
791 if(i >= rules->length()) { return 0; }
792 char16_t c = rules->charAt(i);
793 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
794 if(raw.isEmpty()) { return i; }
795 if(raw.endsWith(&sp, 1)) { // remove trailing space
796 raw.truncate(raw.length() - 1);
797 }
798 return i;
799 }
800 if(PatternProps::isWhiteSpace(c)) {
801 raw.append(sp);
802 i = skipWhiteSpace(i + 1);
803 } else {
804 raw.append(c);
805 ++i;
806 }
807 }
808 }
809
810 int32_t
skipComment(int32_t i) const811 CollationRuleParser::skipComment(int32_t i) const {
812 // skip to past the newline
813 while(i < rules->length()) {
814 char16_t c = rules->charAt(i++);
815 // LF or FF or CR or NEL or LS or PS
816 if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
817 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
818 // NLF (new line function) = CR or LF or CR+LF or NEL.
819 // No need to collect all of CR+LF because a following LF will be ignored anyway.
820 break;
821 }
822 }
823 return i;
824 }
825
826 void
setParseError(const char * reason,UErrorCode & errorCode)827 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
828 if(U_FAILURE(errorCode)) { return; }
829 // Error code consistent with the old parser (from ca. 2001),
830 // rather than U_PARSE_ERROR;
831 errorCode = U_INVALID_FORMAT_ERROR;
832 errorReason = reason;
833 if(parseError != nullptr) { setErrorContext(); }
834 }
835
836 void
setErrorContext()837 CollationRuleParser::setErrorContext() {
838 if(parseError == nullptr) { return; }
839
840 // Note: This relies on the calling code maintaining the ruleIndex
841 // at a position that is useful for debugging.
842 // For example, at the beginning of a reset or relation etc.
843 parseError->offset = ruleIndex;
844 parseError->line = 0; // We are not counting line numbers.
845
846 // before ruleIndex
847 int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
848 if(start < 0) {
849 start = 0;
850 } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
851 ++start;
852 }
853 int32_t length = ruleIndex - start;
854 rules->extract(start, length, parseError->preContext);
855 parseError->preContext[length] = 0;
856
857 // starting from ruleIndex
858 length = rules->length() - ruleIndex;
859 if(length >= U_PARSE_CONTEXT_LEN) {
860 length = U_PARSE_CONTEXT_LEN - 1;
861 if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
862 --length;
863 }
864 }
865 rules->extract(ruleIndex, length, parseError->postContext);
866 parseError->postContext[length] = 0;
867 }
868
869 UBool
isSyntaxChar(UChar32 c)870 CollationRuleParser::isSyntaxChar(UChar32 c) {
871 return 0x21 <= c && c <= 0x7e &&
872 (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
873 (0x5b <= c && c <= 0x60) || (0x7b <= c));
874 }
875
876 int32_t
skipWhiteSpace(int32_t i) const877 CollationRuleParser::skipWhiteSpace(int32_t i) const {
878 while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
879 ++i;
880 }
881 return i;
882 }
883
884 U_NAMESPACE_END
885
886 #endif // !UCONFIG_NO_COLLATION
887