xref: /aosp_15_r20/external/cronet/third_party/icu/source/i18n/rulebasedcollator.cpp (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 1996-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * rulebasedcollator.cpp
9 *
10 * (replaced the former tblcoll.cpp)
11 *
12 * created on: 2012feb14 with new and old collation code
13 * created by: Markus W. Scherer
14 */
15 
16 #include "unicode/utypes.h"
17 
18 #if !UCONFIG_NO_COLLATION
19 
20 #include "unicode/coll.h"
21 #include "unicode/coleitr.h"
22 #include "unicode/localpointer.h"
23 #include "unicode/locid.h"
24 #include "unicode/sortkey.h"
25 #include "unicode/tblcoll.h"
26 #include "unicode/ucol.h"
27 #include "unicode/uiter.h"
28 #include "unicode/uloc.h"
29 #include "unicode/uniset.h"
30 #include "unicode/unistr.h"
31 #include "unicode/usetiter.h"
32 #include "unicode/utf8.h"
33 #include "unicode/uversion.h"
34 #include "bocsu.h"
35 #include "bytesinkutil.h"
36 #include "charstr.h"
37 #include "cmemory.h"
38 #include "collation.h"
39 #include "collationcompare.h"
40 #include "collationdata.h"
41 #include "collationdatareader.h"
42 #include "collationfastlatin.h"
43 #include "collationiterator.h"
44 #include "collationkeys.h"
45 #include "collationroot.h"
46 #include "collationsets.h"
47 #include "collationsettings.h"
48 #include "collationtailoring.h"
49 #include "cstring.h"
50 #include "uassert.h"
51 #include "ucol_imp.h"
52 #include "uhash.h"
53 #include "uitercollationiterator.h"
54 #include "ulocimp.h"
55 #include "ustr_imp.h"
56 #include "utf16collationiterator.h"
57 #include "utf8collationiterator.h"
58 #include "uvectr64.h"
59 
60 U_NAMESPACE_BEGIN
61 
62 namespace {
63 
64 class FixedSortKeyByteSink : public SortKeyByteSink {
65 public:
FixedSortKeyByteSink(char * dest,int32_t destCapacity)66     FixedSortKeyByteSink(char *dest, int32_t destCapacity)
67             : SortKeyByteSink(dest, destCapacity) {}
68     virtual ~FixedSortKeyByteSink();
69 
70 private:
71     virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) override;
72     virtual UBool Resize(int32_t appendCapacity, int32_t length) override;
73 };
74 
~FixedSortKeyByteSink()75 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
76 
77 void
AppendBeyondCapacity(const char * bytes,int32_t,int32_t length)78 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
79     // buffer_ != nullptr && bytes != nullptr && n > 0 && appended_ > capacity_
80     // Fill the buffer completely.
81     int32_t available = capacity_ - length;
82     if (available > 0) {
83         uprv_memcpy(buffer_ + length, bytes, available);
84     }
85 }
86 
87 UBool
Resize(int32_t,int32_t)88 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
89     return false;
90 }
91 
92 }  // namespace
93 
94 // Not in an anonymous namespace, so that it can be a friend of CollationKey.
95 class CollationKeyByteSink : public SortKeyByteSink {
96 public:
CollationKeyByteSink(CollationKey & key)97     CollationKeyByteSink(CollationKey &key)
98             : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
99               key_(key) {}
100     virtual ~CollationKeyByteSink();
101 
102 private:
103     virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) override;
104     virtual UBool Resize(int32_t appendCapacity, int32_t length) override;
105 
106     CollationKey &key_;
107 };
108 
~CollationKeyByteSink()109 CollationKeyByteSink::~CollationKeyByteSink() {}
110 
111 void
AppendBeyondCapacity(const char * bytes,int32_t n,int32_t length)112 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
113     // buffer_ != nullptr && bytes != nullptr && n > 0 && appended_ > capacity_
114     if (Resize(n, length)) {
115         uprv_memcpy(buffer_ + length, bytes, n);
116     }
117 }
118 
119 UBool
Resize(int32_t appendCapacity,int32_t length)120 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
121     if (buffer_ == nullptr) {
122         return false;  // allocation failed before already
123     }
124     int32_t newCapacity = 2 * capacity_;
125     int32_t altCapacity = length + 2 * appendCapacity;
126     if (newCapacity < altCapacity) {
127         newCapacity = altCapacity;
128     }
129     if (newCapacity < 200) {
130         newCapacity = 200;
131     }
132     uint8_t *newBuffer = key_.reallocate(newCapacity, length);
133     if (newBuffer == nullptr) {
134         SetNotOk();
135         return false;
136     }
137     buffer_ = reinterpret_cast<char *>(newBuffer);
138     capacity_ = newCapacity;
139     return true;
140 }
141 
RuleBasedCollator(const RuleBasedCollator & other)142 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other)
143         : Collator(other),
144           data(other.data),
145           settings(other.settings),
146           tailoring(other.tailoring),
147           cacheEntry(other.cacheEntry),
148           validLocale(other.validLocale),
149           explicitlySetAttributes(other.explicitlySetAttributes),
150           actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) {
151     settings->addRef();
152     cacheEntry->addRef();
153 }
154 
RuleBasedCollator(const uint8_t * bin,int32_t length,const RuleBasedCollator * base,UErrorCode & errorCode)155 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
156                                      const RuleBasedCollator *base, UErrorCode &errorCode)
157         : data(nullptr),
158           settings(nullptr),
159           tailoring(nullptr),
160           cacheEntry(nullptr),
161           validLocale(""),
162           explicitlySetAttributes(0),
163           actualLocaleIsSameAsValid(false) {
164     if(U_FAILURE(errorCode)) { return; }
165     if(bin == nullptr || length == 0 || base == nullptr) {
166         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
167         return;
168     }
169     const CollationTailoring *root = CollationRoot::getRoot(errorCode);
170     if(U_FAILURE(errorCode)) { return; }
171     if(base->tailoring != root) {
172         errorCode = U_UNSUPPORTED_ERROR;
173         return;
174     }
175     LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings));
176     if(t.isNull() || t->isBogus()) {
177         errorCode = U_MEMORY_ALLOCATION_ERROR;
178         return;
179     }
180     CollationDataReader::read(base->tailoring, bin, length, *t, errorCode);
181     if(U_FAILURE(errorCode)) { return; }
182     t->actualLocale.setToBogus();
183     adoptTailoring(t.orphan(), errorCode);
184 }
185 
RuleBasedCollator(const CollationCacheEntry * entry)186 RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry *entry)
187         : data(entry->tailoring->data),
188           settings(entry->tailoring->settings),
189           tailoring(entry->tailoring),
190           cacheEntry(entry),
191           validLocale(entry->validLocale),
192           explicitlySetAttributes(0),
193           actualLocaleIsSameAsValid(false) {
194     settings->addRef();
195     cacheEntry->addRef();
196 }
197 
~RuleBasedCollator()198 RuleBasedCollator::~RuleBasedCollator() {
199     SharedObject::clearPtr(settings);
200     SharedObject::clearPtr(cacheEntry);
201 }
202 
203 void
adoptTailoring(CollationTailoring * t,UErrorCode & errorCode)204 RuleBasedCollator::adoptTailoring(CollationTailoring *t, UErrorCode &errorCode) {
205     if(U_FAILURE(errorCode)) {
206         t->deleteIfZeroRefCount();
207         return;
208     }
209     U_ASSERT(settings == nullptr && data == nullptr && tailoring == nullptr && cacheEntry == nullptr);
210     cacheEntry = new CollationCacheEntry(t->actualLocale, t);
211     if(cacheEntry == nullptr) {
212         errorCode = U_MEMORY_ALLOCATION_ERROR;
213         t->deleteIfZeroRefCount();
214         return;
215     }
216     data = t->data;
217     settings = t->settings;
218     settings->addRef();
219     tailoring = t;
220     cacheEntry->addRef();
221     validLocale = t->actualLocale;
222     actualLocaleIsSameAsValid = false;
223 }
224 
225 RuleBasedCollator *
clone() const226 RuleBasedCollator::clone() const {
227     return new RuleBasedCollator(*this);
228 }
229 
operator =(const RuleBasedCollator & other)230 RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) {
231     if(this == &other) { return *this; }
232     SharedObject::copyPtr(other.settings, settings);
233     tailoring = other.tailoring;
234     SharedObject::copyPtr(other.cacheEntry, cacheEntry);
235     data = tailoring->data;
236     validLocale = other.validLocale;
237     explicitlySetAttributes = other.explicitlySetAttributes;
238     actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid;
239     return *this;
240 }
241 
242 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
243 
244 bool
245 RuleBasedCollator::operator==(const Collator& other) const {
246     if(this == &other) { return true; }
247     if(!Collator::operator==(other)) { return false; }
248     const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other);
249     if(*settings != *o.settings) { return false; }
250     if(data == o.data) { return true; }
251     UBool thisIsRoot = data->base == nullptr;
252     UBool otherIsRoot = o.data->base == nullptr;
253     U_ASSERT(!thisIsRoot || !otherIsRoot);  // otherwise their data pointers should be ==
254     if(thisIsRoot != otherIsRoot) { return false; }
255     if((thisIsRoot || !tailoring->rules.isEmpty()) &&
256             (otherIsRoot || !o.tailoring->rules.isEmpty())) {
257         // Shortcut: If both collators have valid rule strings, then compare those.
258         if(tailoring->rules == o.tailoring->rules) { return true; }
259     }
260     // Different rule strings can result in the same or equivalent tailoring.
261     // The rule strings are optional in ICU resource bundles, although included by default.
262     // cloneBinary() drops the rule string.
263     UErrorCode errorCode = U_ZERO_ERROR;
264     LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode));
265     LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode));
266     if(U_FAILURE(errorCode)) { return false; }
267     if(*thisTailored != *otherTailored) { return false; }
268     // For completeness, we should compare all of the mappings;
269     // or we should create a list of strings, sort it with one collator,
270     // and check if both collators compare adjacent strings the same
271     // (order & strength, down to quaternary); or similar.
272     // Testing equality of collators seems unusual.
273     return true;
274 }
275 
276 int32_t
hashCode() const277 RuleBasedCollator::hashCode() const {
278     int32_t h = settings->hashCode();
279     if(data->base == nullptr) { return h; }  // root collator
280     // Do not rely on the rule string, see comments in operator==().
281     UErrorCode errorCode = U_ZERO_ERROR;
282     LocalPointer<UnicodeSet> set(getTailoredSet(errorCode));
283     if(U_FAILURE(errorCode)) { return 0; }
284     UnicodeSetIterator iter(*set);
285     while(iter.next() && !iter.isString()) {
286         h ^= data->getCE32(iter.getCodepoint());
287     }
288     return h;
289 }
290 
291 void
setLocales(const Locale & requested,const Locale & valid,const Locale & actual)292 RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid,
293                               const Locale &actual) {
294     if(actual == tailoring->actualLocale) {
295         actualLocaleIsSameAsValid = false;
296     } else {
297         U_ASSERT(actual == valid);
298         actualLocaleIsSameAsValid = true;
299     }
300     // Do not modify tailoring.actualLocale:
301     // We cannot be sure that that would be thread-safe.
302     validLocale = valid;
303     (void)requested;  // Ignore, see also ticket #10477.
304 }
305 
306 Locale
getLocale(ULocDataLocaleType type,UErrorCode & errorCode) const307 RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const {
308     if(U_FAILURE(errorCode)) {
309         return Locale::getRoot();
310     }
311     switch(type) {
312     case ULOC_ACTUAL_LOCALE:
313         return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale;
314     case ULOC_VALID_LOCALE:
315         return validLocale;
316     case ULOC_REQUESTED_LOCALE:
317     default:
318         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
319         return Locale::getRoot();
320     }
321 }
322 
323 const char *
internalGetLocaleID(ULocDataLocaleType type,UErrorCode & errorCode) const324 RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const {
325     if(U_FAILURE(errorCode)) {
326         return nullptr;
327     }
328     const Locale *result;
329     switch(type) {
330     case ULOC_ACTUAL_LOCALE:
331         result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale;
332         break;
333     case ULOC_VALID_LOCALE:
334         result = &validLocale;
335         break;
336     case ULOC_REQUESTED_LOCALE:
337     default:
338         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
339         return nullptr;
340     }
341     if(result->isBogus()) { return nullptr; }
342     const char *id = result->getName();
343     return id[0] == 0 ? "root" : id;
344 }
345 
346 const UnicodeString&
getRules() const347 RuleBasedCollator::getRules() const {
348     return tailoring->rules;
349 }
350 
351 void
getRules(UColRuleOption delta,UnicodeString & buffer) const352 RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const {
353     if(delta == UCOL_TAILORING_ONLY) {
354         buffer = tailoring->rules;
355         return;
356     }
357     // UCOL_FULL_RULES
358     buffer.remove();
359     CollationLoader::appendRootRules(buffer);
360     buffer.append(tailoring->rules).getTerminatedBuffer();
361 }
362 
363 void
getVersion(UVersionInfo version) const364 RuleBasedCollator::getVersion(UVersionInfo version) const {
365     uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH);
366     version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4);
367 }
368 
369 UnicodeSet *
getTailoredSet(UErrorCode & errorCode) const370 RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const {
371     if(U_FAILURE(errorCode)) { return nullptr; }
372     UnicodeSet *tailored = new UnicodeSet();
373     if(tailored == nullptr) {
374         errorCode = U_MEMORY_ALLOCATION_ERROR;
375         return nullptr;
376     }
377     if(data->base != nullptr) {
378         TailoredSet(tailored).forData(data, errorCode);
379         if(U_FAILURE(errorCode)) {
380             delete tailored;
381             return nullptr;
382         }
383     }
384     return tailored;
385 }
386 
387 void
internalGetContractionsAndExpansions(UnicodeSet * contractions,UnicodeSet * expansions,UBool addPrefixes,UErrorCode & errorCode) const388 RuleBasedCollator::internalGetContractionsAndExpansions(
389         UnicodeSet *contractions, UnicodeSet *expansions,
390         UBool addPrefixes, UErrorCode &errorCode) const {
391     if(U_FAILURE(errorCode)) { return; }
392     if(contractions != nullptr) {
393         contractions->clear();
394     }
395     if(expansions != nullptr) {
396         expansions->clear();
397     }
398     ContractionsAndExpansions(contractions, expansions, nullptr, addPrefixes).forData(data, errorCode);
399 }
400 
401 void
internalAddContractions(UChar32 c,UnicodeSet & set,UErrorCode & errorCode) const402 RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const {
403     if(U_FAILURE(errorCode)) { return; }
404     ContractionsAndExpansions(&set, nullptr, nullptr, false).forCodePoint(data, c, errorCode);
405 }
406 
407 const CollationSettings &
getDefaultSettings() const408 RuleBasedCollator::getDefaultSettings() const {
409     return *tailoring->settings;
410 }
411 
412 UColAttributeValue
getAttribute(UColAttribute attr,UErrorCode & errorCode) const413 RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const {
414     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
415     int32_t option;
416     switch(attr) {
417     case UCOL_FRENCH_COLLATION:
418         option = CollationSettings::BACKWARD_SECONDARY;
419         break;
420     case UCOL_ALTERNATE_HANDLING:
421         return settings->getAlternateHandling();
422     case UCOL_CASE_FIRST:
423         return settings->getCaseFirst();
424     case UCOL_CASE_LEVEL:
425         option = CollationSettings::CASE_LEVEL;
426         break;
427     case UCOL_NORMALIZATION_MODE:
428         option = CollationSettings::CHECK_FCD;
429         break;
430     case UCOL_STRENGTH:
431         return (UColAttributeValue)settings->getStrength();
432     case UCOL_HIRAGANA_QUATERNARY_MODE:
433         // Deprecated attribute, unsettable.
434         return UCOL_OFF;
435     case UCOL_NUMERIC_COLLATION:
436         option = CollationSettings::NUMERIC;
437         break;
438     default:
439         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
440         return UCOL_DEFAULT;
441     }
442     return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON;
443 }
444 
445 void
setAttribute(UColAttribute attr,UColAttributeValue value,UErrorCode & errorCode)446 RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value,
447                                 UErrorCode &errorCode) {
448     UColAttributeValue oldValue = getAttribute(attr, errorCode);
449     if(U_FAILURE(errorCode)) { return; }
450     if(value == oldValue) {
451         setAttributeExplicitly(attr);
452         return;
453     }
454     const CollationSettings &defaultSettings = getDefaultSettings();
455     if(settings == &defaultSettings) {
456         if(value == UCOL_DEFAULT) {
457             setAttributeDefault(attr);
458             return;
459         }
460     }
461     CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
462     if(ownedSettings == nullptr) {
463         errorCode = U_MEMORY_ALLOCATION_ERROR;
464         return;
465     }
466 
467     switch(attr) {
468     case UCOL_FRENCH_COLLATION:
469         ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value,
470                                defaultSettings.options, errorCode);
471         break;
472     case UCOL_ALTERNATE_HANDLING:
473         ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode);
474         break;
475     case UCOL_CASE_FIRST:
476         ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode);
477         break;
478     case UCOL_CASE_LEVEL:
479         ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value,
480                                defaultSettings.options, errorCode);
481         break;
482     case UCOL_NORMALIZATION_MODE:
483         ownedSettings->setFlag(CollationSettings::CHECK_FCD, value,
484                                defaultSettings.options, errorCode);
485         break;
486     case UCOL_STRENGTH:
487         ownedSettings->setStrength(value, defaultSettings.options, errorCode);
488         break;
489     case UCOL_HIRAGANA_QUATERNARY_MODE:
490         // Deprecated attribute. Check for valid values but do not change anything.
491         if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) {
492             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
493         }
494         break;
495     case UCOL_NUMERIC_COLLATION:
496         ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode);
497         break;
498     default:
499         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
500         break;
501     }
502     if(U_FAILURE(errorCode)) { return; }
503     setFastLatinOptions(*ownedSettings);
504     if(value == UCOL_DEFAULT) {
505         setAttributeDefault(attr);
506     } else {
507         setAttributeExplicitly(attr);
508     }
509 }
510 
511 Collator &
setMaxVariable(UColReorderCode group,UErrorCode & errorCode)512 RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) {
513     if(U_FAILURE(errorCode)) { return *this; }
514     // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
515     int32_t value;
516     if(group == UCOL_REORDER_CODE_DEFAULT) {
517         value = UCOL_DEFAULT;
518     } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) {
519         value = group - UCOL_REORDER_CODE_FIRST;
520     } else {
521         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
522         return *this;
523     }
524     CollationSettings::MaxVariable oldValue = settings->getMaxVariable();
525     if(value == oldValue) {
526         setAttributeExplicitly(ATTR_VARIABLE_TOP);
527         return *this;
528     }
529     const CollationSettings &defaultSettings = getDefaultSettings();
530     if(settings == &defaultSettings) {
531         if(value == UCOL_DEFAULT) {
532             setAttributeDefault(ATTR_VARIABLE_TOP);
533             return *this;
534         }
535     }
536     CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
537     if(ownedSettings == nullptr) {
538         errorCode = U_MEMORY_ALLOCATION_ERROR;
539         return *this;
540     }
541 
542     if(group == UCOL_REORDER_CODE_DEFAULT) {
543         group = (UColReorderCode)(
544             UCOL_REORDER_CODE_FIRST + int32_t{defaultSettings.getMaxVariable()});
545     }
546     uint32_t varTop = data->getLastPrimaryForGroup(group);
547     U_ASSERT(varTop != 0);
548     ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode);
549     if(U_FAILURE(errorCode)) { return *this; }
550     ownedSettings->variableTop = varTop;
551     setFastLatinOptions(*ownedSettings);
552     if(value == UCOL_DEFAULT) {
553         setAttributeDefault(ATTR_VARIABLE_TOP);
554     } else {
555         setAttributeExplicitly(ATTR_VARIABLE_TOP);
556     }
557     return *this;
558 }
559 
560 UColReorderCode
getMaxVariable() const561 RuleBasedCollator::getMaxVariable() const {
562     return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + int32_t{settings->getMaxVariable()});
563 }
564 
565 uint32_t
getVariableTop(UErrorCode &) const566 RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const {
567     return settings->variableTop;
568 }
569 
570 uint32_t
setVariableTop(const char16_t * varTop,int32_t len,UErrorCode & errorCode)571 RuleBasedCollator::setVariableTop(const char16_t *varTop, int32_t len, UErrorCode &errorCode) {
572     if(U_FAILURE(errorCode)) { return 0; }
573     if(varTop == nullptr && len !=0) {
574         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
575         return 0;
576     }
577     if(len < 0) { len = u_strlen(varTop); }
578     if(len == 0) {
579         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
580         return 0;
581     }
582     UBool numeric = settings->isNumeric();
583     int64_t ce1, ce2;
584     if(settings->dontCheckFCD()) {
585         UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
586         ce1 = ci.nextCE(errorCode);
587         ce2 = ci.nextCE(errorCode);
588     } else {
589         FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
590         ce1 = ci.nextCE(errorCode);
591         ce2 = ci.nextCE(errorCode);
592     }
593     if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) {
594         errorCode = U_CE_NOT_FOUND_ERROR;
595         return 0;
596     }
597     setVariableTop((uint32_t)(ce1 >> 32), errorCode);
598     return settings->variableTop;
599 }
600 
601 uint32_t
setVariableTop(const UnicodeString & varTop,UErrorCode & errorCode)602 RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) {
603     return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode);
604 }
605 
606 void
setVariableTop(uint32_t varTop,UErrorCode & errorCode)607 RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) {
608     if(U_FAILURE(errorCode)) { return; }
609     if(varTop != settings->variableTop) {
610         // Pin the variable top to the end of the reordering group which contains it.
611         // Only a few special groups are supported.
612         int32_t group = data->getGroupForPrimary(varTop);
613         if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) {
614             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
615             return;
616         }
617         uint32_t v = data->getLastPrimaryForGroup(group);
618         U_ASSERT(v != 0 && v >= varTop);
619         varTop = v;
620         if(varTop != settings->variableTop) {
621             CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
622             if(ownedSettings == nullptr) {
623                 errorCode = U_MEMORY_ALLOCATION_ERROR;
624                 return;
625             }
626             ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST,
627                                           getDefaultSettings().options, errorCode);
628             if(U_FAILURE(errorCode)) { return; }
629             ownedSettings->variableTop = varTop;
630             setFastLatinOptions(*ownedSettings);
631         }
632     }
633     if(varTop == getDefaultSettings().variableTop) {
634         setAttributeDefault(ATTR_VARIABLE_TOP);
635     } else {
636         setAttributeExplicitly(ATTR_VARIABLE_TOP);
637     }
638 }
639 
640 int32_t
getReorderCodes(int32_t * dest,int32_t capacity,UErrorCode & errorCode) const641 RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity,
642                                    UErrorCode &errorCode) const {
643     if(U_FAILURE(errorCode)) { return 0; }
644     if(capacity < 0 || (dest == nullptr && capacity > 0)) {
645         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
646         return 0;
647     }
648     int32_t length = settings->reorderCodesLength;
649     if(length == 0) { return 0; }
650     if(length > capacity) {
651         errorCode = U_BUFFER_OVERFLOW_ERROR;
652         return length;
653     }
654     uprv_memcpy(dest, settings->reorderCodes, length * 4);
655     return length;
656 }
657 
658 void
setReorderCodes(const int32_t * reorderCodes,int32_t length,UErrorCode & errorCode)659 RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
660                                    UErrorCode &errorCode) {
661     if(U_FAILURE(errorCode)) { return; }
662     if(length < 0 || (reorderCodes == nullptr && length > 0)) {
663         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
664         return;
665     }
666     if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_NONE) {
667         length = 0;
668     }
669     if(length == settings->reorderCodesLength &&
670             uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) {
671         return;
672     }
673     const CollationSettings &defaultSettings = getDefaultSettings();
674     if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) {
675         if(settings != &defaultSettings) {
676             CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
677             if(ownedSettings == nullptr) {
678                 errorCode = U_MEMORY_ALLOCATION_ERROR;
679                 return;
680             }
681             ownedSettings->copyReorderingFrom(defaultSettings, errorCode);
682             setFastLatinOptions(*ownedSettings);
683         }
684         return;
685     }
686     CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
687     if(ownedSettings == nullptr) {
688         errorCode = U_MEMORY_ALLOCATION_ERROR;
689         return;
690     }
691     ownedSettings->setReordering(*data, reorderCodes, length, errorCode);
692     setFastLatinOptions(*ownedSettings);
693 }
694 
695 void
setFastLatinOptions(CollationSettings & ownedSettings) const696 RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const {
697     ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
698             data, ownedSettings,
699             ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
700 }
701 
702 UCollationResult
compare(const UnicodeString & left,const UnicodeString & right,UErrorCode & errorCode) const703 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
704                            UErrorCode &errorCode) const {
705     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
706     return doCompare(left.getBuffer(), left.length(),
707                      right.getBuffer(), right.length(), errorCode);
708 }
709 
710 UCollationResult
compare(const UnicodeString & left,const UnicodeString & right,int32_t length,UErrorCode & errorCode) const711 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
712                            int32_t length, UErrorCode &errorCode) const {
713     if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; }
714     if(length < 0) {
715         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
716         return UCOL_EQUAL;
717     }
718     int32_t leftLength = left.length();
719     int32_t rightLength = right.length();
720     if(leftLength > length) { leftLength = length; }
721     if(rightLength > length) { rightLength = length; }
722     return doCompare(left.getBuffer(), leftLength,
723                      right.getBuffer(), rightLength, errorCode);
724 }
725 
726 UCollationResult
compare(const char16_t * left,int32_t leftLength,const char16_t * right,int32_t rightLength,UErrorCode & errorCode) const727 RuleBasedCollator::compare(const char16_t *left, int32_t leftLength,
728                            const char16_t *right, int32_t rightLength,
729                            UErrorCode &errorCode) const {
730     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
731     if((left == nullptr && leftLength != 0) || (right == nullptr && rightLength != 0)) {
732         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
733         return UCOL_EQUAL;
734     }
735     // Make sure both or neither strings have a known length.
736     // We do not optimize for mixed length/termination.
737     if(leftLength >= 0) {
738         if(rightLength < 0) { rightLength = u_strlen(right); }
739     } else {
740         if(rightLength >= 0) { leftLength = u_strlen(left); }
741     }
742     return doCompare(left, leftLength, right, rightLength, errorCode);
743 }
744 
745 UCollationResult
compareUTF8(const StringPiece & left,const StringPiece & right,UErrorCode & errorCode) const746 RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right,
747                                UErrorCode &errorCode) const {
748     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
749     const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data());
750     const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data());
751     if((leftBytes == nullptr && !left.empty()) || (rightBytes == nullptr && !right.empty())) {
752         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
753         return UCOL_EQUAL;
754     }
755     return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode);
756 }
757 
758 UCollationResult
internalCompareUTF8(const char * left,int32_t leftLength,const char * right,int32_t rightLength,UErrorCode & errorCode) const759 RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength,
760                                        const char *right, int32_t rightLength,
761                                        UErrorCode &errorCode) const {
762     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
763     if((left == nullptr && leftLength != 0) || (right == nullptr && rightLength != 0)) {
764         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
765         return UCOL_EQUAL;
766     }
767     // Make sure both or neither strings have a known length.
768     // We do not optimize for mixed length/termination.
769     if(leftLength >= 0) {
770         if(rightLength < 0) { rightLength = static_cast<int32_t>(uprv_strlen(right)); }
771     } else {
772         if(rightLength >= 0) { leftLength = static_cast<int32_t>(uprv_strlen(left)); }
773     }
774     return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength,
775                      reinterpret_cast<const uint8_t *>(right), rightLength, errorCode);
776 }
777 
778 namespace {
779 
780 /**
781  * Abstract iterator for identical-level string comparisons.
782  * Returns FCD code points and handles temporary switching to NFD.
783  */
784 class NFDIterator : public UObject {
785 public:
NFDIterator()786     NFDIterator() : index(-1), length(0) {}
~NFDIterator()787     virtual ~NFDIterator() {}
788     /**
789      * Returns the next code point from the internal normalization buffer,
790      * or else the next text code point.
791      * Returns -1 at the end of the text.
792      */
nextCodePoint()793     UChar32 nextCodePoint() {
794         if(index >= 0) {
795             if(index == length) {
796                 index = -1;
797             } else {
798                 UChar32 c;
799                 U16_NEXT_UNSAFE(decomp, index, c);
800                 return c;
801             }
802         }
803         return nextRawCodePoint();
804     }
805     /**
806      * @param nfcImpl
807      * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
808      * @return the first code point in c's decomposition,
809      *         or c itself if it was decomposed already or if it does not decompose
810      */
nextDecomposedCodePoint(const Normalizer2Impl & nfcImpl,UChar32 c)811     UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) {
812         if(index >= 0) { return c; }
813         decomp = nfcImpl.getDecomposition(c, buffer, length);
814         if(decomp == nullptr) { return c; }
815         index = 0;
816         U16_NEXT_UNSAFE(decomp, index, c);
817         return c;
818     }
819 protected:
820     /**
821      * Returns the next text code point in FCD order.
822      * Returns -1 at the end of the text.
823      */
824     virtual UChar32 nextRawCodePoint() = 0;
825 private:
826     const char16_t *decomp;
827     char16_t buffer[4];
828     int32_t index;
829     int32_t length;
830 };
831 
832 class UTF16NFDIterator : public NFDIterator {
833 public:
UTF16NFDIterator(const char16_t * text,const char16_t * textLimit)834     UTF16NFDIterator(const char16_t *text, const char16_t *textLimit) : s(text), limit(textLimit) {}
835 protected:
nextRawCodePoint()836     virtual UChar32 nextRawCodePoint() override {
837         if(s == limit) { return U_SENTINEL; }
838         UChar32 c = *s++;
839         if(limit == nullptr && c == 0) {
840             s = nullptr;
841             return U_SENTINEL;
842         }
843         char16_t trail;
844         if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) {
845             ++s;
846             c = U16_GET_SUPPLEMENTARY(c, trail);
847         }
848         return c;
849     }
850 
851     const char16_t *s;
852     const char16_t *limit;
853 };
854 
855 class FCDUTF16NFDIterator : public UTF16NFDIterator {
856 public:
FCDUTF16NFDIterator(const Normalizer2Impl & nfcImpl,const char16_t * text,const char16_t * textLimit)857     FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const char16_t *text, const char16_t *textLimit)
858             : UTF16NFDIterator(nullptr, nullptr) {
859         UErrorCode errorCode = U_ZERO_ERROR;
860         const char16_t *spanLimit = nfcImpl.makeFCD(text, textLimit, nullptr, errorCode);
861         if(U_FAILURE(errorCode)) { return; }
862         if(spanLimit == textLimit || (textLimit == nullptr && *spanLimit == 0)) {
863             s = text;
864             limit = spanLimit;
865         } else {
866             str.setTo(text, (int32_t)(spanLimit - text));
867             {
868                 ReorderingBuffer r_buffer(nfcImpl, str);
869                 if(r_buffer.init(str.length(), errorCode)) {
870                     nfcImpl.makeFCD(spanLimit, textLimit, &r_buffer, errorCode);
871                 }
872             }
873             if(U_SUCCESS(errorCode)) {
874                 s = str.getBuffer();
875                 limit = s + str.length();
876             }
877         }
878     }
879 private:
880     UnicodeString str;
881 };
882 
883 class UTF8NFDIterator : public NFDIterator {
884 public:
UTF8NFDIterator(const uint8_t * text,int32_t textLength)885     UTF8NFDIterator(const uint8_t *text, int32_t textLength)
886         : s(text), pos(0), length(textLength) {}
887 protected:
nextRawCodePoint()888     virtual UChar32 nextRawCodePoint() override {
889         if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; }
890         UChar32 c;
891         U8_NEXT_OR_FFFD(s, pos, length, c);
892         return c;
893     }
894 
895     const uint8_t *s;
896     int32_t pos;
897     int32_t length;
898 };
899 
900 class FCDUTF8NFDIterator : public NFDIterator {
901 public:
FCDUTF8NFDIterator(const CollationData * data,const uint8_t * text,int32_t textLength)902     FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength)
903             : u8ci(data, false, text, 0, textLength) {}
904 protected:
nextRawCodePoint()905     virtual UChar32 nextRawCodePoint() override {
906         UErrorCode errorCode = U_ZERO_ERROR;
907         return u8ci.nextCodePoint(errorCode);
908     }
909 private:
910     FCDUTF8CollationIterator u8ci;
911 };
912 
913 class UIterNFDIterator : public NFDIterator {
914 public:
UIterNFDIterator(UCharIterator & it)915     UIterNFDIterator(UCharIterator &it) : iter(it) {}
916 protected:
nextRawCodePoint()917     virtual UChar32 nextRawCodePoint() override {
918         return uiter_next32(&iter);
919     }
920 private:
921     UCharIterator &iter;
922 };
923 
924 class FCDUIterNFDIterator : public NFDIterator {
925 public:
FCDUIterNFDIterator(const CollationData * data,UCharIterator & it,int32_t startIndex)926     FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex)
927             : uici(data, false, it, startIndex) {}
928 protected:
nextRawCodePoint()929     virtual UChar32 nextRawCodePoint() override {
930         UErrorCode errorCode = U_ZERO_ERROR;
931         return uici.nextCodePoint(errorCode);
932     }
933 private:
934     FCDUIterCollationIterator uici;
935 };
936 
compareNFDIter(const Normalizer2Impl & nfcImpl,NFDIterator & left,NFDIterator & right)937 UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl,
938                                 NFDIterator &left, NFDIterator &right) {
939     for(;;) {
940         // Fetch the next FCD code point from each string.
941         UChar32 leftCp = left.nextCodePoint();
942         UChar32 rightCp = right.nextCodePoint();
943         if(leftCp == rightCp) {
944             if(leftCp < 0) { break; }
945             continue;
946         }
947         // If they are different, then decompose each and compare again.
948         if(leftCp < 0) {
949             leftCp = -2;  // end of string
950         } else if(leftCp == 0xfffe) {
951             leftCp = -1;  // U+FFFE: merge separator
952         } else {
953             leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp);
954         }
955         if(rightCp < 0) {
956             rightCp = -2;  // end of string
957         } else if(rightCp == 0xfffe) {
958             rightCp = -1;  // U+FFFE: merge separator
959         } else {
960             rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp);
961         }
962         if(leftCp < rightCp) { return UCOL_LESS; }
963         if(leftCp > rightCp) { return UCOL_GREATER; }
964     }
965     return UCOL_EQUAL;
966 }
967 
968 }  // namespace
969 
970 UCollationResult
doCompare(const char16_t * left,int32_t leftLength,const char16_t * right,int32_t rightLength,UErrorCode & errorCode) const971 RuleBasedCollator::doCompare(const char16_t *left, int32_t leftLength,
972                              const char16_t *right, int32_t rightLength,
973                              UErrorCode &errorCode) const {
974     // U_FAILURE(errorCode) checked by caller.
975     if(left == right && leftLength == rightLength) {
976         return UCOL_EQUAL;
977     }
978 
979     // Identical-prefix test.
980     const char16_t *leftLimit;
981     const char16_t *rightLimit;
982     int32_t equalPrefixLength = 0;
983     if(leftLength < 0) {
984         leftLimit = nullptr;
985         rightLimit = nullptr;
986         char16_t c;
987         while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
988             if(c == 0) { return UCOL_EQUAL; }
989             ++equalPrefixLength;
990         }
991     } else {
992         leftLimit = left + leftLength;
993         rightLimit = right + rightLength;
994         for(;;) {
995             if(equalPrefixLength == leftLength) {
996                 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
997                 break;
998             } else if(equalPrefixLength == rightLength ||
999                       left[equalPrefixLength] != right[equalPrefixLength]) {
1000                 break;
1001             }
1002             ++equalPrefixLength;
1003         }
1004     }
1005 
1006     UBool numeric = settings->isNumeric();
1007     if(equalPrefixLength > 0) {
1008         if((equalPrefixLength != leftLength &&
1009                     data->isUnsafeBackward(left[equalPrefixLength], numeric)) ||
1010                 (equalPrefixLength != rightLength &&
1011                     data->isUnsafeBackward(right[equalPrefixLength], numeric))) {
1012             // Identical prefix: Back up to the start of a contraction or reordering sequence.
1013             while(--equalPrefixLength > 0 &&
1014                     data->isUnsafeBackward(left[equalPrefixLength], numeric)) {}
1015         }
1016         // Notes:
1017         // - A longer string can compare equal to a prefix of it if only ignorables follow.
1018         // - With a backward level, a longer string can compare less-than a prefix of it.
1019 
1020         // Pass the actual start of each string into the CollationIterators,
1021         // plus the equalPrefixLength position,
1022         // so that prefix matches back into the equal prefix work.
1023     }
1024 
1025     int32_t result;
1026     int32_t fastLatinOptions = settings->fastLatinOptions;
1027     if(fastLatinOptions >= 0 &&
1028             (equalPrefixLength == leftLength ||
1029                 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) &&
1030             (equalPrefixLength == rightLength ||
1031                 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) {
1032         if(leftLength >= 0) {
1033             result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1034                                                       settings->fastLatinPrimaries,
1035                                                       fastLatinOptions,
1036                                                       left + equalPrefixLength,
1037                                                       leftLength - equalPrefixLength,
1038                                                       right + equalPrefixLength,
1039                                                       rightLength - equalPrefixLength);
1040         } else {
1041             result = CollationFastLatin::compareUTF16(data->fastLatinTable,
1042                                                       settings->fastLatinPrimaries,
1043                                                       fastLatinOptions,
1044                                                       left + equalPrefixLength, -1,
1045                                                       right + equalPrefixLength, -1);
1046         }
1047     } else {
1048         result = CollationFastLatin::BAIL_OUT_RESULT;
1049     }
1050 
1051     if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1052         if(settings->dontCheckFCD()) {
1053             UTF16CollationIterator leftIter(data, numeric,
1054                                             left, left + equalPrefixLength, leftLimit);
1055             UTF16CollationIterator rightIter(data, numeric,
1056                                             right, right + equalPrefixLength, rightLimit);
1057             result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1058         } else {
1059             FCDUTF16CollationIterator leftIter(data, numeric,
1060                                               left, left + equalPrefixLength, leftLimit);
1061             FCDUTF16CollationIterator rightIter(data, numeric,
1062                                                 right, right + equalPrefixLength, rightLimit);
1063             result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1064         }
1065     }
1066     if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1067         return (UCollationResult)result;
1068     }
1069 
1070     // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1071     // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1072     // and the benefit seems unlikely to be measurable.
1073 
1074     // Compare identical level.
1075     const Normalizer2Impl &nfcImpl = data->nfcImpl;
1076     left += equalPrefixLength;
1077     right += equalPrefixLength;
1078     if(settings->dontCheckFCD()) {
1079         UTF16NFDIterator leftIter(left, leftLimit);
1080         UTF16NFDIterator rightIter(right, rightLimit);
1081         return compareNFDIter(nfcImpl, leftIter, rightIter);
1082     } else {
1083         FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit);
1084         FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit);
1085         return compareNFDIter(nfcImpl, leftIter, rightIter);
1086     }
1087 }
1088 
1089 UCollationResult
doCompare(const uint8_t * left,int32_t leftLength,const uint8_t * right,int32_t rightLength,UErrorCode & errorCode) const1090 RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength,
1091                              const uint8_t *right, int32_t rightLength,
1092                              UErrorCode &errorCode) const {
1093     // U_FAILURE(errorCode) checked by caller.
1094     if(left == right && leftLength == rightLength) {
1095         return UCOL_EQUAL;
1096     }
1097 
1098     // Identical-prefix test.
1099     int32_t equalPrefixLength = 0;
1100     if(leftLength < 0) {
1101         uint8_t c;
1102         while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
1103             if(c == 0) { return UCOL_EQUAL; }
1104             ++equalPrefixLength;
1105         }
1106     } else {
1107         for(;;) {
1108             if(equalPrefixLength == leftLength) {
1109                 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
1110                 break;
1111             } else if(equalPrefixLength == rightLength ||
1112                       left[equalPrefixLength] != right[equalPrefixLength]) {
1113                 break;
1114             }
1115             ++equalPrefixLength;
1116         }
1117     }
1118     // Back up to the start of a partially-equal code point.
1119     if(equalPrefixLength > 0 &&
1120             ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) ||
1121             (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) {
1122         while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {}
1123     }
1124 
1125     UBool numeric = settings->isNumeric();
1126     if(equalPrefixLength > 0) {
1127         UBool unsafe = false;
1128         if(equalPrefixLength != leftLength) {
1129             int32_t i = equalPrefixLength;
1130             UChar32 c;
1131             U8_NEXT_OR_FFFD(left, i, leftLength, c);
1132             unsafe = data->isUnsafeBackward(c, numeric);
1133         }
1134         if(!unsafe && equalPrefixLength != rightLength) {
1135             int32_t i = equalPrefixLength;
1136             UChar32 c;
1137             U8_NEXT_OR_FFFD(right, i, rightLength, c);
1138             unsafe = data->isUnsafeBackward(c, numeric);
1139         }
1140         if(unsafe) {
1141             // Identical prefix: Back up to the start of a contraction or reordering sequence.
1142             UChar32 c;
1143             do {
1144                 U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c);
1145             } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric));
1146         }
1147         // See the notes in the UTF-16 version.
1148 
1149         // Pass the actual start of each string into the CollationIterators,
1150         // plus the equalPrefixLength position,
1151         // so that prefix matches back into the equal prefix work.
1152     }
1153 
1154     int32_t result;
1155     int32_t fastLatinOptions = settings->fastLatinOptions;
1156     if(fastLatinOptions >= 0 &&
1157             (equalPrefixLength == leftLength ||
1158                 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) &&
1159             (equalPrefixLength == rightLength ||
1160                 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) {
1161         if(leftLength >= 0) {
1162             result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1163                                                      settings->fastLatinPrimaries,
1164                                                      fastLatinOptions,
1165                                                      left + equalPrefixLength,
1166                                                      leftLength - equalPrefixLength,
1167                                                      right + equalPrefixLength,
1168                                                      rightLength - equalPrefixLength);
1169         } else {
1170             result = CollationFastLatin::compareUTF8(data->fastLatinTable,
1171                                                      settings->fastLatinPrimaries,
1172                                                      fastLatinOptions,
1173                                                      left + equalPrefixLength, -1,
1174                                                      right + equalPrefixLength, -1);
1175         }
1176     } else {
1177         result = CollationFastLatin::BAIL_OUT_RESULT;
1178     }
1179 
1180     if(result == CollationFastLatin::BAIL_OUT_RESULT) {
1181         if(settings->dontCheckFCD()) {
1182             UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
1183             UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
1184             result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1185         } else {
1186             FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
1187             FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
1188             result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1189         }
1190     }
1191     if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1192         return (UCollationResult)result;
1193     }
1194 
1195     // Note: If NUL-terminated, we could get the actual limits from the iterators now.
1196     // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
1197     // and the benefit seems unlikely to be measurable.
1198 
1199     // Compare identical level.
1200     const Normalizer2Impl &nfcImpl = data->nfcImpl;
1201     left += equalPrefixLength;
1202     right += equalPrefixLength;
1203     if(leftLength > 0) {
1204         leftLength -= equalPrefixLength;
1205         rightLength -= equalPrefixLength;
1206     }
1207     if(settings->dontCheckFCD()) {
1208         UTF8NFDIterator leftIter(left, leftLength);
1209         UTF8NFDIterator rightIter(right, rightLength);
1210         return compareNFDIter(nfcImpl, leftIter, rightIter);
1211     } else {
1212         FCDUTF8NFDIterator leftIter(data, left, leftLength);
1213         FCDUTF8NFDIterator rightIter(data, right, rightLength);
1214         return compareNFDIter(nfcImpl, leftIter, rightIter);
1215     }
1216 }
1217 
1218 UCollationResult
compare(UCharIterator & left,UCharIterator & right,UErrorCode & errorCode) const1219 RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right,
1220                            UErrorCode &errorCode) const {
1221     if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; }
1222     UBool numeric = settings->isNumeric();
1223 
1224     // Identical-prefix test.
1225     int32_t equalPrefixLength = 0;
1226     {
1227         UChar32 leftUnit;
1228         UChar32 rightUnit;
1229         while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) {
1230             if(leftUnit < 0) { return UCOL_EQUAL; }
1231             ++equalPrefixLength;
1232         }
1233 
1234         // Back out the code units that differed, for the real collation comparison.
1235         if(leftUnit >= 0) { left.previous(&left); }
1236         if(rightUnit >= 0) { right.previous(&right); }
1237 
1238         if(equalPrefixLength > 0) {
1239             if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) ||
1240                     (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) {
1241                 // Identical prefix: Back up to the start of a contraction or reordering sequence.
1242                 do {
1243                     --equalPrefixLength;
1244                     leftUnit = left.previous(&left);
1245                     right.previous(&right);
1246                 } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric));
1247             }
1248             // See the notes in the UTF-16 version.
1249         }
1250     }
1251 
1252     UCollationResult result;
1253     if(settings->dontCheckFCD()) {
1254         UIterCollationIterator leftIter(data, numeric, left);
1255         UIterCollationIterator rightIter(data, numeric, right);
1256         result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1257     } else {
1258         FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength);
1259         FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength);
1260         result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
1261     }
1262     if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
1263         return result;
1264     }
1265 
1266     // Compare identical level.
1267     left.move(&left, equalPrefixLength, UITER_ZERO);
1268     right.move(&right, equalPrefixLength, UITER_ZERO);
1269     const Normalizer2Impl &nfcImpl = data->nfcImpl;
1270     if(settings->dontCheckFCD()) {
1271         UIterNFDIterator leftIter(left);
1272         UIterNFDIterator rightIter(right);
1273         return compareNFDIter(nfcImpl, leftIter, rightIter);
1274     } else {
1275         FCDUIterNFDIterator leftIter(data, left, equalPrefixLength);
1276         FCDUIterNFDIterator rightIter(data, right, equalPrefixLength);
1277         return compareNFDIter(nfcImpl, leftIter, rightIter);
1278     }
1279 }
1280 
1281 CollationKey &
getCollationKey(const UnicodeString & s,CollationKey & key,UErrorCode & errorCode) const1282 RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key,
1283                                    UErrorCode &errorCode) const {
1284     return getCollationKey(s.getBuffer(), s.length(), key, errorCode);
1285 }
1286 
1287 CollationKey &
getCollationKey(const char16_t * s,int32_t length,CollationKey & key,UErrorCode & errorCode) const1288 RuleBasedCollator::getCollationKey(const char16_t *s, int32_t length, CollationKey& key,
1289                                    UErrorCode &errorCode) const {
1290     if(U_FAILURE(errorCode)) {
1291         return key.setToBogus();
1292     }
1293     if(s == nullptr && length != 0) {
1294         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1295         return key.setToBogus();
1296     }
1297     key.reset();  // resets the "bogus" state
1298     CollationKeyByteSink sink(key);
1299     writeSortKey(s, length, sink, errorCode);
1300     if(U_FAILURE(errorCode)) {
1301         key.setToBogus();
1302     } else if(key.isBogus()) {
1303         errorCode = U_MEMORY_ALLOCATION_ERROR;
1304     } else {
1305         key.setLength(sink.NumberOfBytesAppended());
1306     }
1307     return key;
1308 }
1309 
1310 int32_t
getSortKey(const UnicodeString & s,uint8_t * dest,int32_t capacity) const1311 RuleBasedCollator::getSortKey(const UnicodeString &s,
1312                               uint8_t *dest, int32_t capacity) const {
1313     return getSortKey(s.getBuffer(), s.length(), dest, capacity);
1314 }
1315 
1316 int32_t
getSortKey(const char16_t * s,int32_t length,uint8_t * dest,int32_t capacity) const1317 RuleBasedCollator::getSortKey(const char16_t *s, int32_t length,
1318                               uint8_t *dest, int32_t capacity) const {
1319     if((s == nullptr && length != 0) || capacity < 0 || (dest == nullptr && capacity > 0)) {
1320         return 0;
1321     }
1322     uint8_t noDest[1] = { 0 };
1323     if(dest == nullptr) {
1324         // Distinguish pure preflighting from an allocation error.
1325         dest = noDest;
1326         capacity = 0;
1327     }
1328     FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity);
1329     UErrorCode errorCode = U_ZERO_ERROR;
1330     writeSortKey(s, length, sink, errorCode);
1331     return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0;
1332 }
1333 
1334 void
writeSortKey(const char16_t * s,int32_t length,SortKeyByteSink & sink,UErrorCode & errorCode) const1335 RuleBasedCollator::writeSortKey(const char16_t *s, int32_t length,
1336                                 SortKeyByteSink &sink, UErrorCode &errorCode) const {
1337     if(U_FAILURE(errorCode)) { return; }
1338     const char16_t *limit = (length >= 0) ? s + length : nullptr;
1339     UBool numeric = settings->isNumeric();
1340     CollationKeys::LevelCallback callback;
1341     if(settings->dontCheckFCD()) {
1342         UTF16CollationIterator iter(data, numeric, s, s, limit);
1343         CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1344                                                   sink, Collation::PRIMARY_LEVEL,
1345                                                   callback, true, errorCode);
1346     } else {
1347         FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1348         CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
1349                                                   sink, Collation::PRIMARY_LEVEL,
1350                                                   callback, true, errorCode);
1351     }
1352     if(settings->getStrength() == UCOL_IDENTICAL) {
1353         writeIdenticalLevel(s, limit, sink, errorCode);
1354     }
1355     static const char terminator = 0;  // TERMINATOR_BYTE
1356     sink.Append(&terminator, 1);
1357 }
1358 
1359 void
writeIdenticalLevel(const char16_t * s,const char16_t * limit,SortKeyByteSink & sink,UErrorCode & errorCode) const1360 RuleBasedCollator::writeIdenticalLevel(const char16_t *s, const char16_t *limit,
1361                                        SortKeyByteSink &sink, UErrorCode &errorCode) const {
1362     // NFD quick check
1363     const char16_t *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, nullptr, errorCode);
1364     if(U_FAILURE(errorCode)) { return; }
1365     sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
1366     UChar32 prev = 0;
1367     if(nfdQCYesLimit != s) {
1368         prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), sink);
1369     }
1370     // Is there non-NFD text?
1371     int32_t destLengthEstimate;
1372     if(limit != nullptr) {
1373         if(nfdQCYesLimit == limit) { return; }
1374         destLengthEstimate = (int32_t)(limit - nfdQCYesLimit);
1375     } else {
1376         // s is NUL-terminated
1377         if(*nfdQCYesLimit == 0) { return; }
1378         destLengthEstimate = -1;
1379     }
1380     UnicodeString nfd;
1381     data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode);
1382     u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink);
1383 }
1384 
1385 namespace {
1386 
1387 /**
1388  * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
1389  * with an instance of this callback class.
1390  * When another level is about to be written, the callback
1391  * records the level and the number of bytes that will be written until
1392  * the sink (which is actually a FixedSortKeyByteSink) fills up.
1393  *
1394  * When internalNextSortKeyPart() is called again, it restarts with the last level
1395  * and ignores as many bytes as were written previously for that level.
1396  */
1397 class PartLevelCallback : public CollationKeys::LevelCallback {
1398 public:
PartLevelCallback(const SortKeyByteSink & s)1399     PartLevelCallback(const SortKeyByteSink &s)
1400             : sink(s), level(Collation::PRIMARY_LEVEL) {
1401         levelCapacity = sink.GetRemainingCapacity();
1402     }
~PartLevelCallback()1403     virtual ~PartLevelCallback() {}
needToWrite(Collation::Level l)1404     virtual UBool needToWrite(Collation::Level l) override {
1405         if(!sink.Overflowed()) {
1406             // Remember a level that will be at least partially written.
1407             level = l;
1408             levelCapacity = sink.GetRemainingCapacity();
1409             return true;
1410         } else {
1411             return false;
1412         }
1413     }
getLevel() const1414     Collation::Level getLevel() const { return level; }
getLevelCapacity() const1415     int32_t getLevelCapacity() const { return levelCapacity; }
1416 
1417 private:
1418     const SortKeyByteSink &sink;
1419     Collation::Level level;
1420     int32_t levelCapacity;
1421 };
1422 
1423 }  // namespace
1424 
1425 int32_t
internalNextSortKeyPart(UCharIterator * iter,uint32_t state[2],uint8_t * dest,int32_t count,UErrorCode & errorCode) const1426 RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2],
1427                                            uint8_t *dest, int32_t count, UErrorCode &errorCode) const {
1428     if(U_FAILURE(errorCode)) { return 0; }
1429     if(iter == nullptr || state == nullptr || count < 0 || (count > 0 && dest == nullptr)) {
1430         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1431         return 0;
1432     }
1433     if(count == 0) { return 0; }
1434 
1435     FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count);
1436     sink.IgnoreBytes((int32_t)state[1]);
1437     iter->move(iter, 0, UITER_START);
1438 
1439     Collation::Level level = (Collation::Level)state[0];
1440     if(level <= Collation::QUATERNARY_LEVEL) {
1441         UBool numeric = settings->isNumeric();
1442         PartLevelCallback callback(sink);
1443         if(settings->dontCheckFCD()) {
1444             UIterCollationIterator ci(data, numeric, *iter);
1445             CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
1446                                                       sink, level, callback, false, errorCode);
1447         } else {
1448             FCDUIterCollationIterator ci(data, numeric, *iter, 0);
1449             CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
1450                                                       sink, level, callback, false, errorCode);
1451         }
1452         if(U_FAILURE(errorCode)) { return 0; }
1453         if(sink.NumberOfBytesAppended() > count) {
1454             state[0] = (uint32_t)callback.getLevel();
1455             state[1] = (uint32_t)callback.getLevelCapacity();
1456             return count;
1457         }
1458         // All of the normal levels are done.
1459         if(settings->getStrength() == UCOL_IDENTICAL) {
1460             level = Collation::IDENTICAL_LEVEL;
1461             iter->move(iter, 0, UITER_START);
1462         }
1463         // else fall through to setting ZERO_LEVEL
1464     }
1465 
1466     if(level == Collation::IDENTICAL_LEVEL) {
1467         int32_t levelCapacity = sink.GetRemainingCapacity();
1468         UnicodeString s;
1469         for(;;) {
1470             UChar32 c = iter->next(iter);
1471             if(c < 0) { break; }
1472             s.append((char16_t)c);
1473         }
1474         const char16_t *sArray = s.getBuffer();
1475         writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode);
1476         if(U_FAILURE(errorCode)) { return 0; }
1477         if(sink.NumberOfBytesAppended() > count) {
1478             state[0] = (uint32_t)level;
1479             state[1] = (uint32_t)levelCapacity;
1480             return count;
1481         }
1482     }
1483 
1484     // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
1485     state[0] = (uint32_t)Collation::ZERO_LEVEL;
1486     state[1] = 0;
1487     int32_t length = sink.NumberOfBytesAppended();
1488     int32_t i = length;
1489     while(i < count) { dest[i++] = 0; }
1490     return length;
1491 }
1492 
1493 void
internalGetCEs(const UnicodeString & str,UVector64 & ces,UErrorCode & errorCode) const1494 RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces,
1495                                   UErrorCode &errorCode) const {
1496     if(U_FAILURE(errorCode)) { return; }
1497     const char16_t *s = str.getBuffer();
1498     const char16_t *limit = s + str.length();
1499     UBool numeric = settings->isNumeric();
1500     if(settings->dontCheckFCD()) {
1501         UTF16CollationIterator iter(data, numeric, s, s, limit);
1502         int64_t ce;
1503         while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1504             ces.addElement(ce, errorCode);
1505         }
1506     } else {
1507         FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
1508         int64_t ce;
1509         while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
1510             ces.addElement(ce, errorCode);
1511         }
1512     }
1513 }
1514 
1515 namespace {
1516 
appendSubtag(CharString & s,char letter,const char * subtag,int32_t length,UErrorCode & errorCode)1517 void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length,
1518                   UErrorCode &errorCode) {
1519     if(U_FAILURE(errorCode) || length == 0) { return; }
1520     if(!s.isEmpty()) {
1521         s.append('_', errorCode);
1522     }
1523     s.append(letter, errorCode);
1524     for(int32_t i = 0; i < length; ++i) {
1525         s.append(uprv_toupper(subtag[i]), errorCode);
1526     }
1527 }
1528 
appendAttribute(CharString & s,char letter,UColAttributeValue value,UErrorCode & errorCode)1529 void appendAttribute(CharString &s, char letter, UColAttributeValue value,
1530                      UErrorCode &errorCode) {
1531     if(U_FAILURE(errorCode)) { return; }
1532     if(!s.isEmpty()) {
1533         s.append('_', errorCode);
1534     }
1535     static const char *valueChars = "1234...........IXO..SN..LU......";
1536     s.append(letter, errorCode);
1537     s.append(valueChars[value], errorCode);
1538 }
1539 
1540 }  // namespace
1541 
1542 int32_t
internalGetShortDefinitionString(const char * locale,char * buffer,int32_t capacity,UErrorCode & errorCode) const1543 RuleBasedCollator::internalGetShortDefinitionString(const char *locale,
1544                                                     char *buffer, int32_t capacity,
1545                                                     UErrorCode &errorCode) const {
1546     if(U_FAILURE(errorCode)) { return 0; }
1547     if(buffer == nullptr ? capacity != 0 : capacity < 0) {
1548         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
1549         return 0;
1550     }
1551     if(locale == nullptr) {
1552         locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode);
1553     }
1554 
1555     char resultLocale[ULOC_FULLNAME_CAPACITY + 1];
1556     int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY,
1557                                                   "collation", locale,
1558                                                   nullptr, &errorCode);
1559     if(U_FAILURE(errorCode)) { return 0; }
1560     resultLocale[length] = 0;
1561 
1562     // Append items in alphabetic order of their short definition letters.
1563     CharString result;
1564     char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY];
1565 
1566     if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) {
1567         appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode);
1568     }
1569     // ATTR_VARIABLE_TOP not supported because 'B' was broken.
1570     // See ICU tickets #10372 and #10386.
1571     if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) {
1572         appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode);
1573     }
1574     if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) {
1575         appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode);
1576     }
1577     if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) {
1578         appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode);
1579     }
1580     if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) {
1581         appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode);
1582     }
1583     // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
1584     {
1585         CharString collation;
1586         CharStringByteSink sink(&collation);
1587         ulocimp_getKeywordValue(resultLocale, "collation", sink, &errorCode);
1588         appendSubtag(result, 'K', collation.data(), collation.length(), errorCode);
1589     }
1590     length = uloc_getLanguage(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1591     if (length == 0) {
1592         appendSubtag(result, 'L', "root", 4, errorCode);
1593     } else {
1594         appendSubtag(result, 'L', subtag, length, errorCode);
1595     }
1596     if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) {
1597         appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode);
1598     }
1599     length = uloc_getCountry(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1600     appendSubtag(result, 'R', subtag, length, errorCode);
1601     if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) {
1602         appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode);
1603     }
1604     length = uloc_getVariant(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1605     appendSubtag(result, 'V', subtag, length, errorCode);
1606     length = uloc_getScript(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
1607     appendSubtag(result, 'Z', subtag, length, errorCode);
1608 
1609     if(U_FAILURE(errorCode)) { return 0; }
1610     return result.extract(buffer, capacity, errorCode);
1611 }
1612 
1613 UBool
isUnsafe(UChar32 c) const1614 RuleBasedCollator::isUnsafe(UChar32 c) const {
1615     return data->isUnsafeBackward(c, settings->isNumeric());
1616 }
1617 
1618 void U_CALLCONV
computeMaxExpansions(const CollationTailoring * t,UErrorCode & errorCode)1619 RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) {
1620     t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode);
1621 }
1622 
1623 UBool
initMaxExpansions(UErrorCode & errorCode) const1624 RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const {
1625     umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode);
1626     return U_SUCCESS(errorCode);
1627 }
1628 
1629 CollationElementIterator *
createCollationElementIterator(const UnicodeString & source) const1630 RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const {
1631     UErrorCode errorCode = U_ZERO_ERROR;
1632     if(!initMaxExpansions(errorCode)) { return nullptr; }
1633     CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
1634     if(U_FAILURE(errorCode)) {
1635         delete cei;
1636         return nullptr;
1637     }
1638     return cei;
1639 }
1640 
1641 CollationElementIterator *
createCollationElementIterator(const CharacterIterator & source) const1642 RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const {
1643     UErrorCode errorCode = U_ZERO_ERROR;
1644     if(!initMaxExpansions(errorCode)) { return nullptr; }
1645     CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
1646     if(U_FAILURE(errorCode)) {
1647         delete cei;
1648         return nullptr;
1649     }
1650     return cei;
1651 }
1652 
1653 int32_t
getMaxExpansion(int32_t order) const1654 RuleBasedCollator::getMaxExpansion(int32_t order) const {
1655     UErrorCode errorCode = U_ZERO_ERROR;
1656     (void)initMaxExpansions(errorCode);
1657     return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order);
1658 }
1659 
1660 U_NAMESPACE_END
1661 
1662 #endif  // !UCONFIG_NO_COLLATION
1663