xref: /aosp_15_r20/external/cronet/third_party/icu/source/common/ucase.cpp (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2004-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucase.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2004aug30
16 *   created by: Markus W. Scherer
17 *
18 *   Low-level Unicode character/string case mapping code.
19 *   Much code moved here (and modified) from uchar.c.
20 */
21 
22 #include "unicode/utypes.h"
23 #include "unicode/unistr.h"
24 #include "unicode/uset.h"
25 #include "unicode/utf16.h"
26 #include "cmemory.h"
27 #include "uassert.h"
28 #include "ucase.h"
29 #include "umutex.h"
30 #include "utrie2.h"
31 
32 /* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
33 #define INCLUDED_FROM_UCASE_CPP
34 #include "ucase_props_data.h"
35 
36 /* set of property starts for UnicodeSet ------------------------------------ */
37 
38 static UBool U_CALLCONV
_enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)39 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
40     /* add the start code point to the USet */
41     const USetAdder *sa=(const USetAdder *)context;
42     sa->add(sa->set, start);
43     return true;
44 }
45 
46 U_CFUNC void U_EXPORT2
ucase_addPropertyStarts(const USetAdder * sa,UErrorCode * pErrorCode)47 ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
48     if(U_FAILURE(*pErrorCode)) {
49         return;
50     }
51 
52     /* add the start code point of each same-value range of the trie */
53     utrie2_enum(&ucase_props_singleton.trie, nullptr, _enumPropertyStartsRange, sa);
54 
55     /* add code points with hardcoded properties, plus the ones following them */
56 
57     /* (none right now, see comment below) */
58 
59     /*
60      * Omit code points with hardcoded specialcasing properties
61      * because we do not build property UnicodeSets for them right now.
62      */
63 }
64 
65 /* data access primitives --------------------------------------------------- */
66 
67 U_CAPI const struct UCaseProps * U_EXPORT2
ucase_getSingleton(int32_t * pExceptionsLength,int32_t * pUnfoldLength)68 ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
69     *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
70     *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
71     return &ucase_props_singleton;
72 }
73 
74 U_CFUNC const UTrie2 * U_EXPORT2
ucase_getTrie()75 ucase_getTrie() {
76     return &ucase_props_singleton.trie;
77 }
78 
79 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
80 
81 /* number of bits in an 8-bit integer value */
82 static const uint8_t flagsOffset[256]={
83     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
84     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
85     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
86     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
87     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
88     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
89     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
90     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
91     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
94     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
95     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
98     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
99 };
100 
101 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
102 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
103 
104 /*
105  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
106  *
107  * @param excWord (in) initial exceptions word
108  * @param idx (in) desired slot index
109  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
110  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
111  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
112  */
113 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
114     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
115         (pExc16)+=SLOT_OFFSET(excWord, idx); \
116         (value)=*pExc16; \
117     } else { \
118         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
119         (value)=*pExc16++; \
120         (value)=((value)<<16)|*pExc16; \
121     } \
122 } UPRV_BLOCK_MACRO_END
123 
124 /* simple case mappings ----------------------------------------------------- */
125 
126 U_CAPI UChar32 U_EXPORT2
ucase_tolower(UChar32 c)127 ucase_tolower(UChar32 c) {
128     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
129     if(!UCASE_HAS_EXCEPTION(props)) {
130         if(UCASE_IS_UPPER_OR_TITLE(props)) {
131             c+=UCASE_GET_DELTA(props);
132         }
133     } else {
134         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
135         uint16_t excWord=*pe++;
136         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
137             int32_t delta;
138             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
139             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
140         }
141         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
142             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
143         }
144     }
145     return c;
146 }
147 
148 U_CAPI UChar32 U_EXPORT2
ucase_toupper(UChar32 c)149 ucase_toupper(UChar32 c) {
150     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
151     if(!UCASE_HAS_EXCEPTION(props)) {
152         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
153             c+=UCASE_GET_DELTA(props);
154         }
155     } else {
156         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
157         uint16_t excWord=*pe++;
158         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
159             int32_t delta;
160             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
161             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
162         }
163         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
164             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
165         }
166     }
167     return c;
168 }
169 
170 U_CAPI UChar32 U_EXPORT2
ucase_totitle(UChar32 c)171 ucase_totitle(UChar32 c) {
172     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
173     if(!UCASE_HAS_EXCEPTION(props)) {
174         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
175             c+=UCASE_GET_DELTA(props);
176         }
177     } else {
178         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
179         uint16_t excWord=*pe++;
180         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
181             int32_t delta;
182             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
183             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
184         }
185         int32_t idx;
186         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
187             idx=UCASE_EXC_TITLE;
188         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
189             idx=UCASE_EXC_UPPER;
190         } else {
191             return c;
192         }
193         GET_SLOT_VALUE(excWord, idx, pe, c);
194     }
195     return c;
196 }
197 
198 static const char16_t iDot[2] = { 0x69, 0x307 };
199 static const char16_t jDot[2] = { 0x6a, 0x307 };
200 static const char16_t iOgonekDot[3] = { 0x12f, 0x307 };
201 static const char16_t iDotGrave[3] = { 0x69, 0x307, 0x300 };
202 static const char16_t iDotAcute[3] = { 0x69, 0x307, 0x301 };
203 static const char16_t iDotTilde[3] = { 0x69, 0x307, 0x303 };
204 
205 
206 U_CFUNC void U_EXPORT2
ucase_addCaseClosure(UChar32 c,const USetAdder * sa)207 ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
208     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
209     if(!UCASE_HAS_EXCEPTION(props)) {
210         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
211             /* add the one simple case mapping, no matter what type it is */
212             int32_t delta=UCASE_GET_DELTA(props);
213             if(delta!=0) {
214                 sa->add(sa->set, c+delta);
215             }
216         }
217     } else {
218         /*
219          * c has exceptions, so there may be multiple simple and/or
220          * full case mappings. Add them all.
221          */
222         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
223         uint16_t excWord=*pe++;
224         const uint16_t *pe0=pe;
225 
226         // Hardcode the case closure of i and its relatives and ignore the
227         // data file data for these characters.
228         // The Turkic dotless i and dotted I with their case mapping conditions
229         // and case folding option make the related characters behave specially.
230         // This code matches their closure behavior to their case folding behavior.
231         if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
232             // These characters have Turkic case foldings. Hardcode their closure.
233             if (c == 0x49) {
234                 // Regular i and I are in one equivalence class.
235                 sa->add(sa->set, 0x69);
236                 return;
237             } else if (c == 0x130) {
238                 // Dotted I is in a class with <0069 0307>
239                 // (for canonical equivalence with <0049 0307>).
240                 sa->addString(sa->set, iDot, 2);
241                 return;
242             }
243         } else if (c == 0x69) {
244             sa->add(sa->set, 0x49);
245             return;
246         } else if (c == 0x131) {
247             // Dotless i is in a class by itself.
248             return;
249         }
250 
251         /* add all simple case mappings */
252         for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
253             if(HAS_SLOT(excWord, idx)) {
254                 pe=pe0;
255                 UChar32 mapping;
256                 GET_SLOT_VALUE(excWord, idx, pe, mapping);
257                 sa->add(sa->set, mapping);
258             }
259         }
260         if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
261             pe=pe0;
262             int32_t delta;
263             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
264             sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
265         }
266 
267         /* get the closure string pointer & length */
268         const char16_t *closure;
269         int32_t closureLength;
270         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
271             pe=pe0;
272             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
273             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
274             closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
275         } else {
276             closureLength=0;
277             closure=nullptr;
278         }
279 
280         /* add the full case folding */
281         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
282             pe=pe0;
283             int32_t fullLength;
284             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
285 
286             /* start of full case mapping strings */
287             ++pe;
288 
289             fullLength&=0xffff; /* bits 16 and higher are reserved */
290 
291             /* skip the lowercase result string */
292             pe+=fullLength&UCASE_FULL_LOWER;
293             fullLength>>=4;
294 
295             /* add the full case folding string */
296             int32_t length=fullLength&0xf;
297             if(length!=0) {
298                 sa->addString(sa->set, (const char16_t *)pe, length);
299                 pe+=length;
300             }
301 
302             /* skip the uppercase and titlecase strings */
303             fullLength>>=4;
304             pe+=fullLength&0xf;
305             fullLength>>=4;
306             pe+=fullLength;
307 
308             closure=(const char16_t *)pe; /* behind full case mappings */
309         }
310 
311         /* add each code point in the closure string */
312         for(int32_t idx=0; idx<closureLength;) {
313             UChar32 mapping;
314             U16_NEXT_UNSAFE(closure, idx, mapping);
315             sa->add(sa->set, mapping);
316         }
317     }
318 }
319 
320 U_CFUNC void U_EXPORT2
ucase_addSimpleCaseClosure(UChar32 c,const USetAdder * sa)321 ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
322     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
323     if(!UCASE_HAS_EXCEPTION(props)) {
324         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
325             /* add the one simple case mapping, no matter what type it is */
326             int32_t delta=UCASE_GET_DELTA(props);
327             if(delta!=0) {
328                 sa->add(sa->set, c+delta);
329             }
330         }
331     } else {
332         // c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
333         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
334         uint16_t excWord=*pe++;
335         const uint16_t *pe0=pe;
336 
337         // Hardcode the case closure of i and its relatives and ignore the
338         // data file data for these characters, like in ucase_addCaseClosure().
339         if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
340             // These characters have Turkic case foldings. Hardcode their closure.
341             if (c == 0x49) {
342                 // Regular i and I are in one equivalence class.
343                 sa->add(sa->set, 0x69);
344                 return;
345             } else if (c == 0x130) {
346                 // For scf=Simple_Case_Folding, dotted I is in a class by itself.
347                 return;
348             }
349         } else if (c == 0x69) {
350             sa->add(sa->set, 0x49);
351             return;
352         } else if (c == 0x131) {
353             // Dotless i is in a class by itself.
354             return;
355         }
356 
357         // Add all simple case mappings.
358         for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
359             if(HAS_SLOT(excWord, idx)) {
360                 pe=pe0;
361                 UChar32 mapping;
362                 GET_SLOT_VALUE(excWord, idx, pe, mapping);
363                 sa->add(sa->set, mapping);
364             }
365         }
366         if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
367             pe=pe0;
368             int32_t delta;
369             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
370             UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
371             sa->add(sa->set, mapping);
372         }
373 
374         /* get the closure string pointer & length */
375         const char16_t *closure;
376         int32_t closureLength;
377         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
378             pe=pe0;
379             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
380             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
381             closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
382         } else {
383             closureLength=0;
384             closure=nullptr;
385         }
386 
387         // Skip the full case mappings.
388         if(closureLength > 0 && HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
389             pe=pe0;
390             int32_t fullLength;
391             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
392 
393             /* start of full case mapping strings */
394             ++pe;
395 
396             fullLength&=0xffff; /* bits 16 and higher are reserved */
397 
398             // Skip all 4 full case mappings.
399             pe+=fullLength&UCASE_FULL_LOWER;
400             fullLength>>=4;
401             pe+=fullLength&0xf;
402             fullLength>>=4;
403             pe+=fullLength&0xf;
404             fullLength>>=4;
405             pe+=fullLength;
406 
407             closure=(const char16_t *)pe; /* behind full case mappings */
408         }
409 
410         // Add each code point in the closure string whose scf maps back to c.
411         for(int32_t idx=0; idx<closureLength;) {
412             UChar32 mapping;
413             U16_NEXT_UNSAFE(closure, idx, mapping);
414             sa->add(sa->set, mapping);
415         }
416     }
417 }
418 
419 /*
420  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
421  * must be length>0 and max>0 and length<=max
422  */
423 static inline int32_t
strcmpMax(const char16_t * s,int32_t length,const char16_t * t,int32_t max)424 strcmpMax(const char16_t *s, int32_t length, const char16_t *t, int32_t max) {
425     int32_t c1, c2;
426 
427     max-=length; /* we require length<=max, so no need to decrement max in the loop */
428     do {
429         c1=*s++;
430         c2=*t++;
431         if(c2==0) {
432             return 1; /* reached the end of t but not of s */
433         }
434         c1-=c2;
435         if(c1!=0) {
436             return c1; /* return difference result */
437         }
438     } while(--length>0);
439     /* ends with length==0 */
440 
441     if(max==0 || *t==0) {
442         return 0; /* equal to length of both strings */
443     } else {
444         return -max; /* return length difference */
445     }
446 }
447 
448 U_CFUNC UBool U_EXPORT2
ucase_addStringCaseClosure(const char16_t * s,int32_t length,const USetAdder * sa)449 ucase_addStringCaseClosure(const char16_t *s, int32_t length, const USetAdder *sa) {
450     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
451 
452     if(ucase_props_singleton.unfold==nullptr || s==nullptr) {
453         return false; /* no reverse case folding data, or no string */
454     }
455     if(length<=1) {
456         /* the string is too short to find any match */
457         /*
458          * more precise would be:
459          * if(!u_strHasMoreChar32Than(s, length, 1))
460          * but this does not make much practical difference because
461          * a single supplementary code point would just not be found
462          */
463         return false;
464     }
465 
466     const uint16_t *unfold=ucase_props_singleton.unfold;
467     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
468     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
469     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
470     unfold+=unfoldRowWidth;
471 
472     if(length>unfoldStringWidth) {
473         /* the string is too long to find any match */
474         return false;
475     }
476 
477     /* do a binary search for the string */
478     start=0;
479     limit=unfoldRows;
480     while(start<limit) {
481         i=(start+limit)/2;
482         const char16_t *p=reinterpret_cast<const char16_t *>(unfold+(i*unfoldRowWidth));
483         result=strcmpMax(s, length, p, unfoldStringWidth);
484 
485         if(result==0) {
486             /* found the string: add each code point, and its case closure */
487             UChar32 c;
488 
489             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
490                 U16_NEXT_UNSAFE(p, i, c);
491                 sa->add(sa->set, c);
492                 ucase_addCaseClosure(c, sa);
493             }
494             return true;
495         } else if(result<0) {
496             limit=i;
497         } else /* result>0 */ {
498             start=i+1;
499         }
500     }
501 
502     return false; /* string not found */
503 }
504 
505 U_NAMESPACE_BEGIN
506 
FullCaseFoldingIterator()507 FullCaseFoldingIterator::FullCaseFoldingIterator()
508         : unfold(reinterpret_cast<const char16_t *>(ucase_props_singleton.unfold)),
509           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
510           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
511           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
512           currentRow(0),
513           rowCpIndex(unfoldStringWidth) {
514     unfold+=unfoldRowWidth;
515 }
516 
517 UChar32
next(UnicodeString & full)518 FullCaseFoldingIterator::next(UnicodeString &full) {
519     // Advance past the last-delivered code point.
520     const char16_t *p=unfold+(currentRow*unfoldRowWidth);
521     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
522         ++currentRow;
523         p+=unfoldRowWidth;
524         rowCpIndex=unfoldStringWidth;
525     }
526     if(currentRow>=unfoldRows) { return U_SENTINEL; }
527     // Set "full" to the NUL-terminated string in the first unfold column.
528     int32_t length=unfoldStringWidth;
529     while(length>0 && p[length-1]==0) { --length; }
530     full.setTo(false, p, length);
531     // Return the code point.
532     UChar32 c;
533     U16_NEXT_UNSAFE(p, rowCpIndex, c);
534     return c;
535 }
536 
537 namespace LatinCase {
538 
539 const int8_t TO_LOWER_NORMAL[LIMIT] = {
540     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
541     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
542     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
543     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
544 
545     0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
546     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
547     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 
550     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
554 
555     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
556     32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
557     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559 
560     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
561     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
562     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
563     EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
564 
565     0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
566     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
567     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
568     1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
569 };
570 
571 const int8_t TO_LOWER_TR_LT[LIMIT] = {
572     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
573     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
574     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
575     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
576 
577     0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
578     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
579     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581 
582     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
583     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
584     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
585     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
586 
587     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
588     32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
589     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
590     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591 
592     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
593     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
594     1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
595     EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
596 
597     0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
598     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
599     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
600     1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
601 };
602 
603 const int8_t TO_UPPER_NORMAL[LIMIT] = {
604     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
605     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
606     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
607     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
608 
609     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
610     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
611     0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
612     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
613 
614     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
615     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
616     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618 
619     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
620     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
621     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
622     -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
623 
624     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
625     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
626     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
627     0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
628 
629     -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
630     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
631     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
632     0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
633 };
634 
635 const int8_t TO_UPPER_TR[LIMIT] = {
636     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
637     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
638     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
639     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
640 
641     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
642     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
643     0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
644     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
645 
646     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
649     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
650 
651     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
652     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
653     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
654     -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
655 
656     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
657     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
658     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
659     0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
660 
661     -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
662     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
663     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
664     0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
665 };
666 
667 }  // namespace LatinCase
668 
669 U_NAMESPACE_END
670 
671 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
672 U_CAPI int32_t U_EXPORT2
ucase_getType(UChar32 c)673 ucase_getType(UChar32 c) {
674     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
675     return UCASE_GET_TYPE(props);
676 }
677 
678 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
679 U_CAPI int32_t U_EXPORT2
ucase_getTypeOrIgnorable(UChar32 c)680 ucase_getTypeOrIgnorable(UChar32 c) {
681     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
682     return UCASE_GET_TYPE_AND_IGNORABLE(props);
683 }
684 
685 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
686 static inline int32_t
getDotType(UChar32 c)687 getDotType(UChar32 c) {
688     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
689     if(!UCASE_HAS_EXCEPTION(props)) {
690         return props&UCASE_DOT_MASK;
691     } else {
692         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
693         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
694     }
695 }
696 
697 U_CAPI UBool U_EXPORT2
ucase_isSoftDotted(UChar32 c)698 ucase_isSoftDotted(UChar32 c) {
699     return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
700 }
701 
702 U_CAPI UBool U_EXPORT2
ucase_isCaseSensitive(UChar32 c)703 ucase_isCaseSensitive(UChar32 c) {
704     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
705     if(!UCASE_HAS_EXCEPTION(props)) {
706         return (UBool)((props&UCASE_SENSITIVE)!=0);
707     } else {
708         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
709         return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
710     }
711 }
712 
713 /* string casing ------------------------------------------------------------ */
714 
715 /*
716  * These internal functions form the core of string case mappings.
717  * They map single code points to result code points or strings and take
718  * all necessary conditions (context, locale ID, options) into account.
719  *
720  * They do not iterate over the source or write to the destination
721  * so that the same functions are useful for non-standard string storage,
722  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
723  * For the same reason, the "surrounding text" context is passed in as a
724  * UCaseContextIterator which does not make any assumptions about
725  * the underlying storage.
726  *
727  * This section contains helper functions that check for conditions
728  * in the input text surrounding the current code point
729  * according to SpecialCasing.txt.
730  *
731  * Each helper function gets the index
732  * - after the current code point if it looks at following text
733  * - before the current code point if it looks at preceding text
734  *
735  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
736  *
737  * Final_Sigma
738  *   C is preceded by a sequence consisting of
739  *     a cased letter and a case-ignorable sequence,
740  *   and C is not followed by a sequence consisting of
741  *     an ignorable sequence and then a cased letter.
742  *
743  * More_Above
744  *   C is followed by one or more characters of combining class 230 (ABOVE)
745  *   in the combining character sequence.
746  *
747  * After_Soft_Dotted
748  *   The last preceding character with combining class of zero before C
749  *   was Soft_Dotted,
750  *   and there is no intervening combining character class 230 (ABOVE).
751  *
752  * Before_Dot
753  *   C is followed by combining dot above (U+0307).
754  *   Any sequence of characters with a combining class that is neither 0 nor 230
755  *   may intervene between the current character and the combining dot above.
756  *
757  * The erratum from 2002-10-31 adds the condition
758  *
759  * After_I
760  *   The last preceding base character was an uppercase I, and there is no
761  *   intervening combining character class 230 (ABOVE).
762  *
763  *   (See Jitterbug 2344 and the comments on After_I below.)
764  *
765  * Helper definitions in Unicode 3.2 UAX 21:
766  *
767  * D1. A character C is defined to be cased
768  *     if it meets any of the following criteria:
769  *
770  *   - The general category of C is Titlecase Letter (Lt)
771  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
772  *   - Given D = NFD(C), then it is not the case that:
773  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
774  *     (This third criterion does not add any characters to the list
775  *      for Unicode 3.2. Ignored.)
776  *
777  * D2. A character C is defined to be case-ignorable
778  *     if it meets either of the following criteria:
779  *
780  *   - The general category of C is
781  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
782  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
783  *   - C is one of the following characters
784  *     U+0027 APOSTROPHE
785  *     U+00AD SOFT HYPHEN (SHY)
786  *     U+2019 RIGHT SINGLE QUOTATION MARK
787  *            (the preferred character for apostrophe)
788  *
789  * D3. A case-ignorable sequence is a sequence of
790  *     zero or more case-ignorable characters.
791  */
792 
793 #define is_d(c) ((c)=='d' || (c)=='D')
794 #define is_e(c) ((c)=='e' || (c)=='E')
795 #define is_i(c) ((c)=='i' || (c)=='I')
796 #define is_l(c) ((c)=='l' || (c)=='L')
797 #define is_r(c) ((c)=='r' || (c)=='R')
798 #define is_t(c) ((c)=='t' || (c)=='T')
799 #define is_u(c) ((c)=='u' || (c)=='U')
800 #define is_y(c) ((c)=='y' || (c)=='Y')
801 #define is_z(c) ((c)=='z' || (c)=='Z')
802 
803 /* separator? */
804 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
805 
806 /**
807  * Requires non-nullptr locale ID but otherwise does the equivalent of
808  * checking for language codes as if uloc_getLanguage() were called:
809  * Accepts both 2- and 3-letter codes and accepts case variants.
810  */
811 U_CFUNC int32_t
ucase_getCaseLocale(const char * locale)812 ucase_getCaseLocale(const char *locale) {
813     /*
814      * This function used to use uloc_getLanguage(), but the current code
815      * removes the dependency of this low-level code on uloc implementation code
816      * and is faster because not the whole locale ID has to be
817      * examined and copied/transformed.
818      *
819      * Because this code does not want to depend on uloc, the caller must
820      * pass in a non-nullptr locale, i.e., may need to call uloc_getDefault().
821      */
822     char c=*locale++;
823     // Fastpath for English "en" which is often used for default (=root locale) case mappings,
824     // and for Chinese "zh": Very common but no special case mapping behavior.
825     // Then check lowercase vs. uppercase to reduce the number of comparisons
826     // for other locales without special behavior.
827     if(c=='e') {
828         /* el or ell? */
829         c=*locale++;
830         if(is_l(c)) {
831             c=*locale++;
832             if(is_l(c)) {
833                 c=*locale;
834             }
835             if(is_sep(c)) {
836                 return UCASE_LOC_GREEK;
837             }
838         }
839         // en, es, ... -> root
840     } else if(c=='z') {
841         return UCASE_LOC_ROOT;
842 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
843     } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
844 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
845     } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
846 #else
847 #   error Unknown charset family!
848 #endif
849         // lowercase c
850         if(c=='t') {
851             /* tr or tur? */
852             c=*locale++;
853             if(is_u(c)) {
854                 c=*locale++;
855             }
856             if(is_r(c)) {
857                 c=*locale;
858                 if(is_sep(c)) {
859                     return UCASE_LOC_TURKISH;
860                 }
861             }
862         } else if(c=='a') {
863             /* az or aze? */
864             c=*locale++;
865             if(is_z(c)) {
866                 c=*locale++;
867                 if(is_e(c)) {
868                     c=*locale;
869                 }
870                 if(is_sep(c)) {
871                     return UCASE_LOC_TURKISH;
872                 }
873             }
874         } else if(c=='l') {
875             /* lt or lit? */
876             c=*locale++;
877             if(is_i(c)) {
878                 c=*locale++;
879             }
880             if(is_t(c)) {
881                 c=*locale;
882                 if(is_sep(c)) {
883                     return UCASE_LOC_LITHUANIAN;
884                 }
885             }
886         } else if(c=='n') {
887             /* nl or nld? */
888             c=*locale++;
889             if(is_l(c)) {
890                 c=*locale++;
891                 if(is_d(c)) {
892                     c=*locale;
893                 }
894                 if(is_sep(c)) {
895                     return UCASE_LOC_DUTCH;
896                 }
897             }
898         } else if(c=='h') {
899             /* hy or hye? *not* hyw */
900             c=*locale++;
901             if(is_y(c)) {
902                 c=*locale++;
903                 if(is_e(c)) {
904                     c=*locale;
905                 }
906                 if(is_sep(c)) {
907                     return UCASE_LOC_ARMENIAN;
908                 }
909             }
910         }
911     } else {
912         // uppercase c
913         // Same code as for lowercase c but also check for 'E'.
914         if(c=='T') {
915             /* tr or tur? */
916             c=*locale++;
917             if(is_u(c)) {
918                 c=*locale++;
919             }
920             if(is_r(c)) {
921                 c=*locale;
922                 if(is_sep(c)) {
923                     return UCASE_LOC_TURKISH;
924                 }
925             }
926         } else if(c=='A') {
927             /* az or aze? */
928             c=*locale++;
929             if(is_z(c)) {
930                 c=*locale++;
931                 if(is_e(c)) {
932                     c=*locale;
933                 }
934                 if(is_sep(c)) {
935                     return UCASE_LOC_TURKISH;
936                 }
937             }
938         } else if(c=='L') {
939             /* lt or lit? */
940             c=*locale++;
941             if(is_i(c)) {
942                 c=*locale++;
943             }
944             if(is_t(c)) {
945                 c=*locale;
946                 if(is_sep(c)) {
947                     return UCASE_LOC_LITHUANIAN;
948                 }
949             }
950         } else if(c=='E') {
951             /* el or ell? */
952             c=*locale++;
953             if(is_l(c)) {
954                 c=*locale++;
955                 if(is_l(c)) {
956                     c=*locale;
957                 }
958                 if(is_sep(c)) {
959                     return UCASE_LOC_GREEK;
960                 }
961             }
962         } else if(c=='N') {
963             /* nl or nld? */
964             c=*locale++;
965             if(is_l(c)) {
966                 c=*locale++;
967                 if(is_d(c)) {
968                     c=*locale;
969                 }
970                 if(is_sep(c)) {
971                     return UCASE_LOC_DUTCH;
972                 }
973             }
974         } else if(c=='H') {
975             /* hy or hye? *not* hyw */
976             c=*locale++;
977             if(is_y(c)) {
978                 c=*locale++;
979                 if(is_e(c)) {
980                     c=*locale;
981                 }
982                 if(is_sep(c)) {
983                     return UCASE_LOC_ARMENIAN;
984                 }
985             }
986         }
987     }
988     return UCASE_LOC_ROOT;
989 }
990 
991 /*
992  * Is followed by
993  *   {case-ignorable}* cased
994  * ?
995  * (dir determines looking forward/backward)
996  * If a character is case-ignorable, it is skipped regardless of whether
997  * it is also cased or not.
998  */
999 static UBool
isFollowedByCasedLetter(UCaseContextIterator * iter,void * context,int8_t dir)1000 isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
1001     UChar32 c;
1002 
1003     if(iter==nullptr) {
1004         return false;
1005     }
1006 
1007     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
1008         int32_t type=ucase_getTypeOrIgnorable(c);
1009         if(type&4) {
1010             /* case-ignorable, continue with the loop */
1011         } else if(type!=UCASE_NONE) {
1012             return true; /* followed by cased letter */
1013         } else {
1014             return false; /* uncased and not case-ignorable */
1015         }
1016     }
1017 
1018     return false; /* not followed by cased letter */
1019 }
1020 
1021 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
1022 static UBool
isPrecededBySoftDotted(UCaseContextIterator * iter,void * context)1023 isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
1024     UChar32 c;
1025     int32_t dotType;
1026     int8_t dir;
1027 
1028     if(iter==nullptr) {
1029         return false;
1030     }
1031 
1032     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1033         dotType=getDotType(c);
1034         if(dotType==UCASE_SOFT_DOTTED) {
1035             return true; /* preceded by TYPE_i */
1036         } else if(dotType!=UCASE_OTHER_ACCENT) {
1037             return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
1038         }
1039     }
1040 
1041     return false; /* not preceded by TYPE_i */
1042 }
1043 
1044 /*
1045  * See Jitterbug 2344:
1046  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
1047  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
1048  * we made those releases compatible with Unicode 3.2 which had not fixed
1049  * a related bug in SpecialCasing.txt.
1050  *
1051  * From the Jitterbug 2344 text:
1052  * ... this bug is listed as a Unicode erratum
1053  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
1054  * <quote>
1055  * There are two errors in SpecialCasing.txt.
1056  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
1057  * 2. An incorrect context definition. Correct as follows:
1058  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
1059  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
1060  * ---
1061  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1062  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1063  * where the context After_I is defined as:
1064  * The last preceding base character was an uppercase I, and there is no
1065  * intervening combining character class 230 (ABOVE).
1066  * </quote>
1067  *
1068  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
1069  *
1070  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1071  * # This matches the behavior of the canonically equivalent I-dot_above
1072  *
1073  * See also the description in this place in older versions of uchar.c (revision 1.100).
1074  *
1075  * Markus W. Scherer 2003-feb-15
1076  */
1077 
1078 /* Is preceded by base character 'I' with no intervening cc=230 ? */
1079 static UBool
isPrecededBy_I(UCaseContextIterator * iter,void * context)1080 isPrecededBy_I(UCaseContextIterator *iter, void *context) {
1081     UChar32 c;
1082     int32_t dotType;
1083     int8_t dir;
1084 
1085     if(iter==nullptr) {
1086         return false;
1087     }
1088 
1089     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1090         if(c==0x49) {
1091             return true; /* preceded by I */
1092         }
1093         dotType=getDotType(c);
1094         if(dotType!=UCASE_OTHER_ACCENT) {
1095             return false; /* preceded by different base character (not I), or intervening cc==230 */
1096         }
1097     }
1098 
1099     return false; /* not preceded by I */
1100 }
1101 
1102 /* Is followed by one or more cc==230 ? */
1103 static UBool
isFollowedByMoreAbove(UCaseContextIterator * iter,void * context)1104 isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
1105     UChar32 c;
1106     int32_t dotType;
1107     int8_t dir;
1108 
1109     if(iter==nullptr) {
1110         return false;
1111     }
1112 
1113     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1114         dotType=getDotType(c);
1115         if(dotType==UCASE_ABOVE) {
1116             return true; /* at least one cc==230 following */
1117         } else if(dotType!=UCASE_OTHER_ACCENT) {
1118             return false; /* next base character, no more cc==230 following */
1119         }
1120     }
1121 
1122     return false; /* no more cc==230 following */
1123 }
1124 
1125 /* Is followed by a dot above (without cc==230 in between) ? */
1126 static UBool
isFollowedByDotAbove(UCaseContextIterator * iter,void * context)1127 isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
1128     UChar32 c;
1129     int32_t dotType;
1130     int8_t dir;
1131 
1132     if(iter==nullptr) {
1133         return false;
1134     }
1135 
1136     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1137         if(c==0x307) {
1138             return true;
1139         }
1140         dotType=getDotType(c);
1141         if(dotType!=UCASE_OTHER_ACCENT) {
1142             return false; /* next base character or cc==230 in between */
1143         }
1144     }
1145 
1146     return false; /* no dot above following */
1147 }
1148 
1149 U_CAPI int32_t U_EXPORT2
ucase_toFullLower(UChar32 c,UCaseContextIterator * iter,void * context,const char16_t ** pString,int32_t loc)1150 ucase_toFullLower(UChar32 c,
1151                   UCaseContextIterator *iter, void *context,
1152                   const char16_t **pString,
1153                   int32_t loc) {
1154     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1155     U_ASSERT(c >= 0);
1156     UChar32 result=c;
1157     // Reset the output pointer in case it was uninitialized.
1158     *pString=nullptr;
1159     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1160     if(!UCASE_HAS_EXCEPTION(props)) {
1161         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1162             result=c+UCASE_GET_DELTA(props);
1163         }
1164     } else {
1165         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1166         uint16_t excWord=*pe++;
1167         int32_t full;
1168 
1169         pe2=pe;
1170 
1171         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1172             /* use hardcoded conditions and mappings */
1173 
1174             /*
1175              * Test for conditional mappings first
1176              *   (otherwise the unconditional default mappings are always taken),
1177              * then test for characters that have unconditional mappings in SpecialCasing.txt,
1178              * then get the UnicodeData.txt mappings.
1179              */
1180             if( loc==UCASE_LOC_LITHUANIAN &&
1181                     /* base characters, find accents above */
1182                     (((c==0x49 || c==0x4a || c==0x12e) &&
1183                         isFollowedByMoreAbove(iter, context)) ||
1184                     /* precomposed with accent above, no need to find one */
1185                     (c==0xcc || c==0xcd || c==0x128))
1186             ) {
1187                 /*
1188                     # Lithuanian
1189 
1190                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1191 
1192                     # Introduce an explicit dot above when lowercasing capital I's and J's
1193                     # whenever there are more accents above.
1194                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1195 
1196                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1197                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1198                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1199                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1200                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1201                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1202                  */
1203                 switch(c) {
1204                 case 0x49:  /* LATIN CAPITAL LETTER I */
1205                     *pString=iDot;
1206                     return 2;
1207                 case 0x4a:  /* LATIN CAPITAL LETTER J */
1208                     *pString=jDot;
1209                     return 2;
1210                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1211                     *pString=iOgonekDot;
1212                     return 2;
1213                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
1214                     *pString=iDotGrave;
1215                     return 3;
1216                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
1217                     *pString=iDotAcute;
1218                     return 3;
1219                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1220                     *pString=iDotTilde;
1221                     return 3;
1222                 default:
1223                     return 0; /* will not occur */
1224                 }
1225             /* # Turkish and Azeri */
1226             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1227                 /*
1228                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1229                     # The following rules handle those cases.
1230 
1231                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1232                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1233                  */
1234                 return 0x69;
1235             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
1236                 /*
1237                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1238                     # This matches the behavior of the canonically equivalent I-dot_above
1239 
1240                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1241                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1242                  */
1243                 return 0; /* remove the dot (continue without output) */
1244             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
1245                 /*
1246                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1247 
1248                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1249                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1250                  */
1251                 return 0x131;
1252             } else if(c==0x130) {
1253                 /*
1254                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
1255 
1256                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1257                  */
1258                 *pString=iDot;
1259                 return 2;
1260             } else if(  c==0x3a3 &&
1261                         !isFollowedByCasedLetter(iter, context, 1) &&
1262                         isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
1263             ) {
1264                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1265                 /*
1266                     # Special case for final form of sigma
1267 
1268                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1269                  */
1270                 return 0x3c2; /* greek small final sigma */
1271             } else {
1272                 /* no known conditional special case mapping, use a normal mapping */
1273             }
1274         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1275             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1276             full&=UCASE_FULL_LOWER;
1277             if(full!=0) {
1278                 /* set the output pointer to the lowercase mapping */
1279                 *pString=reinterpret_cast<const char16_t *>(pe+1);
1280 
1281                 /* return the string length */
1282                 return full;
1283             }
1284         }
1285 
1286         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1287             int32_t delta;
1288             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1289             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1290         }
1291         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1292             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1293         }
1294     }
1295 
1296     return (result==c) ? ~result : result;
1297 }
1298 
1299 /* internal */
1300 static int32_t
toUpperOrTitle(UChar32 c,UCaseContextIterator * iter,void * context,const char16_t ** pString,int32_t loc,UBool upperNotTitle)1301 toUpperOrTitle(UChar32 c,
1302                UCaseContextIterator *iter, void *context,
1303                const char16_t **pString,
1304                int32_t loc,
1305                UBool upperNotTitle) {
1306     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1307     U_ASSERT(c >= 0);
1308     UChar32 result=c;
1309     // Reset the output pointer in case it was uninitialized.
1310     *pString=nullptr;
1311     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1312     if(!UCASE_HAS_EXCEPTION(props)) {
1313         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1314             result=c+UCASE_GET_DELTA(props);
1315         }
1316     } else {
1317         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1318         uint16_t excWord=*pe++;
1319         int32_t full, idx;
1320 
1321         pe2=pe;
1322 
1323         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1324             /* use hardcoded conditions and mappings */
1325             if(loc==UCASE_LOC_TURKISH && c==0x69) {
1326                 /*
1327                     # Turkish and Azeri
1328 
1329                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1330                     # The following rules handle those cases.
1331 
1332                     # When uppercasing, i turns into a dotted capital I
1333 
1334                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1335                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1336                 */
1337                 return 0x130;
1338             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1339                 /*
1340                     # Lithuanian
1341 
1342                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1343 
1344                     # Remove DOT ABOVE after "i" with upper or titlecase
1345 
1346                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1347                  */
1348                 return 0; /* remove the dot (continue without output) */
1349             } else if(c==0x0587) {
1350                 // See ICU-13416:
1351                 // և ligature ech-yiwn
1352                 // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian,
1353                 // but to ԵՎ=ech+vew in Eastern Armenian.
1354                 if(loc==UCASE_LOC_ARMENIAN) {
1355                     *pString=upperNotTitle ? u"ԵՎ" : u"Եվ";
1356                 } else {
1357                     *pString=upperNotTitle ? u"ԵՒ" : u"Եւ";
1358                 }
1359                 return 2;
1360             } else {
1361                 /* no known conditional special case mapping, use a normal mapping */
1362             }
1363         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1364             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1365 
1366             /* start of full case mapping strings */
1367             ++pe;
1368 
1369             /* skip the lowercase and case-folding result strings */
1370             pe+=full&UCASE_FULL_LOWER;
1371             full>>=4;
1372             pe+=full&0xf;
1373             full>>=4;
1374 
1375             if(upperNotTitle) {
1376                 full&=0xf;
1377             } else {
1378                 /* skip the uppercase result string */
1379                 pe+=full&0xf;
1380                 full=(full>>4)&0xf;
1381             }
1382 
1383             if(full!=0) {
1384                 /* set the output pointer to the result string */
1385                 *pString=reinterpret_cast<const char16_t *>(pe);
1386 
1387                 /* return the string length */
1388                 return full;
1389             }
1390         }
1391 
1392         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1393             int32_t delta;
1394             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1395             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1396         }
1397         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1398             idx=UCASE_EXC_TITLE;
1399         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1400             /* here, titlecase is same as uppercase */
1401             idx=UCASE_EXC_UPPER;
1402         } else {
1403             return ~c;
1404         }
1405         GET_SLOT_VALUE(excWord, idx, pe2, result);
1406     }
1407 
1408     return (result==c) ? ~result : result;
1409 }
1410 
1411 U_CAPI int32_t U_EXPORT2
ucase_toFullUpper(UChar32 c,UCaseContextIterator * iter,void * context,const char16_t ** pString,int32_t caseLocale)1412 ucase_toFullUpper(UChar32 c,
1413                   UCaseContextIterator *iter, void *context,
1414                   const char16_t **pString,
1415                   int32_t caseLocale) {
1416     return toUpperOrTitle(c, iter, context, pString, caseLocale, true);
1417 }
1418 
1419 U_CAPI int32_t U_EXPORT2
ucase_toFullTitle(UChar32 c,UCaseContextIterator * iter,void * context,const char16_t ** pString,int32_t caseLocale)1420 ucase_toFullTitle(UChar32 c,
1421                   UCaseContextIterator *iter, void *context,
1422                   const char16_t **pString,
1423                   int32_t caseLocale) {
1424     return toUpperOrTitle(c, iter, context, pString, caseLocale, false);
1425 }
1426 
1427 /* case folding ------------------------------------------------------------- */
1428 
1429 /*
1430  * Case folding is similar to lowercasing.
1431  * The result may be a simple mapping, i.e., a single code point, or
1432  * a full mapping, i.e., a string.
1433  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1434  * then only the lowercase mapping is stored.
1435  *
1436  * Some special cases are hardcoded because their conditions cannot be
1437  * parsed and processed from CaseFolding.txt.
1438  *
1439  * Unicode 3.2 CaseFolding.txt specifies for its status field:
1440 
1441 # C: common case folding, common mappings shared by both simple and full mappings.
1442 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1443 # S: simple case folding, mappings to single characters where different from F.
1444 # T: special case for uppercase I and dotted uppercase I
1445 #    - For non-Turkic languages, this mapping is normally not used.
1446 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1447 #
1448 # Usage:
1449 #  A. To do a simple case folding, use the mappings with status C + S.
1450 #  B. To do a full case folding, use the mappings with status C + F.
1451 #
1452 #    The mappings with status T can be used or omitted depending on the desired case-folding
1453 #    behavior. (The default option is to exclude them.)
1454 
1455  * Unicode 3.2 has 'T' mappings as follows:
1456 
1457 0049; T; 0131; # LATIN CAPITAL LETTER I
1458 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1459 
1460  * while the default mappings for these code points are:
1461 
1462 0049; C; 0069; # LATIN CAPITAL LETTER I
1463 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1464 
1465  * U+0130 has no simple case folding (simple-case-folds to itself).
1466  */
1467 
1468 /* return the simple case folding mapping for c */
1469 U_CAPI UChar32 U_EXPORT2
ucase_fold(UChar32 c,uint32_t options)1470 ucase_fold(UChar32 c, uint32_t options) {
1471     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1472     if(!UCASE_HAS_EXCEPTION(props)) {
1473         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1474             c+=UCASE_GET_DELTA(props);
1475         }
1476     } else {
1477         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1478         uint16_t excWord=*pe++;
1479         int32_t idx;
1480         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1481             /* special case folding mappings, hardcoded */
1482             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1483                 /* default mappings */
1484                 if(c==0x49) {
1485                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1486                     return 0x69;
1487                 } else if(c==0x130) {
1488                     /* no simple case folding for U+0130 */
1489                     return c;
1490                 }
1491             } else {
1492                 /* Turkic mappings */
1493                 if(c==0x49) {
1494                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1495                     return 0x131;
1496                 } else if(c==0x130) {
1497                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1498                     return 0x69;
1499                 }
1500             }
1501         }
1502         if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1503             return c;
1504         }
1505         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1506             int32_t delta;
1507             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1508             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1509         }
1510         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1511             idx=UCASE_EXC_FOLD;
1512         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1513             idx=UCASE_EXC_LOWER;
1514         } else {
1515             return c;
1516         }
1517         GET_SLOT_VALUE(excWord, idx, pe, c);
1518     }
1519     return c;
1520 }
1521 
1522 /*
1523  * Issue for canonical caseless match (UAX #21):
1524  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1525  * canonical equivalence, unlike default-option casefolding.
1526  * For example, I-grave and I + grave fold to strings that are not canonically
1527  * equivalent.
1528  * For more details, see the comment in unorm_compare() in unorm.cpp
1529  * and the intermediate prototype changes for Jitterbug 2021.
1530  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1531  *
1532  * This did not get fixed because it appears that it is not possible to fix
1533  * it for uppercase and lowercase characters (I-grave vs. i-grave)
1534  * together in a way that they still fold to common result strings.
1535  */
1536 
1537 U_CAPI int32_t U_EXPORT2
ucase_toFullFolding(UChar32 c,const char16_t ** pString,uint32_t options)1538 ucase_toFullFolding(UChar32 c,
1539                     const char16_t **pString,
1540                     uint32_t options) {
1541     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1542     U_ASSERT(c >= 0);
1543     UChar32 result=c;
1544     // Reset the output pointer in case it was uninitialized.
1545     *pString=nullptr;
1546     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1547     if(!UCASE_HAS_EXCEPTION(props)) {
1548         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1549             result=c+UCASE_GET_DELTA(props);
1550         }
1551     } else {
1552         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1553         uint16_t excWord=*pe++;
1554         int32_t full, idx;
1555 
1556         pe2=pe;
1557 
1558         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1559             /* use hardcoded conditions and mappings */
1560             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1561                 /* default mappings */
1562                 if(c==0x49) {
1563                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1564                     return 0x69;
1565                 } else if(c==0x130) {
1566                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1567                     *pString=iDot;
1568                     return 2;
1569                 }
1570             } else {
1571                 /* Turkic mappings */
1572                 if(c==0x49) {
1573                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1574                     return 0x131;
1575                 } else if(c==0x130) {
1576                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1577                     return 0x69;
1578                 }
1579             }
1580         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1581             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1582 
1583             /* start of full case mapping strings */
1584             ++pe;
1585 
1586             /* skip the lowercase result string */
1587             pe+=full&UCASE_FULL_LOWER;
1588             full=(full>>4)&0xf;
1589 
1590             if(full!=0) {
1591                 /* set the output pointer to the result string */
1592                 *pString=reinterpret_cast<const char16_t *>(pe);
1593 
1594                 /* return the string length */
1595                 return full;
1596             }
1597         }
1598 
1599         if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1600             return ~c;
1601         }
1602         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1603             int32_t delta;
1604             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1605             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1606         }
1607         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1608             idx=UCASE_EXC_FOLD;
1609         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1610             idx=UCASE_EXC_LOWER;
1611         } else {
1612             return ~c;
1613         }
1614         GET_SLOT_VALUE(excWord, idx, pe2, result);
1615     }
1616 
1617     return (result==c) ? ~result : result;
1618 }
1619 
1620 /* case mapping properties API ---------------------------------------------- */
1621 
1622 /* public API (see uchar.h) */
1623 
1624 U_CAPI UBool U_EXPORT2
u_isULowercase(UChar32 c)1625 u_isULowercase(UChar32 c) {
1626     return (UBool)(UCASE_LOWER==ucase_getType(c));
1627 }
1628 
1629 U_CAPI UBool U_EXPORT2
u_isUUppercase(UChar32 c)1630 u_isUUppercase(UChar32 c) {
1631     return (UBool)(UCASE_UPPER==ucase_getType(c));
1632 }
1633 
1634 /* Transforms the Unicode character to its lower case equivalent.*/
1635 U_CAPI UChar32 U_EXPORT2
u_tolower(UChar32 c)1636 u_tolower(UChar32 c) {
1637     return ucase_tolower(c);
1638 }
1639 
1640 /* Transforms the Unicode character to its upper case equivalent.*/
1641 U_CAPI UChar32 U_EXPORT2
u_toupper(UChar32 c)1642 u_toupper(UChar32 c) {
1643     return ucase_toupper(c);
1644 }
1645 
1646 /* Transforms the Unicode character to its title case equivalent.*/
1647 U_CAPI UChar32 U_EXPORT2
u_totitle(UChar32 c)1648 u_totitle(UChar32 c) {
1649     return ucase_totitle(c);
1650 }
1651 
1652 /* return the simple case folding mapping for c */
1653 U_CAPI UChar32 U_EXPORT2
u_foldCase(UChar32 c,uint32_t options)1654 u_foldCase(UChar32 c, uint32_t options) {
1655     return ucase_fold(c, options);
1656 }
1657 
1658 U_CFUNC int32_t U_EXPORT2
ucase_hasBinaryProperty(UChar32 c,UProperty which)1659 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1660     /* case mapping properties */
1661     const char16_t *resultString;
1662     switch(which) {
1663     case UCHAR_LOWERCASE:
1664         return (UBool)(UCASE_LOWER==ucase_getType(c));
1665     case UCHAR_UPPERCASE:
1666         return (UBool)(UCASE_UPPER==ucase_getType(c));
1667     case UCHAR_SOFT_DOTTED:
1668         return ucase_isSoftDotted(c);
1669     case UCHAR_CASE_SENSITIVE:
1670         return ucase_isCaseSensitive(c);
1671     case UCHAR_CASED:
1672         return (UBool)(UCASE_NONE!=ucase_getType(c));
1673     case UCHAR_CASE_IGNORABLE:
1674         return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1675     /*
1676      * Note: The following Changes_When_Xyz are defined as testing whether
1677      * the NFD form of the input changes when Xyz-case-mapped.
1678      * However, this simpler implementation of these properties,
1679      * ignoring NFD, passes the tests.
1680      * The implementation needs to be changed if the tests start failing.
1681      * When that happens, optimizations should be used to work with the
1682      * per-single-code point ucase_toFullXyz() functions unless
1683      * the NFD form has more than one code point,
1684      * and the property starts set needs to be the union of the
1685      * start sets for normalization and case mappings.
1686      */
1687     case UCHAR_CHANGES_WHEN_LOWERCASED:
1688         return (UBool)(ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1689     case UCHAR_CHANGES_WHEN_UPPERCASED:
1690         return (UBool)(ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1691     case UCHAR_CHANGES_WHEN_TITLECASED:
1692         return (UBool)(ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1693     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1694     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1695         return (UBool)(
1696             ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 ||
1697             ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 ||
1698             ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1699     default:
1700         return false;
1701     }
1702 }
1703