1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2004-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ucase.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2004aug30
16 * created by: Markus W. Scherer
17 *
18 * Low-level Unicode character/string case mapping code.
19 * Much code moved here (and modified) from uchar.c.
20 */
21
22 #include "unicode/utypes.h"
23 #include "unicode/unistr.h"
24 #include "unicode/uset.h"
25 #include "unicode/utf16.h"
26 #include "cmemory.h"
27 #include "uassert.h"
28 #include "ucase.h"
29 #include "umutex.h"
30 #include "utrie2.h"
31
32 /* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
33 #define INCLUDED_FROM_UCASE_CPP
34 #include "ucase_props_data.h"
35
36 /* set of property starts for UnicodeSet ------------------------------------ */
37
38 static UBool U_CALLCONV
_enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)39 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
40 /* add the start code point to the USet */
41 const USetAdder *sa=(const USetAdder *)context;
42 sa->add(sa->set, start);
43 return true;
44 }
45
46 U_CFUNC void U_EXPORT2
ucase_addPropertyStarts(const USetAdder * sa,UErrorCode * pErrorCode)47 ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
48 if(U_FAILURE(*pErrorCode)) {
49 return;
50 }
51
52 /* add the start code point of each same-value range of the trie */
53 utrie2_enum(&ucase_props_singleton.trie, nullptr, _enumPropertyStartsRange, sa);
54
55 /* add code points with hardcoded properties, plus the ones following them */
56
57 /* (none right now, see comment below) */
58
59 /*
60 * Omit code points with hardcoded specialcasing properties
61 * because we do not build property UnicodeSets for them right now.
62 */
63 }
64
65 /* data access primitives --------------------------------------------------- */
66
67 U_CAPI const struct UCaseProps * U_EXPORT2
ucase_getSingleton(int32_t * pExceptionsLength,int32_t * pUnfoldLength)68 ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
69 *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
70 *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
71 return &ucase_props_singleton;
72 }
73
74 U_CFUNC const UTrie2 * U_EXPORT2
ucase_getTrie()75 ucase_getTrie() {
76 return &ucase_props_singleton.trie;
77 }
78
79 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
80
81 /* number of bits in an 8-bit integer value */
82 static const uint8_t flagsOffset[256]={
83 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
84 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
85 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
86 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
87 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
88 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
89 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
90 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
91 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
94 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
98 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
99 };
100
101 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
102 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
103
104 /*
105 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
106 *
107 * @param excWord (in) initial exceptions word
108 * @param idx (in) desired slot index
109 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
110 * moved to the last uint16_t of the value, use +1 for beginning of next slot
111 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
112 */
113 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
114 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
115 (pExc16)+=SLOT_OFFSET(excWord, idx); \
116 (value)=*pExc16; \
117 } else { \
118 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
119 (value)=*pExc16++; \
120 (value)=((value)<<16)|*pExc16; \
121 } \
122 } UPRV_BLOCK_MACRO_END
123
124 /* simple case mappings ----------------------------------------------------- */
125
126 U_CAPI UChar32 U_EXPORT2
ucase_tolower(UChar32 c)127 ucase_tolower(UChar32 c) {
128 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
129 if(!UCASE_HAS_EXCEPTION(props)) {
130 if(UCASE_IS_UPPER_OR_TITLE(props)) {
131 c+=UCASE_GET_DELTA(props);
132 }
133 } else {
134 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
135 uint16_t excWord=*pe++;
136 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
137 int32_t delta;
138 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
139 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
140 }
141 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
142 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
143 }
144 }
145 return c;
146 }
147
148 U_CAPI UChar32 U_EXPORT2
ucase_toupper(UChar32 c)149 ucase_toupper(UChar32 c) {
150 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
151 if(!UCASE_HAS_EXCEPTION(props)) {
152 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
153 c+=UCASE_GET_DELTA(props);
154 }
155 } else {
156 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
157 uint16_t excWord=*pe++;
158 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
159 int32_t delta;
160 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
161 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
162 }
163 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
164 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
165 }
166 }
167 return c;
168 }
169
170 U_CAPI UChar32 U_EXPORT2
ucase_totitle(UChar32 c)171 ucase_totitle(UChar32 c) {
172 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
173 if(!UCASE_HAS_EXCEPTION(props)) {
174 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
175 c+=UCASE_GET_DELTA(props);
176 }
177 } else {
178 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
179 uint16_t excWord=*pe++;
180 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
181 int32_t delta;
182 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
183 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
184 }
185 int32_t idx;
186 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
187 idx=UCASE_EXC_TITLE;
188 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
189 idx=UCASE_EXC_UPPER;
190 } else {
191 return c;
192 }
193 GET_SLOT_VALUE(excWord, idx, pe, c);
194 }
195 return c;
196 }
197
198 static const char16_t iDot[2] = { 0x69, 0x307 };
199 static const char16_t jDot[2] = { 0x6a, 0x307 };
200 static const char16_t iOgonekDot[3] = { 0x12f, 0x307 };
201 static const char16_t iDotGrave[3] = { 0x69, 0x307, 0x300 };
202 static const char16_t iDotAcute[3] = { 0x69, 0x307, 0x301 };
203 static const char16_t iDotTilde[3] = { 0x69, 0x307, 0x303 };
204
205
206 U_CFUNC void U_EXPORT2
ucase_addCaseClosure(UChar32 c,const USetAdder * sa)207 ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
208 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
209 if(!UCASE_HAS_EXCEPTION(props)) {
210 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
211 /* add the one simple case mapping, no matter what type it is */
212 int32_t delta=UCASE_GET_DELTA(props);
213 if(delta!=0) {
214 sa->add(sa->set, c+delta);
215 }
216 }
217 } else {
218 /*
219 * c has exceptions, so there may be multiple simple and/or
220 * full case mappings. Add them all.
221 */
222 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
223 uint16_t excWord=*pe++;
224 const uint16_t *pe0=pe;
225
226 // Hardcode the case closure of i and its relatives and ignore the
227 // data file data for these characters.
228 // The Turkic dotless i and dotted I with their case mapping conditions
229 // and case folding option make the related characters behave specially.
230 // This code matches their closure behavior to their case folding behavior.
231 if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
232 // These characters have Turkic case foldings. Hardcode their closure.
233 if (c == 0x49) {
234 // Regular i and I are in one equivalence class.
235 sa->add(sa->set, 0x69);
236 return;
237 } else if (c == 0x130) {
238 // Dotted I is in a class with <0069 0307>
239 // (for canonical equivalence with <0049 0307>).
240 sa->addString(sa->set, iDot, 2);
241 return;
242 }
243 } else if (c == 0x69) {
244 sa->add(sa->set, 0x49);
245 return;
246 } else if (c == 0x131) {
247 // Dotless i is in a class by itself.
248 return;
249 }
250
251 /* add all simple case mappings */
252 for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
253 if(HAS_SLOT(excWord, idx)) {
254 pe=pe0;
255 UChar32 mapping;
256 GET_SLOT_VALUE(excWord, idx, pe, mapping);
257 sa->add(sa->set, mapping);
258 }
259 }
260 if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
261 pe=pe0;
262 int32_t delta;
263 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
264 sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
265 }
266
267 /* get the closure string pointer & length */
268 const char16_t *closure;
269 int32_t closureLength;
270 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
271 pe=pe0;
272 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
273 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
274 closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
275 } else {
276 closureLength=0;
277 closure=nullptr;
278 }
279
280 /* add the full case folding */
281 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
282 pe=pe0;
283 int32_t fullLength;
284 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
285
286 /* start of full case mapping strings */
287 ++pe;
288
289 fullLength&=0xffff; /* bits 16 and higher are reserved */
290
291 /* skip the lowercase result string */
292 pe+=fullLength&UCASE_FULL_LOWER;
293 fullLength>>=4;
294
295 /* add the full case folding string */
296 int32_t length=fullLength&0xf;
297 if(length!=0) {
298 sa->addString(sa->set, (const char16_t *)pe, length);
299 pe+=length;
300 }
301
302 /* skip the uppercase and titlecase strings */
303 fullLength>>=4;
304 pe+=fullLength&0xf;
305 fullLength>>=4;
306 pe+=fullLength;
307
308 closure=(const char16_t *)pe; /* behind full case mappings */
309 }
310
311 /* add each code point in the closure string */
312 for(int32_t idx=0; idx<closureLength;) {
313 UChar32 mapping;
314 U16_NEXT_UNSAFE(closure, idx, mapping);
315 sa->add(sa->set, mapping);
316 }
317 }
318 }
319
320 U_CFUNC void U_EXPORT2
ucase_addSimpleCaseClosure(UChar32 c,const USetAdder * sa)321 ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
322 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
323 if(!UCASE_HAS_EXCEPTION(props)) {
324 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
325 /* add the one simple case mapping, no matter what type it is */
326 int32_t delta=UCASE_GET_DELTA(props);
327 if(delta!=0) {
328 sa->add(sa->set, c+delta);
329 }
330 }
331 } else {
332 // c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
333 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
334 uint16_t excWord=*pe++;
335 const uint16_t *pe0=pe;
336
337 // Hardcode the case closure of i and its relatives and ignore the
338 // data file data for these characters, like in ucase_addCaseClosure().
339 if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
340 // These characters have Turkic case foldings. Hardcode their closure.
341 if (c == 0x49) {
342 // Regular i and I are in one equivalence class.
343 sa->add(sa->set, 0x69);
344 return;
345 } else if (c == 0x130) {
346 // For scf=Simple_Case_Folding, dotted I is in a class by itself.
347 return;
348 }
349 } else if (c == 0x69) {
350 sa->add(sa->set, 0x49);
351 return;
352 } else if (c == 0x131) {
353 // Dotless i is in a class by itself.
354 return;
355 }
356
357 // Add all simple case mappings.
358 for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
359 if(HAS_SLOT(excWord, idx)) {
360 pe=pe0;
361 UChar32 mapping;
362 GET_SLOT_VALUE(excWord, idx, pe, mapping);
363 sa->add(sa->set, mapping);
364 }
365 }
366 if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
367 pe=pe0;
368 int32_t delta;
369 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
370 UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
371 sa->add(sa->set, mapping);
372 }
373
374 /* get the closure string pointer & length */
375 const char16_t *closure;
376 int32_t closureLength;
377 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
378 pe=pe0;
379 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
380 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
381 closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
382 } else {
383 closureLength=0;
384 closure=nullptr;
385 }
386
387 // Skip the full case mappings.
388 if(closureLength > 0 && HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
389 pe=pe0;
390 int32_t fullLength;
391 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
392
393 /* start of full case mapping strings */
394 ++pe;
395
396 fullLength&=0xffff; /* bits 16 and higher are reserved */
397
398 // Skip all 4 full case mappings.
399 pe+=fullLength&UCASE_FULL_LOWER;
400 fullLength>>=4;
401 pe+=fullLength&0xf;
402 fullLength>>=4;
403 pe+=fullLength&0xf;
404 fullLength>>=4;
405 pe+=fullLength;
406
407 closure=(const char16_t *)pe; /* behind full case mappings */
408 }
409
410 // Add each code point in the closure string whose scf maps back to c.
411 for(int32_t idx=0; idx<closureLength;) {
412 UChar32 mapping;
413 U16_NEXT_UNSAFE(closure, idx, mapping);
414 sa->add(sa->set, mapping);
415 }
416 }
417 }
418
419 /*
420 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
421 * must be length>0 and max>0 and length<=max
422 */
423 static inline int32_t
strcmpMax(const char16_t * s,int32_t length,const char16_t * t,int32_t max)424 strcmpMax(const char16_t *s, int32_t length, const char16_t *t, int32_t max) {
425 int32_t c1, c2;
426
427 max-=length; /* we require length<=max, so no need to decrement max in the loop */
428 do {
429 c1=*s++;
430 c2=*t++;
431 if(c2==0) {
432 return 1; /* reached the end of t but not of s */
433 }
434 c1-=c2;
435 if(c1!=0) {
436 return c1; /* return difference result */
437 }
438 } while(--length>0);
439 /* ends with length==0 */
440
441 if(max==0 || *t==0) {
442 return 0; /* equal to length of both strings */
443 } else {
444 return -max; /* return length difference */
445 }
446 }
447
448 U_CFUNC UBool U_EXPORT2
ucase_addStringCaseClosure(const char16_t * s,int32_t length,const USetAdder * sa)449 ucase_addStringCaseClosure(const char16_t *s, int32_t length, const USetAdder *sa) {
450 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
451
452 if(ucase_props_singleton.unfold==nullptr || s==nullptr) {
453 return false; /* no reverse case folding data, or no string */
454 }
455 if(length<=1) {
456 /* the string is too short to find any match */
457 /*
458 * more precise would be:
459 * if(!u_strHasMoreChar32Than(s, length, 1))
460 * but this does not make much practical difference because
461 * a single supplementary code point would just not be found
462 */
463 return false;
464 }
465
466 const uint16_t *unfold=ucase_props_singleton.unfold;
467 unfoldRows=unfold[UCASE_UNFOLD_ROWS];
468 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
469 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
470 unfold+=unfoldRowWidth;
471
472 if(length>unfoldStringWidth) {
473 /* the string is too long to find any match */
474 return false;
475 }
476
477 /* do a binary search for the string */
478 start=0;
479 limit=unfoldRows;
480 while(start<limit) {
481 i=(start+limit)/2;
482 const char16_t *p=reinterpret_cast<const char16_t *>(unfold+(i*unfoldRowWidth));
483 result=strcmpMax(s, length, p, unfoldStringWidth);
484
485 if(result==0) {
486 /* found the string: add each code point, and its case closure */
487 UChar32 c;
488
489 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
490 U16_NEXT_UNSAFE(p, i, c);
491 sa->add(sa->set, c);
492 ucase_addCaseClosure(c, sa);
493 }
494 return true;
495 } else if(result<0) {
496 limit=i;
497 } else /* result>0 */ {
498 start=i+1;
499 }
500 }
501
502 return false; /* string not found */
503 }
504
505 U_NAMESPACE_BEGIN
506
FullCaseFoldingIterator()507 FullCaseFoldingIterator::FullCaseFoldingIterator()
508 : unfold(reinterpret_cast<const char16_t *>(ucase_props_singleton.unfold)),
509 unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
510 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
511 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
512 currentRow(0),
513 rowCpIndex(unfoldStringWidth) {
514 unfold+=unfoldRowWidth;
515 }
516
517 UChar32
next(UnicodeString & full)518 FullCaseFoldingIterator::next(UnicodeString &full) {
519 // Advance past the last-delivered code point.
520 const char16_t *p=unfold+(currentRow*unfoldRowWidth);
521 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
522 ++currentRow;
523 p+=unfoldRowWidth;
524 rowCpIndex=unfoldStringWidth;
525 }
526 if(currentRow>=unfoldRows) { return U_SENTINEL; }
527 // Set "full" to the NUL-terminated string in the first unfold column.
528 int32_t length=unfoldStringWidth;
529 while(length>0 && p[length-1]==0) { --length; }
530 full.setTo(false, p, length);
531 // Return the code point.
532 UChar32 c;
533 U16_NEXT_UNSAFE(p, rowCpIndex, c);
534 return c;
535 }
536
537 namespace LatinCase {
538
539 const int8_t TO_LOWER_NORMAL[LIMIT] = {
540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
544
545 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
546 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549
550 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
554
555 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
556 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
557 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559
560 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
561 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
562 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
563 EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
564
565 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
566 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
567 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
568 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
569 };
570
571 const int8_t TO_LOWER_TR_LT[LIMIT] = {
572 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
573 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
574 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
575 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
576
577 0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
578 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
579 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581
582 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
583 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
584 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
585 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
586
587 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
588 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
589 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591
592 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
593 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
594 1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
595 EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
596
597 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
598 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
599 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
600 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
601 };
602
603 const int8_t TO_UPPER_NORMAL[LIMIT] = {
604 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
605 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
606 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
607 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
608
609 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
610 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
611 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
612 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
613
614 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
615 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618
619 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
620 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
621 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
622 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
623
624 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
625 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
626 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
627 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
628
629 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
630 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
631 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
632 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
633 };
634
635 const int8_t TO_UPPER_TR[LIMIT] = {
636 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
640
641 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
642 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
643 0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
644 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
645
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
649 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
650
651 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
652 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
653 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
654 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
655
656 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
657 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
658 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
659 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
660
661 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
662 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
663 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
664 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
665 };
666
667 } // namespace LatinCase
668
669 U_NAMESPACE_END
670
671 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
672 U_CAPI int32_t U_EXPORT2
ucase_getType(UChar32 c)673 ucase_getType(UChar32 c) {
674 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
675 return UCASE_GET_TYPE(props);
676 }
677
678 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
679 U_CAPI int32_t U_EXPORT2
ucase_getTypeOrIgnorable(UChar32 c)680 ucase_getTypeOrIgnorable(UChar32 c) {
681 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
682 return UCASE_GET_TYPE_AND_IGNORABLE(props);
683 }
684
685 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
686 static inline int32_t
getDotType(UChar32 c)687 getDotType(UChar32 c) {
688 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
689 if(!UCASE_HAS_EXCEPTION(props)) {
690 return props&UCASE_DOT_MASK;
691 } else {
692 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
693 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
694 }
695 }
696
697 U_CAPI UBool U_EXPORT2
ucase_isSoftDotted(UChar32 c)698 ucase_isSoftDotted(UChar32 c) {
699 return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
700 }
701
702 U_CAPI UBool U_EXPORT2
ucase_isCaseSensitive(UChar32 c)703 ucase_isCaseSensitive(UChar32 c) {
704 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
705 if(!UCASE_HAS_EXCEPTION(props)) {
706 return (UBool)((props&UCASE_SENSITIVE)!=0);
707 } else {
708 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
709 return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
710 }
711 }
712
713 /* string casing ------------------------------------------------------------ */
714
715 /*
716 * These internal functions form the core of string case mappings.
717 * They map single code points to result code points or strings and take
718 * all necessary conditions (context, locale ID, options) into account.
719 *
720 * They do not iterate over the source or write to the destination
721 * so that the same functions are useful for non-standard string storage,
722 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
723 * For the same reason, the "surrounding text" context is passed in as a
724 * UCaseContextIterator which does not make any assumptions about
725 * the underlying storage.
726 *
727 * This section contains helper functions that check for conditions
728 * in the input text surrounding the current code point
729 * according to SpecialCasing.txt.
730 *
731 * Each helper function gets the index
732 * - after the current code point if it looks at following text
733 * - before the current code point if it looks at preceding text
734 *
735 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
736 *
737 * Final_Sigma
738 * C is preceded by a sequence consisting of
739 * a cased letter and a case-ignorable sequence,
740 * and C is not followed by a sequence consisting of
741 * an ignorable sequence and then a cased letter.
742 *
743 * More_Above
744 * C is followed by one or more characters of combining class 230 (ABOVE)
745 * in the combining character sequence.
746 *
747 * After_Soft_Dotted
748 * The last preceding character with combining class of zero before C
749 * was Soft_Dotted,
750 * and there is no intervening combining character class 230 (ABOVE).
751 *
752 * Before_Dot
753 * C is followed by combining dot above (U+0307).
754 * Any sequence of characters with a combining class that is neither 0 nor 230
755 * may intervene between the current character and the combining dot above.
756 *
757 * The erratum from 2002-10-31 adds the condition
758 *
759 * After_I
760 * The last preceding base character was an uppercase I, and there is no
761 * intervening combining character class 230 (ABOVE).
762 *
763 * (See Jitterbug 2344 and the comments on After_I below.)
764 *
765 * Helper definitions in Unicode 3.2 UAX 21:
766 *
767 * D1. A character C is defined to be cased
768 * if it meets any of the following criteria:
769 *
770 * - The general category of C is Titlecase Letter (Lt)
771 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
772 * - Given D = NFD(C), then it is not the case that:
773 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
774 * (This third criterion does not add any characters to the list
775 * for Unicode 3.2. Ignored.)
776 *
777 * D2. A character C is defined to be case-ignorable
778 * if it meets either of the following criteria:
779 *
780 * - The general category of C is
781 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
782 * Letter Modifier (Lm), or Symbol Modifier (Sk)
783 * - C is one of the following characters
784 * U+0027 APOSTROPHE
785 * U+00AD SOFT HYPHEN (SHY)
786 * U+2019 RIGHT SINGLE QUOTATION MARK
787 * (the preferred character for apostrophe)
788 *
789 * D3. A case-ignorable sequence is a sequence of
790 * zero or more case-ignorable characters.
791 */
792
793 #define is_d(c) ((c)=='d' || (c)=='D')
794 #define is_e(c) ((c)=='e' || (c)=='E')
795 #define is_i(c) ((c)=='i' || (c)=='I')
796 #define is_l(c) ((c)=='l' || (c)=='L')
797 #define is_r(c) ((c)=='r' || (c)=='R')
798 #define is_t(c) ((c)=='t' || (c)=='T')
799 #define is_u(c) ((c)=='u' || (c)=='U')
800 #define is_y(c) ((c)=='y' || (c)=='Y')
801 #define is_z(c) ((c)=='z' || (c)=='Z')
802
803 /* separator? */
804 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
805
806 /**
807 * Requires non-nullptr locale ID but otherwise does the equivalent of
808 * checking for language codes as if uloc_getLanguage() were called:
809 * Accepts both 2- and 3-letter codes and accepts case variants.
810 */
811 U_CFUNC int32_t
ucase_getCaseLocale(const char * locale)812 ucase_getCaseLocale(const char *locale) {
813 /*
814 * This function used to use uloc_getLanguage(), but the current code
815 * removes the dependency of this low-level code on uloc implementation code
816 * and is faster because not the whole locale ID has to be
817 * examined and copied/transformed.
818 *
819 * Because this code does not want to depend on uloc, the caller must
820 * pass in a non-nullptr locale, i.e., may need to call uloc_getDefault().
821 */
822 char c=*locale++;
823 // Fastpath for English "en" which is often used for default (=root locale) case mappings,
824 // and for Chinese "zh": Very common but no special case mapping behavior.
825 // Then check lowercase vs. uppercase to reduce the number of comparisons
826 // for other locales without special behavior.
827 if(c=='e') {
828 /* el or ell? */
829 c=*locale++;
830 if(is_l(c)) {
831 c=*locale++;
832 if(is_l(c)) {
833 c=*locale;
834 }
835 if(is_sep(c)) {
836 return UCASE_LOC_GREEK;
837 }
838 }
839 // en, es, ... -> root
840 } else if(c=='z') {
841 return UCASE_LOC_ROOT;
842 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
843 } else if(c>='a') { // ASCII a-z = 0x61..0x7a, after A-Z
844 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
845 } else if(c<='z') { // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
846 #else
847 # error Unknown charset family!
848 #endif
849 // lowercase c
850 if(c=='t') {
851 /* tr or tur? */
852 c=*locale++;
853 if(is_u(c)) {
854 c=*locale++;
855 }
856 if(is_r(c)) {
857 c=*locale;
858 if(is_sep(c)) {
859 return UCASE_LOC_TURKISH;
860 }
861 }
862 } else if(c=='a') {
863 /* az or aze? */
864 c=*locale++;
865 if(is_z(c)) {
866 c=*locale++;
867 if(is_e(c)) {
868 c=*locale;
869 }
870 if(is_sep(c)) {
871 return UCASE_LOC_TURKISH;
872 }
873 }
874 } else if(c=='l') {
875 /* lt or lit? */
876 c=*locale++;
877 if(is_i(c)) {
878 c=*locale++;
879 }
880 if(is_t(c)) {
881 c=*locale;
882 if(is_sep(c)) {
883 return UCASE_LOC_LITHUANIAN;
884 }
885 }
886 } else if(c=='n') {
887 /* nl or nld? */
888 c=*locale++;
889 if(is_l(c)) {
890 c=*locale++;
891 if(is_d(c)) {
892 c=*locale;
893 }
894 if(is_sep(c)) {
895 return UCASE_LOC_DUTCH;
896 }
897 }
898 } else if(c=='h') {
899 /* hy or hye? *not* hyw */
900 c=*locale++;
901 if(is_y(c)) {
902 c=*locale++;
903 if(is_e(c)) {
904 c=*locale;
905 }
906 if(is_sep(c)) {
907 return UCASE_LOC_ARMENIAN;
908 }
909 }
910 }
911 } else {
912 // uppercase c
913 // Same code as for lowercase c but also check for 'E'.
914 if(c=='T') {
915 /* tr or tur? */
916 c=*locale++;
917 if(is_u(c)) {
918 c=*locale++;
919 }
920 if(is_r(c)) {
921 c=*locale;
922 if(is_sep(c)) {
923 return UCASE_LOC_TURKISH;
924 }
925 }
926 } else if(c=='A') {
927 /* az or aze? */
928 c=*locale++;
929 if(is_z(c)) {
930 c=*locale++;
931 if(is_e(c)) {
932 c=*locale;
933 }
934 if(is_sep(c)) {
935 return UCASE_LOC_TURKISH;
936 }
937 }
938 } else if(c=='L') {
939 /* lt or lit? */
940 c=*locale++;
941 if(is_i(c)) {
942 c=*locale++;
943 }
944 if(is_t(c)) {
945 c=*locale;
946 if(is_sep(c)) {
947 return UCASE_LOC_LITHUANIAN;
948 }
949 }
950 } else if(c=='E') {
951 /* el or ell? */
952 c=*locale++;
953 if(is_l(c)) {
954 c=*locale++;
955 if(is_l(c)) {
956 c=*locale;
957 }
958 if(is_sep(c)) {
959 return UCASE_LOC_GREEK;
960 }
961 }
962 } else if(c=='N') {
963 /* nl or nld? */
964 c=*locale++;
965 if(is_l(c)) {
966 c=*locale++;
967 if(is_d(c)) {
968 c=*locale;
969 }
970 if(is_sep(c)) {
971 return UCASE_LOC_DUTCH;
972 }
973 }
974 } else if(c=='H') {
975 /* hy or hye? *not* hyw */
976 c=*locale++;
977 if(is_y(c)) {
978 c=*locale++;
979 if(is_e(c)) {
980 c=*locale;
981 }
982 if(is_sep(c)) {
983 return UCASE_LOC_ARMENIAN;
984 }
985 }
986 }
987 }
988 return UCASE_LOC_ROOT;
989 }
990
991 /*
992 * Is followed by
993 * {case-ignorable}* cased
994 * ?
995 * (dir determines looking forward/backward)
996 * If a character is case-ignorable, it is skipped regardless of whether
997 * it is also cased or not.
998 */
999 static UBool
isFollowedByCasedLetter(UCaseContextIterator * iter,void * context,int8_t dir)1000 isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
1001 UChar32 c;
1002
1003 if(iter==nullptr) {
1004 return false;
1005 }
1006
1007 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
1008 int32_t type=ucase_getTypeOrIgnorable(c);
1009 if(type&4) {
1010 /* case-ignorable, continue with the loop */
1011 } else if(type!=UCASE_NONE) {
1012 return true; /* followed by cased letter */
1013 } else {
1014 return false; /* uncased and not case-ignorable */
1015 }
1016 }
1017
1018 return false; /* not followed by cased letter */
1019 }
1020
1021 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
1022 static UBool
isPrecededBySoftDotted(UCaseContextIterator * iter,void * context)1023 isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
1024 UChar32 c;
1025 int32_t dotType;
1026 int8_t dir;
1027
1028 if(iter==nullptr) {
1029 return false;
1030 }
1031
1032 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1033 dotType=getDotType(c);
1034 if(dotType==UCASE_SOFT_DOTTED) {
1035 return true; /* preceded by TYPE_i */
1036 } else if(dotType!=UCASE_OTHER_ACCENT) {
1037 return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
1038 }
1039 }
1040
1041 return false; /* not preceded by TYPE_i */
1042 }
1043
1044 /*
1045 * See Jitterbug 2344:
1046 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
1047 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
1048 * we made those releases compatible with Unicode 3.2 which had not fixed
1049 * a related bug in SpecialCasing.txt.
1050 *
1051 * From the Jitterbug 2344 text:
1052 * ... this bug is listed as a Unicode erratum
1053 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
1054 * <quote>
1055 * There are two errors in SpecialCasing.txt.
1056 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
1057 * 2. An incorrect context definition. Correct as follows:
1058 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
1059 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
1060 * ---
1061 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1062 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1063 * where the context After_I is defined as:
1064 * The last preceding base character was an uppercase I, and there is no
1065 * intervening combining character class 230 (ABOVE).
1066 * </quote>
1067 *
1068 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
1069 *
1070 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1071 * # This matches the behavior of the canonically equivalent I-dot_above
1072 *
1073 * See also the description in this place in older versions of uchar.c (revision 1.100).
1074 *
1075 * Markus W. Scherer 2003-feb-15
1076 */
1077
1078 /* Is preceded by base character 'I' with no intervening cc=230 ? */
1079 static UBool
isPrecededBy_I(UCaseContextIterator * iter,void * context)1080 isPrecededBy_I(UCaseContextIterator *iter, void *context) {
1081 UChar32 c;
1082 int32_t dotType;
1083 int8_t dir;
1084
1085 if(iter==nullptr) {
1086 return false;
1087 }
1088
1089 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1090 if(c==0x49) {
1091 return true; /* preceded by I */
1092 }
1093 dotType=getDotType(c);
1094 if(dotType!=UCASE_OTHER_ACCENT) {
1095 return false; /* preceded by different base character (not I), or intervening cc==230 */
1096 }
1097 }
1098
1099 return false; /* not preceded by I */
1100 }
1101
1102 /* Is followed by one or more cc==230 ? */
1103 static UBool
isFollowedByMoreAbove(UCaseContextIterator * iter,void * context)1104 isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
1105 UChar32 c;
1106 int32_t dotType;
1107 int8_t dir;
1108
1109 if(iter==nullptr) {
1110 return false;
1111 }
1112
1113 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1114 dotType=getDotType(c);
1115 if(dotType==UCASE_ABOVE) {
1116 return true; /* at least one cc==230 following */
1117 } else if(dotType!=UCASE_OTHER_ACCENT) {
1118 return false; /* next base character, no more cc==230 following */
1119 }
1120 }
1121
1122 return false; /* no more cc==230 following */
1123 }
1124
1125 /* Is followed by a dot above (without cc==230 in between) ? */
1126 static UBool
isFollowedByDotAbove(UCaseContextIterator * iter,void * context)1127 isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
1128 UChar32 c;
1129 int32_t dotType;
1130 int8_t dir;
1131
1132 if(iter==nullptr) {
1133 return false;
1134 }
1135
1136 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1137 if(c==0x307) {
1138 return true;
1139 }
1140 dotType=getDotType(c);
1141 if(dotType!=UCASE_OTHER_ACCENT) {
1142 return false; /* next base character or cc==230 in between */
1143 }
1144 }
1145
1146 return false; /* no dot above following */
1147 }
1148
1149 U_CAPI int32_t U_EXPORT2
ucase_toFullLower(UChar32 c,UCaseContextIterator * iter,void * context,const char16_t ** pString,int32_t loc)1150 ucase_toFullLower(UChar32 c,
1151 UCaseContextIterator *iter, void *context,
1152 const char16_t **pString,
1153 int32_t loc) {
1154 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1155 U_ASSERT(c >= 0);
1156 UChar32 result=c;
1157 // Reset the output pointer in case it was uninitialized.
1158 *pString=nullptr;
1159 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1160 if(!UCASE_HAS_EXCEPTION(props)) {
1161 if(UCASE_IS_UPPER_OR_TITLE(props)) {
1162 result=c+UCASE_GET_DELTA(props);
1163 }
1164 } else {
1165 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1166 uint16_t excWord=*pe++;
1167 int32_t full;
1168
1169 pe2=pe;
1170
1171 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1172 /* use hardcoded conditions and mappings */
1173
1174 /*
1175 * Test for conditional mappings first
1176 * (otherwise the unconditional default mappings are always taken),
1177 * then test for characters that have unconditional mappings in SpecialCasing.txt,
1178 * then get the UnicodeData.txt mappings.
1179 */
1180 if( loc==UCASE_LOC_LITHUANIAN &&
1181 /* base characters, find accents above */
1182 (((c==0x49 || c==0x4a || c==0x12e) &&
1183 isFollowedByMoreAbove(iter, context)) ||
1184 /* precomposed with accent above, no need to find one */
1185 (c==0xcc || c==0xcd || c==0x128))
1186 ) {
1187 /*
1188 # Lithuanian
1189
1190 # Lithuanian retains the dot in a lowercase i when followed by accents.
1191
1192 # Introduce an explicit dot above when lowercasing capital I's and J's
1193 # whenever there are more accents above.
1194 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1195
1196 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1197 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1198 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1199 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1200 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1201 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1202 */
1203 switch(c) {
1204 case 0x49: /* LATIN CAPITAL LETTER I */
1205 *pString=iDot;
1206 return 2;
1207 case 0x4a: /* LATIN CAPITAL LETTER J */
1208 *pString=jDot;
1209 return 2;
1210 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1211 *pString=iOgonekDot;
1212 return 2;
1213 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
1214 *pString=iDotGrave;
1215 return 3;
1216 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
1217 *pString=iDotAcute;
1218 return 3;
1219 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1220 *pString=iDotTilde;
1221 return 3;
1222 default:
1223 return 0; /* will not occur */
1224 }
1225 /* # Turkish and Azeri */
1226 } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1227 /*
1228 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1229 # The following rules handle those cases.
1230
1231 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1232 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1233 */
1234 return 0x69;
1235 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
1236 /*
1237 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1238 # This matches the behavior of the canonically equivalent I-dot_above
1239
1240 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1241 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1242 */
1243 return 0; /* remove the dot (continue without output) */
1244 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
1245 /*
1246 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1247
1248 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1249 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1250 */
1251 return 0x131;
1252 } else if(c==0x130) {
1253 /*
1254 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1255
1256 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1257 */
1258 *pString=iDot;
1259 return 2;
1260 } else if( c==0x3a3 &&
1261 !isFollowedByCasedLetter(iter, context, 1) &&
1262 isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
1263 ) {
1264 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1265 /*
1266 # Special case for final form of sigma
1267
1268 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1269 */
1270 return 0x3c2; /* greek small final sigma */
1271 } else {
1272 /* no known conditional special case mapping, use a normal mapping */
1273 }
1274 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1275 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1276 full&=UCASE_FULL_LOWER;
1277 if(full!=0) {
1278 /* set the output pointer to the lowercase mapping */
1279 *pString=reinterpret_cast<const char16_t *>(pe+1);
1280
1281 /* return the string length */
1282 return full;
1283 }
1284 }
1285
1286 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1287 int32_t delta;
1288 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1289 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1290 }
1291 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1292 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1293 }
1294 }
1295
1296 return (result==c) ? ~result : result;
1297 }
1298
1299 /* internal */
1300 static int32_t
toUpperOrTitle(UChar32 c,UCaseContextIterator * iter,void * context,const char16_t ** pString,int32_t loc,UBool upperNotTitle)1301 toUpperOrTitle(UChar32 c,
1302 UCaseContextIterator *iter, void *context,
1303 const char16_t **pString,
1304 int32_t loc,
1305 UBool upperNotTitle) {
1306 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1307 U_ASSERT(c >= 0);
1308 UChar32 result=c;
1309 // Reset the output pointer in case it was uninitialized.
1310 *pString=nullptr;
1311 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1312 if(!UCASE_HAS_EXCEPTION(props)) {
1313 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1314 result=c+UCASE_GET_DELTA(props);
1315 }
1316 } else {
1317 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1318 uint16_t excWord=*pe++;
1319 int32_t full, idx;
1320
1321 pe2=pe;
1322
1323 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1324 /* use hardcoded conditions and mappings */
1325 if(loc==UCASE_LOC_TURKISH && c==0x69) {
1326 /*
1327 # Turkish and Azeri
1328
1329 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1330 # The following rules handle those cases.
1331
1332 # When uppercasing, i turns into a dotted capital I
1333
1334 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1335 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1336 */
1337 return 0x130;
1338 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1339 /*
1340 # Lithuanian
1341
1342 # Lithuanian retains the dot in a lowercase i when followed by accents.
1343
1344 # Remove DOT ABOVE after "i" with upper or titlecase
1345
1346 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1347 */
1348 return 0; /* remove the dot (continue without output) */
1349 } else if(c==0x0587) {
1350 // See ICU-13416:
1351 // և ligature ech-yiwn
1352 // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian,
1353 // but to ԵՎ=ech+vew in Eastern Armenian.
1354 if(loc==UCASE_LOC_ARMENIAN) {
1355 *pString=upperNotTitle ? u"ԵՎ" : u"Եվ";
1356 } else {
1357 *pString=upperNotTitle ? u"ԵՒ" : u"Եւ";
1358 }
1359 return 2;
1360 } else {
1361 /* no known conditional special case mapping, use a normal mapping */
1362 }
1363 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1364 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1365
1366 /* start of full case mapping strings */
1367 ++pe;
1368
1369 /* skip the lowercase and case-folding result strings */
1370 pe+=full&UCASE_FULL_LOWER;
1371 full>>=4;
1372 pe+=full&0xf;
1373 full>>=4;
1374
1375 if(upperNotTitle) {
1376 full&=0xf;
1377 } else {
1378 /* skip the uppercase result string */
1379 pe+=full&0xf;
1380 full=(full>>4)&0xf;
1381 }
1382
1383 if(full!=0) {
1384 /* set the output pointer to the result string */
1385 *pString=reinterpret_cast<const char16_t *>(pe);
1386
1387 /* return the string length */
1388 return full;
1389 }
1390 }
1391
1392 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1393 int32_t delta;
1394 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1395 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1396 }
1397 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1398 idx=UCASE_EXC_TITLE;
1399 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1400 /* here, titlecase is same as uppercase */
1401 idx=UCASE_EXC_UPPER;
1402 } else {
1403 return ~c;
1404 }
1405 GET_SLOT_VALUE(excWord, idx, pe2, result);
1406 }
1407
1408 return (result==c) ? ~result : result;
1409 }
1410
1411 U_CAPI int32_t U_EXPORT2
ucase_toFullUpper(UChar32 c,UCaseContextIterator * iter,void * context,const char16_t ** pString,int32_t caseLocale)1412 ucase_toFullUpper(UChar32 c,
1413 UCaseContextIterator *iter, void *context,
1414 const char16_t **pString,
1415 int32_t caseLocale) {
1416 return toUpperOrTitle(c, iter, context, pString, caseLocale, true);
1417 }
1418
1419 U_CAPI int32_t U_EXPORT2
ucase_toFullTitle(UChar32 c,UCaseContextIterator * iter,void * context,const char16_t ** pString,int32_t caseLocale)1420 ucase_toFullTitle(UChar32 c,
1421 UCaseContextIterator *iter, void *context,
1422 const char16_t **pString,
1423 int32_t caseLocale) {
1424 return toUpperOrTitle(c, iter, context, pString, caseLocale, false);
1425 }
1426
1427 /* case folding ------------------------------------------------------------- */
1428
1429 /*
1430 * Case folding is similar to lowercasing.
1431 * The result may be a simple mapping, i.e., a single code point, or
1432 * a full mapping, i.e., a string.
1433 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1434 * then only the lowercase mapping is stored.
1435 *
1436 * Some special cases are hardcoded because their conditions cannot be
1437 * parsed and processed from CaseFolding.txt.
1438 *
1439 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1440
1441 # C: common case folding, common mappings shared by both simple and full mappings.
1442 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1443 # S: simple case folding, mappings to single characters where different from F.
1444 # T: special case for uppercase I and dotted uppercase I
1445 # - For non-Turkic languages, this mapping is normally not used.
1446 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1447 #
1448 # Usage:
1449 # A. To do a simple case folding, use the mappings with status C + S.
1450 # B. To do a full case folding, use the mappings with status C + F.
1451 #
1452 # The mappings with status T can be used or omitted depending on the desired case-folding
1453 # behavior. (The default option is to exclude them.)
1454
1455 * Unicode 3.2 has 'T' mappings as follows:
1456
1457 0049; T; 0131; # LATIN CAPITAL LETTER I
1458 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1459
1460 * while the default mappings for these code points are:
1461
1462 0049; C; 0069; # LATIN CAPITAL LETTER I
1463 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1464
1465 * U+0130 has no simple case folding (simple-case-folds to itself).
1466 */
1467
1468 /* return the simple case folding mapping for c */
1469 U_CAPI UChar32 U_EXPORT2
ucase_fold(UChar32 c,uint32_t options)1470 ucase_fold(UChar32 c, uint32_t options) {
1471 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1472 if(!UCASE_HAS_EXCEPTION(props)) {
1473 if(UCASE_IS_UPPER_OR_TITLE(props)) {
1474 c+=UCASE_GET_DELTA(props);
1475 }
1476 } else {
1477 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1478 uint16_t excWord=*pe++;
1479 int32_t idx;
1480 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1481 /* special case folding mappings, hardcoded */
1482 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1483 /* default mappings */
1484 if(c==0x49) {
1485 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1486 return 0x69;
1487 } else if(c==0x130) {
1488 /* no simple case folding for U+0130 */
1489 return c;
1490 }
1491 } else {
1492 /* Turkic mappings */
1493 if(c==0x49) {
1494 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1495 return 0x131;
1496 } else if(c==0x130) {
1497 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1498 return 0x69;
1499 }
1500 }
1501 }
1502 if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1503 return c;
1504 }
1505 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1506 int32_t delta;
1507 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1508 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1509 }
1510 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1511 idx=UCASE_EXC_FOLD;
1512 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1513 idx=UCASE_EXC_LOWER;
1514 } else {
1515 return c;
1516 }
1517 GET_SLOT_VALUE(excWord, idx, pe, c);
1518 }
1519 return c;
1520 }
1521
1522 /*
1523 * Issue for canonical caseless match (UAX #21):
1524 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1525 * canonical equivalence, unlike default-option casefolding.
1526 * For example, I-grave and I + grave fold to strings that are not canonically
1527 * equivalent.
1528 * For more details, see the comment in unorm_compare() in unorm.cpp
1529 * and the intermediate prototype changes for Jitterbug 2021.
1530 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1531 *
1532 * This did not get fixed because it appears that it is not possible to fix
1533 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1534 * together in a way that they still fold to common result strings.
1535 */
1536
1537 U_CAPI int32_t U_EXPORT2
ucase_toFullFolding(UChar32 c,const char16_t ** pString,uint32_t options)1538 ucase_toFullFolding(UChar32 c,
1539 const char16_t **pString,
1540 uint32_t options) {
1541 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1542 U_ASSERT(c >= 0);
1543 UChar32 result=c;
1544 // Reset the output pointer in case it was uninitialized.
1545 *pString=nullptr;
1546 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1547 if(!UCASE_HAS_EXCEPTION(props)) {
1548 if(UCASE_IS_UPPER_OR_TITLE(props)) {
1549 result=c+UCASE_GET_DELTA(props);
1550 }
1551 } else {
1552 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1553 uint16_t excWord=*pe++;
1554 int32_t full, idx;
1555
1556 pe2=pe;
1557
1558 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1559 /* use hardcoded conditions and mappings */
1560 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1561 /* default mappings */
1562 if(c==0x49) {
1563 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1564 return 0x69;
1565 } else if(c==0x130) {
1566 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1567 *pString=iDot;
1568 return 2;
1569 }
1570 } else {
1571 /* Turkic mappings */
1572 if(c==0x49) {
1573 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1574 return 0x131;
1575 } else if(c==0x130) {
1576 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1577 return 0x69;
1578 }
1579 }
1580 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1581 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1582
1583 /* start of full case mapping strings */
1584 ++pe;
1585
1586 /* skip the lowercase result string */
1587 pe+=full&UCASE_FULL_LOWER;
1588 full=(full>>4)&0xf;
1589
1590 if(full!=0) {
1591 /* set the output pointer to the result string */
1592 *pString=reinterpret_cast<const char16_t *>(pe);
1593
1594 /* return the string length */
1595 return full;
1596 }
1597 }
1598
1599 if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1600 return ~c;
1601 }
1602 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1603 int32_t delta;
1604 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1605 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1606 }
1607 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1608 idx=UCASE_EXC_FOLD;
1609 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1610 idx=UCASE_EXC_LOWER;
1611 } else {
1612 return ~c;
1613 }
1614 GET_SLOT_VALUE(excWord, idx, pe2, result);
1615 }
1616
1617 return (result==c) ? ~result : result;
1618 }
1619
1620 /* case mapping properties API ---------------------------------------------- */
1621
1622 /* public API (see uchar.h) */
1623
1624 U_CAPI UBool U_EXPORT2
u_isULowercase(UChar32 c)1625 u_isULowercase(UChar32 c) {
1626 return (UBool)(UCASE_LOWER==ucase_getType(c));
1627 }
1628
1629 U_CAPI UBool U_EXPORT2
u_isUUppercase(UChar32 c)1630 u_isUUppercase(UChar32 c) {
1631 return (UBool)(UCASE_UPPER==ucase_getType(c));
1632 }
1633
1634 /* Transforms the Unicode character to its lower case equivalent.*/
1635 U_CAPI UChar32 U_EXPORT2
u_tolower(UChar32 c)1636 u_tolower(UChar32 c) {
1637 return ucase_tolower(c);
1638 }
1639
1640 /* Transforms the Unicode character to its upper case equivalent.*/
1641 U_CAPI UChar32 U_EXPORT2
u_toupper(UChar32 c)1642 u_toupper(UChar32 c) {
1643 return ucase_toupper(c);
1644 }
1645
1646 /* Transforms the Unicode character to its title case equivalent.*/
1647 U_CAPI UChar32 U_EXPORT2
u_totitle(UChar32 c)1648 u_totitle(UChar32 c) {
1649 return ucase_totitle(c);
1650 }
1651
1652 /* return the simple case folding mapping for c */
1653 U_CAPI UChar32 U_EXPORT2
u_foldCase(UChar32 c,uint32_t options)1654 u_foldCase(UChar32 c, uint32_t options) {
1655 return ucase_fold(c, options);
1656 }
1657
1658 U_CFUNC int32_t U_EXPORT2
ucase_hasBinaryProperty(UChar32 c,UProperty which)1659 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1660 /* case mapping properties */
1661 const char16_t *resultString;
1662 switch(which) {
1663 case UCHAR_LOWERCASE:
1664 return (UBool)(UCASE_LOWER==ucase_getType(c));
1665 case UCHAR_UPPERCASE:
1666 return (UBool)(UCASE_UPPER==ucase_getType(c));
1667 case UCHAR_SOFT_DOTTED:
1668 return ucase_isSoftDotted(c);
1669 case UCHAR_CASE_SENSITIVE:
1670 return ucase_isCaseSensitive(c);
1671 case UCHAR_CASED:
1672 return (UBool)(UCASE_NONE!=ucase_getType(c));
1673 case UCHAR_CASE_IGNORABLE:
1674 return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1675 /*
1676 * Note: The following Changes_When_Xyz are defined as testing whether
1677 * the NFD form of the input changes when Xyz-case-mapped.
1678 * However, this simpler implementation of these properties,
1679 * ignoring NFD, passes the tests.
1680 * The implementation needs to be changed if the tests start failing.
1681 * When that happens, optimizations should be used to work with the
1682 * per-single-code point ucase_toFullXyz() functions unless
1683 * the NFD form has more than one code point,
1684 * and the property starts set needs to be the union of the
1685 * start sets for normalization and case mappings.
1686 */
1687 case UCHAR_CHANGES_WHEN_LOWERCASED:
1688 return (UBool)(ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1689 case UCHAR_CHANGES_WHEN_UPPERCASED:
1690 return (UBool)(ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1691 case UCHAR_CHANGES_WHEN_TITLECASED:
1692 return (UBool)(ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1693 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1694 case UCHAR_CHANGES_WHEN_CASEMAPPED:
1695 return (UBool)(
1696 ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 ||
1697 ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 ||
1698 ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1699 default:
1700 return false;
1701 }
1702 }
1703