xref: /aosp_15_r20/external/icu/libicu/cts_headers/normalizer2impl.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker *******************************************************************************
5*0e209d39SAndroid Build Coastguard Worker *
6*0e209d39SAndroid Build Coastguard Worker *   Copyright (C) 2009-2014, International Business Machines
7*0e209d39SAndroid Build Coastguard Worker *   Corporation and others.  All Rights Reserved.
8*0e209d39SAndroid Build Coastguard Worker *
9*0e209d39SAndroid Build Coastguard Worker *******************************************************************************
10*0e209d39SAndroid Build Coastguard Worker *   file name:  normalizer2impl.h
11*0e209d39SAndroid Build Coastguard Worker *   encoding:   UTF-8
12*0e209d39SAndroid Build Coastguard Worker *   tab size:   8 (not used)
13*0e209d39SAndroid Build Coastguard Worker *   indentation:4
14*0e209d39SAndroid Build Coastguard Worker *
15*0e209d39SAndroid Build Coastguard Worker *   created on: 2009nov22
16*0e209d39SAndroid Build Coastguard Worker *   created by: Markus W. Scherer
17*0e209d39SAndroid Build Coastguard Worker */
18*0e209d39SAndroid Build Coastguard Worker 
19*0e209d39SAndroid Build Coastguard Worker #ifndef __NORMALIZER2IMPL_H__
20*0e209d39SAndroid Build Coastguard Worker #define __NORMALIZER2IMPL_H__
21*0e209d39SAndroid Build Coastguard Worker 
22*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
23*0e209d39SAndroid Build Coastguard Worker 
24*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_NORMALIZATION
25*0e209d39SAndroid Build Coastguard Worker 
26*0e209d39SAndroid Build Coastguard Worker #include "unicode/normalizer2.h"
27*0e209d39SAndroid Build Coastguard Worker #include "unicode/ucptrie.h"
28*0e209d39SAndroid Build Coastguard Worker #include "unicode/unistr.h"
29*0e209d39SAndroid Build Coastguard Worker #include "unicode/unorm.h"
30*0e209d39SAndroid Build Coastguard Worker #include "unicode/utf.h"
31*0e209d39SAndroid Build Coastguard Worker #include "unicode/utf16.h"
32*0e209d39SAndroid Build Coastguard Worker #include "mutex.h"
33*0e209d39SAndroid Build Coastguard Worker #include "udataswp.h"
34*0e209d39SAndroid Build Coastguard Worker #include "uset_imp.h"
35*0e209d39SAndroid Build Coastguard Worker 
36*0e209d39SAndroid Build Coastguard Worker // When the nfc.nrm data is *not* hardcoded into the common library
37*0e209d39SAndroid Build Coastguard Worker // (with this constant set to 0),
38*0e209d39SAndroid Build Coastguard Worker // then it needs to be built into the data package:
39*0e209d39SAndroid Build Coastguard Worker // Add nfc.nrm to icu4c/source/data/Makefile.in DAT_FILES_SHORT
40*0e209d39SAndroid Build Coastguard Worker #define NORM2_HARDCODE_NFC_DATA 1
41*0e209d39SAndroid Build Coastguard Worker 
42*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
43*0e209d39SAndroid Build Coastguard Worker 
44*0e209d39SAndroid Build Coastguard Worker struct CanonIterData;
45*0e209d39SAndroid Build Coastguard Worker 
46*0e209d39SAndroid Build Coastguard Worker class ByteSink;
47*0e209d39SAndroid Build Coastguard Worker class Edits;
48*0e209d39SAndroid Build Coastguard Worker class InitCanonIterData;
49*0e209d39SAndroid Build Coastguard Worker class LcccContext;
50*0e209d39SAndroid Build Coastguard Worker 
51*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API Hangul {
52*0e209d39SAndroid Build Coastguard Worker public:
53*0e209d39SAndroid Build Coastguard Worker     /* Korean Hangul and Jamo constants */
54*0e209d39SAndroid Build Coastguard Worker     enum {
55*0e209d39SAndroid Build Coastguard Worker         JAMO_L_BASE=0x1100,     /* "lead" jamo */
56*0e209d39SAndroid Build Coastguard Worker         JAMO_L_END=0x1112,
57*0e209d39SAndroid Build Coastguard Worker         JAMO_V_BASE=0x1161,     /* "vowel" jamo */
58*0e209d39SAndroid Build Coastguard Worker         JAMO_V_END=0x1175,
59*0e209d39SAndroid Build Coastguard Worker         JAMO_T_BASE=0x11a7,     /* "trail" jamo */
60*0e209d39SAndroid Build Coastguard Worker         JAMO_T_END=0x11c2,
61*0e209d39SAndroid Build Coastguard Worker 
62*0e209d39SAndroid Build Coastguard Worker         HANGUL_BASE=0xac00,
63*0e209d39SAndroid Build Coastguard Worker         HANGUL_END=0xd7a3,
64*0e209d39SAndroid Build Coastguard Worker 
65*0e209d39SAndroid Build Coastguard Worker         JAMO_L_COUNT=19,
66*0e209d39SAndroid Build Coastguard Worker         JAMO_V_COUNT=21,
67*0e209d39SAndroid Build Coastguard Worker         JAMO_T_COUNT=28,
68*0e209d39SAndroid Build Coastguard Worker 
69*0e209d39SAndroid Build Coastguard Worker         JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT,
70*0e209d39SAndroid Build Coastguard Worker 
71*0e209d39SAndroid Build Coastguard Worker         HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT,
72*0e209d39SAndroid Build Coastguard Worker         HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT
73*0e209d39SAndroid Build Coastguard Worker     };
74*0e209d39SAndroid Build Coastguard Worker 
isHangul(UChar32 c)75*0e209d39SAndroid Build Coastguard Worker     static inline UBool isHangul(UChar32 c) {
76*0e209d39SAndroid Build Coastguard Worker         return HANGUL_BASE<=c && c<HANGUL_LIMIT;
77*0e209d39SAndroid Build Coastguard Worker     }
78*0e209d39SAndroid Build Coastguard Worker     static inline UBool
isHangulLV(UChar32 c)79*0e209d39SAndroid Build Coastguard Worker     isHangulLV(UChar32 c) {
80*0e209d39SAndroid Build Coastguard Worker         c-=HANGUL_BASE;
81*0e209d39SAndroid Build Coastguard Worker         return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
82*0e209d39SAndroid Build Coastguard Worker     }
isJamoL(UChar32 c)83*0e209d39SAndroid Build Coastguard Worker     static inline UBool isJamoL(UChar32 c) {
84*0e209d39SAndroid Build Coastguard Worker         return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT;
85*0e209d39SAndroid Build Coastguard Worker     }
isJamoV(UChar32 c)86*0e209d39SAndroid Build Coastguard Worker     static inline UBool isJamoV(UChar32 c) {
87*0e209d39SAndroid Build Coastguard Worker         return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT;
88*0e209d39SAndroid Build Coastguard Worker     }
isJamoT(UChar32 c)89*0e209d39SAndroid Build Coastguard Worker     static inline UBool isJamoT(UChar32 c) {
90*0e209d39SAndroid Build Coastguard Worker         int32_t t=c-JAMO_T_BASE;
91*0e209d39SAndroid Build Coastguard Worker         return 0<t && t<JAMO_T_COUNT;  // not JAMO_T_BASE itself
92*0e209d39SAndroid Build Coastguard Worker     }
isJamo(UChar32 c)93*0e209d39SAndroid Build Coastguard Worker     static UBool isJamo(UChar32 c) {
94*0e209d39SAndroid Build Coastguard Worker         return JAMO_L_BASE<=c && c<=JAMO_T_END &&
95*0e209d39SAndroid Build Coastguard Worker             (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);
96*0e209d39SAndroid Build Coastguard Worker     }
97*0e209d39SAndroid Build Coastguard Worker 
98*0e209d39SAndroid Build Coastguard Worker     /**
99*0e209d39SAndroid Build Coastguard Worker      * Decomposes c, which must be a Hangul syllable, into buffer
100*0e209d39SAndroid Build Coastguard Worker      * and returns the length of the decomposition (2 or 3).
101*0e209d39SAndroid Build Coastguard Worker      */
decompose(UChar32 c,char16_t buffer[3])102*0e209d39SAndroid Build Coastguard Worker     static inline int32_t decompose(UChar32 c, char16_t buffer[3]) {
103*0e209d39SAndroid Build Coastguard Worker         c-=HANGUL_BASE;
104*0e209d39SAndroid Build Coastguard Worker         UChar32 c2=c%JAMO_T_COUNT;
105*0e209d39SAndroid Build Coastguard Worker         c/=JAMO_T_COUNT;
106*0e209d39SAndroid Build Coastguard Worker         buffer[0]=(char16_t)(JAMO_L_BASE+c/JAMO_V_COUNT);
107*0e209d39SAndroid Build Coastguard Worker         buffer[1]=(char16_t)(JAMO_V_BASE+c%JAMO_V_COUNT);
108*0e209d39SAndroid Build Coastguard Worker         if(c2==0) {
109*0e209d39SAndroid Build Coastguard Worker             return 2;
110*0e209d39SAndroid Build Coastguard Worker         } else {
111*0e209d39SAndroid Build Coastguard Worker             buffer[2]=(char16_t)(JAMO_T_BASE+c2);
112*0e209d39SAndroid Build Coastguard Worker             return 3;
113*0e209d39SAndroid Build Coastguard Worker         }
114*0e209d39SAndroid Build Coastguard Worker     }
115*0e209d39SAndroid Build Coastguard Worker 
116*0e209d39SAndroid Build Coastguard Worker     /**
117*0e209d39SAndroid Build Coastguard Worker      * Decomposes c, which must be a Hangul syllable, into buffer.
118*0e209d39SAndroid Build Coastguard Worker      * This is the raw, not recursive, decomposition. Its length is always 2.
119*0e209d39SAndroid Build Coastguard Worker      */
getRawDecomposition(UChar32 c,char16_t buffer[2])120*0e209d39SAndroid Build Coastguard Worker     static inline void getRawDecomposition(UChar32 c, char16_t buffer[2]) {
121*0e209d39SAndroid Build Coastguard Worker         UChar32 orig=c;
122*0e209d39SAndroid Build Coastguard Worker         c-=HANGUL_BASE;
123*0e209d39SAndroid Build Coastguard Worker         UChar32 c2=c%JAMO_T_COUNT;
124*0e209d39SAndroid Build Coastguard Worker         if(c2==0) {
125*0e209d39SAndroid Build Coastguard Worker             c/=JAMO_T_COUNT;
126*0e209d39SAndroid Build Coastguard Worker             buffer[0]=(char16_t)(JAMO_L_BASE+c/JAMO_V_COUNT);
127*0e209d39SAndroid Build Coastguard Worker             buffer[1]=(char16_t)(JAMO_V_BASE+c%JAMO_V_COUNT);
128*0e209d39SAndroid Build Coastguard Worker         } else {
129*0e209d39SAndroid Build Coastguard Worker             buffer[0]=(char16_t)(orig-c2);  // LV syllable
130*0e209d39SAndroid Build Coastguard Worker             buffer[1]=(char16_t)(JAMO_T_BASE+c2);
131*0e209d39SAndroid Build Coastguard Worker         }
132*0e209d39SAndroid Build Coastguard Worker     }
133*0e209d39SAndroid Build Coastguard Worker private:
134*0e209d39SAndroid Build Coastguard Worker     Hangul() = delete;  // no instantiation
135*0e209d39SAndroid Build Coastguard Worker };
136*0e209d39SAndroid Build Coastguard Worker 
137*0e209d39SAndroid Build Coastguard Worker class Normalizer2Impl;
138*0e209d39SAndroid Build Coastguard Worker 
139*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API ReorderingBuffer : public UMemory {
140*0e209d39SAndroid Build Coastguard Worker public:
141*0e209d39SAndroid Build Coastguard Worker     /** Constructs only; init() should be called. */
ReorderingBuffer(const Normalizer2Impl & ni,UnicodeString & dest)142*0e209d39SAndroid Build Coastguard Worker     ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
143*0e209d39SAndroid Build Coastguard Worker         impl(ni), str(dest),
144*0e209d39SAndroid Build Coastguard Worker         start(nullptr), reorderStart(nullptr), limit(nullptr),
145*0e209d39SAndroid Build Coastguard Worker         remainingCapacity(0), lastCC(0) {}
146*0e209d39SAndroid Build Coastguard Worker     /** Constructs, removes the string contents, and initializes for a small initial capacity. */
147*0e209d39SAndroid Build Coastguard Worker     ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, UErrorCode &errorCode);
~ReorderingBuffer()148*0e209d39SAndroid Build Coastguard Worker     ~ReorderingBuffer() {
149*0e209d39SAndroid Build Coastguard Worker         if (start != nullptr) {
150*0e209d39SAndroid Build Coastguard Worker             str.releaseBuffer((int32_t)(limit-start));
151*0e209d39SAndroid Build Coastguard Worker         }
152*0e209d39SAndroid Build Coastguard Worker     }
153*0e209d39SAndroid Build Coastguard Worker     UBool init(int32_t destCapacity, UErrorCode &errorCode);
154*0e209d39SAndroid Build Coastguard Worker 
isEmpty()155*0e209d39SAndroid Build Coastguard Worker     UBool isEmpty() const { return start==limit; }
length()156*0e209d39SAndroid Build Coastguard Worker     int32_t length() const { return (int32_t)(limit-start); }
getStart()157*0e209d39SAndroid Build Coastguard Worker     char16_t *getStart() { return start; }
getLimit()158*0e209d39SAndroid Build Coastguard Worker     char16_t *getLimit() { return limit; }
getLastCC()159*0e209d39SAndroid Build Coastguard Worker     uint8_t getLastCC() const { return lastCC; }
160*0e209d39SAndroid Build Coastguard Worker 
161*0e209d39SAndroid Build Coastguard Worker     UBool equals(const char16_t *start, const char16_t *limit) const;
162*0e209d39SAndroid Build Coastguard Worker     UBool equals(const uint8_t *otherStart, const uint8_t *otherLimit) const;
163*0e209d39SAndroid Build Coastguard Worker 
append(UChar32 c,uint8_t cc,UErrorCode & errorCode)164*0e209d39SAndroid Build Coastguard Worker     UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
165*0e209d39SAndroid Build Coastguard Worker         return (c<=0xffff) ?
166*0e209d39SAndroid Build Coastguard Worker             appendBMP((char16_t)c, cc, errorCode) :
167*0e209d39SAndroid Build Coastguard Worker             appendSupplementary(c, cc, errorCode);
168*0e209d39SAndroid Build Coastguard Worker     }
169*0e209d39SAndroid Build Coastguard Worker     UBool append(const char16_t *s, int32_t length, UBool isNFD,
170*0e209d39SAndroid Build Coastguard Worker                  uint8_t leadCC, uint8_t trailCC,
171*0e209d39SAndroid Build Coastguard Worker                  UErrorCode &errorCode);
appendBMP(char16_t c,uint8_t cc,UErrorCode & errorCode)172*0e209d39SAndroid Build Coastguard Worker     UBool appendBMP(char16_t c, uint8_t cc, UErrorCode &errorCode) {
173*0e209d39SAndroid Build Coastguard Worker         if(remainingCapacity==0 && !resize(1, errorCode)) {
174*0e209d39SAndroid Build Coastguard Worker             return false;
175*0e209d39SAndroid Build Coastguard Worker         }
176*0e209d39SAndroid Build Coastguard Worker         if(lastCC<=cc || cc==0) {
177*0e209d39SAndroid Build Coastguard Worker             *limit++=c;
178*0e209d39SAndroid Build Coastguard Worker             lastCC=cc;
179*0e209d39SAndroid Build Coastguard Worker             if(cc<=1) {
180*0e209d39SAndroid Build Coastguard Worker                 reorderStart=limit;
181*0e209d39SAndroid Build Coastguard Worker             }
182*0e209d39SAndroid Build Coastguard Worker         } else {
183*0e209d39SAndroid Build Coastguard Worker             insert(c, cc);
184*0e209d39SAndroid Build Coastguard Worker         }
185*0e209d39SAndroid Build Coastguard Worker         --remainingCapacity;
186*0e209d39SAndroid Build Coastguard Worker         return true;
187*0e209d39SAndroid Build Coastguard Worker     }
188*0e209d39SAndroid Build Coastguard Worker     UBool appendZeroCC(UChar32 c, UErrorCode &errorCode);
189*0e209d39SAndroid Build Coastguard Worker     UBool appendZeroCC(const char16_t *s, const char16_t *sLimit, UErrorCode &errorCode);
190*0e209d39SAndroid Build Coastguard Worker     void remove();
191*0e209d39SAndroid Build Coastguard Worker     void removeSuffix(int32_t suffixLength);
setReorderingLimit(char16_t * newLimit)192*0e209d39SAndroid Build Coastguard Worker     void setReorderingLimit(char16_t *newLimit) {
193*0e209d39SAndroid Build Coastguard Worker         remainingCapacity+=(int32_t)(limit-newLimit);
194*0e209d39SAndroid Build Coastguard Worker         reorderStart=limit=newLimit;
195*0e209d39SAndroid Build Coastguard Worker         lastCC=0;
196*0e209d39SAndroid Build Coastguard Worker     }
copyReorderableSuffixTo(UnicodeString & s)197*0e209d39SAndroid Build Coastguard Worker     void copyReorderableSuffixTo(UnicodeString &s) const {
198*0e209d39SAndroid Build Coastguard Worker         s.setTo(ConstChar16Ptr(reorderStart), (int32_t)(limit-reorderStart));
199*0e209d39SAndroid Build Coastguard Worker     }
200*0e209d39SAndroid Build Coastguard Worker private:
201*0e209d39SAndroid Build Coastguard Worker     /*
202*0e209d39SAndroid Build Coastguard Worker      * TODO: Revisit whether it makes sense to track reorderStart.
203*0e209d39SAndroid Build Coastguard Worker      * It is set to after the last known character with cc<=1,
204*0e209d39SAndroid Build Coastguard Worker      * which stops previousCC() before it reads that character and looks up its cc.
205*0e209d39SAndroid Build Coastguard Worker      * previousCC() is normally only called from insert().
206*0e209d39SAndroid Build Coastguard Worker      * In other words, reorderStart speeds up the insertion of a combining mark
207*0e209d39SAndroid Build Coastguard Worker      * into a multi-combining mark sequence where it does not belong at the end.
208*0e209d39SAndroid Build Coastguard Worker      * This might not be worth the trouble.
209*0e209d39SAndroid Build Coastguard Worker      * On the other hand, it's not a huge amount of trouble.
210*0e209d39SAndroid Build Coastguard Worker      *
211*0e209d39SAndroid Build Coastguard Worker      * We probably need it for UNORM_SIMPLE_APPEND.
212*0e209d39SAndroid Build Coastguard Worker      */
213*0e209d39SAndroid Build Coastguard Worker 
214*0e209d39SAndroid Build Coastguard Worker     UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode);
215*0e209d39SAndroid Build Coastguard Worker     void insert(UChar32 c, uint8_t cc);
writeCodePoint(char16_t * p,UChar32 c)216*0e209d39SAndroid Build Coastguard Worker     static void writeCodePoint(char16_t *p, UChar32 c) {
217*0e209d39SAndroid Build Coastguard Worker         if(c<=0xffff) {
218*0e209d39SAndroid Build Coastguard Worker             *p=(char16_t)c;
219*0e209d39SAndroid Build Coastguard Worker         } else {
220*0e209d39SAndroid Build Coastguard Worker             p[0]=U16_LEAD(c);
221*0e209d39SAndroid Build Coastguard Worker             p[1]=U16_TRAIL(c);
222*0e209d39SAndroid Build Coastguard Worker         }
223*0e209d39SAndroid Build Coastguard Worker     }
224*0e209d39SAndroid Build Coastguard Worker     UBool resize(int32_t appendLength, UErrorCode &errorCode);
225*0e209d39SAndroid Build Coastguard Worker 
226*0e209d39SAndroid Build Coastguard Worker     const Normalizer2Impl &impl;
227*0e209d39SAndroid Build Coastguard Worker     UnicodeString &str;
228*0e209d39SAndroid Build Coastguard Worker     char16_t *start, *reorderStart, *limit;
229*0e209d39SAndroid Build Coastguard Worker     int32_t remainingCapacity;
230*0e209d39SAndroid Build Coastguard Worker     uint8_t lastCC;
231*0e209d39SAndroid Build Coastguard Worker 
232*0e209d39SAndroid Build Coastguard Worker     // private backward iterator
setIterator()233*0e209d39SAndroid Build Coastguard Worker     void setIterator() { codePointStart=limit; }
234*0e209d39SAndroid Build Coastguard Worker     void skipPrevious();  // Requires start<codePointStart.
235*0e209d39SAndroid Build Coastguard Worker     uint8_t previousCC();  // Returns 0 if there is no previous character.
236*0e209d39SAndroid Build Coastguard Worker 
237*0e209d39SAndroid Build Coastguard Worker     char16_t *codePointStart, *codePointLimit;
238*0e209d39SAndroid Build Coastguard Worker };
239*0e209d39SAndroid Build Coastguard Worker 
240*0e209d39SAndroid Build Coastguard Worker /**
241*0e209d39SAndroid Build Coastguard Worker  * Low-level implementation of the Unicode Normalization Algorithm.
242*0e209d39SAndroid Build Coastguard Worker  * For the data structure and details see the documentation at the end of
243*0e209d39SAndroid Build Coastguard Worker  * this normalizer2impl.h and in the design doc at
244*0e209d39SAndroid Build Coastguard Worker  * https://icu.unicode.org/design/normalization/custom
245*0e209d39SAndroid Build Coastguard Worker  */
246*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API Normalizer2Impl : public UObject {
247*0e209d39SAndroid Build Coastguard Worker public:
Normalizer2Impl()248*0e209d39SAndroid Build Coastguard Worker     Normalizer2Impl() : normTrie(nullptr), fCanonIterData(nullptr) {}
249*0e209d39SAndroid Build Coastguard Worker     virtual ~Normalizer2Impl();
250*0e209d39SAndroid Build Coastguard Worker 
251*0e209d39SAndroid Build Coastguard Worker     void init(const int32_t *inIndexes, const UCPTrie *inTrie,
252*0e209d39SAndroid Build Coastguard Worker               const uint16_t *inExtraData, const uint8_t *inSmallFCD);
253*0e209d39SAndroid Build Coastguard Worker 
254*0e209d39SAndroid Build Coastguard Worker     void addLcccChars(UnicodeSet &set) const;
255*0e209d39SAndroid Build Coastguard Worker     void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
256*0e209d39SAndroid Build Coastguard Worker     void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
257*0e209d39SAndroid Build Coastguard Worker 
258*0e209d39SAndroid Build Coastguard Worker     // low-level properties ------------------------------------------------ ***
259*0e209d39SAndroid Build Coastguard Worker 
260*0e209d39SAndroid Build Coastguard Worker     UBool ensureCanonIterData(UErrorCode &errorCode) const;
261*0e209d39SAndroid Build Coastguard Worker 
262*0e209d39SAndroid Build Coastguard Worker     // The trie stores values for lead surrogate code *units*.
263*0e209d39SAndroid Build Coastguard Worker     // Surrogate code *points* are inert.
getNorm16(UChar32 c)264*0e209d39SAndroid Build Coastguard Worker     uint16_t getNorm16(UChar32 c) const {
265*0e209d39SAndroid Build Coastguard Worker         return U_IS_LEAD(c) ?
266*0e209d39SAndroid Build Coastguard Worker             static_cast<uint16_t>(INERT) :
267*0e209d39SAndroid Build Coastguard Worker             UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c);
268*0e209d39SAndroid Build Coastguard Worker     }
getRawNorm16(UChar32 c)269*0e209d39SAndroid Build Coastguard Worker     uint16_t getRawNorm16(UChar32 c) const { return UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); }
270*0e209d39SAndroid Build Coastguard Worker 
getCompQuickCheck(uint16_t norm16)271*0e209d39SAndroid Build Coastguard Worker     UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const {
272*0e209d39SAndroid Build Coastguard Worker         if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
273*0e209d39SAndroid Build Coastguard Worker             return UNORM_YES;
274*0e209d39SAndroid Build Coastguard Worker         } else if(minMaybeYes<=norm16) {
275*0e209d39SAndroid Build Coastguard Worker             return UNORM_MAYBE;
276*0e209d39SAndroid Build Coastguard Worker         } else {
277*0e209d39SAndroid Build Coastguard Worker             return UNORM_NO;
278*0e209d39SAndroid Build Coastguard Worker         }
279*0e209d39SAndroid Build Coastguard Worker     }
isAlgorithmicNoNo(uint16_t norm16)280*0e209d39SAndroid Build Coastguard Worker     UBool isAlgorithmicNoNo(uint16_t norm16) const { return limitNoNo<=norm16 && norm16<minMaybeYes; }
isCompNo(uint16_t norm16)281*0e209d39SAndroid Build Coastguard Worker     UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; }
isDecompYes(uint16_t norm16)282*0e209d39SAndroid Build Coastguard Worker     UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; }
283*0e209d39SAndroid Build Coastguard Worker 
getCC(uint16_t norm16)284*0e209d39SAndroid Build Coastguard Worker     uint8_t getCC(uint16_t norm16) const {
285*0e209d39SAndroid Build Coastguard Worker         if(norm16>=MIN_NORMAL_MAYBE_YES) {
286*0e209d39SAndroid Build Coastguard Worker             return getCCFromNormalYesOrMaybe(norm16);
287*0e209d39SAndroid Build Coastguard Worker         }
288*0e209d39SAndroid Build Coastguard Worker         if(norm16<minNoNo || limitNoNo<=norm16) {
289*0e209d39SAndroid Build Coastguard Worker             return 0;
290*0e209d39SAndroid Build Coastguard Worker         }
291*0e209d39SAndroid Build Coastguard Worker         return getCCFromNoNo(norm16);
292*0e209d39SAndroid Build Coastguard Worker     }
getCCFromNormalYesOrMaybe(uint16_t norm16)293*0e209d39SAndroid Build Coastguard Worker     static uint8_t getCCFromNormalYesOrMaybe(uint16_t norm16) {
294*0e209d39SAndroid Build Coastguard Worker         return (uint8_t)(norm16 >> OFFSET_SHIFT);
295*0e209d39SAndroid Build Coastguard Worker     }
getCCFromYesOrMaybe(uint16_t norm16)296*0e209d39SAndroid Build Coastguard Worker     static uint8_t getCCFromYesOrMaybe(uint16_t norm16) {
297*0e209d39SAndroid Build Coastguard Worker         return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
298*0e209d39SAndroid Build Coastguard Worker     }
getCCFromYesOrMaybeCP(UChar32 c)299*0e209d39SAndroid Build Coastguard Worker     uint8_t getCCFromYesOrMaybeCP(UChar32 c) const {
300*0e209d39SAndroid Build Coastguard Worker         if (c < minCompNoMaybeCP) { return 0; }
301*0e209d39SAndroid Build Coastguard Worker         return getCCFromYesOrMaybe(getNorm16(c));
302*0e209d39SAndroid Build Coastguard Worker     }
303*0e209d39SAndroid Build Coastguard Worker 
304*0e209d39SAndroid Build Coastguard Worker     /**
305*0e209d39SAndroid Build Coastguard Worker      * Returns the FCD data for code point c.
306*0e209d39SAndroid Build Coastguard Worker      * @param c A Unicode code point.
307*0e209d39SAndroid Build Coastguard Worker      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
308*0e209d39SAndroid Build Coastguard Worker      */
getFCD16(UChar32 c)309*0e209d39SAndroid Build Coastguard Worker     uint16_t getFCD16(UChar32 c) const {
310*0e209d39SAndroid Build Coastguard Worker         if(c<minDecompNoCP) {
311*0e209d39SAndroid Build Coastguard Worker             return 0;
312*0e209d39SAndroid Build Coastguard Worker         } else if(c<=0xffff) {
313*0e209d39SAndroid Build Coastguard Worker             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
314*0e209d39SAndroid Build Coastguard Worker         }
315*0e209d39SAndroid Build Coastguard Worker         return getFCD16FromNormData(c);
316*0e209d39SAndroid Build Coastguard Worker     }
317*0e209d39SAndroid Build Coastguard Worker     /**
318*0e209d39SAndroid Build Coastguard Worker      * Returns the FCD data for the next code point (post-increment).
319*0e209d39SAndroid Build Coastguard Worker      * Might skip only a lead surrogate rather than the whole surrogate pair if none of
320*0e209d39SAndroid Build Coastguard Worker      * the supplementary code points associated with the lead surrogate have non-zero FCD data.
321*0e209d39SAndroid Build Coastguard Worker      * @param s A valid pointer into a string. Requires s!=limit.
322*0e209d39SAndroid Build Coastguard Worker      * @param limit The end of the string, or NULL.
323*0e209d39SAndroid Build Coastguard Worker      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
324*0e209d39SAndroid Build Coastguard Worker      */
nextFCD16(const char16_t * & s,const char16_t * limit)325*0e209d39SAndroid Build Coastguard Worker     uint16_t nextFCD16(const char16_t *&s, const char16_t *limit) const {
326*0e209d39SAndroid Build Coastguard Worker         UChar32 c=*s++;
327*0e209d39SAndroid Build Coastguard Worker         if(c<minDecompNoCP || !singleLeadMightHaveNonZeroFCD16(c)) {
328*0e209d39SAndroid Build Coastguard Worker             return 0;
329*0e209d39SAndroid Build Coastguard Worker         }
330*0e209d39SAndroid Build Coastguard Worker         char16_t c2;
331*0e209d39SAndroid Build Coastguard Worker         if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) {
332*0e209d39SAndroid Build Coastguard Worker             c=U16_GET_SUPPLEMENTARY(c, c2);
333*0e209d39SAndroid Build Coastguard Worker             ++s;
334*0e209d39SAndroid Build Coastguard Worker         }
335*0e209d39SAndroid Build Coastguard Worker         return getFCD16FromNormData(c);
336*0e209d39SAndroid Build Coastguard Worker     }
337*0e209d39SAndroid Build Coastguard Worker     /**
338*0e209d39SAndroid Build Coastguard Worker      * Returns the FCD data for the previous code point (pre-decrement).
339*0e209d39SAndroid Build Coastguard Worker      * @param start The start of the string.
340*0e209d39SAndroid Build Coastguard Worker      * @param s A valid pointer into a string. Requires start<s.
341*0e209d39SAndroid Build Coastguard Worker      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
342*0e209d39SAndroid Build Coastguard Worker      */
previousFCD16(const char16_t * start,const char16_t * & s)343*0e209d39SAndroid Build Coastguard Worker     uint16_t previousFCD16(const char16_t *start, const char16_t *&s) const {
344*0e209d39SAndroid Build Coastguard Worker         UChar32 c=*--s;
345*0e209d39SAndroid Build Coastguard Worker         if(c<minDecompNoCP) {
346*0e209d39SAndroid Build Coastguard Worker             return 0;
347*0e209d39SAndroid Build Coastguard Worker         }
348*0e209d39SAndroid Build Coastguard Worker         if(!U16_IS_TRAIL(c)) {
349*0e209d39SAndroid Build Coastguard Worker             if(!singleLeadMightHaveNonZeroFCD16(c)) {
350*0e209d39SAndroid Build Coastguard Worker                 return 0;
351*0e209d39SAndroid Build Coastguard Worker             }
352*0e209d39SAndroid Build Coastguard Worker         } else {
353*0e209d39SAndroid Build Coastguard Worker             char16_t c2;
354*0e209d39SAndroid Build Coastguard Worker             if(start<s && U16_IS_LEAD(c2=*(s-1))) {
355*0e209d39SAndroid Build Coastguard Worker                 c=U16_GET_SUPPLEMENTARY(c2, c);
356*0e209d39SAndroid Build Coastguard Worker                 --s;
357*0e209d39SAndroid Build Coastguard Worker             }
358*0e209d39SAndroid Build Coastguard Worker         }
359*0e209d39SAndroid Build Coastguard Worker         return getFCD16FromNormData(c);
360*0e209d39SAndroid Build Coastguard Worker     }
361*0e209d39SAndroid Build Coastguard Worker 
362*0e209d39SAndroid Build Coastguard Worker     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
singleLeadMightHaveNonZeroFCD16(UChar32 lead)363*0e209d39SAndroid Build Coastguard Worker     UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const {
364*0e209d39SAndroid Build Coastguard Worker         // 0<=lead<=0xffff
365*0e209d39SAndroid Build Coastguard Worker         uint8_t bits=smallFCD[lead>>8];
366*0e209d39SAndroid Build Coastguard Worker         if(bits==0) { return false; }
367*0e209d39SAndroid Build Coastguard Worker         return (UBool)((bits>>((lead>>5)&7))&1);
368*0e209d39SAndroid Build Coastguard Worker     }
369*0e209d39SAndroid Build Coastguard Worker     /** Returns the FCD value from the regular normalization data. */
370*0e209d39SAndroid Build Coastguard Worker     uint16_t getFCD16FromNormData(UChar32 c) const;
371*0e209d39SAndroid Build Coastguard Worker 
372*0e209d39SAndroid Build Coastguard Worker     /**
373*0e209d39SAndroid Build Coastguard Worker      * Gets the decomposition for one code point.
374*0e209d39SAndroid Build Coastguard Worker      * @param c code point
375*0e209d39SAndroid Build Coastguard Worker      * @param buffer out-only buffer for algorithmic decompositions
376*0e209d39SAndroid Build Coastguard Worker      * @param length out-only, takes the length of the decomposition, if any
377*0e209d39SAndroid Build Coastguard Worker      * @return pointer to the decomposition, or NULL if none
378*0e209d39SAndroid Build Coastguard Worker      */
379*0e209d39SAndroid Build Coastguard Worker     const char16_t *getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length) const;
380*0e209d39SAndroid Build Coastguard Worker 
381*0e209d39SAndroid Build Coastguard Worker     /**
382*0e209d39SAndroid Build Coastguard Worker      * Gets the raw decomposition for one code point.
383*0e209d39SAndroid Build Coastguard Worker      * @param c code point
384*0e209d39SAndroid Build Coastguard Worker      * @param buffer out-only buffer for algorithmic decompositions
385*0e209d39SAndroid Build Coastguard Worker      * @param length out-only, takes the length of the decomposition, if any
386*0e209d39SAndroid Build Coastguard Worker      * @return pointer to the decomposition, or NULL if none
387*0e209d39SAndroid Build Coastguard Worker      */
388*0e209d39SAndroid Build Coastguard Worker     const char16_t *getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &length) const;
389*0e209d39SAndroid Build Coastguard Worker 
390*0e209d39SAndroid Build Coastguard Worker     UChar32 composePair(UChar32 a, UChar32 b) const;
391*0e209d39SAndroid Build Coastguard Worker 
392*0e209d39SAndroid Build Coastguard Worker     UBool isCanonSegmentStarter(UChar32 c) const;
393*0e209d39SAndroid Build Coastguard Worker     UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
394*0e209d39SAndroid Build Coastguard Worker 
395*0e209d39SAndroid Build Coastguard Worker     enum {
396*0e209d39SAndroid Build Coastguard Worker         // Fixed norm16 values.
397*0e209d39SAndroid Build Coastguard Worker         MIN_YES_YES_WITH_CC=0xfe02,
398*0e209d39SAndroid Build Coastguard Worker         JAMO_VT=0xfe00,
399*0e209d39SAndroid Build Coastguard Worker         MIN_NORMAL_MAYBE_YES=0xfc00,
400*0e209d39SAndroid Build Coastguard Worker         JAMO_L=2,  // offset=1 hasCompBoundaryAfter=false
401*0e209d39SAndroid Build Coastguard Worker         INERT=1,  // offset=0 hasCompBoundaryAfter=true
402*0e209d39SAndroid Build Coastguard Worker 
403*0e209d39SAndroid Build Coastguard Worker         // norm16 bit 0 is comp-boundary-after.
404*0e209d39SAndroid Build Coastguard Worker         HAS_COMP_BOUNDARY_AFTER=1,
405*0e209d39SAndroid Build Coastguard Worker         OFFSET_SHIFT=1,
406*0e209d39SAndroid Build Coastguard Worker 
407*0e209d39SAndroid Build Coastguard Worker         // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
408*0e209d39SAndroid Build Coastguard Worker         // tccc (0, 1, >1) for quick FCC boundary-after tests.
409*0e209d39SAndroid Build Coastguard Worker         DELTA_TCCC_0=0,
410*0e209d39SAndroid Build Coastguard Worker         DELTA_TCCC_1=2,
411*0e209d39SAndroid Build Coastguard Worker         DELTA_TCCC_GT_1=4,
412*0e209d39SAndroid Build Coastguard Worker         DELTA_TCCC_MASK=6,
413*0e209d39SAndroid Build Coastguard Worker         DELTA_SHIFT=3,
414*0e209d39SAndroid Build Coastguard Worker 
415*0e209d39SAndroid Build Coastguard Worker         MAX_DELTA=0x40
416*0e209d39SAndroid Build Coastguard Worker     };
417*0e209d39SAndroid Build Coastguard Worker 
418*0e209d39SAndroid Build Coastguard Worker     enum {
419*0e209d39SAndroid Build Coastguard Worker         // Byte offsets from the start of the data, after the generic header.
420*0e209d39SAndroid Build Coastguard Worker         IX_NORM_TRIE_OFFSET,
421*0e209d39SAndroid Build Coastguard Worker         IX_EXTRA_DATA_OFFSET,
422*0e209d39SAndroid Build Coastguard Worker         IX_SMALL_FCD_OFFSET,
423*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED3_OFFSET,
424*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED4_OFFSET,
425*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED5_OFFSET,
426*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED6_OFFSET,
427*0e209d39SAndroid Build Coastguard Worker         IX_TOTAL_SIZE,
428*0e209d39SAndroid Build Coastguard Worker 
429*0e209d39SAndroid Build Coastguard Worker         // Code point thresholds for quick check codes.
430*0e209d39SAndroid Build Coastguard Worker         IX_MIN_DECOMP_NO_CP,
431*0e209d39SAndroid Build Coastguard Worker         IX_MIN_COMP_NO_MAYBE_CP,
432*0e209d39SAndroid Build Coastguard Worker 
433*0e209d39SAndroid Build Coastguard Worker         // Norm16 value thresholds for quick check combinations and types of extra data.
434*0e209d39SAndroid Build Coastguard Worker 
435*0e209d39SAndroid Build Coastguard Worker         /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
436*0e209d39SAndroid Build Coastguard Worker         IX_MIN_YES_NO,
437*0e209d39SAndroid Build Coastguard Worker         /** Mappings are comp-normalized. */
438*0e209d39SAndroid Build Coastguard Worker         IX_MIN_NO_NO,
439*0e209d39SAndroid Build Coastguard Worker         IX_LIMIT_NO_NO,
440*0e209d39SAndroid Build Coastguard Worker         IX_MIN_MAYBE_YES,
441*0e209d39SAndroid Build Coastguard Worker 
442*0e209d39SAndroid Build Coastguard Worker         /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
443*0e209d39SAndroid Build Coastguard Worker         IX_MIN_YES_NO_MAPPINGS_ONLY,
444*0e209d39SAndroid Build Coastguard Worker         /** Mappings are not comp-normalized but have a comp boundary before. */
445*0e209d39SAndroid Build Coastguard Worker         IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
446*0e209d39SAndroid Build Coastguard Worker         /** Mappings do not have a comp boundary before. */
447*0e209d39SAndroid Build Coastguard Worker         IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
448*0e209d39SAndroid Build Coastguard Worker         /** Mappings to the empty string. */
449*0e209d39SAndroid Build Coastguard Worker         IX_MIN_NO_NO_EMPTY,
450*0e209d39SAndroid Build Coastguard Worker 
451*0e209d39SAndroid Build Coastguard Worker         IX_MIN_LCCC_CP,
452*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED19,
453*0e209d39SAndroid Build Coastguard Worker         IX_COUNT
454*0e209d39SAndroid Build Coastguard Worker     };
455*0e209d39SAndroid Build Coastguard Worker 
456*0e209d39SAndroid Build Coastguard Worker     enum {
457*0e209d39SAndroid Build Coastguard Worker         MAPPING_HAS_CCC_LCCC_WORD=0x80,
458*0e209d39SAndroid Build Coastguard Worker         MAPPING_HAS_RAW_MAPPING=0x40,
459*0e209d39SAndroid Build Coastguard Worker         // unused bit 0x20,
460*0e209d39SAndroid Build Coastguard Worker         MAPPING_LENGTH_MASK=0x1f
461*0e209d39SAndroid Build Coastguard Worker     };
462*0e209d39SAndroid Build Coastguard Worker 
463*0e209d39SAndroid Build Coastguard Worker     enum {
464*0e209d39SAndroid Build Coastguard Worker         COMP_1_LAST_TUPLE=0x8000,
465*0e209d39SAndroid Build Coastguard Worker         COMP_1_TRIPLE=1,
466*0e209d39SAndroid Build Coastguard Worker         COMP_1_TRAIL_LIMIT=0x3400,
467*0e209d39SAndroid Build Coastguard Worker         COMP_1_TRAIL_MASK=0x7ffe,
468*0e209d39SAndroid Build Coastguard Worker         COMP_1_TRAIL_SHIFT=9,  // 10-1 for the "triple" bit
469*0e209d39SAndroid Build Coastguard Worker         COMP_2_TRAIL_SHIFT=6,
470*0e209d39SAndroid Build Coastguard Worker         COMP_2_TRAIL_MASK=0xffc0
471*0e209d39SAndroid Build Coastguard Worker     };
472*0e209d39SAndroid Build Coastguard Worker 
473*0e209d39SAndroid Build Coastguard Worker     // higher-level functionality ------------------------------------------ ***
474*0e209d39SAndroid Build Coastguard Worker 
475*0e209d39SAndroid Build Coastguard Worker     // NFD without an NFD Normalizer2 instance.
476*0e209d39SAndroid Build Coastguard Worker     UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest,
477*0e209d39SAndroid Build Coastguard Worker                              UErrorCode &errorCode) const;
478*0e209d39SAndroid Build Coastguard Worker     /**
479*0e209d39SAndroid Build Coastguard Worker      * Decomposes [src, limit[ and writes the result to dest.
480*0e209d39SAndroid Build Coastguard Worker      * limit can be NULL if src is NUL-terminated.
481*0e209d39SAndroid Build Coastguard Worker      * destLengthEstimate is the initial dest buffer capacity and can be -1.
482*0e209d39SAndroid Build Coastguard Worker      */
483*0e209d39SAndroid Build Coastguard Worker     void decompose(const char16_t *src, const char16_t *limit,
484*0e209d39SAndroid Build Coastguard Worker                    UnicodeString &dest, int32_t destLengthEstimate,
485*0e209d39SAndroid Build Coastguard Worker                    UErrorCode &errorCode) const;
486*0e209d39SAndroid Build Coastguard Worker 
487*0e209d39SAndroid Build Coastguard Worker     const char16_t *decompose(const char16_t *src, const char16_t *limit,
488*0e209d39SAndroid Build Coastguard Worker                            ReorderingBuffer *buffer, UErrorCode &errorCode) const;
489*0e209d39SAndroid Build Coastguard Worker     void decomposeAndAppend(const char16_t *src, const char16_t *limit,
490*0e209d39SAndroid Build Coastguard Worker                             UBool doDecompose,
491*0e209d39SAndroid Build Coastguard Worker                             UnicodeString &safeMiddle,
492*0e209d39SAndroid Build Coastguard Worker                             ReorderingBuffer &buffer,
493*0e209d39SAndroid Build Coastguard Worker                             UErrorCode &errorCode) const;
494*0e209d39SAndroid Build Coastguard Worker 
495*0e209d39SAndroid Build Coastguard Worker     /** sink==nullptr: isNormalized()/spanQuickCheckYes() */
496*0e209d39SAndroid Build Coastguard Worker     const uint8_t *decomposeUTF8(uint32_t options,
497*0e209d39SAndroid Build Coastguard Worker                                  const uint8_t *src, const uint8_t *limit,
498*0e209d39SAndroid Build Coastguard Worker                                  ByteSink *sink, Edits *edits, UErrorCode &errorCode) const;
499*0e209d39SAndroid Build Coastguard Worker 
500*0e209d39SAndroid Build Coastguard Worker     UBool compose(const char16_t *src, const char16_t *limit,
501*0e209d39SAndroid Build Coastguard Worker                   UBool onlyContiguous,
502*0e209d39SAndroid Build Coastguard Worker                   UBool doCompose,
503*0e209d39SAndroid Build Coastguard Worker                   ReorderingBuffer &buffer,
504*0e209d39SAndroid Build Coastguard Worker                   UErrorCode &errorCode) const;
505*0e209d39SAndroid Build Coastguard Worker     const char16_t *composeQuickCheck(const char16_t *src, const char16_t *limit,
506*0e209d39SAndroid Build Coastguard Worker                                    UBool onlyContiguous,
507*0e209d39SAndroid Build Coastguard Worker                                    UNormalizationCheckResult *pQCResult) const;
508*0e209d39SAndroid Build Coastguard Worker     void composeAndAppend(const char16_t *src, const char16_t *limit,
509*0e209d39SAndroid Build Coastguard Worker                           UBool doCompose,
510*0e209d39SAndroid Build Coastguard Worker                           UBool onlyContiguous,
511*0e209d39SAndroid Build Coastguard Worker                           UnicodeString &safeMiddle,
512*0e209d39SAndroid Build Coastguard Worker                           ReorderingBuffer &buffer,
513*0e209d39SAndroid Build Coastguard Worker                           UErrorCode &errorCode) const;
514*0e209d39SAndroid Build Coastguard Worker 
515*0e209d39SAndroid Build Coastguard Worker     /** sink==nullptr: isNormalized() */
516*0e209d39SAndroid Build Coastguard Worker     UBool composeUTF8(uint32_t options, UBool onlyContiguous,
517*0e209d39SAndroid Build Coastguard Worker                       const uint8_t *src, const uint8_t *limit,
518*0e209d39SAndroid Build Coastguard Worker                       ByteSink *sink, icu::Edits *edits, UErrorCode &errorCode) const;
519*0e209d39SAndroid Build Coastguard Worker 
520*0e209d39SAndroid Build Coastguard Worker     const char16_t *makeFCD(const char16_t *src, const char16_t *limit,
521*0e209d39SAndroid Build Coastguard Worker                          ReorderingBuffer *buffer, UErrorCode &errorCode) const;
522*0e209d39SAndroid Build Coastguard Worker     void makeFCDAndAppend(const char16_t *src, const char16_t *limit,
523*0e209d39SAndroid Build Coastguard Worker                           UBool doMakeFCD,
524*0e209d39SAndroid Build Coastguard Worker                           UnicodeString &safeMiddle,
525*0e209d39SAndroid Build Coastguard Worker                           ReorderingBuffer &buffer,
526*0e209d39SAndroid Build Coastguard Worker                           UErrorCode &errorCode) const;
527*0e209d39SAndroid Build Coastguard Worker 
528*0e209d39SAndroid Build Coastguard Worker     UBool hasDecompBoundaryBefore(UChar32 c) const;
529*0e209d39SAndroid Build Coastguard Worker     UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const;
530*0e209d39SAndroid Build Coastguard Worker     UBool hasDecompBoundaryAfter(UChar32 c) const;
531*0e209d39SAndroid Build Coastguard Worker     UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const;
isDecompInert(UChar32 c)532*0e209d39SAndroid Build Coastguard Worker     UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
533*0e209d39SAndroid Build Coastguard Worker 
hasCompBoundaryBefore(UChar32 c)534*0e209d39SAndroid Build Coastguard Worker     UBool hasCompBoundaryBefore(UChar32 c) const {
535*0e209d39SAndroid Build Coastguard Worker         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
536*0e209d39SAndroid Build Coastguard Worker     }
hasCompBoundaryAfter(UChar32 c,UBool onlyContiguous)537*0e209d39SAndroid Build Coastguard Worker     UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous) const {
538*0e209d39SAndroid Build Coastguard Worker         return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
539*0e209d39SAndroid Build Coastguard Worker     }
isCompInert(UChar32 c,UBool onlyContiguous)540*0e209d39SAndroid Build Coastguard Worker     UBool isCompInert(UChar32 c, UBool onlyContiguous) const {
541*0e209d39SAndroid Build Coastguard Worker         uint16_t norm16=getNorm16(c);
542*0e209d39SAndroid Build Coastguard Worker         return isCompYesAndZeroCC(norm16) &&
543*0e209d39SAndroid Build Coastguard Worker             (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
544*0e209d39SAndroid Build Coastguard Worker             (!onlyContiguous || isInert(norm16) || *getMapping(norm16) <= 0x1ff);
545*0e209d39SAndroid Build Coastguard Worker     }
546*0e209d39SAndroid Build Coastguard Worker 
hasFCDBoundaryBefore(UChar32 c)547*0e209d39SAndroid Build Coastguard Worker     UBool hasFCDBoundaryBefore(UChar32 c) const { return hasDecompBoundaryBefore(c); }
hasFCDBoundaryAfter(UChar32 c)548*0e209d39SAndroid Build Coastguard Worker     UBool hasFCDBoundaryAfter(UChar32 c) const { return hasDecompBoundaryAfter(c); }
isFCDInert(UChar32 c)549*0e209d39SAndroid Build Coastguard Worker     UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
550*0e209d39SAndroid Build Coastguard Worker private:
551*0e209d39SAndroid Build Coastguard Worker     friend class InitCanonIterData;
552*0e209d39SAndroid Build Coastguard Worker     friend class LcccContext;
553*0e209d39SAndroid Build Coastguard Worker 
isMaybe(uint16_t norm16)554*0e209d39SAndroid Build Coastguard Worker     UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
isMaybeOrNonZeroCC(uint16_t norm16)555*0e209d39SAndroid Build Coastguard Worker     UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
isInert(uint16_t norm16)556*0e209d39SAndroid Build Coastguard Worker     static UBool isInert(uint16_t norm16) { return norm16==INERT; }
isJamoL(uint16_t norm16)557*0e209d39SAndroid Build Coastguard Worker     static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; }
isJamoVT(uint16_t norm16)558*0e209d39SAndroid Build Coastguard Worker     static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
hangulLVT()559*0e209d39SAndroid Build Coastguard Worker     uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
isHangulLV(uint16_t norm16)560*0e209d39SAndroid Build Coastguard Worker     UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; }
isHangulLVT(uint16_t norm16)561*0e209d39SAndroid Build Coastguard Worker     UBool isHangulLVT(uint16_t norm16) const {
562*0e209d39SAndroid Build Coastguard Worker         return norm16==hangulLVT();
563*0e209d39SAndroid Build Coastguard Worker     }
isCompYesAndZeroCC(uint16_t norm16)564*0e209d39SAndroid Build Coastguard Worker     UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
565*0e209d39SAndroid Build Coastguard Worker     // UBool isCompYes(uint16_t norm16) const {
566*0e209d39SAndroid Build Coastguard Worker     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
567*0e209d39SAndroid Build Coastguard Worker     // }
568*0e209d39SAndroid Build Coastguard Worker     // UBool isCompYesOrMaybe(uint16_t norm16) const {
569*0e209d39SAndroid Build Coastguard Worker     //     return norm16<minNoNo || minMaybeYes<=norm16;
570*0e209d39SAndroid Build Coastguard Worker     // }
571*0e209d39SAndroid Build Coastguard Worker     // UBool hasZeroCCFromDecompYes(uint16_t norm16) const {
572*0e209d39SAndroid Build Coastguard Worker     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
573*0e209d39SAndroid Build Coastguard Worker     // }
isDecompYesAndZeroCC(uint16_t norm16)574*0e209d39SAndroid Build Coastguard Worker     UBool isDecompYesAndZeroCC(uint16_t norm16) const {
575*0e209d39SAndroid Build Coastguard Worker         return norm16<minYesNo ||
576*0e209d39SAndroid Build Coastguard Worker                norm16==JAMO_VT ||
577*0e209d39SAndroid Build Coastguard Worker                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
578*0e209d39SAndroid Build Coastguard Worker     }
579*0e209d39SAndroid Build Coastguard Worker     /**
580*0e209d39SAndroid Build Coastguard Worker      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
581*0e209d39SAndroid Build Coastguard Worker      * the MaybeYes which combine-forward and have ccc=0.
582*0e209d39SAndroid Build Coastguard Worker      * (Standard Unicode 10 normalization does not have such characters.)
583*0e209d39SAndroid Build Coastguard Worker      */
isMostDecompYesAndZeroCC(uint16_t norm16)584*0e209d39SAndroid Build Coastguard Worker     UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
585*0e209d39SAndroid Build Coastguard Worker         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
586*0e209d39SAndroid Build Coastguard Worker     }
isDecompNoAlgorithmic(uint16_t norm16)587*0e209d39SAndroid Build Coastguard Worker     UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; }
588*0e209d39SAndroid Build Coastguard Worker 
589*0e209d39SAndroid Build Coastguard Worker     // For use with isCompYes().
590*0e209d39SAndroid Build Coastguard Worker     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
591*0e209d39SAndroid Build Coastguard Worker     // static uint8_t getCCFromYes(uint16_t norm16) {
592*0e209d39SAndroid Build Coastguard Worker     //     return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
593*0e209d39SAndroid Build Coastguard Worker     // }
getCCFromNoNo(uint16_t norm16)594*0e209d39SAndroid Build Coastguard Worker     uint8_t getCCFromNoNo(uint16_t norm16) const {
595*0e209d39SAndroid Build Coastguard Worker         const uint16_t *mapping=getMapping(norm16);
596*0e209d39SAndroid Build Coastguard Worker         if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) {
597*0e209d39SAndroid Build Coastguard Worker             return (uint8_t)*(mapping-1);
598*0e209d39SAndroid Build Coastguard Worker         } else {
599*0e209d39SAndroid Build Coastguard Worker             return 0;
600*0e209d39SAndroid Build Coastguard Worker         }
601*0e209d39SAndroid Build Coastguard Worker     }
602*0e209d39SAndroid Build Coastguard Worker     // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
getTrailCCFromCompYesAndZeroCC(uint16_t norm16)603*0e209d39SAndroid Build Coastguard Worker     uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const {
604*0e209d39SAndroid Build Coastguard Worker         if(norm16<=minYesNo) {
605*0e209d39SAndroid Build Coastguard Worker             return 0;  // yesYes and Hangul LV have ccc=tccc=0
606*0e209d39SAndroid Build Coastguard Worker         } else {
607*0e209d39SAndroid Build Coastguard Worker             // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
608*0e209d39SAndroid Build Coastguard Worker             return (uint8_t)(*getMapping(norm16)>>8);  // tccc from yesNo
609*0e209d39SAndroid Build Coastguard Worker         }
610*0e209d39SAndroid Build Coastguard Worker     }
611*0e209d39SAndroid Build Coastguard Worker     uint8_t getPreviousTrailCC(const char16_t *start, const char16_t *p) const;
612*0e209d39SAndroid Build Coastguard Worker     uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const;
613*0e209d39SAndroid Build Coastguard Worker 
614*0e209d39SAndroid Build Coastguard Worker     // Requires algorithmic-NoNo.
mapAlgorithmic(UChar32 c,uint16_t norm16)615*0e209d39SAndroid Build Coastguard Worker     UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
616*0e209d39SAndroid Build Coastguard Worker         return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
617*0e209d39SAndroid Build Coastguard Worker     }
getAlgorithmicDelta(uint16_t norm16)618*0e209d39SAndroid Build Coastguard Worker     UChar32 getAlgorithmicDelta(uint16_t norm16) const {
619*0e209d39SAndroid Build Coastguard Worker         return (norm16>>DELTA_SHIFT)-centerNoNoDelta;
620*0e209d39SAndroid Build Coastguard Worker     }
621*0e209d39SAndroid Build Coastguard Worker 
622*0e209d39SAndroid Build Coastguard Worker     // Requires minYesNo<norm16<limitNoNo.
getMapping(uint16_t norm16)623*0e209d39SAndroid Build Coastguard Worker     const uint16_t *getMapping(uint16_t norm16) const { return extraData+(norm16>>OFFSET_SHIFT); }
getCompositionsListForDecompYes(uint16_t norm16)624*0e209d39SAndroid Build Coastguard Worker     const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const {
625*0e209d39SAndroid Build Coastguard Worker         if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
626*0e209d39SAndroid Build Coastguard Worker             return nullptr;
627*0e209d39SAndroid Build Coastguard Worker         } else if(norm16<minMaybeYes) {
628*0e209d39SAndroid Build Coastguard Worker             return getMapping(norm16);  // for yesYes; if Jamo L: harmless empty list
629*0e209d39SAndroid Build Coastguard Worker         } else {
630*0e209d39SAndroid Build Coastguard Worker             return maybeYesCompositions+norm16-minMaybeYes;
631*0e209d39SAndroid Build Coastguard Worker         }
632*0e209d39SAndroid Build Coastguard Worker     }
getCompositionsListForComposite(uint16_t norm16)633*0e209d39SAndroid Build Coastguard Worker     const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
634*0e209d39SAndroid Build Coastguard Worker         // A composite has both mapping & compositions list.
635*0e209d39SAndroid Build Coastguard Worker         const uint16_t *list=getMapping(norm16);
636*0e209d39SAndroid Build Coastguard Worker         return list+  // mapping pointer
637*0e209d39SAndroid Build Coastguard Worker             1+  // +1 to skip the first unit with the mapping length
638*0e209d39SAndroid Build Coastguard Worker             (*list&MAPPING_LENGTH_MASK);  // + mapping length
639*0e209d39SAndroid Build Coastguard Worker     }
getCompositionsListForMaybe(uint16_t norm16)640*0e209d39SAndroid Build Coastguard Worker     const uint16_t *getCompositionsListForMaybe(uint16_t norm16) const {
641*0e209d39SAndroid Build Coastguard Worker         // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES
642*0e209d39SAndroid Build Coastguard Worker         return maybeYesCompositions+((norm16-minMaybeYes)>>OFFSET_SHIFT);
643*0e209d39SAndroid Build Coastguard Worker     }
644*0e209d39SAndroid Build Coastguard Worker     /**
645*0e209d39SAndroid Build Coastguard Worker      * @param c code point must have compositions
646*0e209d39SAndroid Build Coastguard Worker      * @return compositions list pointer
647*0e209d39SAndroid Build Coastguard Worker      */
getCompositionsList(uint16_t norm16)648*0e209d39SAndroid Build Coastguard Worker     const uint16_t *getCompositionsList(uint16_t norm16) const {
649*0e209d39SAndroid Build Coastguard Worker         return isDecompYes(norm16) ?
650*0e209d39SAndroid Build Coastguard Worker                 getCompositionsListForDecompYes(norm16) :
651*0e209d39SAndroid Build Coastguard Worker                 getCompositionsListForComposite(norm16);
652*0e209d39SAndroid Build Coastguard Worker     }
653*0e209d39SAndroid Build Coastguard Worker 
654*0e209d39SAndroid Build Coastguard Worker     const char16_t *copyLowPrefixFromNulTerminated(const char16_t *src,
655*0e209d39SAndroid Build Coastguard Worker                                                 UChar32 minNeedDataCP,
656*0e209d39SAndroid Build Coastguard Worker                                                 ReorderingBuffer *buffer,
657*0e209d39SAndroid Build Coastguard Worker                                                 UErrorCode &errorCode) const;
658*0e209d39SAndroid Build Coastguard Worker 
659*0e209d39SAndroid Build Coastguard Worker     enum StopAt { STOP_AT_LIMIT, STOP_AT_DECOMP_BOUNDARY, STOP_AT_COMP_BOUNDARY };
660*0e209d39SAndroid Build Coastguard Worker 
661*0e209d39SAndroid Build Coastguard Worker     const char16_t *decomposeShort(const char16_t *src, const char16_t *limit,
662*0e209d39SAndroid Build Coastguard Worker                                 UBool stopAtCompBoundary, UBool onlyContiguous,
663*0e209d39SAndroid Build Coastguard Worker                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const;
664*0e209d39SAndroid Build Coastguard Worker     UBool decompose(UChar32 c, uint16_t norm16,
665*0e209d39SAndroid Build Coastguard Worker                     ReorderingBuffer &buffer, UErrorCode &errorCode) const;
666*0e209d39SAndroid Build Coastguard Worker 
667*0e209d39SAndroid Build Coastguard Worker     const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
668*0e209d39SAndroid Build Coastguard Worker                                   StopAt stopAt, UBool onlyContiguous,
669*0e209d39SAndroid Build Coastguard Worker                                   ReorderingBuffer &buffer, UErrorCode &errorCode) const;
670*0e209d39SAndroid Build Coastguard Worker 
671*0e209d39SAndroid Build Coastguard Worker     static int32_t combine(const uint16_t *list, UChar32 trail);
672*0e209d39SAndroid Build Coastguard Worker     void addComposites(const uint16_t *list, UnicodeSet &set) const;
673*0e209d39SAndroid Build Coastguard Worker     void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
674*0e209d39SAndroid Build Coastguard Worker                    UBool onlyContiguous) const;
675*0e209d39SAndroid Build Coastguard Worker 
hasCompBoundaryBefore(UChar32 c,uint16_t norm16)676*0e209d39SAndroid Build Coastguard Worker     UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
677*0e209d39SAndroid Build Coastguard Worker         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
678*0e209d39SAndroid Build Coastguard Worker     }
norm16HasCompBoundaryBefore(uint16_t norm16)679*0e209d39SAndroid Build Coastguard Worker     UBool norm16HasCompBoundaryBefore(uint16_t norm16) const  {
680*0e209d39SAndroid Build Coastguard Worker         return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
681*0e209d39SAndroid Build Coastguard Worker     }
682*0e209d39SAndroid Build Coastguard Worker     UBool hasCompBoundaryBefore(const char16_t *src, const char16_t *limit) const;
683*0e209d39SAndroid Build Coastguard Worker     UBool hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const;
684*0e209d39SAndroid Build Coastguard Worker     UBool hasCompBoundaryAfter(const char16_t *start, const char16_t *p,
685*0e209d39SAndroid Build Coastguard Worker                                UBool onlyContiguous) const;
686*0e209d39SAndroid Build Coastguard Worker     UBool hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
687*0e209d39SAndroid Build Coastguard Worker                                UBool onlyContiguous) const;
norm16HasCompBoundaryAfter(uint16_t norm16,UBool onlyContiguous)688*0e209d39SAndroid Build Coastguard Worker     UBool norm16HasCompBoundaryAfter(uint16_t norm16, UBool onlyContiguous) const {
689*0e209d39SAndroid Build Coastguard Worker         return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
690*0e209d39SAndroid Build Coastguard Worker             (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
691*0e209d39SAndroid Build Coastguard Worker     }
692*0e209d39SAndroid Build Coastguard Worker     /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
isTrailCC01ForCompBoundaryAfter(uint16_t norm16)693*0e209d39SAndroid Build Coastguard Worker     UBool isTrailCC01ForCompBoundaryAfter(uint16_t norm16) const {
694*0e209d39SAndroid Build Coastguard Worker         return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
695*0e209d39SAndroid Build Coastguard Worker             (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : *getMapping(norm16) <= 0x1ff);
696*0e209d39SAndroid Build Coastguard Worker     }
697*0e209d39SAndroid Build Coastguard Worker 
698*0e209d39SAndroid Build Coastguard Worker     const char16_t *findPreviousCompBoundary(const char16_t *start, const char16_t *p, UBool onlyContiguous) const;
699*0e209d39SAndroid Build Coastguard Worker     const char16_t *findNextCompBoundary(const char16_t *p, const char16_t *limit, UBool onlyContiguous) const;
700*0e209d39SAndroid Build Coastguard Worker 
701*0e209d39SAndroid Build Coastguard Worker     const char16_t *findPreviousFCDBoundary(const char16_t *start, const char16_t *p) const;
702*0e209d39SAndroid Build Coastguard Worker     const char16_t *findNextFCDBoundary(const char16_t *p, const char16_t *limit) const;
703*0e209d39SAndroid Build Coastguard Worker 
704*0e209d39SAndroid Build Coastguard Worker     void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
705*0e209d39SAndroid Build Coastguard Worker                                      CanonIterData &newData, UErrorCode &errorCode) const;
706*0e209d39SAndroid Build Coastguard Worker 
707*0e209d39SAndroid Build Coastguard Worker     int32_t getCanonValue(UChar32 c) const;
708*0e209d39SAndroid Build Coastguard Worker     const UnicodeSet &getCanonStartSet(int32_t n) const;
709*0e209d39SAndroid Build Coastguard Worker 
710*0e209d39SAndroid Build Coastguard Worker     // UVersionInfo dataVersion;
711*0e209d39SAndroid Build Coastguard Worker 
712*0e209d39SAndroid Build Coastguard Worker     // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
713*0e209d39SAndroid Build Coastguard Worker     char16_t minDecompNoCP;
714*0e209d39SAndroid Build Coastguard Worker     char16_t minCompNoMaybeCP;
715*0e209d39SAndroid Build Coastguard Worker     char16_t minLcccCP;
716*0e209d39SAndroid Build Coastguard Worker 
717*0e209d39SAndroid Build Coastguard Worker     // Norm16 value thresholds for quick check combinations and types of extra data.
718*0e209d39SAndroid Build Coastguard Worker     uint16_t minYesNo;
719*0e209d39SAndroid Build Coastguard Worker     uint16_t minYesNoMappingsOnly;
720*0e209d39SAndroid Build Coastguard Worker     uint16_t minNoNo;
721*0e209d39SAndroid Build Coastguard Worker     uint16_t minNoNoCompBoundaryBefore;
722*0e209d39SAndroid Build Coastguard Worker     uint16_t minNoNoCompNoMaybeCC;
723*0e209d39SAndroid Build Coastguard Worker     uint16_t minNoNoEmpty;
724*0e209d39SAndroid Build Coastguard Worker     uint16_t limitNoNo;
725*0e209d39SAndroid Build Coastguard Worker     uint16_t centerNoNoDelta;
726*0e209d39SAndroid Build Coastguard Worker     uint16_t minMaybeYes;
727*0e209d39SAndroid Build Coastguard Worker 
728*0e209d39SAndroid Build Coastguard Worker     const UCPTrie *normTrie;
729*0e209d39SAndroid Build Coastguard Worker     const uint16_t *maybeYesCompositions;
730*0e209d39SAndroid Build Coastguard Worker     const uint16_t *extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
731*0e209d39SAndroid Build Coastguard Worker     const uint8_t *smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
732*0e209d39SAndroid Build Coastguard Worker 
733*0e209d39SAndroid Build Coastguard Worker     UInitOnce       fCanonIterDataInitOnce {};
734*0e209d39SAndroid Build Coastguard Worker     CanonIterData  *fCanonIterData;
735*0e209d39SAndroid Build Coastguard Worker };
736*0e209d39SAndroid Build Coastguard Worker 
737*0e209d39SAndroid Build Coastguard Worker // bits in canonIterData
738*0e209d39SAndroid Build Coastguard Worker #define CANON_NOT_SEGMENT_STARTER 0x80000000
739*0e209d39SAndroid Build Coastguard Worker #define CANON_HAS_COMPOSITIONS 0x40000000
740*0e209d39SAndroid Build Coastguard Worker #define CANON_HAS_SET 0x200000
741*0e209d39SAndroid Build Coastguard Worker #define CANON_VALUE_MASK 0x1fffff
742*0e209d39SAndroid Build Coastguard Worker 
743*0e209d39SAndroid Build Coastguard Worker /**
744*0e209d39SAndroid Build Coastguard Worker  * ICU-internal shortcut for quick access to standard Unicode normalization.
745*0e209d39SAndroid Build Coastguard Worker  */
746*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API Normalizer2Factory {
747*0e209d39SAndroid Build Coastguard Worker public:
748*0e209d39SAndroid Build Coastguard Worker     static const Normalizer2 *getFCDInstance(UErrorCode &errorCode);
749*0e209d39SAndroid Build Coastguard Worker     static const Normalizer2 *getFCCInstance(UErrorCode &errorCode);
750*0e209d39SAndroid Build Coastguard Worker     static const Normalizer2 *getNoopInstance(UErrorCode &errorCode);
751*0e209d39SAndroid Build Coastguard Worker 
752*0e209d39SAndroid Build Coastguard Worker     static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode);
753*0e209d39SAndroid Build Coastguard Worker 
754*0e209d39SAndroid Build Coastguard Worker     static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode);
755*0e209d39SAndroid Build Coastguard Worker     static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode);
756*0e209d39SAndroid Build Coastguard Worker     static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode);
757*0e209d39SAndroid Build Coastguard Worker 
758*0e209d39SAndroid Build Coastguard Worker     // Get the Impl instance of the Normalizer2.
759*0e209d39SAndroid Build Coastguard Worker     // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance.
760*0e209d39SAndroid Build Coastguard Worker     static const Normalizer2Impl *getImpl(const Normalizer2 *norm2);
761*0e209d39SAndroid Build Coastguard Worker private:
762*0e209d39SAndroid Build Coastguard Worker     Normalizer2Factory() = delete;  // No instantiation.
763*0e209d39SAndroid Build Coastguard Worker };
764*0e209d39SAndroid Build Coastguard Worker 
765*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
766*0e209d39SAndroid Build Coastguard Worker 
767*0e209d39SAndroid Build Coastguard Worker U_CAPI int32_t U_EXPORT2
768*0e209d39SAndroid Build Coastguard Worker unorm2_swap(const UDataSwapper *ds,
769*0e209d39SAndroid Build Coastguard Worker             const void *inData, int32_t length, void *outData,
770*0e209d39SAndroid Build Coastguard Worker             UErrorCode *pErrorCode);
771*0e209d39SAndroid Build Coastguard Worker 
772*0e209d39SAndroid Build Coastguard Worker /**
773*0e209d39SAndroid Build Coastguard Worker  * Get the NF*_QC property for a code point, for u_getIntPropertyValue().
774*0e209d39SAndroid Build Coastguard Worker  * @internal
775*0e209d39SAndroid Build Coastguard Worker  */
776*0e209d39SAndroid Build Coastguard Worker U_CFUNC UNormalizationCheckResult
777*0e209d39SAndroid Build Coastguard Worker unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
778*0e209d39SAndroid Build Coastguard Worker 
779*0e209d39SAndroid Build Coastguard Worker /**
780*0e209d39SAndroid Build Coastguard Worker  * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue().
781*0e209d39SAndroid Build Coastguard Worker  * @internal
782*0e209d39SAndroid Build Coastguard Worker  */
783*0e209d39SAndroid Build Coastguard Worker U_CFUNC uint16_t
784*0e209d39SAndroid Build Coastguard Worker unorm_getFCD16(UChar32 c);
785*0e209d39SAndroid Build Coastguard Worker 
786*0e209d39SAndroid Build Coastguard Worker /**
787*0e209d39SAndroid Build Coastguard Worker  * Format of Normalizer2 .nrm data files.
788*0e209d39SAndroid Build Coastguard Worker  * Format version 4.0.
789*0e209d39SAndroid Build Coastguard Worker  *
790*0e209d39SAndroid Build Coastguard Worker  * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
791*0e209d39SAndroid Build Coastguard Worker  * ICU ships with data files for standard Unicode Normalization Forms
792*0e209d39SAndroid Build Coastguard Worker  * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm),
793*0e209d39SAndroid Build Coastguard Worker  * NFKC_Casefold (nfkc_cf.nrm) and NFKC_Simple_Casefold (nfkc_scf.nrm).
794*0e209d39SAndroid Build Coastguard Worker  * Custom (application-specific) data can be built into additional .nrm files
795*0e209d39SAndroid Build Coastguard Worker  * with the gennorm2 build tool.
796*0e209d39SAndroid Build Coastguard Worker  * ICU ships with one such file, uts46.nrm, for the implementation of UTS #46.
797*0e209d39SAndroid Build Coastguard Worker  *
798*0e209d39SAndroid Build Coastguard Worker  * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been
799*0e209d39SAndroid Build Coastguard Worker  * cached already. Internally, Normalizer2Impl.load() reads the .nrm file.
800*0e209d39SAndroid Build Coastguard Worker  *
801*0e209d39SAndroid Build Coastguard Worker  * A .nrm file begins with a standard ICU data file header
802*0e209d39SAndroid Build Coastguard Worker  * (DataHeader, see ucmndata.h and unicode/udata.h).
803*0e209d39SAndroid Build Coastguard Worker  * The UDataInfo.dataVersion field usually contains the Unicode version
804*0e209d39SAndroid Build Coastguard Worker  * for which the data was generated.
805*0e209d39SAndroid Build Coastguard Worker  *
806*0e209d39SAndroid Build Coastguard Worker  * After the header, the file contains the following parts.
807*0e209d39SAndroid Build Coastguard Worker  * Constants are defined as enum values of the Normalizer2Impl class.
808*0e209d39SAndroid Build Coastguard Worker  *
809*0e209d39SAndroid Build Coastguard Worker  * Many details of the data structures are described in the design doc
810*0e209d39SAndroid Build Coastguard Worker  * which is at https://icu.unicode.org/design/normalization/custom
811*0e209d39SAndroid Build Coastguard Worker  *
812*0e209d39SAndroid Build Coastguard Worker  * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4;
813*0e209d39SAndroid Build Coastguard Worker  *
814*0e209d39SAndroid Build Coastguard Worker  *      The first eight indexes are byte offsets in ascending order.
815*0e209d39SAndroid Build Coastguard Worker  *      Each byte offset marks the start of the next part in the data file,
816*0e209d39SAndroid Build Coastguard Worker  *      and the end of the previous one.
817*0e209d39SAndroid Build Coastguard Worker  *      When two consecutive byte offsets are the same, then the corresponding part is empty.
818*0e209d39SAndroid Build Coastguard Worker  *      Byte offsets are offsets from after the header,
819*0e209d39SAndroid Build Coastguard Worker  *      that is, from the beginning of the indexes[].
820*0e209d39SAndroid Build Coastguard Worker  *      Each part starts at an offset with proper alignment for its data.
821*0e209d39SAndroid Build Coastguard Worker  *      If necessary, the previous part may include padding bytes to achieve this alignment.
822*0e209d39SAndroid Build Coastguard Worker  *
823*0e209d39SAndroid Build Coastguard Worker  *      minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point
824*0e209d39SAndroid Build Coastguard Worker  *      with a decomposition mapping, that is, with NF*D_QC=No.
825*0e209d39SAndroid Build Coastguard Worker  *      minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
826*0e209d39SAndroid Build Coastguard Worker  *      with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
827*0e209d39SAndroid Build Coastguard Worker  *      minLcccCP=indexes[IX_MIN_LCCC_CP] (index 18, new in formatVersion 3)
828*0e209d39SAndroid Build Coastguard Worker  *      is the lowest code point with lccc!=0.
829*0e209d39SAndroid Build Coastguard Worker  *
830*0e209d39SAndroid Build Coastguard Worker  *      The next eight indexes are thresholds of 16-bit trie values for ranges of
831*0e209d39SAndroid Build Coastguard Worker  *      values indicating multiple normalization properties.
832*0e209d39SAndroid Build Coastguard Worker  *      They are listed here in threshold order, not in the order they are stored in the indexes.
833*0e209d39SAndroid Build Coastguard Worker  *          minYesNo=indexes[IX_MIN_YES_NO];
834*0e209d39SAndroid Build Coastguard Worker  *          minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
835*0e209d39SAndroid Build Coastguard Worker  *          minNoNo=indexes[IX_MIN_NO_NO];
836*0e209d39SAndroid Build Coastguard Worker  *          minNoNoCompBoundaryBefore=indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
837*0e209d39SAndroid Build Coastguard Worker  *          minNoNoCompNoMaybeCC=indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
838*0e209d39SAndroid Build Coastguard Worker  *          minNoNoEmpty=indexes[IX_MIN_NO_NO_EMPTY];
839*0e209d39SAndroid Build Coastguard Worker  *          limitNoNo=indexes[IX_LIMIT_NO_NO];
840*0e209d39SAndroid Build Coastguard Worker  *          minMaybeYes=indexes[IX_MIN_MAYBE_YES];
841*0e209d39SAndroid Build Coastguard Worker  *      See the normTrie description below and the design doc for details.
842*0e209d39SAndroid Build Coastguard Worker  *
843*0e209d39SAndroid Build Coastguard Worker  * UCPTrie normTrie; -- see ucptrie_impl.h and ucptrie.h, same as Java CodePointTrie
844*0e209d39SAndroid Build Coastguard Worker  *
845*0e209d39SAndroid Build Coastguard Worker  *      The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
846*0e209d39SAndroid Build Coastguard Worker  *      Rather than using independent bits in the value (which would require more than 16 bits),
847*0e209d39SAndroid Build Coastguard Worker  *      information is extracted primarily via range checks.
848*0e209d39SAndroid Build Coastguard Worker  *      Except, format version 3 uses bit 0 for hasCompBoundaryAfter().
849*0e209d39SAndroid Build Coastguard Worker  *      For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo
850*0e209d39SAndroid Build Coastguard Worker  *      means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
851*0e209d39SAndroid Build Coastguard Worker  *      which means it has a two-way (round-trip) decomposition mapping.
852*0e209d39SAndroid Build Coastguard Worker  *      Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
853*0e209d39SAndroid Build Coastguard Worker  *      pointing to mappings, compositions lists, or both.
854*0e209d39SAndroid Build Coastguard Worker  *      Value norm16==INERT (0 in versions 1 & 2, 1 in version 3)
855*0e209d39SAndroid Build Coastguard Worker  *      means that the character is normalization-inert, that is,
856*0e209d39SAndroid Build Coastguard Worker  *      it does not have a mapping, does not participate in composition, has a zero
857*0e209d39SAndroid Build Coastguard Worker  *      canonical combining class, and forms a boundary where text before it and after it
858*0e209d39SAndroid Build Coastguard Worker  *      can be normalized independently.
859*0e209d39SAndroid Build Coastguard Worker  *      For details about how multiple properties are encoded in 16-bit values
860*0e209d39SAndroid Build Coastguard Worker  *      see the design doc.
861*0e209d39SAndroid Build Coastguard Worker  *      Note that the encoding cannot express all combinations of the properties involved;
862*0e209d39SAndroid Build Coastguard Worker  *      it only supports those combinations that are allowed by
863*0e209d39SAndroid Build Coastguard Worker  *      the Unicode Normalization algorithms. Details are in the design doc as well.
864*0e209d39SAndroid Build Coastguard Worker  *      The gennorm2 tool only builds .nrm files for data that conforms to the limitations.
865*0e209d39SAndroid Build Coastguard Worker  *
866*0e209d39SAndroid Build Coastguard Worker  *      The trie has a value for each lead surrogate code unit representing the "worst case"
867*0e209d39SAndroid Build Coastguard Worker  *      properties of the 1024 supplementary characters whose UTF-16 form starts with
868*0e209d39SAndroid Build Coastguard Worker  *      the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,
869*0e209d39SAndroid Build Coastguard Worker  *      then their lead surrogate code unit has the trie value INERT.
870*0e209d39SAndroid Build Coastguard Worker  *      When the lead surrogate unit's value exceeds the quick check minimum during processing,
871*0e209d39SAndroid Build Coastguard Worker  *      the properties for the full supplementary code point need to be looked up.
872*0e209d39SAndroid Build Coastguard Worker  *
873*0e209d39SAndroid Build Coastguard Worker  * uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes];
874*0e209d39SAndroid Build Coastguard Worker  * uint16_t extraData[];
875*0e209d39SAndroid Build Coastguard Worker  *
876*0e209d39SAndroid Build Coastguard Worker  *      There is only one byte offset for the end of these two arrays.
877*0e209d39SAndroid Build Coastguard Worker  *      The split between them is given by the constant and variable mentioned above.
878*0e209d39SAndroid Build Coastguard Worker  *      In version 3, the difference must be shifted right by OFFSET_SHIFT.
879*0e209d39SAndroid Build Coastguard Worker  *
880*0e209d39SAndroid Build Coastguard Worker  *      The maybeYesCompositions array contains compositions lists for characters that
881*0e209d39SAndroid Build Coastguard Worker  *      combine both forward (as starters in composition pairs)
882*0e209d39SAndroid Build Coastguard Worker  *      and backward (as trailing characters in composition pairs).
883*0e209d39SAndroid Build Coastguard Worker  *      Such characters do not occur in Unicode 5.2 but are allowed by
884*0e209d39SAndroid Build Coastguard Worker  *      the Unicode Normalization algorithms.
885*0e209d39SAndroid Build Coastguard Worker  *      If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES
886*0e209d39SAndroid Build Coastguard Worker  *      and the maybeYesCompositions array is empty.
887*0e209d39SAndroid Build Coastguard Worker  *      If there are such characters, then minMaybeYes is subtracted from their norm16 values
888*0e209d39SAndroid Build Coastguard Worker  *      to get the index into this array.
889*0e209d39SAndroid Build Coastguard Worker  *
890*0e209d39SAndroid Build Coastguard Worker  *      The extraData array contains compositions lists for "YesYes" characters,
891*0e209d39SAndroid Build Coastguard Worker  *      followed by mappings and optional compositions lists for "YesNo" characters,
892*0e209d39SAndroid Build Coastguard Worker  *      followed by only mappings for "NoNo" characters.
893*0e209d39SAndroid Build Coastguard Worker  *      (Referring to pairs of NFC/NFD quick check values.)
894*0e209d39SAndroid Build Coastguard Worker  *      The norm16 values of those characters are directly indexes into the extraData array.
895*0e209d39SAndroid Build Coastguard Worker  *      In version 3, the norm16 values must be shifted right by OFFSET_SHIFT
896*0e209d39SAndroid Build Coastguard Worker  *      for accessing extraData.
897*0e209d39SAndroid Build Coastguard Worker  *
898*0e209d39SAndroid Build Coastguard Worker  *      The data structures for compositions lists and mappings are described in the design doc.
899*0e209d39SAndroid Build Coastguard Worker  *
900*0e209d39SAndroid Build Coastguard Worker  * uint8_t smallFCD[0x100]; -- new in format version 2
901*0e209d39SAndroid Build Coastguard Worker  *
902*0e209d39SAndroid Build Coastguard Worker  *      This is a bit set to help speed up FCD value lookups in the absence of a full
903*0e209d39SAndroid Build Coastguard Worker  *      UTrie2 or other large data structure with the full FCD value mapping.
904*0e209d39SAndroid Build Coastguard Worker  *
905*0e209d39SAndroid Build Coastguard Worker  *      Each smallFCD bit is set if any of the corresponding 32 BMP code points
906*0e209d39SAndroid Build Coastguard Worker  *      has a non-zero FCD value (lccc!=0 or tccc!=0).
907*0e209d39SAndroid Build Coastguard Worker  *      Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF.
908*0e209d39SAndroid Build Coastguard Worker  *      A bit for 32 lead surrogates is set if any of the 32k corresponding
909*0e209d39SAndroid Build Coastguard Worker  *      _supplementary_ code points has a non-zero FCD value.
910*0e209d39SAndroid Build Coastguard Worker  *
911*0e209d39SAndroid Build Coastguard Worker  *      This bit set is most useful for the large blocks of CJK characters with FCD=0.
912*0e209d39SAndroid Build Coastguard Worker  *
913*0e209d39SAndroid Build Coastguard Worker  * Changes from format version 1 to format version 2 ---------------------------
914*0e209d39SAndroid Build Coastguard Worker  *
915*0e209d39SAndroid Build Coastguard Worker  * - Addition of data for raw (not recursively decomposed) mappings.
916*0e209d39SAndroid Build Coastguard Worker  *   + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when
917*0e209d39SAndroid Build Coastguard Worker  *     the mapping is to an empty string or when the character combines-forward.
918*0e209d39SAndroid Build Coastguard Worker  *     This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which
919*0e209d39SAndroid Build Coastguard Worker  *     is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.
920*0e209d39SAndroid Build Coastguard Worker  *   + For details see the design doc.
921*0e209d39SAndroid Build Coastguard Worker  * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into
922*0e209d39SAndroid Build Coastguard Worker  *   distinct ranges (combines-forward vs. not)
923*0e209d39SAndroid Build Coastguard Worker  *   so that a range check can be used to find out if there is a compositions list.
924*0e209d39SAndroid Build Coastguard Worker  *   This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
925*0e209d39SAndroid Build Coastguard Worker  *   It is needed for the new (in ICU 49) composePair(), not for other normalization.
926*0e209d39SAndroid Build Coastguard Worker  * - Addition of the smallFCD[] bit set.
927*0e209d39SAndroid Build Coastguard Worker  *
928*0e209d39SAndroid Build Coastguard Worker  * Changes from format version 2 to format version 3 (ICU 60) ------------------
929*0e209d39SAndroid Build Coastguard Worker  *
930*0e209d39SAndroid Build Coastguard Worker  * - norm16 bit 0 indicates hasCompBoundaryAfter(),
931*0e209d39SAndroid Build Coastguard Worker  *   except that for contiguous composition (FCC) the tccc must be checked as well.
932*0e209d39SAndroid Build Coastguard Worker  *   Data indexes and ccc values are shifted left by one (OFFSET_SHIFT).
933*0e209d39SAndroid Build Coastguard Worker  *   Thresholds like minNoNo are tested before shifting.
934*0e209d39SAndroid Build Coastguard Worker  *
935*0e209d39SAndroid Build Coastguard Worker  * - Algorithmic mapping deltas are shifted left by two more bits (total DELTA_SHIFT),
936*0e209d39SAndroid Build Coastguard Worker  *   to make room for two bits (three values) indicating whether the tccc is 0, 1, or greater.
937*0e209d39SAndroid Build Coastguard Worker  *   See DELTA_TCCC_MASK etc.
938*0e209d39SAndroid Build Coastguard Worker  *   This helps with fetching tccc/FCD values and FCC hasCompBoundaryAfter().
939*0e209d39SAndroid Build Coastguard Worker  *   minMaybeYes is 8-aligned so that the DELTA_TCCC_MASK bits can be tested directly.
940*0e209d39SAndroid Build Coastguard Worker  *
941*0e209d39SAndroid Build Coastguard Worker  * - Algorithmic mappings are only used for mapping to "comp yes and ccc=0" characters,
942*0e209d39SAndroid Build Coastguard Worker  *   and ASCII characters are mapped algorithmically only to other ASCII characters.
943*0e209d39SAndroid Build Coastguard Worker  *   This helps with hasCompBoundaryBefore() and compose() fast paths.
944*0e209d39SAndroid Build Coastguard Worker  *   It is never necessary any more to loop for algorithmic mappings.
945*0e209d39SAndroid Build Coastguard Worker  *
946*0e209d39SAndroid Build Coastguard Worker  * - Addition of indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE],
947*0e209d39SAndroid Build Coastguard Worker  *   indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC], and indexes[IX_MIN_NO_NO_EMPTY],
948*0e209d39SAndroid Build Coastguard Worker  *   and separation of the noNo extraData into distinct ranges.
949*0e209d39SAndroid Build Coastguard Worker  *   With this, the noNo norm16 value indicates whether the mapping is
950*0e209d39SAndroid Build Coastguard Worker  *   compose-normalized, not normalized but hasCompBoundaryBefore(),
951*0e209d39SAndroid Build Coastguard Worker  *   not even that, or maps to an empty string.
952*0e209d39SAndroid Build Coastguard Worker  *   hasCompBoundaryBefore() can be determined solely from the norm16 value.
953*0e209d39SAndroid Build Coastguard Worker  *
954*0e209d39SAndroid Build Coastguard Worker  * - The norm16 value for Hangul LVT is now different from that for Hangul LV,
955*0e209d39SAndroid Build Coastguard Worker  *   so that hasCompBoundaryAfter() need not check for the syllable type.
956*0e209d39SAndroid Build Coastguard Worker  *   For Hangul LV, minYesNo continues to be used (no comp-boundary-after).
957*0e209d39SAndroid Build Coastguard Worker  *   For Hangul LVT, minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER is used.
958*0e209d39SAndroid Build Coastguard Worker  *   The extraData units at these indexes are set to firstUnit=2 and firstUnit=3, respectively,
959*0e209d39SAndroid Build Coastguard Worker  *   to simplify some code.
960*0e209d39SAndroid Build Coastguard Worker  *
961*0e209d39SAndroid Build Coastguard Worker  * - The extraData firstUnit bit 5 is no longer necessary
962*0e209d39SAndroid Build Coastguard Worker  *   (norm16 bit 0 used instead of firstUnit MAPPING_NO_COMP_BOUNDARY_AFTER),
963*0e209d39SAndroid Build Coastguard Worker  *   is reserved again, and always set to 0.
964*0e209d39SAndroid Build Coastguard Worker  *
965*0e209d39SAndroid Build Coastguard Worker  * - Addition of indexes[IX_MIN_LCCC_CP], the first code point where lccc!=0.
966*0e209d39SAndroid Build Coastguard Worker  *   This used to be hardcoded to U+0300, but in data like NFKC_Casefold it is lower:
967*0e209d39SAndroid Build Coastguard Worker  *   U+00AD Soft Hyphen maps to an empty string,
968*0e209d39SAndroid Build Coastguard Worker  *   which is artificially assigned "worst case" values lccc=1 and tccc=255.
969*0e209d39SAndroid Build Coastguard Worker  *
970*0e209d39SAndroid Build Coastguard Worker  * - A mapping to an empty string has explicit lccc=1 and tccc=255 values.
971*0e209d39SAndroid Build Coastguard Worker  *
972*0e209d39SAndroid Build Coastguard Worker  * Changes from format version 3 to format version 4 (ICU 63) ------------------
973*0e209d39SAndroid Build Coastguard Worker  *
974*0e209d39SAndroid Build Coastguard Worker  * Switched from UTrie2 to UCPTrie/CodePointTrie.
975*0e209d39SAndroid Build Coastguard Worker  *
976*0e209d39SAndroid Build Coastguard Worker  * The new trie no longer stores different values for surrogate code *units* vs.
977*0e209d39SAndroid Build Coastguard Worker  * surrogate code *points*.
978*0e209d39SAndroid Build Coastguard Worker  * Lead surrogates still have values for optimized UTF-16 string processing.
979*0e209d39SAndroid Build Coastguard Worker  * When looking up code point properties, the code now checks for lead surrogates and
980*0e209d39SAndroid Build Coastguard Worker  * treats them as inert.
981*0e209d39SAndroid Build Coastguard Worker  *
982*0e209d39SAndroid Build Coastguard Worker  * gennorm2 now has to reject mappings for surrogate code points.
983*0e209d39SAndroid Build Coastguard Worker  * UTS #46 maps unpaired surrogates to U+FFFD in code rather than via its
984*0e209d39SAndroid Build Coastguard Worker  * custom normalization data file.
985*0e209d39SAndroid Build Coastguard Worker  */
986*0e209d39SAndroid Build Coastguard Worker 
987*0e209d39SAndroid Build Coastguard Worker #endif  /* !UCONFIG_NO_NORMALIZATION */
988*0e209d39SAndroid Build Coastguard Worker #endif  /* __NORMALIZER2IMPL_H__ */
989