xref: /aosp_15_r20/external/icu/libicu/cts_headers/collationdata.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker *******************************************************************************
5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2010-2015, International Business Machines
6*0e209d39SAndroid Build Coastguard Worker * Corporation and others.  All Rights Reserved.
7*0e209d39SAndroid Build Coastguard Worker *******************************************************************************
8*0e209d39SAndroid Build Coastguard Worker * collationdata.h
9*0e209d39SAndroid Build Coastguard Worker *
10*0e209d39SAndroid Build Coastguard Worker * created on: 2010oct27
11*0e209d39SAndroid Build Coastguard Worker * created by: Markus W. Scherer
12*0e209d39SAndroid Build Coastguard Worker */
13*0e209d39SAndroid Build Coastguard Worker 
14*0e209d39SAndroid Build Coastguard Worker #ifndef __COLLATIONDATA_H__
15*0e209d39SAndroid Build Coastguard Worker #define __COLLATIONDATA_H__
16*0e209d39SAndroid Build Coastguard Worker 
17*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
18*0e209d39SAndroid Build Coastguard Worker 
19*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_COLLATION
20*0e209d39SAndroid Build Coastguard Worker 
21*0e209d39SAndroid Build Coastguard Worker #include "unicode/ucol.h"
22*0e209d39SAndroid Build Coastguard Worker #include "unicode/uniset.h"
23*0e209d39SAndroid Build Coastguard Worker #include "collation.h"
24*0e209d39SAndroid Build Coastguard Worker #include "normalizer2impl.h"
25*0e209d39SAndroid Build Coastguard Worker #include "utrie2.h"
26*0e209d39SAndroid Build Coastguard Worker 
27*0e209d39SAndroid Build Coastguard Worker struct UDataMemory;
28*0e209d39SAndroid Build Coastguard Worker 
29*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
30*0e209d39SAndroid Build Coastguard Worker 
31*0e209d39SAndroid Build Coastguard Worker class UVector32;
32*0e209d39SAndroid Build Coastguard Worker 
33*0e209d39SAndroid Build Coastguard Worker /**
34*0e209d39SAndroid Build Coastguard Worker  * Collation data container.
35*0e209d39SAndroid Build Coastguard Worker  * Immutable data created by a CollationDataBuilder, or loaded from a file,
36*0e209d39SAndroid Build Coastguard Worker  * or deserialized from API-provided binary data.
37*0e209d39SAndroid Build Coastguard Worker  *
38*0e209d39SAndroid Build Coastguard Worker  * Includes data for the collation base (root/default), aliased if this is not the base.
39*0e209d39SAndroid Build Coastguard Worker  */
40*0e209d39SAndroid Build Coastguard Worker struct U_I18N_API CollationData : public UMemory {
41*0e209d39SAndroid Build Coastguard Worker     // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
42*0e209d39SAndroid Build Coastguard Worker     // parallel with the ranges, and resetting ranges that are indexed.
43*0e209d39SAndroid Build Coastguard Worker     // The reordering builder code could clone the resulting template array.
44*0e209d39SAndroid Build Coastguard Worker     static constexpr int32_t REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14;
45*0e209d39SAndroid Build Coastguard Worker     static constexpr int32_t REORDER_RESERVED_AFTER_LATIN = REORDER_RESERVED_BEFORE_LATIN + 1;
46*0e209d39SAndroid Build Coastguard Worker 
47*0e209d39SAndroid Build Coastguard Worker     static constexpr int32_t MAX_NUM_SPECIAL_REORDER_CODES = 8;
48*0e209d39SAndroid Build Coastguard Worker     /** C++ only, data reader check scriptStartsLength. */
49*0e209d39SAndroid Build Coastguard Worker     static constexpr int32_t MAX_NUM_SCRIPT_RANGES = 256;
50*0e209d39SAndroid Build Coastguard Worker 
CollationDataCollationData51*0e209d39SAndroid Build Coastguard Worker     CollationData(const Normalizer2Impl &nfc)
52*0e209d39SAndroid Build Coastguard Worker             : trie(nullptr),
53*0e209d39SAndroid Build Coastguard Worker               ce32s(nullptr), ces(nullptr), contexts(nullptr), base(nullptr),
54*0e209d39SAndroid Build Coastguard Worker               jamoCE32s(nullptr),
55*0e209d39SAndroid Build Coastguard Worker               nfcImpl(nfc),
56*0e209d39SAndroid Build Coastguard Worker               numericPrimary(0x12000000),
57*0e209d39SAndroid Build Coastguard Worker               ce32sLength(0), cesLength(0), contextsLength(0),
58*0e209d39SAndroid Build Coastguard Worker               compressibleBytes(nullptr),
59*0e209d39SAndroid Build Coastguard Worker               unsafeBackwardSet(nullptr),
60*0e209d39SAndroid Build Coastguard Worker               fastLatinTable(nullptr), fastLatinTableLength(0),
61*0e209d39SAndroid Build Coastguard Worker               numScripts(0), scriptsIndex(nullptr), scriptStarts(nullptr), scriptStartsLength(0),
62*0e209d39SAndroid Build Coastguard Worker               rootElements(nullptr), rootElementsLength(0) {}
63*0e209d39SAndroid Build Coastguard Worker 
getCE32CollationData64*0e209d39SAndroid Build Coastguard Worker     uint32_t getCE32(UChar32 c) const {
65*0e209d39SAndroid Build Coastguard Worker         return UTRIE2_GET32(trie, c);
66*0e209d39SAndroid Build Coastguard Worker     }
67*0e209d39SAndroid Build Coastguard Worker 
getCE32FromSupplementaryCollationData68*0e209d39SAndroid Build Coastguard Worker     uint32_t getCE32FromSupplementary(UChar32 c) const {
69*0e209d39SAndroid Build Coastguard Worker         return UTRIE2_GET32_FROM_SUPP(trie, c);
70*0e209d39SAndroid Build Coastguard Worker     }
71*0e209d39SAndroid Build Coastguard Worker 
isDigitCollationData72*0e209d39SAndroid Build Coastguard Worker     UBool isDigit(UChar32 c) const {
73*0e209d39SAndroid Build Coastguard Worker         return c < 0x660 ? c <= 0x39 && 0x30 <= c :
74*0e209d39SAndroid Build Coastguard Worker                 Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG);
75*0e209d39SAndroid Build Coastguard Worker     }
76*0e209d39SAndroid Build Coastguard Worker 
isUnsafeBackwardCollationData77*0e209d39SAndroid Build Coastguard Worker     UBool isUnsafeBackward(UChar32 c, UBool numeric) const {
78*0e209d39SAndroid Build Coastguard Worker         return unsafeBackwardSet->contains(c) || (numeric && isDigit(c));
79*0e209d39SAndroid Build Coastguard Worker     }
80*0e209d39SAndroid Build Coastguard Worker 
isCompressibleLeadByteCollationData81*0e209d39SAndroid Build Coastguard Worker     UBool isCompressibleLeadByte(uint32_t b) const {
82*0e209d39SAndroid Build Coastguard Worker         return compressibleBytes[b];
83*0e209d39SAndroid Build Coastguard Worker     }
84*0e209d39SAndroid Build Coastguard Worker 
isCompressiblePrimaryCollationData85*0e209d39SAndroid Build Coastguard Worker     inline UBool isCompressiblePrimary(uint32_t p) const {
86*0e209d39SAndroid Build Coastguard Worker         return isCompressibleLeadByte(p >> 24);
87*0e209d39SAndroid Build Coastguard Worker     }
88*0e209d39SAndroid Build Coastguard Worker 
89*0e209d39SAndroid Build Coastguard Worker     /**
90*0e209d39SAndroid Build Coastguard Worker      * Returns the CE32 from two contexts words.
91*0e209d39SAndroid Build Coastguard Worker      * Access to the defaultCE32 for contraction and prefix matching.
92*0e209d39SAndroid Build Coastguard Worker      */
readCE32CollationData93*0e209d39SAndroid Build Coastguard Worker     static uint32_t readCE32(const char16_t *p) {
94*0e209d39SAndroid Build Coastguard Worker         return ((uint32_t)p[0] << 16) | p[1];
95*0e209d39SAndroid Build Coastguard Worker     }
96*0e209d39SAndroid Build Coastguard Worker 
97*0e209d39SAndroid Build Coastguard Worker     /**
98*0e209d39SAndroid Build Coastguard Worker      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
99*0e209d39SAndroid Build Coastguard Worker      * Requires that ce32 is special.
100*0e209d39SAndroid Build Coastguard Worker      */
101*0e209d39SAndroid Build Coastguard Worker     uint32_t getIndirectCE32(uint32_t ce32) const;
102*0e209d39SAndroid Build Coastguard Worker     /**
103*0e209d39SAndroid Build Coastguard Worker      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
104*0e209d39SAndroid Build Coastguard Worker      * if ce32 is special.
105*0e209d39SAndroid Build Coastguard Worker      */
106*0e209d39SAndroid Build Coastguard Worker     uint32_t getFinalCE32(uint32_t ce32) const;
107*0e209d39SAndroid Build Coastguard Worker 
108*0e209d39SAndroid Build Coastguard Worker     /**
109*0e209d39SAndroid Build Coastguard Worker      * Computes a CE from c's ce32 which has the OFFSET_TAG.
110*0e209d39SAndroid Build Coastguard Worker      */
getCEFromOffsetCE32CollationData111*0e209d39SAndroid Build Coastguard Worker     int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const {
112*0e209d39SAndroid Build Coastguard Worker         int64_t dataCE = ces[Collation::indexFromCE32(ce32)];
113*0e209d39SAndroid Build Coastguard Worker         return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE));
114*0e209d39SAndroid Build Coastguard Worker     }
115*0e209d39SAndroid Build Coastguard Worker 
116*0e209d39SAndroid Build Coastguard Worker     /**
117*0e209d39SAndroid Build Coastguard Worker      * Returns the single CE that c maps to.
118*0e209d39SAndroid Build Coastguard Worker      * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.
119*0e209d39SAndroid Build Coastguard Worker      */
120*0e209d39SAndroid Build Coastguard Worker     int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
121*0e209d39SAndroid Build Coastguard Worker 
122*0e209d39SAndroid Build Coastguard Worker     /**
123*0e209d39SAndroid Build Coastguard Worker      * Returns the FCD16 value for code point c. c must be >= 0.
124*0e209d39SAndroid Build Coastguard Worker      */
getFCD16CollationData125*0e209d39SAndroid Build Coastguard Worker     uint16_t getFCD16(UChar32 c) const {
126*0e209d39SAndroid Build Coastguard Worker         return nfcImpl.getFCD16(c);
127*0e209d39SAndroid Build Coastguard Worker     }
128*0e209d39SAndroid Build Coastguard Worker 
129*0e209d39SAndroid Build Coastguard Worker     /**
130*0e209d39SAndroid Build Coastguard Worker      * Returns the first primary for the script's reordering group.
131*0e209d39SAndroid Build Coastguard Worker      * @return the primary with only the first primary lead byte of the group
132*0e209d39SAndroid Build Coastguard Worker      *         (not necessarily an actual root collator primary weight),
133*0e209d39SAndroid Build Coastguard Worker      *         or 0 if the script is unknown
134*0e209d39SAndroid Build Coastguard Worker      */
135*0e209d39SAndroid Build Coastguard Worker     uint32_t getFirstPrimaryForGroup(int32_t script) const;
136*0e209d39SAndroid Build Coastguard Worker 
137*0e209d39SAndroid Build Coastguard Worker     /**
138*0e209d39SAndroid Build Coastguard Worker      * Returns the last primary for the script's reordering group.
139*0e209d39SAndroid Build Coastguard Worker      * @return the last primary of the group
140*0e209d39SAndroid Build Coastguard Worker      *         (not an actual root collator primary weight),
141*0e209d39SAndroid Build Coastguard Worker      *         or 0 if the script is unknown
142*0e209d39SAndroid Build Coastguard Worker      */
143*0e209d39SAndroid Build Coastguard Worker     uint32_t getLastPrimaryForGroup(int32_t script) const;
144*0e209d39SAndroid Build Coastguard Worker 
145*0e209d39SAndroid Build Coastguard Worker     /**
146*0e209d39SAndroid Build Coastguard Worker      * Finds the reordering group which contains the primary weight.
147*0e209d39SAndroid Build Coastguard Worker      * @return the first script of the group, or -1 if the weight is beyond the last group
148*0e209d39SAndroid Build Coastguard Worker      */
149*0e209d39SAndroid Build Coastguard Worker     int32_t getGroupForPrimary(uint32_t p) const;
150*0e209d39SAndroid Build Coastguard Worker 
151*0e209d39SAndroid Build Coastguard Worker     int32_t getEquivalentScripts(int32_t script,
152*0e209d39SAndroid Build Coastguard Worker                                  int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
153*0e209d39SAndroid Build Coastguard Worker 
154*0e209d39SAndroid Build Coastguard Worker     /**
155*0e209d39SAndroid Build Coastguard Worker      * Writes the permutation of primary-weight ranges
156*0e209d39SAndroid Build Coastguard Worker      * for the given reordering of scripts and groups.
157*0e209d39SAndroid Build Coastguard Worker      * The caller checks for illegal arguments and
158*0e209d39SAndroid Build Coastguard Worker      * takes care of [DEFAULT] and memory allocation.
159*0e209d39SAndroid Build Coastguard Worker      *
160*0e209d39SAndroid Build Coastguard Worker      * Each list element will be a (limit, offset) pair as described
161*0e209d39SAndroid Build Coastguard Worker      * for the CollationSettings::reorderRanges.
162*0e209d39SAndroid Build Coastguard Worker      * The list will be empty if no ranges are reordered.
163*0e209d39SAndroid Build Coastguard Worker      */
164*0e209d39SAndroid Build Coastguard Worker     void makeReorderRanges(const int32_t *reorder, int32_t length,
165*0e209d39SAndroid Build Coastguard Worker                            UVector32 &ranges, UErrorCode &errorCode) const;
166*0e209d39SAndroid Build Coastguard Worker 
167*0e209d39SAndroid Build Coastguard Worker     /** @see jamoCE32s */
168*0e209d39SAndroid Build Coastguard Worker     static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
169*0e209d39SAndroid Build Coastguard Worker 
170*0e209d39SAndroid Build Coastguard Worker     /** Main lookup trie. */
171*0e209d39SAndroid Build Coastguard Worker     const UTrie2 *trie;
172*0e209d39SAndroid Build Coastguard Worker     /**
173*0e209d39SAndroid Build Coastguard Worker      * Array of CE32 values.
174*0e209d39SAndroid Build Coastguard Worker      * At index 0 there must be CE32(U+0000)
175*0e209d39SAndroid Build Coastguard Worker      * to support U+0000's special-tag for NUL-termination handling.
176*0e209d39SAndroid Build Coastguard Worker      */
177*0e209d39SAndroid Build Coastguard Worker     const uint32_t *ce32s;
178*0e209d39SAndroid Build Coastguard Worker     /** Array of CE values for expansions and OFFSET_TAG. */
179*0e209d39SAndroid Build Coastguard Worker     const int64_t *ces;
180*0e209d39SAndroid Build Coastguard Worker     /** Array of prefix and contraction-suffix matching data. */
181*0e209d39SAndroid Build Coastguard Worker     const char16_t *contexts;
182*0e209d39SAndroid Build Coastguard Worker     /** Base collation data, or nullptr if this data itself is a base. */
183*0e209d39SAndroid Build Coastguard Worker     const CollationData *base;
184*0e209d39SAndroid Build Coastguard Worker     /**
185*0e209d39SAndroid Build Coastguard Worker      * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
186*0e209d39SAndroid Build Coastguard Worker      * They are normally simple CE32s, rarely expansions.
187*0e209d39SAndroid Build Coastguard Worker      * For fast handling of HANGUL_TAG.
188*0e209d39SAndroid Build Coastguard Worker      */
189*0e209d39SAndroid Build Coastguard Worker     const uint32_t *jamoCE32s;
190*0e209d39SAndroid Build Coastguard Worker     const Normalizer2Impl &nfcImpl;
191*0e209d39SAndroid Build Coastguard Worker     /** The single-byte primary weight (xx000000) for numeric collation. */
192*0e209d39SAndroid Build Coastguard Worker     uint32_t numericPrimary;
193*0e209d39SAndroid Build Coastguard Worker 
194*0e209d39SAndroid Build Coastguard Worker     int32_t ce32sLength;
195*0e209d39SAndroid Build Coastguard Worker     int32_t cesLength;
196*0e209d39SAndroid Build Coastguard Worker     int32_t contextsLength;
197*0e209d39SAndroid Build Coastguard Worker 
198*0e209d39SAndroid Build Coastguard Worker     /** 256 flags for which primary-weight lead bytes are compressible. */
199*0e209d39SAndroid Build Coastguard Worker     const UBool *compressibleBytes;
200*0e209d39SAndroid Build Coastguard Worker     /**
201*0e209d39SAndroid Build Coastguard Worker      * Set of code points that are unsafe for starting string comparison after an identical prefix,
202*0e209d39SAndroid Build Coastguard Worker      * or in backwards CE iteration.
203*0e209d39SAndroid Build Coastguard Worker      */
204*0e209d39SAndroid Build Coastguard Worker     const UnicodeSet *unsafeBackwardSet;
205*0e209d39SAndroid Build Coastguard Worker 
206*0e209d39SAndroid Build Coastguard Worker     /**
207*0e209d39SAndroid Build Coastguard Worker      * Fast Latin table for common-Latin-text string comparisons.
208*0e209d39SAndroid Build Coastguard Worker      * Data structure see class CollationFastLatin.
209*0e209d39SAndroid Build Coastguard Worker      */
210*0e209d39SAndroid Build Coastguard Worker     const uint16_t *fastLatinTable;
211*0e209d39SAndroid Build Coastguard Worker     int32_t fastLatinTableLength;
212*0e209d39SAndroid Build Coastguard Worker 
213*0e209d39SAndroid Build Coastguard Worker     /**
214*0e209d39SAndroid Build Coastguard Worker      * Data for scripts and reordering groups.
215*0e209d39SAndroid Build Coastguard Worker      * Uses include building a reordering permutation table and
216*0e209d39SAndroid Build Coastguard Worker      * providing script boundaries to AlphabeticIndex.
217*0e209d39SAndroid Build Coastguard Worker      */
218*0e209d39SAndroid Build Coastguard Worker     int32_t numScripts;
219*0e209d39SAndroid Build Coastguard Worker     /**
220*0e209d39SAndroid Build Coastguard Worker      * The length of scriptsIndex is numScripts+16.
221*0e209d39SAndroid Build Coastguard Worker      * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
222*0e209d39SAndroid Build Coastguard Worker      * 16 special reorder codes (not all used) are mapped starting at numScripts.
223*0e209d39SAndroid Build Coastguard Worker      * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
224*0e209d39SAndroid Build Coastguard Worker      * There are special codes at the end for reorder-reserved primary ranges.
225*0e209d39SAndroid Build Coastguard Worker      *
226*0e209d39SAndroid Build Coastguard Worker      * Multiple scripts may share a range and index, for example Hira & Kana.
227*0e209d39SAndroid Build Coastguard Worker      */
228*0e209d39SAndroid Build Coastguard Worker     const uint16_t *scriptsIndex;
229*0e209d39SAndroid Build Coastguard Worker     /**
230*0e209d39SAndroid Build Coastguard Worker      * Start primary weight (top 16 bits only) for a group/script/reserved range
231*0e209d39SAndroid Build Coastguard Worker      * indexed by scriptsIndex.
232*0e209d39SAndroid Build Coastguard Worker      * The first range (separators & terminators) and the last range (trailing weights)
233*0e209d39SAndroid Build Coastguard Worker      * are not reorderable, and no scriptsIndex entry points to them.
234*0e209d39SAndroid Build Coastguard Worker      */
235*0e209d39SAndroid Build Coastguard Worker     const uint16_t *scriptStarts;
236*0e209d39SAndroid Build Coastguard Worker     int32_t scriptStartsLength;
237*0e209d39SAndroid Build Coastguard Worker 
238*0e209d39SAndroid Build Coastguard Worker     /**
239*0e209d39SAndroid Build Coastguard Worker      * Collation elements in the root collator.
240*0e209d39SAndroid Build Coastguard Worker      * Used by the CollationRootElements class. The data structure is described there.
241*0e209d39SAndroid Build Coastguard Worker      * nullptr in a tailoring.
242*0e209d39SAndroid Build Coastguard Worker      */
243*0e209d39SAndroid Build Coastguard Worker     const uint32_t *rootElements;
244*0e209d39SAndroid Build Coastguard Worker     int32_t rootElementsLength;
245*0e209d39SAndroid Build Coastguard Worker 
246*0e209d39SAndroid Build Coastguard Worker private:
247*0e209d39SAndroid Build Coastguard Worker     int32_t getScriptIndex(int32_t script) const;
248*0e209d39SAndroid Build Coastguard Worker     void makeReorderRanges(const int32_t *reorder, int32_t length,
249*0e209d39SAndroid Build Coastguard Worker                            UBool latinMustMove,
250*0e209d39SAndroid Build Coastguard Worker                            UVector32 &ranges, UErrorCode &errorCode) const;
251*0e209d39SAndroid Build Coastguard Worker     int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;
252*0e209d39SAndroid Build Coastguard Worker     int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;
253*0e209d39SAndroid Build Coastguard Worker };
254*0e209d39SAndroid Build Coastguard Worker 
255*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
256*0e209d39SAndroid Build Coastguard Worker 
257*0e209d39SAndroid Build Coastguard Worker #endif  // !UCONFIG_NO_COLLATION
258*0e209d39SAndroid Build Coastguard Worker #endif  // __COLLATIONDATA_H__
259