xref: /aosp_15_r20/external/icu/libicu/cts_headers/collationdata.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationdata.h
9 *
10 * created on: 2010oct27
11 * created by: Markus W. Scherer
12 */
13 
14 #ifndef __COLLATIONDATA_H__
15 #define __COLLATIONDATA_H__
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_COLLATION
20 
21 #include "unicode/ucol.h"
22 #include "unicode/uniset.h"
23 #include "collation.h"
24 #include "normalizer2impl.h"
25 #include "utrie2.h"
26 
27 struct UDataMemory;
28 
29 U_NAMESPACE_BEGIN
30 
31 class UVector32;
32 
33 /**
34  * Collation data container.
35  * Immutable data created by a CollationDataBuilder, or loaded from a file,
36  * or deserialized from API-provided binary data.
37  *
38  * Includes data for the collation base (root/default), aliased if this is not the base.
39  */
40 struct U_I18N_API CollationData : public UMemory {
41     // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
42     // parallel with the ranges, and resetting ranges that are indexed.
43     // The reordering builder code could clone the resulting template array.
44     static constexpr int32_t REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14;
45     static constexpr int32_t REORDER_RESERVED_AFTER_LATIN = REORDER_RESERVED_BEFORE_LATIN + 1;
46 
47     static constexpr int32_t MAX_NUM_SPECIAL_REORDER_CODES = 8;
48     /** C++ only, data reader check scriptStartsLength. */
49     static constexpr int32_t MAX_NUM_SCRIPT_RANGES = 256;
50 
CollationDataCollationData51     CollationData(const Normalizer2Impl &nfc)
52             : trie(nullptr),
53               ce32s(nullptr), ces(nullptr), contexts(nullptr), base(nullptr),
54               jamoCE32s(nullptr),
55               nfcImpl(nfc),
56               numericPrimary(0x12000000),
57               ce32sLength(0), cesLength(0), contextsLength(0),
58               compressibleBytes(nullptr),
59               unsafeBackwardSet(nullptr),
60               fastLatinTable(nullptr), fastLatinTableLength(0),
61               numScripts(0), scriptsIndex(nullptr), scriptStarts(nullptr), scriptStartsLength(0),
62               rootElements(nullptr), rootElementsLength(0) {}
63 
getCE32CollationData64     uint32_t getCE32(UChar32 c) const {
65         return UTRIE2_GET32(trie, c);
66     }
67 
getCE32FromSupplementaryCollationData68     uint32_t getCE32FromSupplementary(UChar32 c) const {
69         return UTRIE2_GET32_FROM_SUPP(trie, c);
70     }
71 
isDigitCollationData72     UBool isDigit(UChar32 c) const {
73         return c < 0x660 ? c <= 0x39 && 0x30 <= c :
74                 Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG);
75     }
76 
isUnsafeBackwardCollationData77     UBool isUnsafeBackward(UChar32 c, UBool numeric) const {
78         return unsafeBackwardSet->contains(c) || (numeric && isDigit(c));
79     }
80 
isCompressibleLeadByteCollationData81     UBool isCompressibleLeadByte(uint32_t b) const {
82         return compressibleBytes[b];
83     }
84 
isCompressiblePrimaryCollationData85     inline UBool isCompressiblePrimary(uint32_t p) const {
86         return isCompressibleLeadByte(p >> 24);
87     }
88 
89     /**
90      * Returns the CE32 from two contexts words.
91      * Access to the defaultCE32 for contraction and prefix matching.
92      */
readCE32CollationData93     static uint32_t readCE32(const char16_t *p) {
94         return ((uint32_t)p[0] << 16) | p[1];
95     }
96 
97     /**
98      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
99      * Requires that ce32 is special.
100      */
101     uint32_t getIndirectCE32(uint32_t ce32) const;
102     /**
103      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
104      * if ce32 is special.
105      */
106     uint32_t getFinalCE32(uint32_t ce32) const;
107 
108     /**
109      * Computes a CE from c's ce32 which has the OFFSET_TAG.
110      */
getCEFromOffsetCE32CollationData111     int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const {
112         int64_t dataCE = ces[Collation::indexFromCE32(ce32)];
113         return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE));
114     }
115 
116     /**
117      * Returns the single CE that c maps to.
118      * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.
119      */
120     int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
121 
122     /**
123      * Returns the FCD16 value for code point c. c must be >= 0.
124      */
getFCD16CollationData125     uint16_t getFCD16(UChar32 c) const {
126         return nfcImpl.getFCD16(c);
127     }
128 
129     /**
130      * Returns the first primary for the script's reordering group.
131      * @return the primary with only the first primary lead byte of the group
132      *         (not necessarily an actual root collator primary weight),
133      *         or 0 if the script is unknown
134      */
135     uint32_t getFirstPrimaryForGroup(int32_t script) const;
136 
137     /**
138      * Returns the last primary for the script's reordering group.
139      * @return the last primary of the group
140      *         (not an actual root collator primary weight),
141      *         or 0 if the script is unknown
142      */
143     uint32_t getLastPrimaryForGroup(int32_t script) const;
144 
145     /**
146      * Finds the reordering group which contains the primary weight.
147      * @return the first script of the group, or -1 if the weight is beyond the last group
148      */
149     int32_t getGroupForPrimary(uint32_t p) const;
150 
151     int32_t getEquivalentScripts(int32_t script,
152                                  int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
153 
154     /**
155      * Writes the permutation of primary-weight ranges
156      * for the given reordering of scripts and groups.
157      * The caller checks for illegal arguments and
158      * takes care of [DEFAULT] and memory allocation.
159      *
160      * Each list element will be a (limit, offset) pair as described
161      * for the CollationSettings::reorderRanges.
162      * The list will be empty if no ranges are reordered.
163      */
164     void makeReorderRanges(const int32_t *reorder, int32_t length,
165                            UVector32 &ranges, UErrorCode &errorCode) const;
166 
167     /** @see jamoCE32s */
168     static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
169 
170     /** Main lookup trie. */
171     const UTrie2 *trie;
172     /**
173      * Array of CE32 values.
174      * At index 0 there must be CE32(U+0000)
175      * to support U+0000's special-tag for NUL-termination handling.
176      */
177     const uint32_t *ce32s;
178     /** Array of CE values for expansions and OFFSET_TAG. */
179     const int64_t *ces;
180     /** Array of prefix and contraction-suffix matching data. */
181     const char16_t *contexts;
182     /** Base collation data, or nullptr if this data itself is a base. */
183     const CollationData *base;
184     /**
185      * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
186      * They are normally simple CE32s, rarely expansions.
187      * For fast handling of HANGUL_TAG.
188      */
189     const uint32_t *jamoCE32s;
190     const Normalizer2Impl &nfcImpl;
191     /** The single-byte primary weight (xx000000) for numeric collation. */
192     uint32_t numericPrimary;
193 
194     int32_t ce32sLength;
195     int32_t cesLength;
196     int32_t contextsLength;
197 
198     /** 256 flags for which primary-weight lead bytes are compressible. */
199     const UBool *compressibleBytes;
200     /**
201      * Set of code points that are unsafe for starting string comparison after an identical prefix,
202      * or in backwards CE iteration.
203      */
204     const UnicodeSet *unsafeBackwardSet;
205 
206     /**
207      * Fast Latin table for common-Latin-text string comparisons.
208      * Data structure see class CollationFastLatin.
209      */
210     const uint16_t *fastLatinTable;
211     int32_t fastLatinTableLength;
212 
213     /**
214      * Data for scripts and reordering groups.
215      * Uses include building a reordering permutation table and
216      * providing script boundaries to AlphabeticIndex.
217      */
218     int32_t numScripts;
219     /**
220      * The length of scriptsIndex is numScripts+16.
221      * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
222      * 16 special reorder codes (not all used) are mapped starting at numScripts.
223      * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
224      * There are special codes at the end for reorder-reserved primary ranges.
225      *
226      * Multiple scripts may share a range and index, for example Hira & Kana.
227      */
228     const uint16_t *scriptsIndex;
229     /**
230      * Start primary weight (top 16 bits only) for a group/script/reserved range
231      * indexed by scriptsIndex.
232      * The first range (separators & terminators) and the last range (trailing weights)
233      * are not reorderable, and no scriptsIndex entry points to them.
234      */
235     const uint16_t *scriptStarts;
236     int32_t scriptStartsLength;
237 
238     /**
239      * Collation elements in the root collator.
240      * Used by the CollationRootElements class. The data structure is described there.
241      * nullptr in a tailoring.
242      */
243     const uint32_t *rootElements;
244     int32_t rootElementsLength;
245 
246 private:
247     int32_t getScriptIndex(int32_t script) const;
248     void makeReorderRanges(const int32_t *reorder, int32_t length,
249                            UBool latinMustMove,
250                            UVector32 &ranges, UErrorCode &errorCode) const;
251     int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;
252     int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;
253 };
254 
255 U_NAMESPACE_END
256 
257 #endif  // !UCONFIG_NO_COLLATION
258 #endif  // __COLLATIONDATA_H__
259