xref: /aosp_15_r20/external/icu/libicu/cts_headers/collationdatareader.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker *******************************************************************************
5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2013-2015, International Business Machines
6*0e209d39SAndroid Build Coastguard Worker * Corporation and others.  All Rights Reserved.
7*0e209d39SAndroid Build Coastguard Worker *******************************************************************************
8*0e209d39SAndroid Build Coastguard Worker * collationdatareader.h
9*0e209d39SAndroid Build Coastguard Worker *
10*0e209d39SAndroid Build Coastguard Worker * created on: 2013feb07
11*0e209d39SAndroid Build Coastguard Worker * created by: Markus W. Scherer
12*0e209d39SAndroid Build Coastguard Worker */
13*0e209d39SAndroid Build Coastguard Worker 
14*0e209d39SAndroid Build Coastguard Worker #ifndef __COLLATIONDATAREADER_H__
15*0e209d39SAndroid Build Coastguard Worker #define __COLLATIONDATAREADER_H__
16*0e209d39SAndroid Build Coastguard Worker 
17*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
18*0e209d39SAndroid Build Coastguard Worker 
19*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_COLLATION
20*0e209d39SAndroid Build Coastguard Worker 
21*0e209d39SAndroid Build Coastguard Worker #include "unicode/udata.h"
22*0e209d39SAndroid Build Coastguard Worker 
23*0e209d39SAndroid Build Coastguard Worker struct UDataMemory;
24*0e209d39SAndroid Build Coastguard Worker 
25*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
26*0e209d39SAndroid Build Coastguard Worker 
27*0e209d39SAndroid Build Coastguard Worker struct CollationTailoring;
28*0e209d39SAndroid Build Coastguard Worker 
29*0e209d39SAndroid Build Coastguard Worker /**
30*0e209d39SAndroid Build Coastguard Worker  * Collation binary data reader.
31*0e209d39SAndroid Build Coastguard Worker  */
32*0e209d39SAndroid Build Coastguard Worker struct U_I18N_API CollationDataReader /* all static */ {
33*0e209d39SAndroid Build Coastguard Worker     // The following constants are also copied into source/common/ucol_swp.cpp.
34*0e209d39SAndroid Build Coastguard Worker     // Keep them in sync!
35*0e209d39SAndroid Build Coastguard Worker     enum {
36*0e209d39SAndroid Build Coastguard Worker         /**
37*0e209d39SAndroid Build Coastguard Worker          * Number of int32_t indexes.
38*0e209d39SAndroid Build Coastguard Worker          *
39*0e209d39SAndroid Build Coastguard Worker          * Can be 2 if there are only options.
40*0e209d39SAndroid Build Coastguard Worker          * Can be 7 or 8 if there are only options and a script reordering.
41*0e209d39SAndroid Build Coastguard Worker          * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
42*0e209d39SAndroid Build Coastguard Worker          */
43*0e209d39SAndroid Build Coastguard Worker         IX_INDEXES_LENGTH,  // 0
44*0e209d39SAndroid Build Coastguard Worker         /**
45*0e209d39SAndroid Build Coastguard Worker          * Bits 31..24: numericPrimary, for numeric collation
46*0e209d39SAndroid Build Coastguard Worker          *      23..16: fast Latin format version (0 = no fast Latin table)
47*0e209d39SAndroid Build Coastguard Worker          *      15.. 0: options bit set
48*0e209d39SAndroid Build Coastguard Worker          */
49*0e209d39SAndroid Build Coastguard Worker         IX_OPTIONS,
50*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED2,
51*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED3,
52*0e209d39SAndroid Build Coastguard Worker 
53*0e209d39SAndroid Build Coastguard Worker         /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
54*0e209d39SAndroid Build Coastguard Worker         IX_JAMO_CE32S_START,  // 4
55*0e209d39SAndroid Build Coastguard Worker 
56*0e209d39SAndroid Build Coastguard Worker         // Byte offsets from the start of the data, after the generic header.
57*0e209d39SAndroid Build Coastguard Worker         // The indexes[] are at byte offset 0, other data follows.
58*0e209d39SAndroid Build Coastguard Worker         // Each data item is aligned properly.
59*0e209d39SAndroid Build Coastguard Worker         // The data items should be in descending order of unit size,
60*0e209d39SAndroid Build Coastguard Worker         // to minimize the need for padding.
61*0e209d39SAndroid Build Coastguard Worker         // Each item's byte length is given by the difference between its offset and
62*0e209d39SAndroid Build Coastguard Worker         // the next index/offset value.
63*0e209d39SAndroid Build Coastguard Worker         /** Byte offset to int32_t reorderCodes[]. */
64*0e209d39SAndroid Build Coastguard Worker         IX_REORDER_CODES_OFFSET,
65*0e209d39SAndroid Build Coastguard Worker         /**
66*0e209d39SAndroid Build Coastguard Worker          * Byte offset to uint8_t reorderTable[].
67*0e209d39SAndroid Build Coastguard Worker          * Empty table if <256 bytes (padding only).
68*0e209d39SAndroid Build Coastguard Worker          * Otherwise 256 bytes or more (with padding).
69*0e209d39SAndroid Build Coastguard Worker          */
70*0e209d39SAndroid Build Coastguard Worker         IX_REORDER_TABLE_OFFSET,
71*0e209d39SAndroid Build Coastguard Worker         /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
72*0e209d39SAndroid Build Coastguard Worker         IX_TRIE_OFFSET,
73*0e209d39SAndroid Build Coastguard Worker 
74*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED8_OFFSET,  // 8
75*0e209d39SAndroid Build Coastguard Worker         /** Byte offset to int64_t ces[]. */
76*0e209d39SAndroid Build Coastguard Worker         IX_CES_OFFSET,
77*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED10_OFFSET,
78*0e209d39SAndroid Build Coastguard Worker         /** Byte offset to uint32_t ce32s[]. */
79*0e209d39SAndroid Build Coastguard Worker         IX_CE32S_OFFSET,
80*0e209d39SAndroid Build Coastguard Worker 
81*0e209d39SAndroid Build Coastguard Worker         /** Byte offset to uint32_t rootElements[]. */
82*0e209d39SAndroid Build Coastguard Worker         IX_ROOT_ELEMENTS_OFFSET,  // 12
83*0e209d39SAndroid Build Coastguard Worker         /** Byte offset to char16_t *contexts[]. */
84*0e209d39SAndroid Build Coastguard Worker         IX_CONTEXTS_OFFSET,
85*0e209d39SAndroid Build Coastguard Worker         /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */
86*0e209d39SAndroid Build Coastguard Worker         IX_UNSAFE_BWD_OFFSET,
87*0e209d39SAndroid Build Coastguard Worker         /** Byte offset to uint16_t fastLatinTable[]. */
88*0e209d39SAndroid Build Coastguard Worker         IX_FAST_LATIN_TABLE_OFFSET,
89*0e209d39SAndroid Build Coastguard Worker 
90*0e209d39SAndroid Build Coastguard Worker         /** Byte offset to uint16_t scripts[]. */
91*0e209d39SAndroid Build Coastguard Worker         IX_SCRIPTS_OFFSET,  // 16
92*0e209d39SAndroid Build Coastguard Worker         /**
93*0e209d39SAndroid Build Coastguard Worker          * Byte offset to UBool compressibleBytes[].
94*0e209d39SAndroid Build Coastguard Worker          * Empty table if <256 bytes (padding only).
95*0e209d39SAndroid Build Coastguard Worker          * Otherwise 256 bytes or more (with padding).
96*0e209d39SAndroid Build Coastguard Worker          */
97*0e209d39SAndroid Build Coastguard Worker         IX_COMPRESSIBLE_BYTES_OFFSET,
98*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED18_OFFSET,
99*0e209d39SAndroid Build Coastguard Worker         IX_TOTAL_SIZE
100*0e209d39SAndroid Build Coastguard Worker     };
101*0e209d39SAndroid Build Coastguard Worker 
102*0e209d39SAndroid Build Coastguard Worker     static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
103*0e209d39SAndroid Build Coastguard Worker                      CollationTailoring &tailoring, UErrorCode &errorCode);
104*0e209d39SAndroid Build Coastguard Worker 
105*0e209d39SAndroid Build Coastguard Worker     static UBool U_CALLCONV
106*0e209d39SAndroid Build Coastguard Worker     isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
107*0e209d39SAndroid Build Coastguard Worker 
108*0e209d39SAndroid Build Coastguard Worker private:
109*0e209d39SAndroid Build Coastguard Worker     CollationDataReader() = delete;  // no constructor
110*0e209d39SAndroid Build Coastguard Worker };
111*0e209d39SAndroid Build Coastguard Worker 
112*0e209d39SAndroid Build Coastguard Worker /*
113*0e209d39SAndroid Build Coastguard Worker  * Format of collation data (ucadata.icu, binary data in coll/ *.res files).
114*0e209d39SAndroid Build Coastguard Worker  * Format version 5.
115*0e209d39SAndroid Build Coastguard Worker  *
116*0e209d39SAndroid Build Coastguard Worker  * The root collation data is stored in the ucadata.icu file.
117*0e209d39SAndroid Build Coastguard Worker  * Tailorings are stored inside .res resource bundle files, with a complete file header.
118*0e209d39SAndroid Build Coastguard Worker  *
119*0e209d39SAndroid Build Coastguard Worker  * Collation data begins with a standard ICU data file header
120*0e209d39SAndroid Build Coastguard Worker  * (DataHeader, see ucmndata.h and unicode/udata.h).
121*0e209d39SAndroid Build Coastguard Worker  * The UDataInfo.dataVersion field contains the UCA and other version numbers,
122*0e209d39SAndroid Build Coastguard Worker  * see the comments for CollationTailoring.version.
123*0e209d39SAndroid Build Coastguard Worker  *
124*0e209d39SAndroid Build Coastguard Worker  * After the header, the file contains the following parts.
125*0e209d39SAndroid Build Coastguard Worker  * Constants are defined as enum values of the CollationDataReader class.
126*0e209d39SAndroid Build Coastguard Worker  * See also the Collation class.
127*0e209d39SAndroid Build Coastguard Worker  *
128*0e209d39SAndroid Build Coastguard Worker  * int32_t indexes[indexesLength];
129*0e209d39SAndroid Build Coastguard Worker  *      The indexes array has variable length.
130*0e209d39SAndroid Build Coastguard Worker  *      Some tailorings only need the length and the options,
131*0e209d39SAndroid Build Coastguard Worker  *      others only add reorderCodes and the reorderTable,
132*0e209d39SAndroid Build Coastguard Worker  *      some need to store mappings.
133*0e209d39SAndroid Build Coastguard Worker  *      Only as many indexes are stored as needed to read all of the data.
134*0e209d39SAndroid Build Coastguard Worker  *
135*0e209d39SAndroid Build Coastguard Worker  *      Index 0: indexesLength
136*0e209d39SAndroid Build Coastguard Worker  *      Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS
137*0e209d39SAndroid Build Coastguard Worker  *      Index 2..3: Unused/reserved/0.
138*0e209d39SAndroid Build Coastguard Worker  *      Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo
139*0e209d39SAndroid Build Coastguard Worker  *               are stored in a short, contiguous part of the ce32s array.
140*0e209d39SAndroid Build Coastguard Worker  *
141*0e209d39SAndroid Build Coastguard Worker  *      Indexes 5..19 are byte offsets in ascending order.
142*0e209d39SAndroid Build Coastguard Worker  *      Each byte offset marks the start of the next part in the data file,
143*0e209d39SAndroid Build Coastguard Worker  *      and the end of the previous one.
144*0e209d39SAndroid Build Coastguard Worker  *      When two consecutive byte offsets are the same (or too short),
145*0e209d39SAndroid Build Coastguard Worker  *      then the corresponding part is empty.
146*0e209d39SAndroid Build Coastguard Worker  *      Byte offsets are offsets from after the header,
147*0e209d39SAndroid Build Coastguard Worker  *      that is, from the beginning of the indexes[].
148*0e209d39SAndroid Build Coastguard Worker  *      Each part starts at an offset with proper alignment for its data.
149*0e209d39SAndroid Build Coastguard Worker  *      If necessary, the previous part may include padding bytes to achieve this alignment.
150*0e209d39SAndroid Build Coastguard Worker  *      The last byte offset that is stored in the indexes indicates the total size of the data
151*0e209d39SAndroid Build Coastguard Worker  *      (starting with the indexes).
152*0e209d39SAndroid Build Coastguard Worker  *
153*0e209d39SAndroid Build Coastguard Worker  * int32_t reorderCodes[]; -- empty in root
154*0e209d39SAndroid Build Coastguard Worker  *      The list of script and reordering codes.
155*0e209d39SAndroid Build Coastguard Worker  *
156*0e209d39SAndroid Build Coastguard Worker  *      Beginning with format version 5, this array may optionally
157*0e209d39SAndroid Build Coastguard Worker  *      have trailing entries with a full list of reorder ranges
158*0e209d39SAndroid Build Coastguard Worker  *      as described for CollationSettings::reorderRanges.
159*0e209d39SAndroid Build Coastguard Worker  *
160*0e209d39SAndroid Build Coastguard Worker  *      Script or reorder codes are first and do not exceed 16-bit values.
161*0e209d39SAndroid Build Coastguard Worker  *      Range limits are stored in the upper 16 bits, and are never 0.
162*0e209d39SAndroid Build Coastguard Worker  *      Split this array into reorder codes and ranges at the first entry
163*0e209d39SAndroid Build Coastguard Worker  *      with non-zero upper 16 bits.
164*0e209d39SAndroid Build Coastguard Worker  *
165*0e209d39SAndroid Build Coastguard Worker  *      If the ranges are missing but needed for split-reordered primary lead bytes,
166*0e209d39SAndroid Build Coastguard Worker  *      then they are regenerated at load time.
167*0e209d39SAndroid Build Coastguard Worker  *
168*0e209d39SAndroid Build Coastguard Worker  * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
169*0e209d39SAndroid Build Coastguard Worker  *      Primary-weight lead byte permutation table.
170*0e209d39SAndroid Build Coastguard Worker  *      Normally present when the reorderCodes are, but can be built at load time.
171*0e209d39SAndroid Build Coastguard Worker  *
172*0e209d39SAndroid Build Coastguard Worker  *      Beginning with format version 5, a 0 entry at a non-zero index
173*0e209d39SAndroid Build Coastguard Worker  *      (which is otherwise an illegal value)
174*0e209d39SAndroid Build Coastguard Worker  *      means that the primary lead byte is "split"
175*0e209d39SAndroid Build Coastguard Worker  *      (there are different offsets for primaries that share that lead byte)
176*0e209d39SAndroid Build Coastguard Worker  *      and the reordering offset must be determined via the reorder ranges
177*0e209d39SAndroid Build Coastguard Worker  *      that are either stored as part of the reorderCodes array
178*0e209d39SAndroid Build Coastguard Worker  *      or regenerated at load time.
179*0e209d39SAndroid Build Coastguard Worker  *
180*0e209d39SAndroid Build Coastguard Worker  * UTrie2 trie; -- see utrie2_impl.h and utrie2.h
181*0e209d39SAndroid Build Coastguard Worker  *      The trie holds the main collation data. Each code point is mapped to a 32-bit value.
182*0e209d39SAndroid Build Coastguard Worker  *      It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
183*0e209d39SAndroid Build Coastguard Worker  *      in which case it is a special CE32 and contains a 4-bit tag and further data.
184*0e209d39SAndroid Build Coastguard Worker  *      See the Collation class for details.
185*0e209d39SAndroid Build Coastguard Worker  *
186*0e209d39SAndroid Build Coastguard Worker  *      The trie has a value for each lead surrogate code unit with some bits encoding
187*0e209d39SAndroid Build Coastguard Worker  *      collective properties of the 1024 supplementary characters whose UTF-16 form starts with
188*0e209d39SAndroid Build Coastguard Worker  *      the lead surrogate. See Collation::LEAD_SURROGATE_TAG..
189*0e209d39SAndroid Build Coastguard Worker  *
190*0e209d39SAndroid Build Coastguard Worker  * int64_t ces[];
191*0e209d39SAndroid Build Coastguard Worker  *      64-bit CEs and expansions that cannot be stored in a more compact form.
192*0e209d39SAndroid Build Coastguard Worker  *
193*0e209d39SAndroid Build Coastguard Worker  * uint32_t ce32s[];
194*0e209d39SAndroid Build Coastguard Worker  *      CE32s for expansions in compact form, and for characters whose trie values
195*0e209d39SAndroid Build Coastguard Worker  *      contain special data.
196*0e209d39SAndroid Build Coastguard Worker  *
197*0e209d39SAndroid Build Coastguard Worker  * uint32_t rootElements[]; -- empty in all tailorings
198*0e209d39SAndroid Build Coastguard Worker  *      Compact storage for all of the CEs that occur in the root collation.
199*0e209d39SAndroid Build Coastguard Worker  *      See the CollationRootElements class.
200*0e209d39SAndroid Build Coastguard Worker  *
201*0e209d39SAndroid Build Coastguard Worker  * char16_t *contexts[];
202*0e209d39SAndroid Build Coastguard Worker  *      Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.
203*0e209d39SAndroid Build Coastguard Worker  *
204*0e209d39SAndroid Build Coastguard Worker  * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()
205*0e209d39SAndroid Build Coastguard Worker  *      Serialized form of characters that are unsafe when iterating backwards,
206*0e209d39SAndroid Build Coastguard Worker  *      and at the end of an identical string prefix.
207*0e209d39SAndroid Build Coastguard Worker  *      Back up to a safe character.
208*0e209d39SAndroid Build Coastguard Worker  *      Lead surrogates are "unsafe" when any of their corresponding supplementary
209*0e209d39SAndroid Build Coastguard Worker  *      code points are unsafe.
210*0e209d39SAndroid Build Coastguard Worker  *      Does not include [:^lccc=0:][:^tccc=0:].
211*0e209d39SAndroid Build Coastguard Worker  *      For each tailoring, the root unsafeBackwardSet is subtracted.
212*0e209d39SAndroid Build Coastguard Worker  *      (As a result, in many tailorings no set needs to be stored.)
213*0e209d39SAndroid Build Coastguard Worker  *
214*0e209d39SAndroid Build Coastguard Worker  * uint16_t fastLatinTable[];
215*0e209d39SAndroid Build Coastguard Worker  *      Optional optimization for Latin text.
216*0e209d39SAndroid Build Coastguard Worker  *      See the CollationFastLatin class.
217*0e209d39SAndroid Build Coastguard Worker  *
218*0e209d39SAndroid Build Coastguard Worker  * uint16_t scripts[]; -- empty in all tailorings
219*0e209d39SAndroid Build Coastguard Worker  *      Format version 5:
220*0e209d39SAndroid Build Coastguard Worker  *      uint16_t numScripts;
221*0e209d39SAndroid Build Coastguard Worker  *      uint16_t scriptsIndex[numScripts+16];
222*0e209d39SAndroid Build Coastguard Worker  *      uint16_t scriptStarts[];
223*0e209d39SAndroid Build Coastguard Worker  *      See CollationData::numScripts etc.
224*0e209d39SAndroid Build Coastguard Worker  *
225*0e209d39SAndroid Build Coastguard Worker  *      Format version 4:
226*0e209d39SAndroid Build Coastguard Worker  *      Table of the reordering groups with their first and last lead bytes,
227*0e209d39SAndroid Build Coastguard Worker  *      and their script and reordering codes.
228*0e209d39SAndroid Build Coastguard Worker  *      See CollationData::scripts.
229*0e209d39SAndroid Build Coastguard Worker  *
230*0e209d39SAndroid Build Coastguard Worker  * UBool compressibleBytes[]; -- empty in all tailorings
231*0e209d39SAndroid Build Coastguard Worker  *      Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
232*0e209d39SAndroid Build Coastguard Worker  *
233*0e209d39SAndroid Build Coastguard Worker  * -----------------
234*0e209d39SAndroid Build Coastguard Worker  * Changes for formatVersion 5 (ICU 55)
235*0e209d39SAndroid Build Coastguard Worker  *
236*0e209d39SAndroid Build Coastguard Worker  * Reordering moves single scripts, not groups of scripts.
237*0e209d39SAndroid Build Coastguard Worker  * Reorder ranges are optionally appended to the reorderCodes,
238*0e209d39SAndroid Build Coastguard Worker  * and a 0 entry in the reorderTable indicates a split lead byte.
239*0e209d39SAndroid Build Coastguard Worker  * The scripts data has a new format.
240*0e209d39SAndroid Build Coastguard Worker  *
241*0e209d39SAndroid Build Coastguard Worker  * The rootElements may contain secondary and tertiary weights below common=05.
242*0e209d39SAndroid Build Coastguard Worker  * (Used for small Hiragana letters.)
243*0e209d39SAndroid Build Coastguard Worker  * Where is occurs, there is also an explicit unit with common secondary & tertiary weights.
244*0e209d39SAndroid Build Coastguard Worker  * There are no other data structure changes, but builder code needs to be able to handle such data.
245*0e209d39SAndroid Build Coastguard Worker  *
246*0e209d39SAndroid Build Coastguard Worker  * The collation element for the merge separator code point U+FFFE
247*0e209d39SAndroid Build Coastguard Worker  * does not necessarily have special, unique secondary/tertiary weights any more.
248*0e209d39SAndroid Build Coastguard Worker  */
249*0e209d39SAndroid Build Coastguard Worker 
250*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
251*0e209d39SAndroid Build Coastguard Worker 
252*0e209d39SAndroid Build Coastguard Worker #endif  // !UCONFIG_NO_COLLATION
253*0e209d39SAndroid Build Coastguard Worker #endif  // __COLLATIONDATAREADER_H__
254