1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker /* 4*0e209d39SAndroid Build Coastguard Worker ******************************************************************************* 5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2013-2015, International Business Machines 6*0e209d39SAndroid Build Coastguard Worker * Corporation and others. All Rights Reserved. 7*0e209d39SAndroid Build Coastguard Worker ******************************************************************************* 8*0e209d39SAndroid Build Coastguard Worker * collationdatareader.h 9*0e209d39SAndroid Build Coastguard Worker * 10*0e209d39SAndroid Build Coastguard Worker * created on: 2013feb07 11*0e209d39SAndroid Build Coastguard Worker * created by: Markus W. Scherer 12*0e209d39SAndroid Build Coastguard Worker */ 13*0e209d39SAndroid Build Coastguard Worker 14*0e209d39SAndroid Build Coastguard Worker #ifndef __COLLATIONDATAREADER_H__ 15*0e209d39SAndroid Build Coastguard Worker #define __COLLATIONDATAREADER_H__ 16*0e209d39SAndroid Build Coastguard Worker 17*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 18*0e209d39SAndroid Build Coastguard Worker 19*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_COLLATION 20*0e209d39SAndroid Build Coastguard Worker 21*0e209d39SAndroid Build Coastguard Worker #include "unicode/udata.h" 22*0e209d39SAndroid Build Coastguard Worker 23*0e209d39SAndroid Build Coastguard Worker struct UDataMemory; 24*0e209d39SAndroid Build Coastguard Worker 25*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 26*0e209d39SAndroid Build Coastguard Worker 27*0e209d39SAndroid Build Coastguard Worker struct CollationTailoring; 28*0e209d39SAndroid Build Coastguard Worker 29*0e209d39SAndroid Build Coastguard Worker /** 30*0e209d39SAndroid Build Coastguard Worker * Collation binary data reader. 31*0e209d39SAndroid Build Coastguard Worker */ 32*0e209d39SAndroid Build Coastguard Worker struct U_I18N_API CollationDataReader /* all static */ { 33*0e209d39SAndroid Build Coastguard Worker // The following constants are also copied into source/common/ucol_swp.cpp. 34*0e209d39SAndroid Build Coastguard Worker // Keep them in sync! 35*0e209d39SAndroid Build Coastguard Worker enum { 36*0e209d39SAndroid Build Coastguard Worker /** 37*0e209d39SAndroid Build Coastguard Worker * Number of int32_t indexes. 38*0e209d39SAndroid Build Coastguard Worker * 39*0e209d39SAndroid Build Coastguard Worker * Can be 2 if there are only options. 40*0e209d39SAndroid Build Coastguard Worker * Can be 7 or 8 if there are only options and a script reordering. 41*0e209d39SAndroid Build Coastguard Worker * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. 42*0e209d39SAndroid Build Coastguard Worker */ 43*0e209d39SAndroid Build Coastguard Worker IX_INDEXES_LENGTH, // 0 44*0e209d39SAndroid Build Coastguard Worker /** 45*0e209d39SAndroid Build Coastguard Worker * Bits 31..24: numericPrimary, for numeric collation 46*0e209d39SAndroid Build Coastguard Worker * 23..16: fast Latin format version (0 = no fast Latin table) 47*0e209d39SAndroid Build Coastguard Worker * 15.. 0: options bit set 48*0e209d39SAndroid Build Coastguard Worker */ 49*0e209d39SAndroid Build Coastguard Worker IX_OPTIONS, 50*0e209d39SAndroid Build Coastguard Worker IX_RESERVED2, 51*0e209d39SAndroid Build Coastguard Worker IX_RESERVED3, 52*0e209d39SAndroid Build Coastguard Worker 53*0e209d39SAndroid Build Coastguard Worker /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ 54*0e209d39SAndroid Build Coastguard Worker IX_JAMO_CE32S_START, // 4 55*0e209d39SAndroid Build Coastguard Worker 56*0e209d39SAndroid Build Coastguard Worker // Byte offsets from the start of the data, after the generic header. 57*0e209d39SAndroid Build Coastguard Worker // The indexes[] are at byte offset 0, other data follows. 58*0e209d39SAndroid Build Coastguard Worker // Each data item is aligned properly. 59*0e209d39SAndroid Build Coastguard Worker // The data items should be in descending order of unit size, 60*0e209d39SAndroid Build Coastguard Worker // to minimize the need for padding. 61*0e209d39SAndroid Build Coastguard Worker // Each item's byte length is given by the difference between its offset and 62*0e209d39SAndroid Build Coastguard Worker // the next index/offset value. 63*0e209d39SAndroid Build Coastguard Worker /** Byte offset to int32_t reorderCodes[]. */ 64*0e209d39SAndroid Build Coastguard Worker IX_REORDER_CODES_OFFSET, 65*0e209d39SAndroid Build Coastguard Worker /** 66*0e209d39SAndroid Build Coastguard Worker * Byte offset to uint8_t reorderTable[]. 67*0e209d39SAndroid Build Coastguard Worker * Empty table if <256 bytes (padding only). 68*0e209d39SAndroid Build Coastguard Worker * Otherwise 256 bytes or more (with padding). 69*0e209d39SAndroid Build Coastguard Worker */ 70*0e209d39SAndroid Build Coastguard Worker IX_REORDER_TABLE_OFFSET, 71*0e209d39SAndroid Build Coastguard Worker /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ 72*0e209d39SAndroid Build Coastguard Worker IX_TRIE_OFFSET, 73*0e209d39SAndroid Build Coastguard Worker 74*0e209d39SAndroid Build Coastguard Worker IX_RESERVED8_OFFSET, // 8 75*0e209d39SAndroid Build Coastguard Worker /** Byte offset to int64_t ces[]. */ 76*0e209d39SAndroid Build Coastguard Worker IX_CES_OFFSET, 77*0e209d39SAndroid Build Coastguard Worker IX_RESERVED10_OFFSET, 78*0e209d39SAndroid Build Coastguard Worker /** Byte offset to uint32_t ce32s[]. */ 79*0e209d39SAndroid Build Coastguard Worker IX_CE32S_OFFSET, 80*0e209d39SAndroid Build Coastguard Worker 81*0e209d39SAndroid Build Coastguard Worker /** Byte offset to uint32_t rootElements[]. */ 82*0e209d39SAndroid Build Coastguard Worker IX_ROOT_ELEMENTS_OFFSET, // 12 83*0e209d39SAndroid Build Coastguard Worker /** Byte offset to char16_t *contexts[]. */ 84*0e209d39SAndroid Build Coastguard Worker IX_CONTEXTS_OFFSET, 85*0e209d39SAndroid Build Coastguard Worker /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */ 86*0e209d39SAndroid Build Coastguard Worker IX_UNSAFE_BWD_OFFSET, 87*0e209d39SAndroid Build Coastguard Worker /** Byte offset to uint16_t fastLatinTable[]. */ 88*0e209d39SAndroid Build Coastguard Worker IX_FAST_LATIN_TABLE_OFFSET, 89*0e209d39SAndroid Build Coastguard Worker 90*0e209d39SAndroid Build Coastguard Worker /** Byte offset to uint16_t scripts[]. */ 91*0e209d39SAndroid Build Coastguard Worker IX_SCRIPTS_OFFSET, // 16 92*0e209d39SAndroid Build Coastguard Worker /** 93*0e209d39SAndroid Build Coastguard Worker * Byte offset to UBool compressibleBytes[]. 94*0e209d39SAndroid Build Coastguard Worker * Empty table if <256 bytes (padding only). 95*0e209d39SAndroid Build Coastguard Worker * Otherwise 256 bytes or more (with padding). 96*0e209d39SAndroid Build Coastguard Worker */ 97*0e209d39SAndroid Build Coastguard Worker IX_COMPRESSIBLE_BYTES_OFFSET, 98*0e209d39SAndroid Build Coastguard Worker IX_RESERVED18_OFFSET, 99*0e209d39SAndroid Build Coastguard Worker IX_TOTAL_SIZE 100*0e209d39SAndroid Build Coastguard Worker }; 101*0e209d39SAndroid Build Coastguard Worker 102*0e209d39SAndroid Build Coastguard Worker static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength, 103*0e209d39SAndroid Build Coastguard Worker CollationTailoring &tailoring, UErrorCode &errorCode); 104*0e209d39SAndroid Build Coastguard Worker 105*0e209d39SAndroid Build Coastguard Worker static UBool U_CALLCONV 106*0e209d39SAndroid Build Coastguard Worker isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); 107*0e209d39SAndroid Build Coastguard Worker 108*0e209d39SAndroid Build Coastguard Worker private: 109*0e209d39SAndroid Build Coastguard Worker CollationDataReader() = delete; // no constructor 110*0e209d39SAndroid Build Coastguard Worker }; 111*0e209d39SAndroid Build Coastguard Worker 112*0e209d39SAndroid Build Coastguard Worker /* 113*0e209d39SAndroid Build Coastguard Worker * Format of collation data (ucadata.icu, binary data in coll/ *.res files). 114*0e209d39SAndroid Build Coastguard Worker * Format version 5. 115*0e209d39SAndroid Build Coastguard Worker * 116*0e209d39SAndroid Build Coastguard Worker * The root collation data is stored in the ucadata.icu file. 117*0e209d39SAndroid Build Coastguard Worker * Tailorings are stored inside .res resource bundle files, with a complete file header. 118*0e209d39SAndroid Build Coastguard Worker * 119*0e209d39SAndroid Build Coastguard Worker * Collation data begins with a standard ICU data file header 120*0e209d39SAndroid Build Coastguard Worker * (DataHeader, see ucmndata.h and unicode/udata.h). 121*0e209d39SAndroid Build Coastguard Worker * The UDataInfo.dataVersion field contains the UCA and other version numbers, 122*0e209d39SAndroid Build Coastguard Worker * see the comments for CollationTailoring.version. 123*0e209d39SAndroid Build Coastguard Worker * 124*0e209d39SAndroid Build Coastguard Worker * After the header, the file contains the following parts. 125*0e209d39SAndroid Build Coastguard Worker * Constants are defined as enum values of the CollationDataReader class. 126*0e209d39SAndroid Build Coastguard Worker * See also the Collation class. 127*0e209d39SAndroid Build Coastguard Worker * 128*0e209d39SAndroid Build Coastguard Worker * int32_t indexes[indexesLength]; 129*0e209d39SAndroid Build Coastguard Worker * The indexes array has variable length. 130*0e209d39SAndroid Build Coastguard Worker * Some tailorings only need the length and the options, 131*0e209d39SAndroid Build Coastguard Worker * others only add reorderCodes and the reorderTable, 132*0e209d39SAndroid Build Coastguard Worker * some need to store mappings. 133*0e209d39SAndroid Build Coastguard Worker * Only as many indexes are stored as needed to read all of the data. 134*0e209d39SAndroid Build Coastguard Worker * 135*0e209d39SAndroid Build Coastguard Worker * Index 0: indexesLength 136*0e209d39SAndroid Build Coastguard Worker * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS 137*0e209d39SAndroid Build Coastguard Worker * Index 2..3: Unused/reserved/0. 138*0e209d39SAndroid Build Coastguard Worker * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo 139*0e209d39SAndroid Build Coastguard Worker * are stored in a short, contiguous part of the ce32s array. 140*0e209d39SAndroid Build Coastguard Worker * 141*0e209d39SAndroid Build Coastguard Worker * Indexes 5..19 are byte offsets in ascending order. 142*0e209d39SAndroid Build Coastguard Worker * Each byte offset marks the start of the next part in the data file, 143*0e209d39SAndroid Build Coastguard Worker * and the end of the previous one. 144*0e209d39SAndroid Build Coastguard Worker * When two consecutive byte offsets are the same (or too short), 145*0e209d39SAndroid Build Coastguard Worker * then the corresponding part is empty. 146*0e209d39SAndroid Build Coastguard Worker * Byte offsets are offsets from after the header, 147*0e209d39SAndroid Build Coastguard Worker * that is, from the beginning of the indexes[]. 148*0e209d39SAndroid Build Coastguard Worker * Each part starts at an offset with proper alignment for its data. 149*0e209d39SAndroid Build Coastguard Worker * If necessary, the previous part may include padding bytes to achieve this alignment. 150*0e209d39SAndroid Build Coastguard Worker * The last byte offset that is stored in the indexes indicates the total size of the data 151*0e209d39SAndroid Build Coastguard Worker * (starting with the indexes). 152*0e209d39SAndroid Build Coastguard Worker * 153*0e209d39SAndroid Build Coastguard Worker * int32_t reorderCodes[]; -- empty in root 154*0e209d39SAndroid Build Coastguard Worker * The list of script and reordering codes. 155*0e209d39SAndroid Build Coastguard Worker * 156*0e209d39SAndroid Build Coastguard Worker * Beginning with format version 5, this array may optionally 157*0e209d39SAndroid Build Coastguard Worker * have trailing entries with a full list of reorder ranges 158*0e209d39SAndroid Build Coastguard Worker * as described for CollationSettings::reorderRanges. 159*0e209d39SAndroid Build Coastguard Worker * 160*0e209d39SAndroid Build Coastguard Worker * Script or reorder codes are first and do not exceed 16-bit values. 161*0e209d39SAndroid Build Coastguard Worker * Range limits are stored in the upper 16 bits, and are never 0. 162*0e209d39SAndroid Build Coastguard Worker * Split this array into reorder codes and ranges at the first entry 163*0e209d39SAndroid Build Coastguard Worker * with non-zero upper 16 bits. 164*0e209d39SAndroid Build Coastguard Worker * 165*0e209d39SAndroid Build Coastguard Worker * If the ranges are missing but needed for split-reordered primary lead bytes, 166*0e209d39SAndroid Build Coastguard Worker * then they are regenerated at load time. 167*0e209d39SAndroid Build Coastguard Worker * 168*0e209d39SAndroid Build Coastguard Worker * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes 169*0e209d39SAndroid Build Coastguard Worker * Primary-weight lead byte permutation table. 170*0e209d39SAndroid Build Coastguard Worker * Normally present when the reorderCodes are, but can be built at load time. 171*0e209d39SAndroid Build Coastguard Worker * 172*0e209d39SAndroid Build Coastguard Worker * Beginning with format version 5, a 0 entry at a non-zero index 173*0e209d39SAndroid Build Coastguard Worker * (which is otherwise an illegal value) 174*0e209d39SAndroid Build Coastguard Worker * means that the primary lead byte is "split" 175*0e209d39SAndroid Build Coastguard Worker * (there are different offsets for primaries that share that lead byte) 176*0e209d39SAndroid Build Coastguard Worker * and the reordering offset must be determined via the reorder ranges 177*0e209d39SAndroid Build Coastguard Worker * that are either stored as part of the reorderCodes array 178*0e209d39SAndroid Build Coastguard Worker * or regenerated at load time. 179*0e209d39SAndroid Build Coastguard Worker * 180*0e209d39SAndroid Build Coastguard Worker * UTrie2 trie; -- see utrie2_impl.h and utrie2.h 181*0e209d39SAndroid Build Coastguard Worker * The trie holds the main collation data. Each code point is mapped to a 32-bit value. 182*0e209d39SAndroid Build Coastguard Worker * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, 183*0e209d39SAndroid Build Coastguard Worker * in which case it is a special CE32 and contains a 4-bit tag and further data. 184*0e209d39SAndroid Build Coastguard Worker * See the Collation class for details. 185*0e209d39SAndroid Build Coastguard Worker * 186*0e209d39SAndroid Build Coastguard Worker * The trie has a value for each lead surrogate code unit with some bits encoding 187*0e209d39SAndroid Build Coastguard Worker * collective properties of the 1024 supplementary characters whose UTF-16 form starts with 188*0e209d39SAndroid Build Coastguard Worker * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. 189*0e209d39SAndroid Build Coastguard Worker * 190*0e209d39SAndroid Build Coastguard Worker * int64_t ces[]; 191*0e209d39SAndroid Build Coastguard Worker * 64-bit CEs and expansions that cannot be stored in a more compact form. 192*0e209d39SAndroid Build Coastguard Worker * 193*0e209d39SAndroid Build Coastguard Worker * uint32_t ce32s[]; 194*0e209d39SAndroid Build Coastguard Worker * CE32s for expansions in compact form, and for characters whose trie values 195*0e209d39SAndroid Build Coastguard Worker * contain special data. 196*0e209d39SAndroid Build Coastguard Worker * 197*0e209d39SAndroid Build Coastguard Worker * uint32_t rootElements[]; -- empty in all tailorings 198*0e209d39SAndroid Build Coastguard Worker * Compact storage for all of the CEs that occur in the root collation. 199*0e209d39SAndroid Build Coastguard Worker * See the CollationRootElements class. 200*0e209d39SAndroid Build Coastguard Worker * 201*0e209d39SAndroid Build Coastguard Worker * char16_t *contexts[]; 202*0e209d39SAndroid Build Coastguard Worker * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings. 203*0e209d39SAndroid Build Coastguard Worker * 204*0e209d39SAndroid Build Coastguard Worker * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() 205*0e209d39SAndroid Build Coastguard Worker * Serialized form of characters that are unsafe when iterating backwards, 206*0e209d39SAndroid Build Coastguard Worker * and at the end of an identical string prefix. 207*0e209d39SAndroid Build Coastguard Worker * Back up to a safe character. 208*0e209d39SAndroid Build Coastguard Worker * Lead surrogates are "unsafe" when any of their corresponding supplementary 209*0e209d39SAndroid Build Coastguard Worker * code points are unsafe. 210*0e209d39SAndroid Build Coastguard Worker * Does not include [:^lccc=0:][:^tccc=0:]. 211*0e209d39SAndroid Build Coastguard Worker * For each tailoring, the root unsafeBackwardSet is subtracted. 212*0e209d39SAndroid Build Coastguard Worker * (As a result, in many tailorings no set needs to be stored.) 213*0e209d39SAndroid Build Coastguard Worker * 214*0e209d39SAndroid Build Coastguard Worker * uint16_t fastLatinTable[]; 215*0e209d39SAndroid Build Coastguard Worker * Optional optimization for Latin text. 216*0e209d39SAndroid Build Coastguard Worker * See the CollationFastLatin class. 217*0e209d39SAndroid Build Coastguard Worker * 218*0e209d39SAndroid Build Coastguard Worker * uint16_t scripts[]; -- empty in all tailorings 219*0e209d39SAndroid Build Coastguard Worker * Format version 5: 220*0e209d39SAndroid Build Coastguard Worker * uint16_t numScripts; 221*0e209d39SAndroid Build Coastguard Worker * uint16_t scriptsIndex[numScripts+16]; 222*0e209d39SAndroid Build Coastguard Worker * uint16_t scriptStarts[]; 223*0e209d39SAndroid Build Coastguard Worker * See CollationData::numScripts etc. 224*0e209d39SAndroid Build Coastguard Worker * 225*0e209d39SAndroid Build Coastguard Worker * Format version 4: 226*0e209d39SAndroid Build Coastguard Worker * Table of the reordering groups with their first and last lead bytes, 227*0e209d39SAndroid Build Coastguard Worker * and their script and reordering codes. 228*0e209d39SAndroid Build Coastguard Worker * See CollationData::scripts. 229*0e209d39SAndroid Build Coastguard Worker * 230*0e209d39SAndroid Build Coastguard Worker * UBool compressibleBytes[]; -- empty in all tailorings 231*0e209d39SAndroid Build Coastguard Worker * Flag for getSortKey(), indicating primary weight lead bytes that are compressible. 232*0e209d39SAndroid Build Coastguard Worker * 233*0e209d39SAndroid Build Coastguard Worker * ----------------- 234*0e209d39SAndroid Build Coastguard Worker * Changes for formatVersion 5 (ICU 55) 235*0e209d39SAndroid Build Coastguard Worker * 236*0e209d39SAndroid Build Coastguard Worker * Reordering moves single scripts, not groups of scripts. 237*0e209d39SAndroid Build Coastguard Worker * Reorder ranges are optionally appended to the reorderCodes, 238*0e209d39SAndroid Build Coastguard Worker * and a 0 entry in the reorderTable indicates a split lead byte. 239*0e209d39SAndroid Build Coastguard Worker * The scripts data has a new format. 240*0e209d39SAndroid Build Coastguard Worker * 241*0e209d39SAndroid Build Coastguard Worker * The rootElements may contain secondary and tertiary weights below common=05. 242*0e209d39SAndroid Build Coastguard Worker * (Used for small Hiragana letters.) 243*0e209d39SAndroid Build Coastguard Worker * Where is occurs, there is also an explicit unit with common secondary & tertiary weights. 244*0e209d39SAndroid Build Coastguard Worker * There are no other data structure changes, but builder code needs to be able to handle such data. 245*0e209d39SAndroid Build Coastguard Worker * 246*0e209d39SAndroid Build Coastguard Worker * The collation element for the merge separator code point U+FFFE 247*0e209d39SAndroid Build Coastguard Worker * does not necessarily have special, unique secondary/tertiary weights any more. 248*0e209d39SAndroid Build Coastguard Worker */ 249*0e209d39SAndroid Build Coastguard Worker 250*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 251*0e209d39SAndroid Build Coastguard Worker 252*0e209d39SAndroid Build Coastguard Worker #endif // !UCONFIG_NO_COLLATION 253*0e209d39SAndroid Build Coastguard Worker #endif // __COLLATIONDATAREADER_H__ 254