xref: /aosp_15_r20/external/icu/libicu/cts_headers/dictionarydata.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker *******************************************************************************
5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2014, International Business Machines
6*0e209d39SAndroid Build Coastguard Worker * Corporation and others.  All Rights Reserved.
7*0e209d39SAndroid Build Coastguard Worker *******************************************************************************
8*0e209d39SAndroid Build Coastguard Worker * dictionarydata.h
9*0e209d39SAndroid Build Coastguard Worker *
10*0e209d39SAndroid Build Coastguard Worker * created on: 2012may31
11*0e209d39SAndroid Build Coastguard Worker * created by: Markus W. Scherer & Maxime Serrano
12*0e209d39SAndroid Build Coastguard Worker */
13*0e209d39SAndroid Build Coastguard Worker 
14*0e209d39SAndroid Build Coastguard Worker #ifndef __DICTIONARYDATA_H__
15*0e209d39SAndroid Build Coastguard Worker #define __DICTIONARYDATA_H__
16*0e209d39SAndroid Build Coastguard Worker 
17*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
18*0e209d39SAndroid Build Coastguard Worker 
19*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_BREAK_ITERATION
20*0e209d39SAndroid Build Coastguard Worker 
21*0e209d39SAndroid Build Coastguard Worker #include "unicode/utext.h"
22*0e209d39SAndroid Build Coastguard Worker #include "unicode/udata.h"
23*0e209d39SAndroid Build Coastguard Worker #include "udataswp.h"
24*0e209d39SAndroid Build Coastguard Worker #include "unicode/uobject.h"
25*0e209d39SAndroid Build Coastguard Worker #include "unicode/ustringtrie.h"
26*0e209d39SAndroid Build Coastguard Worker 
27*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
28*0e209d39SAndroid Build Coastguard Worker 
29*0e209d39SAndroid Build Coastguard Worker class UCharsTrie;
30*0e209d39SAndroid Build Coastguard Worker class BytesTrie;
31*0e209d39SAndroid Build Coastguard Worker 
32*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API DictionaryData : public UMemory {
33*0e209d39SAndroid Build Coastguard Worker public:
34*0e209d39SAndroid Build Coastguard Worker     static const int32_t TRIE_TYPE_BYTES; // = 0;
35*0e209d39SAndroid Build Coastguard Worker     static const int32_t TRIE_TYPE_UCHARS; // = 1;
36*0e209d39SAndroid Build Coastguard Worker     static const int32_t TRIE_TYPE_MASK; // = 7;
37*0e209d39SAndroid Build Coastguard Worker     static const int32_t TRIE_HAS_VALUES; // = 8;
38*0e209d39SAndroid Build Coastguard Worker 
39*0e209d39SAndroid Build Coastguard Worker     static const int32_t TRANSFORM_NONE; // = 0;
40*0e209d39SAndroid Build Coastguard Worker     static const int32_t TRANSFORM_TYPE_OFFSET; // = 0x1000000;
41*0e209d39SAndroid Build Coastguard Worker     static const int32_t TRANSFORM_TYPE_MASK; // = 0x7f000000;
42*0e209d39SAndroid Build Coastguard Worker     static const int32_t TRANSFORM_OFFSET_MASK; // = 0x1fffff;
43*0e209d39SAndroid Build Coastguard Worker 
44*0e209d39SAndroid Build Coastguard Worker     enum {
45*0e209d39SAndroid Build Coastguard Worker         // Byte offsets from the start of the data, after the generic header.
46*0e209d39SAndroid Build Coastguard Worker         IX_STRING_TRIE_OFFSET,
47*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED1_OFFSET,
48*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED2_OFFSET,
49*0e209d39SAndroid Build Coastguard Worker         IX_TOTAL_SIZE,
50*0e209d39SAndroid Build Coastguard Worker 
51*0e209d39SAndroid Build Coastguard Worker         // Trie type: TRIE_HAS_VALUES | TRIE_TYPE_BYTES etc.
52*0e209d39SAndroid Build Coastguard Worker         IX_TRIE_TYPE,
53*0e209d39SAndroid Build Coastguard Worker         // Transform specification: TRANSFORM_TYPE_OFFSET | 0xe00 etc.
54*0e209d39SAndroid Build Coastguard Worker         IX_TRANSFORM,
55*0e209d39SAndroid Build Coastguard Worker 
56*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED6,
57*0e209d39SAndroid Build Coastguard Worker         IX_RESERVED7,
58*0e209d39SAndroid Build Coastguard Worker         IX_COUNT
59*0e209d39SAndroid Build Coastguard Worker     };
60*0e209d39SAndroid Build Coastguard Worker };
61*0e209d39SAndroid Build Coastguard Worker 
62*0e209d39SAndroid Build Coastguard Worker /**
63*0e209d39SAndroid Build Coastguard Worker  * Wrapper class around generic dictionaries, implementing matches().
64*0e209d39SAndroid Build Coastguard Worker  * getType() should return a TRIE_TYPE_??? constant from DictionaryData.
65*0e209d39SAndroid Build Coastguard Worker  *
66*0e209d39SAndroid Build Coastguard Worker  * All implementations of this interface must be thread-safe if they are to be used inside of the
67*0e209d39SAndroid Build Coastguard Worker  * dictionary-based break iteration code.
68*0e209d39SAndroid Build Coastguard Worker  */
69*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API DictionaryMatcher : public UMemory {
70*0e209d39SAndroid Build Coastguard Worker public:
DictionaryMatcher()71*0e209d39SAndroid Build Coastguard Worker     DictionaryMatcher() {}
72*0e209d39SAndroid Build Coastguard Worker     virtual ~DictionaryMatcher();
73*0e209d39SAndroid Build Coastguard Worker     // this should emulate CompactTrieDictionary::matches()
74*0e209d39SAndroid Build Coastguard Worker     /*  @param text      The text in which to look for matching words. Matching begins
75*0e209d39SAndroid Build Coastguard Worker      *                   at the current position of the UText.
76*0e209d39SAndroid Build Coastguard Worker      *  @param maxLength The max length of match to consider. Units are the native indexing
77*0e209d39SAndroid Build Coastguard Worker      *                   units of the UText.
78*0e209d39SAndroid Build Coastguard Worker      *  @param limit     Capacity of output arrays, which is also the maximum number of
79*0e209d39SAndroid Build Coastguard Worker      *                   matching words to be found.
80*0e209d39SAndroid Build Coastguard Worker      *  @param lengths   output array, filled with the lengths of the matches, in order,
81*0e209d39SAndroid Build Coastguard Worker      *                   from shortest to longest. Lengths are in native indexing units
82*0e209d39SAndroid Build Coastguard Worker      *                   of the UText. May be nullptr.
83*0e209d39SAndroid Build Coastguard Worker      *  @param cpLengths output array, filled with the lengths of the matches, in order,
84*0e209d39SAndroid Build Coastguard Worker      *                   from shortest to longest. Lengths are the number of Unicode code points.
85*0e209d39SAndroid Build Coastguard Worker      *                   May be nullptr.
86*0e209d39SAndroid Build Coastguard Worker      *  @param values    Output array, filled with the values associated with the words found.
87*0e209d39SAndroid Build Coastguard Worker      *                   May be nullptr.
88*0e209d39SAndroid Build Coastguard Worker      *  @param prefix    Output parameter, the code point length of the prefix match, even if that
89*0e209d39SAndroid Build Coastguard Worker      *                   prefix didn't lead to a complete word. Will always be >= the cpLength
90*0e209d39SAndroid Build Coastguard Worker      *                   of the longest complete word matched. May be nullptr.
91*0e209d39SAndroid Build Coastguard Worker      *  @return          Number of matching words found.
92*0e209d39SAndroid Build Coastguard Worker      */
93*0e209d39SAndroid Build Coastguard Worker     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
94*0e209d39SAndroid Build Coastguard Worker                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
95*0e209d39SAndroid Build Coastguard Worker                             int32_t *prefix) const = 0;
96*0e209d39SAndroid Build Coastguard Worker 
97*0e209d39SAndroid Build Coastguard Worker     /** @return DictionaryData::TRIE_TYPE_XYZ */
98*0e209d39SAndroid Build Coastguard Worker     virtual int32_t getType() const = 0;
99*0e209d39SAndroid Build Coastguard Worker };
100*0e209d39SAndroid Build Coastguard Worker 
101*0e209d39SAndroid Build Coastguard Worker // Implementation of the DictionaryMatcher interface for a UCharsTrie dictionary
102*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API UCharsDictionaryMatcher : public DictionaryMatcher {
103*0e209d39SAndroid Build Coastguard Worker public:
104*0e209d39SAndroid Build Coastguard Worker     // constructs a new UCharsDictionaryMatcher.
105*0e209d39SAndroid Build Coastguard Worker     // The UDataMemory * will be closed on this object's destruction.
UCharsDictionaryMatcher(const char16_t * c,UDataMemory * f)106*0e209d39SAndroid Build Coastguard Worker     UCharsDictionaryMatcher(const char16_t *c, UDataMemory *f) : characters(c), file(f) { }
107*0e209d39SAndroid Build Coastguard Worker     virtual ~UCharsDictionaryMatcher();
108*0e209d39SAndroid Build Coastguard Worker     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
109*0e209d39SAndroid Build Coastguard Worker                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
110*0e209d39SAndroid Build Coastguard Worker                             int32_t *prefix) const override;
111*0e209d39SAndroid Build Coastguard Worker     virtual int32_t getType() const override;
112*0e209d39SAndroid Build Coastguard Worker private:
113*0e209d39SAndroid Build Coastguard Worker     const char16_t *characters;
114*0e209d39SAndroid Build Coastguard Worker     UDataMemory *file;
115*0e209d39SAndroid Build Coastguard Worker };
116*0e209d39SAndroid Build Coastguard Worker 
117*0e209d39SAndroid Build Coastguard Worker // Implementation of the DictionaryMatcher interface for a BytesTrie dictionary
118*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API BytesDictionaryMatcher : public DictionaryMatcher {
119*0e209d39SAndroid Build Coastguard Worker public:
120*0e209d39SAndroid Build Coastguard Worker     // constructs a new BytesTrieDictionaryMatcher
121*0e209d39SAndroid Build Coastguard Worker     // the transform constant should be the constant read from the file, not a masked version!
122*0e209d39SAndroid Build Coastguard Worker     // the UDataMemory * fed in here will be closed on this object's destruction
BytesDictionaryMatcher(const char * c,int32_t t,UDataMemory * f)123*0e209d39SAndroid Build Coastguard Worker     BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f)
124*0e209d39SAndroid Build Coastguard Worker             : characters(c), transformConstant(t), file(f) { }
125*0e209d39SAndroid Build Coastguard Worker     virtual ~BytesDictionaryMatcher();
126*0e209d39SAndroid Build Coastguard Worker     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
127*0e209d39SAndroid Build Coastguard Worker                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
128*0e209d39SAndroid Build Coastguard Worker                             int32_t *prefix) const override;
129*0e209d39SAndroid Build Coastguard Worker     virtual int32_t getType() const override;
130*0e209d39SAndroid Build Coastguard Worker private:
131*0e209d39SAndroid Build Coastguard Worker     UChar32 transform(UChar32 c) const;
132*0e209d39SAndroid Build Coastguard Worker 
133*0e209d39SAndroid Build Coastguard Worker     const char *characters;
134*0e209d39SAndroid Build Coastguard Worker     int32_t transformConstant;
135*0e209d39SAndroid Build Coastguard Worker     UDataMemory *file;
136*0e209d39SAndroid Build Coastguard Worker };
137*0e209d39SAndroid Build Coastguard Worker 
138*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
139*0e209d39SAndroid Build Coastguard Worker 
140*0e209d39SAndroid Build Coastguard Worker U_CAPI int32_t U_EXPORT2
141*0e209d39SAndroid Build Coastguard Worker udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode);
142*0e209d39SAndroid Build Coastguard Worker 
143*0e209d39SAndroid Build Coastguard Worker /**
144*0e209d39SAndroid Build Coastguard Worker  * Format of dictionary .dict data files.
145*0e209d39SAndroid Build Coastguard Worker  * Format version 1.0.
146*0e209d39SAndroid Build Coastguard Worker  *
147*0e209d39SAndroid Build Coastguard Worker  * A dictionary .dict data file contains a byte-serialized BytesTrie or
148*0e209d39SAndroid Build Coastguard Worker  * a UChars-serialized UCharsTrie.
149*0e209d39SAndroid Build Coastguard Worker  * Such files are used in dictionary-based break iteration (DBBI).
150*0e209d39SAndroid Build Coastguard Worker  *
151*0e209d39SAndroid Build Coastguard Worker  * For a BytesTrie, a transformation type is specified for
152*0e209d39SAndroid Build Coastguard Worker  * transforming Unicode strings into byte sequences.
153*0e209d39SAndroid Build Coastguard Worker  *
154*0e209d39SAndroid Build Coastguard Worker  * A .dict file begins with a standard ICU data file header
155*0e209d39SAndroid Build Coastguard Worker  * (DataHeader, see ucmndata.h and unicode/udata.h).
156*0e209d39SAndroid Build Coastguard Worker  * The UDataInfo.dataVersion field is currently unused (set to 0.0.0.0).
157*0e209d39SAndroid Build Coastguard Worker  *
158*0e209d39SAndroid Build Coastguard Worker  * After the header, the file contains the following parts.
159*0e209d39SAndroid Build Coastguard Worker  * Constants are defined in the DictionaryData class.
160*0e209d39SAndroid Build Coastguard Worker  *
161*0e209d39SAndroid Build Coastguard Worker  * For the data structure of BytesTrie & UCharsTrie see
162*0e209d39SAndroid Build Coastguard Worker  * https://icu.unicode.org/design/struct/tries
163*0e209d39SAndroid Build Coastguard Worker  * and the bytestrie.h and ucharstrie.h header files.
164*0e209d39SAndroid Build Coastguard Worker  *
165*0e209d39SAndroid Build Coastguard Worker  * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_STRING_TRIE_OFFSET]/4;
166*0e209d39SAndroid Build Coastguard Worker  *
167*0e209d39SAndroid Build Coastguard Worker  *      The first four indexes are byte offsets in ascending order.
168*0e209d39SAndroid Build Coastguard Worker  *      Each byte offset marks the start of the next part in the data file,
169*0e209d39SAndroid Build Coastguard Worker  *      and the end of the previous one.
170*0e209d39SAndroid Build Coastguard Worker  *      When two consecutive byte offsets are the same, then the corresponding part is empty.
171*0e209d39SAndroid Build Coastguard Worker  *      Byte offsets are offsets from after the header,
172*0e209d39SAndroid Build Coastguard Worker  *      that is, from the beginning of the indexes[].
173*0e209d39SAndroid Build Coastguard Worker  *      Each part starts at an offset with proper alignment for its data.
174*0e209d39SAndroid Build Coastguard Worker  *      If necessary, the previous part may include padding bytes to achieve this alignment.
175*0e209d39SAndroid Build Coastguard Worker  *
176*0e209d39SAndroid Build Coastguard Worker  *      trieType=indexes[IX_TRIE_TYPE] defines the trie type.
177*0e209d39SAndroid Build Coastguard Worker  *      transform=indexes[IX_TRANSFORM] defines the Unicode-to-bytes transformation.
178*0e209d39SAndroid Build Coastguard Worker  *          If the transformation type is TRANSFORM_TYPE_OFFSET,
179*0e209d39SAndroid Build Coastguard Worker  *          then the lower 21 bits contain the offset code point.
180*0e209d39SAndroid Build Coastguard Worker  *          Each code point c is mapped to byte b = (c - offset).
181*0e209d39SAndroid Build Coastguard Worker  *          Code points outside the range offset..(offset+0xff) cannot be mapped
182*0e209d39SAndroid Build Coastguard Worker  *          and do not occur in the dictionary.
183*0e209d39SAndroid Build Coastguard Worker  *
184*0e209d39SAndroid Build Coastguard Worker  * stringTrie; -- a serialized BytesTrie or UCharsTrie
185*0e209d39SAndroid Build Coastguard Worker  *
186*0e209d39SAndroid Build Coastguard Worker  *      The dictionary maps strings to specific values (TRIE_HAS_VALUES bit set in trieType),
187*0e209d39SAndroid Build Coastguard Worker  *      or it maps all strings to 0 (TRIE_HAS_VALUES bit not set).
188*0e209d39SAndroid Build Coastguard Worker  */
189*0e209d39SAndroid Build Coastguard Worker 
190*0e209d39SAndroid Build Coastguard Worker #endif  /* !UCONFIG_NO_BREAK_ITERATION */
191*0e209d39SAndroid Build Coastguard Worker #endif  /* __DICTIONARYDATA_H__ */
192