xref: /aosp_15_r20/external/icu/libicu/cts_headers/usrchimp.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker **********************************************************************
5*0e209d39SAndroid Build Coastguard Worker *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
6*0e209d39SAndroid Build Coastguard Worker **********************************************************************
7*0e209d39SAndroid Build Coastguard Worker *   Date        Name        Description
8*0e209d39SAndroid Build Coastguard Worker *  08/13/2001   synwee      Creation.
9*0e209d39SAndroid Build Coastguard Worker **********************************************************************
10*0e209d39SAndroid Build Coastguard Worker */
11*0e209d39SAndroid Build Coastguard Worker #ifndef USRCHIMP_H
12*0e209d39SAndroid Build Coastguard Worker #define USRCHIMP_H
13*0e209d39SAndroid Build Coastguard Worker 
14*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
15*0e209d39SAndroid Build Coastguard Worker 
16*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_COLLATION
17*0e209d39SAndroid Build Coastguard Worker 
18*0e209d39SAndroid Build Coastguard Worker #include "unicode/normalizer2.h"
19*0e209d39SAndroid Build Coastguard Worker #include "unicode/ucol.h"
20*0e209d39SAndroid Build Coastguard Worker #include "unicode/ucoleitr.h"
21*0e209d39SAndroid Build Coastguard Worker #include "unicode/ubrk.h"
22*0e209d39SAndroid Build Coastguard Worker 
23*0e209d39SAndroid Build Coastguard Worker /* mask off anything but primary order */
24*0e209d39SAndroid Build Coastguard Worker #define UCOL_PRIMARYORDERMASK 0xffff0000
25*0e209d39SAndroid Build Coastguard Worker /* mask off anything but secondary order */
26*0e209d39SAndroid Build Coastguard Worker #define UCOL_SECONDARYORDERMASK 0x0000ff00
27*0e209d39SAndroid Build Coastguard Worker /* mask off anything but tertiary order */
28*0e209d39SAndroid Build Coastguard Worker #define UCOL_TERTIARYORDERMASK 0x000000ff
29*0e209d39SAndroid Build Coastguard Worker /* primary order shift */
30*0e209d39SAndroid Build Coastguard Worker #define UCOL_PRIMARYORDERSHIFT 16
31*0e209d39SAndroid Build Coastguard Worker /* secondary order shift */
32*0e209d39SAndroid Build Coastguard Worker #define UCOL_SECONDARYORDERSHIFT 8
33*0e209d39SAndroid Build Coastguard Worker 
34*0e209d39SAndroid Build Coastguard Worker #define UCOL_IGNORABLE 0
35*0e209d39SAndroid Build Coastguard Worker 
36*0e209d39SAndroid Build Coastguard Worker /* get weights from a CE */
37*0e209d39SAndroid Build Coastguard Worker #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
38*0e209d39SAndroid Build Coastguard Worker #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
39*0e209d39SAndroid Build Coastguard Worker #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
40*0e209d39SAndroid Build Coastguard Worker 
41*0e209d39SAndroid Build Coastguard Worker #define UCOL_CONTINUATION_MARKER 0xC0
42*0e209d39SAndroid Build Coastguard Worker 
43*0e209d39SAndroid Build Coastguard Worker #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
44*0e209d39SAndroid Build Coastguard Worker 
45*0e209d39SAndroid Build Coastguard Worker /**
46*0e209d39SAndroid Build Coastguard Worker  * This indicates an error has occurred during processing or there are no more CEs
47*0e209d39SAndroid Build Coastguard Worker  * to be returned.
48*0e209d39SAndroid Build Coastguard Worker  */
49*0e209d39SAndroid Build Coastguard Worker #define UCOL_PROCESSED_NULLORDER        ((int64_t)U_INT64_MAX)
50*0e209d39SAndroid Build Coastguard Worker 
51*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
52*0e209d39SAndroid Build Coastguard Worker 
53*0e209d39SAndroid Build Coastguard Worker class CollationElementIterator;
54*0e209d39SAndroid Build Coastguard Worker class Collator;
55*0e209d39SAndroid Build Coastguard Worker 
56*0e209d39SAndroid Build Coastguard Worker struct PCEI
57*0e209d39SAndroid Build Coastguard Worker {
58*0e209d39SAndroid Build Coastguard Worker     uint64_t ce;
59*0e209d39SAndroid Build Coastguard Worker     int32_t  low;
60*0e209d39SAndroid Build Coastguard Worker     int32_t  high;
61*0e209d39SAndroid Build Coastguard Worker };
62*0e209d39SAndroid Build Coastguard Worker 
63*0e209d39SAndroid Build Coastguard Worker struct PCEBuffer
64*0e209d39SAndroid Build Coastguard Worker {
65*0e209d39SAndroid Build Coastguard Worker     PCEI    defaultBuffer[16];
66*0e209d39SAndroid Build Coastguard Worker     PCEI   *buffer;
67*0e209d39SAndroid Build Coastguard Worker     int32_t bufferIndex;
68*0e209d39SAndroid Build Coastguard Worker     int32_t bufferSize;
69*0e209d39SAndroid Build Coastguard Worker 
70*0e209d39SAndroid Build Coastguard Worker     PCEBuffer();
71*0e209d39SAndroid Build Coastguard Worker     ~PCEBuffer();
72*0e209d39SAndroid Build Coastguard Worker 
73*0e209d39SAndroid Build Coastguard Worker     void  reset();
74*0e209d39SAndroid Build Coastguard Worker     UBool isEmpty() const;
75*0e209d39SAndroid Build Coastguard Worker     void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
76*0e209d39SAndroid Build Coastguard Worker     const PCEI *get();
77*0e209d39SAndroid Build Coastguard Worker };
78*0e209d39SAndroid Build Coastguard Worker 
79*0e209d39SAndroid Build Coastguard Worker class UCollationPCE : public UMemory {
80*0e209d39SAndroid Build Coastguard Worker private:
81*0e209d39SAndroid Build Coastguard Worker     PCEBuffer          pceBuffer;
82*0e209d39SAndroid Build Coastguard Worker     CollationElementIterator *cei;
83*0e209d39SAndroid Build Coastguard Worker     UCollationStrength strength;
84*0e209d39SAndroid Build Coastguard Worker     UBool              toShift;
85*0e209d39SAndroid Build Coastguard Worker     UBool              isShifted;
86*0e209d39SAndroid Build Coastguard Worker     uint32_t           variableTop;
87*0e209d39SAndroid Build Coastguard Worker 
88*0e209d39SAndroid Build Coastguard Worker public:
89*0e209d39SAndroid Build Coastguard Worker     UCollationPCE(UCollationElements *elems);
90*0e209d39SAndroid Build Coastguard Worker     UCollationPCE(CollationElementIterator *iter);
91*0e209d39SAndroid Build Coastguard Worker     ~UCollationPCE();
92*0e209d39SAndroid Build Coastguard Worker 
93*0e209d39SAndroid Build Coastguard Worker     void init(UCollationElements *elems);
94*0e209d39SAndroid Build Coastguard Worker     void init(CollationElementIterator *iter);
95*0e209d39SAndroid Build Coastguard Worker 
96*0e209d39SAndroid Build Coastguard Worker     /**
97*0e209d39SAndroid Build Coastguard Worker      * Get the processed ordering priority of the next collation element in the text.
98*0e209d39SAndroid Build Coastguard Worker      * A single character may contain more than one collation element.
99*0e209d39SAndroid Build Coastguard Worker      *
100*0e209d39SAndroid Build Coastguard Worker      * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
101*0e209d39SAndroid Build Coastguard Worker      * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
102*0e209d39SAndroid Build Coastguard Worker      * @param status A pointer to an UErrorCode to receive any errors.
103*0e209d39SAndroid Build Coastguard Worker      * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
104*0e209d39SAndroid Build Coastguard Worker      *         if an error has occurred or if the end of string has been reached
105*0e209d39SAndroid Build Coastguard Worker      */
106*0e209d39SAndroid Build Coastguard Worker     int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
107*0e209d39SAndroid Build Coastguard Worker     /**
108*0e209d39SAndroid Build Coastguard Worker      * Get the processed ordering priority of the previous collation element in the text.
109*0e209d39SAndroid Build Coastguard Worker      * A single character may contain more than one collation element.
110*0e209d39SAndroid Build Coastguard Worker      *
111*0e209d39SAndroid Build Coastguard Worker      * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
112*0e209d39SAndroid Build Coastguard Worker      * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
113*0e209d39SAndroid Build Coastguard Worker      * @param status A pointer to an UErrorCode to receive any errors. Notably
114*0e209d39SAndroid Build Coastguard Worker      *               a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
115*0e209d39SAndroid Build Coastguard Worker      *               buffer has been exhausted.
116*0e209d39SAndroid Build Coastguard Worker      * @return The previous collation elements ordering, otherwise returns
117*0e209d39SAndroid Build Coastguard Worker      *         UCOL_PROCESSED_NULLORDER if an error has occurred or if the start of
118*0e209d39SAndroid Build Coastguard Worker      *         string has been reached.
119*0e209d39SAndroid Build Coastguard Worker      */
120*0e209d39SAndroid Build Coastguard Worker     int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
121*0e209d39SAndroid Build Coastguard Worker 
122*0e209d39SAndroid Build Coastguard Worker private:
123*0e209d39SAndroid Build Coastguard Worker     void init(const Collator &coll);
124*0e209d39SAndroid Build Coastguard Worker     uint64_t processCE(uint32_t ce);
125*0e209d39SAndroid Build Coastguard Worker };
126*0e209d39SAndroid Build Coastguard Worker 
127*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
128*0e209d39SAndroid Build Coastguard Worker 
129*0e209d39SAndroid Build Coastguard Worker #define INITIAL_ARRAY_SIZE_       256
130*0e209d39SAndroid Build Coastguard Worker 
131*0e209d39SAndroid Build Coastguard Worker struct USearch {
132*0e209d39SAndroid Build Coastguard Worker     // required since collation element iterator does not have a getText API
133*0e209d39SAndroid Build Coastguard Worker     const UChar              *text;
134*0e209d39SAndroid Build Coastguard Worker           int32_t             textLength; // exact length
135*0e209d39SAndroid Build Coastguard Worker           UBool               isOverlap;
136*0e209d39SAndroid Build Coastguard Worker           UBool               isCanonicalMatch;
137*0e209d39SAndroid Build Coastguard Worker           int16_t             elementComparisonType;
138*0e209d39SAndroid Build Coastguard Worker           UBreakIterator     *internalBreakIter;  // internal character breakiterator, lazily created.
139*0e209d39SAndroid Build Coastguard Worker           UBreakIterator     *breakIter;          // caller provided character breakiterator
140*0e209d39SAndroid Build Coastguard Worker     // value USEARCH_DONE is the default value
141*0e209d39SAndroid Build Coastguard Worker     // if we are not at the start of the text or the end of the text,
142*0e209d39SAndroid Build Coastguard Worker     // depending on the iteration direction and matchedIndex is USEARCH_DONE
143*0e209d39SAndroid Build Coastguard Worker     // it means that we can't find any more matches in that particular direction
144*0e209d39SAndroid Build Coastguard Worker           int32_t             matchedIndex;
145*0e209d39SAndroid Build Coastguard Worker           int32_t             matchedLength;
146*0e209d39SAndroid Build Coastguard Worker           UBool               isForwardSearching;
147*0e209d39SAndroid Build Coastguard Worker           UBool               reset;
148*0e209d39SAndroid Build Coastguard Worker };
149*0e209d39SAndroid Build Coastguard Worker 
150*0e209d39SAndroid Build Coastguard Worker struct UPattern {
151*0e209d39SAndroid Build Coastguard Worker     const UChar              *text;
152*0e209d39SAndroid Build Coastguard Worker           int32_t             textLength; // exact length
153*0e209d39SAndroid Build Coastguard Worker           // length required for backwards ce comparison
154*0e209d39SAndroid Build Coastguard Worker           int32_t             cesLength;
155*0e209d39SAndroid Build Coastguard Worker           int32_t            *ces;
156*0e209d39SAndroid Build Coastguard Worker           int32_t             cesBuffer[INITIAL_ARRAY_SIZE_];
157*0e209d39SAndroid Build Coastguard Worker           int32_t             pcesLength;
158*0e209d39SAndroid Build Coastguard Worker           int64_t            *pces;
159*0e209d39SAndroid Build Coastguard Worker           int64_t             pcesBuffer[INITIAL_ARRAY_SIZE_];
160*0e209d39SAndroid Build Coastguard Worker           UBool               hasPrefixAccents;
161*0e209d39SAndroid Build Coastguard Worker           UBool               hasSuffixAccents;
162*0e209d39SAndroid Build Coastguard Worker };
163*0e209d39SAndroid Build Coastguard Worker 
164*0e209d39SAndroid Build Coastguard Worker struct UStringSearch {
165*0e209d39SAndroid Build Coastguard Worker     struct USearch            *search;
166*0e209d39SAndroid Build Coastguard Worker     struct UPattern            pattern;
167*0e209d39SAndroid Build Coastguard Worker     const  UCollator          *collator;
168*0e209d39SAndroid Build Coastguard Worker     const  icu::Normalizer2   *nfd;
169*0e209d39SAndroid Build Coastguard Worker     // positions within the collation element iterator is used to determine
170*0e209d39SAndroid Build Coastguard Worker     // if we are at the start of the text.
171*0e209d39SAndroid Build Coastguard Worker            UCollationElements *textIter;
172*0e209d39SAndroid Build Coastguard Worker            icu::UCollationPCE *textProcessedIter;
173*0e209d39SAndroid Build Coastguard Worker     // utility collation element, used throughout program for temporary
174*0e209d39SAndroid Build Coastguard Worker     // iteration.
175*0e209d39SAndroid Build Coastguard Worker            UCollationElements *utilIter;
176*0e209d39SAndroid Build Coastguard Worker            UBool               ownCollator;
177*0e209d39SAndroid Build Coastguard Worker            UCollationStrength  strength;
178*0e209d39SAndroid Build Coastguard Worker            uint32_t            ceMask;
179*0e209d39SAndroid Build Coastguard Worker            uint32_t            variableTop;
180*0e209d39SAndroid Build Coastguard Worker            UBool               toShift;
181*0e209d39SAndroid Build Coastguard Worker };
182*0e209d39SAndroid Build Coastguard Worker 
183*0e209d39SAndroid Build Coastguard Worker /**
184*0e209d39SAndroid Build Coastguard Worker * Exact matches without checking for the ends for extra accents.
185*0e209d39SAndroid Build Coastguard Worker * The match after the position within the collation element iterator is to be
186*0e209d39SAndroid Build Coastguard Worker * found.
187*0e209d39SAndroid Build Coastguard Worker * After a match is found the offset in the collation element iterator will be
188*0e209d39SAndroid Build Coastguard Worker * shifted to the start of the match.
189*0e209d39SAndroid Build Coastguard Worker * Implementation note:
190*0e209d39SAndroid Build Coastguard Worker * For tertiary we can't use the collator->tertiaryMask, that is a
191*0e209d39SAndroid Build Coastguard Worker * preprocessed mask that takes into account case options. since we are only
192*0e209d39SAndroid Build Coastguard Worker * concerned with exact matches, we don't need that.
193*0e209d39SAndroid Build Coastguard Worker * Alternate handling - since only the 16 most significant digits is only used,
194*0e209d39SAndroid Build Coastguard Worker * we can safely do a compare without masking if the ce is a variable, we mask
195*0e209d39SAndroid Build Coastguard Worker * and get only the primary values no shifting to quartenary is required since
196*0e209d39SAndroid Build Coastguard Worker * all primary values less than variabletop will need to be masked off anyway.
197*0e209d39SAndroid Build Coastguard Worker * If the end character is composite and the pattern ce does not match the text
198*0e209d39SAndroid Build Coastguard Worker * ce, we skip it until we find a match in the end composite character or when
199*0e209d39SAndroid Build Coastguard Worker * it has passed the character. This is so that we can match pattern "a" with
200*0e209d39SAndroid Build Coastguard Worker * the text "\u00e6"
201*0e209d39SAndroid Build Coastguard Worker * @param strsrch string search data
202*0e209d39SAndroid Build Coastguard Worker * @param status error status if any
203*0e209d39SAndroid Build Coastguard Worker * @return true if an exact match is found, false otherwise
204*0e209d39SAndroid Build Coastguard Worker */
205*0e209d39SAndroid Build Coastguard Worker U_CFUNC
206*0e209d39SAndroid Build Coastguard Worker UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
207*0e209d39SAndroid Build Coastguard Worker 
208*0e209d39SAndroid Build Coastguard Worker /**
209*0e209d39SAndroid Build Coastguard Worker * Canonical matches.
210*0e209d39SAndroid Build Coastguard Worker * According to the definition, matches found here will include the whole span
211*0e209d39SAndroid Build Coastguard Worker * of beginning and ending accents if it overlaps that region.
212*0e209d39SAndroid Build Coastguard Worker * @param strsrch string search data
213*0e209d39SAndroid Build Coastguard Worker * @param status error status if any
214*0e209d39SAndroid Build Coastguard Worker * @return true if a canonical match is found, false otherwise
215*0e209d39SAndroid Build Coastguard Worker */
216*0e209d39SAndroid Build Coastguard Worker U_CFUNC
217*0e209d39SAndroid Build Coastguard Worker UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
218*0e209d39SAndroid Build Coastguard Worker 
219*0e209d39SAndroid Build Coastguard Worker /**
220*0e209d39SAndroid Build Coastguard Worker * Gets the previous match.
221*0e209d39SAndroid Build Coastguard Worker * Comments follows from handleNextExact
222*0e209d39SAndroid Build Coastguard Worker * @param strsrch string search data
223*0e209d39SAndroid Build Coastguard Worker * @param status error status if any
224*0e209d39SAndroid Build Coastguard Worker * @return True if a exact math is found, false otherwise.
225*0e209d39SAndroid Build Coastguard Worker */
226*0e209d39SAndroid Build Coastguard Worker U_CFUNC
227*0e209d39SAndroid Build Coastguard Worker UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
228*0e209d39SAndroid Build Coastguard Worker 
229*0e209d39SAndroid Build Coastguard Worker /**
230*0e209d39SAndroid Build Coastguard Worker * Canonical matches.
231*0e209d39SAndroid Build Coastguard Worker * According to the definition, matches found here will include the whole span
232*0e209d39SAndroid Build Coastguard Worker * of beginning and ending accents if it overlaps that region.
233*0e209d39SAndroid Build Coastguard Worker * @param strsrch string search data
234*0e209d39SAndroid Build Coastguard Worker * @param status error status if any
235*0e209d39SAndroid Build Coastguard Worker * @return true if a canonical match is found, false otherwise
236*0e209d39SAndroid Build Coastguard Worker */
237*0e209d39SAndroid Build Coastguard Worker U_CFUNC
238*0e209d39SAndroid Build Coastguard Worker UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
239*0e209d39SAndroid Build Coastguard Worker                                       UErrorCode    *status);
240*0e209d39SAndroid Build Coastguard Worker 
241*0e209d39SAndroid Build Coastguard Worker #endif /* #if !UCONFIG_NO_COLLATION */
242*0e209d39SAndroid Build Coastguard Worker 
243*0e209d39SAndroid Build Coastguard Worker #endif
244