xref: /aosp_15_r20/external/icu/libicu/cts_headers/utf8collationiterator.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker *******************************************************************************
5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2012-2016, International Business Machines
6*0e209d39SAndroid Build Coastguard Worker * Corporation and others.  All Rights Reserved.
7*0e209d39SAndroid Build Coastguard Worker *******************************************************************************
8*0e209d39SAndroid Build Coastguard Worker * utf8collationiterator.h
9*0e209d39SAndroid Build Coastguard Worker *
10*0e209d39SAndroid Build Coastguard Worker * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
11*0e209d39SAndroid Build Coastguard Worker * created by: Markus W. Scherer
12*0e209d39SAndroid Build Coastguard Worker */
13*0e209d39SAndroid Build Coastguard Worker 
14*0e209d39SAndroid Build Coastguard Worker #ifndef __UTF8COLLATIONITERATOR_H__
15*0e209d39SAndroid Build Coastguard Worker #define __UTF8COLLATIONITERATOR_H__
16*0e209d39SAndroid Build Coastguard Worker 
17*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
18*0e209d39SAndroid Build Coastguard Worker 
19*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_COLLATION
20*0e209d39SAndroid Build Coastguard Worker 
21*0e209d39SAndroid Build Coastguard Worker #include "cmemory.h"
22*0e209d39SAndroid Build Coastguard Worker #include "collation.h"
23*0e209d39SAndroid Build Coastguard Worker #include "collationdata.h"
24*0e209d39SAndroid Build Coastguard Worker #include "collationiterator.h"
25*0e209d39SAndroid Build Coastguard Worker #include "normalizer2impl.h"
26*0e209d39SAndroid Build Coastguard Worker 
27*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
28*0e209d39SAndroid Build Coastguard Worker 
29*0e209d39SAndroid Build Coastguard Worker /**
30*0e209d39SAndroid Build Coastguard Worker  * UTF-8 collation element and character iterator.
31*0e209d39SAndroid Build Coastguard Worker  * Handles normalized UTF-8 text inline, with length or NUL-terminated.
32*0e209d39SAndroid Build Coastguard Worker  * Unnormalized text is handled by a subclass.
33*0e209d39SAndroid Build Coastguard Worker  */
34*0e209d39SAndroid Build Coastguard Worker class U_I18N_API UTF8CollationIterator : public CollationIterator {
35*0e209d39SAndroid Build Coastguard Worker public:
UTF8CollationIterator(const CollationData * d,UBool numeric,const uint8_t * s,int32_t p,int32_t len)36*0e209d39SAndroid Build Coastguard Worker     UTF8CollationIterator(const CollationData *d, UBool numeric,
37*0e209d39SAndroid Build Coastguard Worker                           const uint8_t *s, int32_t p, int32_t len)
38*0e209d39SAndroid Build Coastguard Worker             : CollationIterator(d, numeric),
39*0e209d39SAndroid Build Coastguard Worker               u8(s), pos(p), length(len) {}
40*0e209d39SAndroid Build Coastguard Worker 
41*0e209d39SAndroid Build Coastguard Worker     virtual ~UTF8CollationIterator();
42*0e209d39SAndroid Build Coastguard Worker 
43*0e209d39SAndroid Build Coastguard Worker     virtual void resetToOffset(int32_t newOffset) override;
44*0e209d39SAndroid Build Coastguard Worker 
45*0e209d39SAndroid Build Coastguard Worker     virtual int32_t getOffset() const override;
46*0e209d39SAndroid Build Coastguard Worker 
47*0e209d39SAndroid Build Coastguard Worker     virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
48*0e209d39SAndroid Build Coastguard Worker 
49*0e209d39SAndroid Build Coastguard Worker     virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
50*0e209d39SAndroid Build Coastguard Worker 
51*0e209d39SAndroid Build Coastguard Worker protected:
52*0e209d39SAndroid Build Coastguard Worker     /**
53*0e209d39SAndroid Build Coastguard Worker      * For byte sequences that are illegal in UTF-8, an error value may be returned
54*0e209d39SAndroid Build Coastguard Worker      * together with a bogus code point. The caller will ignore that code point.
55*0e209d39SAndroid Build Coastguard Worker      *
56*0e209d39SAndroid Build Coastguard Worker      * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
57*0e209d39SAndroid Build Coastguard Worker      * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns true.
58*0e209d39SAndroid Build Coastguard Worker      *
59*0e209d39SAndroid Build Coastguard Worker      * Valid lead surrogates are returned from inside a normalized text segment,
60*0e209d39SAndroid Build Coastguard Worker      * where handleGetTrailSurrogate() will return the matching trail surrogate.
61*0e209d39SAndroid Build Coastguard Worker      */
62*0e209d39SAndroid Build Coastguard Worker     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
63*0e209d39SAndroid Build Coastguard Worker 
64*0e209d39SAndroid Build Coastguard Worker     virtual UBool foundNULTerminator() override;
65*0e209d39SAndroid Build Coastguard Worker 
66*0e209d39SAndroid Build Coastguard Worker     virtual UBool forbidSurrogateCodePoints() const override;
67*0e209d39SAndroid Build Coastguard Worker 
68*0e209d39SAndroid Build Coastguard Worker     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
69*0e209d39SAndroid Build Coastguard Worker 
70*0e209d39SAndroid Build Coastguard Worker     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
71*0e209d39SAndroid Build Coastguard Worker 
72*0e209d39SAndroid Build Coastguard Worker     const uint8_t *u8;
73*0e209d39SAndroid Build Coastguard Worker     int32_t pos;
74*0e209d39SAndroid Build Coastguard Worker     int32_t length;  // <0 for NUL-terminated strings
75*0e209d39SAndroid Build Coastguard Worker };
76*0e209d39SAndroid Build Coastguard Worker 
77*0e209d39SAndroid Build Coastguard Worker /**
78*0e209d39SAndroid Build Coastguard Worker  * Incrementally checks the input text for FCD and normalizes where necessary.
79*0e209d39SAndroid Build Coastguard Worker  */
80*0e209d39SAndroid Build Coastguard Worker class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
81*0e209d39SAndroid Build Coastguard Worker public:
FCDUTF8CollationIterator(const CollationData * data,UBool numeric,const uint8_t * s,int32_t p,int32_t len)82*0e209d39SAndroid Build Coastguard Worker     FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
83*0e209d39SAndroid Build Coastguard Worker                              const uint8_t *s, int32_t p, int32_t len)
84*0e209d39SAndroid Build Coastguard Worker             : UTF8CollationIterator(data, numeric, s, p, len),
85*0e209d39SAndroid Build Coastguard Worker               state(CHECK_FWD), start(p),
86*0e209d39SAndroid Build Coastguard Worker               nfcImpl(data->nfcImpl) {}
87*0e209d39SAndroid Build Coastguard Worker 
88*0e209d39SAndroid Build Coastguard Worker     virtual ~FCDUTF8CollationIterator();
89*0e209d39SAndroid Build Coastguard Worker 
90*0e209d39SAndroid Build Coastguard Worker     virtual void resetToOffset(int32_t newOffset) override;
91*0e209d39SAndroid Build Coastguard Worker 
92*0e209d39SAndroid Build Coastguard Worker     virtual int32_t getOffset() const override;
93*0e209d39SAndroid Build Coastguard Worker 
94*0e209d39SAndroid Build Coastguard Worker     virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
95*0e209d39SAndroid Build Coastguard Worker 
96*0e209d39SAndroid Build Coastguard Worker     virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
97*0e209d39SAndroid Build Coastguard Worker 
98*0e209d39SAndroid Build Coastguard Worker protected:
99*0e209d39SAndroid Build Coastguard Worker     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
100*0e209d39SAndroid Build Coastguard Worker 
101*0e209d39SAndroid Build Coastguard Worker     virtual char16_t handleGetTrailSurrogate() override;
102*0e209d39SAndroid Build Coastguard Worker 
103*0e209d39SAndroid Build Coastguard Worker     virtual UBool foundNULTerminator() override;
104*0e209d39SAndroid Build Coastguard Worker 
105*0e209d39SAndroid Build Coastguard Worker     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
106*0e209d39SAndroid Build Coastguard Worker 
107*0e209d39SAndroid Build Coastguard Worker     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
108*0e209d39SAndroid Build Coastguard Worker 
109*0e209d39SAndroid Build Coastguard Worker private:
110*0e209d39SAndroid Build Coastguard Worker     UBool nextHasLccc() const;
111*0e209d39SAndroid Build Coastguard Worker     UBool previousHasTccc() const;
112*0e209d39SAndroid Build Coastguard Worker 
113*0e209d39SAndroid Build Coastguard Worker     /**
114*0e209d39SAndroid Build Coastguard Worker      * Switches to forward checking if possible.
115*0e209d39SAndroid Build Coastguard Worker      */
116*0e209d39SAndroid Build Coastguard Worker     void switchToForward();
117*0e209d39SAndroid Build Coastguard Worker 
118*0e209d39SAndroid Build Coastguard Worker     /**
119*0e209d39SAndroid Build Coastguard Worker      * Extends the FCD text segment forward or normalizes around pos.
120*0e209d39SAndroid Build Coastguard Worker      * @return true if success
121*0e209d39SAndroid Build Coastguard Worker      */
122*0e209d39SAndroid Build Coastguard Worker     UBool nextSegment(UErrorCode &errorCode);
123*0e209d39SAndroid Build Coastguard Worker 
124*0e209d39SAndroid Build Coastguard Worker     /**
125*0e209d39SAndroid Build Coastguard Worker      * Switches to backward checking.
126*0e209d39SAndroid Build Coastguard Worker      */
127*0e209d39SAndroid Build Coastguard Worker     void switchToBackward();
128*0e209d39SAndroid Build Coastguard Worker 
129*0e209d39SAndroid Build Coastguard Worker     /**
130*0e209d39SAndroid Build Coastguard Worker      * Extends the FCD text segment backward or normalizes around pos.
131*0e209d39SAndroid Build Coastguard Worker      * @return true if success
132*0e209d39SAndroid Build Coastguard Worker      */
133*0e209d39SAndroid Build Coastguard Worker     UBool previousSegment(UErrorCode &errorCode);
134*0e209d39SAndroid Build Coastguard Worker 
135*0e209d39SAndroid Build Coastguard Worker     UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
136*0e209d39SAndroid Build Coastguard Worker 
137*0e209d39SAndroid Build Coastguard Worker     enum State {
138*0e209d39SAndroid Build Coastguard Worker         /**
139*0e209d39SAndroid Build Coastguard Worker          * The input text [start..pos[ passes the FCD check.
140*0e209d39SAndroid Build Coastguard Worker          * Moving forward checks incrementally.
141*0e209d39SAndroid Build Coastguard Worker          * limit is undefined.
142*0e209d39SAndroid Build Coastguard Worker          */
143*0e209d39SAndroid Build Coastguard Worker         CHECK_FWD,
144*0e209d39SAndroid Build Coastguard Worker         /**
145*0e209d39SAndroid Build Coastguard Worker          * The input text [pos..limit[ passes the FCD check.
146*0e209d39SAndroid Build Coastguard Worker          * Moving backward checks incrementally.
147*0e209d39SAndroid Build Coastguard Worker          * start is undefined.
148*0e209d39SAndroid Build Coastguard Worker          */
149*0e209d39SAndroid Build Coastguard Worker         CHECK_BWD,
150*0e209d39SAndroid Build Coastguard Worker         /**
151*0e209d39SAndroid Build Coastguard Worker          * The input text [start..limit[ passes the FCD check.
152*0e209d39SAndroid Build Coastguard Worker          * pos tracks the current text index.
153*0e209d39SAndroid Build Coastguard Worker          */
154*0e209d39SAndroid Build Coastguard Worker         IN_FCD_SEGMENT,
155*0e209d39SAndroid Build Coastguard Worker         /**
156*0e209d39SAndroid Build Coastguard Worker          * The input text [start..limit[ failed the FCD check and was normalized.
157*0e209d39SAndroid Build Coastguard Worker          * pos tracks the current index in the normalized string.
158*0e209d39SAndroid Build Coastguard Worker          */
159*0e209d39SAndroid Build Coastguard Worker         IN_NORMALIZED
160*0e209d39SAndroid Build Coastguard Worker     };
161*0e209d39SAndroid Build Coastguard Worker 
162*0e209d39SAndroid Build Coastguard Worker     State state;
163*0e209d39SAndroid Build Coastguard Worker 
164*0e209d39SAndroid Build Coastguard Worker     int32_t start;
165*0e209d39SAndroid Build Coastguard Worker     int32_t limit;
166*0e209d39SAndroid Build Coastguard Worker 
167*0e209d39SAndroid Build Coastguard Worker     const Normalizer2Impl &nfcImpl;
168*0e209d39SAndroid Build Coastguard Worker     UnicodeString normalized;
169*0e209d39SAndroid Build Coastguard Worker };
170*0e209d39SAndroid Build Coastguard Worker 
171*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
172*0e209d39SAndroid Build Coastguard Worker 
173*0e209d39SAndroid Build Coastguard Worker #endif  // !UCONFIG_NO_COLLATION
174*0e209d39SAndroid Build Coastguard Worker #endif  // __UTF8COLLATIONITERATOR_H__
175