xref: /aosp_15_r20/external/icu/libicu/cts_headers/bmpset.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker ******************************************************************************
5*0e209d39SAndroid Build Coastguard Worker *
6*0e209d39SAndroid Build Coastguard Worker *   Copyright (C) 2007, International Business Machines
7*0e209d39SAndroid Build Coastguard Worker *   Corporation and others.  All Rights Reserved.
8*0e209d39SAndroid Build Coastguard Worker *
9*0e209d39SAndroid Build Coastguard Worker ******************************************************************************
10*0e209d39SAndroid Build Coastguard Worker *   file name:  bmpset.h
11*0e209d39SAndroid Build Coastguard Worker *   encoding:   UTF-8
12*0e209d39SAndroid Build Coastguard Worker *   tab size:   8 (not used)
13*0e209d39SAndroid Build Coastguard Worker *   indentation:4
14*0e209d39SAndroid Build Coastguard Worker *
15*0e209d39SAndroid Build Coastguard Worker *   created on: 2007jan29
16*0e209d39SAndroid Build Coastguard Worker *   created by: Markus W. Scherer
17*0e209d39SAndroid Build Coastguard Worker */
18*0e209d39SAndroid Build Coastguard Worker 
19*0e209d39SAndroid Build Coastguard Worker #ifndef __BMPSET_H__
20*0e209d39SAndroid Build Coastguard Worker #define __BMPSET_H__
21*0e209d39SAndroid Build Coastguard Worker 
22*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
23*0e209d39SAndroid Build Coastguard Worker #include "unicode/uniset.h"
24*0e209d39SAndroid Build Coastguard Worker 
25*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
26*0e209d39SAndroid Build Coastguard Worker 
27*0e209d39SAndroid Build Coastguard Worker /*
28*0e209d39SAndroid Build Coastguard Worker  * Helper class for frozen UnicodeSets, implements contains() and span()
29*0e209d39SAndroid Build Coastguard Worker  * optimized for BMP code points. Structured to be UTF-8-friendly.
30*0e209d39SAndroid Build Coastguard Worker  *
31*0e209d39SAndroid Build Coastguard Worker  * Latin-1: Look up bytes.
32*0e209d39SAndroid Build Coastguard Worker  * 2-byte characters: Bits organized vertically.
33*0e209d39SAndroid Build Coastguard Worker  * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
34*0e209d39SAndroid Build Coastguard Worker  *                    with mixed for illegal ranges.
35*0e209d39SAndroid Build Coastguard Worker  * Supplementary characters: Binary search over
36*0e209d39SAndroid Build Coastguard Worker  * the supplementary part of the parent set's inversion list.
37*0e209d39SAndroid Build Coastguard Worker  */
38*0e209d39SAndroid Build Coastguard Worker class BMPSet : public UMemory {
39*0e209d39SAndroid Build Coastguard Worker public:
40*0e209d39SAndroid Build Coastguard Worker     BMPSet(const int32_t *parentList, int32_t parentListLength);
41*0e209d39SAndroid Build Coastguard Worker     BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength);
42*0e209d39SAndroid Build Coastguard Worker     virtual ~BMPSet();
43*0e209d39SAndroid Build Coastguard Worker 
44*0e209d39SAndroid Build Coastguard Worker     virtual UBool contains(UChar32 c) const;
45*0e209d39SAndroid Build Coastguard Worker 
46*0e209d39SAndroid Build Coastguard Worker     /*
47*0e209d39SAndroid Build Coastguard Worker      * Span the initial substring for which each character c has spanCondition==contains(c).
48*0e209d39SAndroid Build Coastguard Worker      * It must be s<limit and spanCondition==0 or 1.
49*0e209d39SAndroid Build Coastguard Worker      * @return The string pointer which limits the span.
50*0e209d39SAndroid Build Coastguard Worker      */
51*0e209d39SAndroid Build Coastguard Worker     const char16_t *span(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const;
52*0e209d39SAndroid Build Coastguard Worker 
53*0e209d39SAndroid Build Coastguard Worker     /*
54*0e209d39SAndroid Build Coastguard Worker      * Span the trailing substring for which each character c has spanCondition==contains(c).
55*0e209d39SAndroid Build Coastguard Worker      * It must be s<limit and spanCondition==0 or 1.
56*0e209d39SAndroid Build Coastguard Worker      * @return The string pointer which starts the span.
57*0e209d39SAndroid Build Coastguard Worker      */
58*0e209d39SAndroid Build Coastguard Worker     const char16_t *spanBack(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const;
59*0e209d39SAndroid Build Coastguard Worker 
60*0e209d39SAndroid Build Coastguard Worker     /*
61*0e209d39SAndroid Build Coastguard Worker      * Span the initial substring for which each character c has spanCondition==contains(c).
62*0e209d39SAndroid Build Coastguard Worker      * It must be length>0 and spanCondition==0 or 1.
63*0e209d39SAndroid Build Coastguard Worker      * @return The string pointer which limits the span.
64*0e209d39SAndroid Build Coastguard Worker      */
65*0e209d39SAndroid Build Coastguard Worker     const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
66*0e209d39SAndroid Build Coastguard Worker 
67*0e209d39SAndroid Build Coastguard Worker     /*
68*0e209d39SAndroid Build Coastguard Worker      * Span the trailing substring for which each character c has spanCondition==contains(c).
69*0e209d39SAndroid Build Coastguard Worker      * It must be length>0 and spanCondition==0 or 1.
70*0e209d39SAndroid Build Coastguard Worker      * @return The start of the span.
71*0e209d39SAndroid Build Coastguard Worker      */
72*0e209d39SAndroid Build Coastguard Worker     int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
73*0e209d39SAndroid Build Coastguard Worker 
74*0e209d39SAndroid Build Coastguard Worker private:
75*0e209d39SAndroid Build Coastguard Worker     void initBits();
76*0e209d39SAndroid Build Coastguard Worker     void overrideIllegal();
77*0e209d39SAndroid Build Coastguard Worker 
78*0e209d39SAndroid Build Coastguard Worker     /**
79*0e209d39SAndroid Build Coastguard Worker      * Same as UnicodeSet::findCodePoint(UChar32 c) const except that the
80*0e209d39SAndroid Build Coastguard Worker      * binary search is restricted for finding code points in a certain range.
81*0e209d39SAndroid Build Coastguard Worker      *
82*0e209d39SAndroid Build Coastguard Worker      * For restricting the search for finding in the range start..end,
83*0e209d39SAndroid Build Coastguard Worker      * pass in
84*0e209d39SAndroid Build Coastguard Worker      *   lo=findCodePoint(start) and
85*0e209d39SAndroid Build Coastguard Worker      *   hi=findCodePoint(end)
86*0e209d39SAndroid Build Coastguard Worker      * with 0<=lo<=hi<len.
87*0e209d39SAndroid Build Coastguard Worker      * findCodePoint(c) defaults to lo=0 and hi=len-1.
88*0e209d39SAndroid Build Coastguard Worker      *
89*0e209d39SAndroid Build Coastguard Worker      * @param c a character in a subrange of MIN_VALUE..MAX_VALUE
90*0e209d39SAndroid Build Coastguard Worker      * @param lo The lowest index to be returned.
91*0e209d39SAndroid Build Coastguard Worker      * @param hi The highest index to be returned.
92*0e209d39SAndroid Build Coastguard Worker      * @return the smallest integer i in the range lo..hi,
93*0e209d39SAndroid Build Coastguard Worker      *         inclusive, such that c < list[i]
94*0e209d39SAndroid Build Coastguard Worker      */
95*0e209d39SAndroid Build Coastguard Worker     int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const;
96*0e209d39SAndroid Build Coastguard Worker 
97*0e209d39SAndroid Build Coastguard Worker     inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
98*0e209d39SAndroid Build Coastguard Worker 
99*0e209d39SAndroid Build Coastguard Worker     /*
100*0e209d39SAndroid Build Coastguard Worker      * One byte 0 or 1 per Latin-1 character.
101*0e209d39SAndroid Build Coastguard Worker      */
102*0e209d39SAndroid Build Coastguard Worker     UBool latin1Contains[0x100];
103*0e209d39SAndroid Build Coastguard Worker 
104*0e209d39SAndroid Build Coastguard Worker     /* true if contains(U+FFFD). */
105*0e209d39SAndroid Build Coastguard Worker     UBool containsFFFD;
106*0e209d39SAndroid Build Coastguard Worker 
107*0e209d39SAndroid Build Coastguard Worker     /*
108*0e209d39SAndroid Build Coastguard Worker      * One bit per code point from U+0000..U+07FF.
109*0e209d39SAndroid Build Coastguard Worker      * The bits are organized vertically; consecutive code points
110*0e209d39SAndroid Build Coastguard Worker      * correspond to the same bit positions in consecutive table words.
111*0e209d39SAndroid Build Coastguard Worker      * With code point parts
112*0e209d39SAndroid Build Coastguard Worker      *   lead=c{10..6}
113*0e209d39SAndroid Build Coastguard Worker      *   trail=c{5..0}
114*0e209d39SAndroid Build Coastguard Worker      * it is set.contains(c)==(table7FF[trail] bit lead)
115*0e209d39SAndroid Build Coastguard Worker      *
116*0e209d39SAndroid Build Coastguard Worker      * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD)
117*0e209d39SAndroid Build Coastguard Worker      * for faster validity checking at runtime.
118*0e209d39SAndroid Build Coastguard Worker      */
119*0e209d39SAndroid Build Coastguard Worker     uint32_t table7FF[64];
120*0e209d39SAndroid Build Coastguard Worker 
121*0e209d39SAndroid Build Coastguard Worker     /*
122*0e209d39SAndroid Build Coastguard Worker      * One bit per 64 BMP code points.
123*0e209d39SAndroid Build Coastguard Worker      * The bits are organized vertically; consecutive 64-code point blocks
124*0e209d39SAndroid Build Coastguard Worker      * correspond to the same bit position in consecutive table words.
125*0e209d39SAndroid Build Coastguard Worker      * With code point parts
126*0e209d39SAndroid Build Coastguard Worker      *   lead=c{15..12}
127*0e209d39SAndroid Build Coastguard Worker      *   t1=c{11..6}
128*0e209d39SAndroid Build Coastguard Worker      * test bits (lead+16) and lead in bmpBlockBits[t1].
129*0e209d39SAndroid Build Coastguard Worker      * If the upper bit is 0, then the lower bit indicates if contains(c)
130*0e209d39SAndroid Build Coastguard Worker      * for all code points in the 64-block.
131*0e209d39SAndroid Build Coastguard Worker      * If the upper bit is 1, then the block is mixed and set.contains(c)
132*0e209d39SAndroid Build Coastguard Worker      * must be called.
133*0e209d39SAndroid Build Coastguard Worker      *
134*0e209d39SAndroid Build Coastguard Worker      * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to
135*0e209d39SAndroid Build Coastguard Worker      * the result of contains(FFFD) for faster validity checking at runtime.
136*0e209d39SAndroid Build Coastguard Worker      */
137*0e209d39SAndroid Build Coastguard Worker     uint32_t bmpBlockBits[64];
138*0e209d39SAndroid Build Coastguard Worker 
139*0e209d39SAndroid Build Coastguard Worker     /*
140*0e209d39SAndroid Build Coastguard Worker      * Inversion list indexes for restricted binary searches in
141*0e209d39SAndroid Build Coastguard Worker      * findCodePoint(), from
142*0e209d39SAndroid Build Coastguard Worker      * findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000).
143*0e209d39SAndroid Build Coastguard Worker      * U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
144*0e209d39SAndroid Build Coastguard Worker      * always looked up in the bit tables.
145*0e209d39SAndroid Build Coastguard Worker      * The last pair of indexes is for finding supplementary code points.
146*0e209d39SAndroid Build Coastguard Worker      */
147*0e209d39SAndroid Build Coastguard Worker     int32_t list4kStarts[18];
148*0e209d39SAndroid Build Coastguard Worker 
149*0e209d39SAndroid Build Coastguard Worker     /*
150*0e209d39SAndroid Build Coastguard Worker      * The inversion list of the parent set, for the slower contains() implementation
151*0e209d39SAndroid Build Coastguard Worker      * for mixed BMP blocks and for supplementary code points.
152*0e209d39SAndroid Build Coastguard Worker      * The list is terminated with list[listLength-1]=0x110000.
153*0e209d39SAndroid Build Coastguard Worker      */
154*0e209d39SAndroid Build Coastguard Worker     const int32_t *list;
155*0e209d39SAndroid Build Coastguard Worker     int32_t listLength;
156*0e209d39SAndroid Build Coastguard Worker };
157*0e209d39SAndroid Build Coastguard Worker 
containsSlow(UChar32 c,int32_t lo,int32_t hi)158*0e209d39SAndroid Build Coastguard Worker inline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const {
159*0e209d39SAndroid Build Coastguard Worker     return (UBool)(findCodePoint(c, lo, hi) & 1);
160*0e209d39SAndroid Build Coastguard Worker }
161*0e209d39SAndroid Build Coastguard Worker 
162*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
163*0e209d39SAndroid Build Coastguard Worker 
164*0e209d39SAndroid Build Coastguard Worker #endif
165