xref: /aosp_15_r20/external/icu/libicu/cts_headers/rbbisetb.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker //
4*0e209d39SAndroid Build Coastguard Worker //  rbbisetb.h
5*0e209d39SAndroid Build Coastguard Worker /*
6*0e209d39SAndroid Build Coastguard Worker **********************************************************************
7*0e209d39SAndroid Build Coastguard Worker *   Copyright (c) 2001-2005, International Business Machines
8*0e209d39SAndroid Build Coastguard Worker *   Corporation and others.  All Rights Reserved.
9*0e209d39SAndroid Build Coastguard Worker **********************************************************************
10*0e209d39SAndroid Build Coastguard Worker */
11*0e209d39SAndroid Build Coastguard Worker 
12*0e209d39SAndroid Build Coastguard Worker #ifndef RBBISETB_H
13*0e209d39SAndroid Build Coastguard Worker #define RBBISETB_H
14*0e209d39SAndroid Build Coastguard Worker 
15*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
16*0e209d39SAndroid Build Coastguard Worker 
17*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_BREAK_ITERATION
18*0e209d39SAndroid Build Coastguard Worker 
19*0e209d39SAndroid Build Coastguard Worker #include "unicode/ucptrie.h"
20*0e209d39SAndroid Build Coastguard Worker #include "unicode/umutablecptrie.h"
21*0e209d39SAndroid Build Coastguard Worker #include "unicode/uobject.h"
22*0e209d39SAndroid Build Coastguard Worker #include "rbbirb.h"
23*0e209d39SAndroid Build Coastguard Worker #include "uvector.h"
24*0e209d39SAndroid Build Coastguard Worker 
25*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
26*0e209d39SAndroid Build Coastguard Worker 
27*0e209d39SAndroid Build Coastguard Worker //
28*0e209d39SAndroid Build Coastguard Worker //  RBBISetBuilder   Derives the character categories used by the runtime RBBI engine
29*0e209d39SAndroid Build Coastguard Worker //                   from the Unicode Sets appearing in the source  RBBI rules, and
30*0e209d39SAndroid Build Coastguard Worker //                   creates the TRIE table used to map from Unicode to the
31*0e209d39SAndroid Build Coastguard Worker //                   character categories.
32*0e209d39SAndroid Build Coastguard Worker //
33*0e209d39SAndroid Build Coastguard Worker 
34*0e209d39SAndroid Build Coastguard Worker 
35*0e209d39SAndroid Build Coastguard Worker //
36*0e209d39SAndroid Build Coastguard Worker //  RangeDescriptor
37*0e209d39SAndroid Build Coastguard Worker //
38*0e209d39SAndroid Build Coastguard Worker //     Each of the non-overlapping character ranges gets one of these descriptors.
39*0e209d39SAndroid Build Coastguard Worker //     All of them are strung together in a linked list, which is kept in order
40*0e209d39SAndroid Build Coastguard Worker //     (by character)
41*0e209d39SAndroid Build Coastguard Worker //
42*0e209d39SAndroid Build Coastguard Worker class RangeDescriptor : public UMemory {
43*0e209d39SAndroid Build Coastguard Worker public:
44*0e209d39SAndroid Build Coastguard Worker     UChar32            fStartChar {};            // Start of range, unicode 32 bit value.
45*0e209d39SAndroid Build Coastguard Worker     UChar32            fEndChar {};              // End of range, unicode 32 bit value.
46*0e209d39SAndroid Build Coastguard Worker     int32_t            fNum {0};                 // runtime-mapped input value for this range.
47*0e209d39SAndroid Build Coastguard Worker     bool               fIncludesDict {false};    // True if the range includes $dictionary.
48*0e209d39SAndroid Build Coastguard Worker     bool               fFirstInGroup {false};    // True if first range in a group with the same fNum.
49*0e209d39SAndroid Build Coastguard Worker     UVector           *fIncludesSets {nullptr};  // vector of the the original
50*0e209d39SAndroid Build Coastguard Worker                                                  //   Unicode sets that include this range.
51*0e209d39SAndroid Build Coastguard Worker                                                  //    (Contains ptrs to uset nodes)
52*0e209d39SAndroid Build Coastguard Worker     RangeDescriptor   *fNext {nullptr};          // Next RangeDescriptor in the linked list.
53*0e209d39SAndroid Build Coastguard Worker 
54*0e209d39SAndroid Build Coastguard Worker     RangeDescriptor(UErrorCode &status);
55*0e209d39SAndroid Build Coastguard Worker     RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
56*0e209d39SAndroid Build Coastguard Worker     ~RangeDescriptor();
57*0e209d39SAndroid Build Coastguard Worker     void split(UChar32 where, UErrorCode &status);   // Spit this range in two at "where", with
58*0e209d39SAndroid Build Coastguard Worker                                         //   where appearing in the second (higher) part.
59*0e209d39SAndroid Build Coastguard Worker     bool isDictionaryRange();           // Check whether this range appears as part of
60*0e209d39SAndroid Build Coastguard Worker                                         //   the Unicode set named "dictionary"
61*0e209d39SAndroid Build Coastguard Worker 
62*0e209d39SAndroid Build Coastguard Worker     RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class
63*0e209d39SAndroid Build Coastguard Worker     RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class
64*0e209d39SAndroid Build Coastguard Worker };
65*0e209d39SAndroid Build Coastguard Worker 
66*0e209d39SAndroid Build Coastguard Worker 
67*0e209d39SAndroid Build Coastguard Worker //
68*0e209d39SAndroid Build Coastguard Worker //  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules.
69*0e209d39SAndroid Build Coastguard Worker //
70*0e209d39SAndroid Build Coastguard Worker //      Starting with the rules parse tree from the scanner,
71*0e209d39SAndroid Build Coastguard Worker //
72*0e209d39SAndroid Build Coastguard Worker //                   -  Enumerate the set of UnicodeSets that are referenced
73*0e209d39SAndroid Build Coastguard Worker //                      by the RBBI rules.
74*0e209d39SAndroid Build Coastguard Worker //                   -  compute a derived set of non-overlapping UnicodeSets
75*0e209d39SAndroid Build Coastguard Worker //                      that will correspond to columns in the state table for
76*0e209d39SAndroid Build Coastguard Worker //                      the RBBI execution engine.
77*0e209d39SAndroid Build Coastguard Worker //                   -  construct the trie table that maps input characters
78*0e209d39SAndroid Build Coastguard Worker //                      to set numbers in the non-overlapping set of sets.
79*0e209d39SAndroid Build Coastguard Worker //
80*0e209d39SAndroid Build Coastguard Worker 
81*0e209d39SAndroid Build Coastguard Worker 
82*0e209d39SAndroid Build Coastguard Worker class RBBISetBuilder : public UMemory {
83*0e209d39SAndroid Build Coastguard Worker public:
84*0e209d39SAndroid Build Coastguard Worker     RBBISetBuilder(RBBIRuleBuilder *rb);
85*0e209d39SAndroid Build Coastguard Worker     ~RBBISetBuilder();
86*0e209d39SAndroid Build Coastguard Worker 
87*0e209d39SAndroid Build Coastguard Worker     void     buildRanges();
88*0e209d39SAndroid Build Coastguard Worker     void     buildTrie();
89*0e209d39SAndroid Build Coastguard Worker     void     addValToSets(UVector *sets,      uint32_t val);
90*0e209d39SAndroid Build Coastguard Worker     void     addValToSet (RBBINode *usetNode, uint32_t val);
91*0e209d39SAndroid Build Coastguard Worker     int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the
92*0e209d39SAndroid Build Coastguard Worker                                              //    runtime state machine, which are the same as
93*0e209d39SAndroid Build Coastguard Worker                                              //    columns in the DFA state table
94*0e209d39SAndroid Build Coastguard Worker     int32_t  getDictCategoriesStart() const; // First char category that includes $dictionary, or
95*0e209d39SAndroid Build Coastguard Worker                                              // last category + 1 if there are no dictionary categories.
96*0e209d39SAndroid Build Coastguard Worker     int32_t  getTrieSize() /*const*/;        // Size in bytes of the serialized Trie.
97*0e209d39SAndroid Build Coastguard Worker     void     serializeTrie(uint8_t *where);  // write out the serialized Trie.
98*0e209d39SAndroid Build Coastguard Worker     UChar32  getFirstChar(int32_t  val) const;
99*0e209d39SAndroid Build Coastguard Worker     UBool    sawBOF() const;                 // Indicate whether any references to the {bof} pseudo
100*0e209d39SAndroid Build Coastguard Worker                                              //   character were encountered.
101*0e209d39SAndroid Build Coastguard Worker     /**
102*0e209d39SAndroid Build Coastguard Worker      * Merge two character categories that have been identified as having equivalent behavior.
103*0e209d39SAndroid Build Coastguard Worker      * The ranges belonging to the second category (table column) will be added to the first.
104*0e209d39SAndroid Build Coastguard Worker      * @param categories the pair of categories to be merged.
105*0e209d39SAndroid Build Coastguard Worker      */
106*0e209d39SAndroid Build Coastguard Worker     void     mergeCategories(IntPair categories);
107*0e209d39SAndroid Build Coastguard Worker 
108*0e209d39SAndroid Build Coastguard Worker #ifdef RBBI_DEBUG
109*0e209d39SAndroid Build Coastguard Worker     void     printSets();
110*0e209d39SAndroid Build Coastguard Worker     void     printRanges();
111*0e209d39SAndroid Build Coastguard Worker     void     printRangeGroups();
112*0e209d39SAndroid Build Coastguard Worker #else
113*0e209d39SAndroid Build Coastguard Worker     #define printSets()
114*0e209d39SAndroid Build Coastguard Worker     #define printRanges()
115*0e209d39SAndroid Build Coastguard Worker     #define printRangeGroups()
116*0e209d39SAndroid Build Coastguard Worker #endif
117*0e209d39SAndroid Build Coastguard Worker 
118*0e209d39SAndroid Build Coastguard Worker private:
119*0e209d39SAndroid Build Coastguard Worker     RBBIRuleBuilder       *fRB;             // The RBBI Rule Compiler that owns us.
120*0e209d39SAndroid Build Coastguard Worker     UErrorCode            *fStatus;
121*0e209d39SAndroid Build Coastguard Worker 
122*0e209d39SAndroid Build Coastguard Worker     RangeDescriptor       *fRangeList;      // Head of the linked list of RangeDescriptors
123*0e209d39SAndroid Build Coastguard Worker 
124*0e209d39SAndroid Build Coastguard Worker     UMutableCPTrie        *fMutableTrie;    // The mapping TRIE that is the end result of processing
125*0e209d39SAndroid Build Coastguard Worker     UCPTrie               *fTrie;           //  the Unicode Sets.
126*0e209d39SAndroid Build Coastguard Worker     uint32_t               fTrieSize;
127*0e209d39SAndroid Build Coastguard Worker 
128*0e209d39SAndroid Build Coastguard Worker     // Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
129*0e209d39SAndroid Build Coastguard Worker     int32_t               fGroupCount;
130*0e209d39SAndroid Build Coastguard Worker 
131*0e209d39SAndroid Build Coastguard Worker     // The number of the first dictionary char category.
132*0e209d39SAndroid Build Coastguard Worker     // If there are no Dictionary categories, set to the last category + 1.
133*0e209d39SAndroid Build Coastguard Worker     int32_t               fDictCategoriesStart;
134*0e209d39SAndroid Build Coastguard Worker 
135*0e209d39SAndroid Build Coastguard Worker     UBool                 fSawBOF;
136*0e209d39SAndroid Build Coastguard Worker 
137*0e209d39SAndroid Build Coastguard Worker     RBBISetBuilder(const RBBISetBuilder &other) = delete; // forbid copying of this class
138*0e209d39SAndroid Build Coastguard Worker     RBBISetBuilder &operator=(const RBBISetBuilder &other) = delete; // forbid copying of this class
139*0e209d39SAndroid Build Coastguard Worker };
140*0e209d39SAndroid Build Coastguard Worker 
141*0e209d39SAndroid Build Coastguard Worker 
142*0e209d39SAndroid Build Coastguard Worker 
143*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
144*0e209d39SAndroid Build Coastguard Worker 
145*0e209d39SAndroid Build Coastguard Worker #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
146*0e209d39SAndroid Build Coastguard Worker 
147*0e209d39SAndroid Build Coastguard Worker #endif
148