xref: /aosp_15_r20/external/icu/libicu/cts_headers/rbbirb.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker //
4*0e209d39SAndroid Build Coastguard Worker //  rbbirb.h
5*0e209d39SAndroid Build Coastguard Worker //
6*0e209d39SAndroid Build Coastguard Worker //  Copyright (C) 2002-2008, International Business Machines Corporation and others.
7*0e209d39SAndroid Build Coastguard Worker //  All Rights Reserved.
8*0e209d39SAndroid Build Coastguard Worker //
9*0e209d39SAndroid Build Coastguard Worker //  This file contains declarations for several classes from the
10*0e209d39SAndroid Build Coastguard Worker //    Rule Based Break Iterator rule builder.
11*0e209d39SAndroid Build Coastguard Worker //
12*0e209d39SAndroid Build Coastguard Worker 
13*0e209d39SAndroid Build Coastguard Worker 
14*0e209d39SAndroid Build Coastguard Worker #ifndef RBBIRB_H
15*0e209d39SAndroid Build Coastguard Worker #define RBBIRB_H
16*0e209d39SAndroid Build Coastguard Worker 
17*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
18*0e209d39SAndroid Build Coastguard Worker 
19*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_BREAK_ITERATION
20*0e209d39SAndroid Build Coastguard Worker 
21*0e209d39SAndroid Build Coastguard Worker #include <utility>
22*0e209d39SAndroid Build Coastguard Worker 
23*0e209d39SAndroid Build Coastguard Worker #include "unicode/uobject.h"
24*0e209d39SAndroid Build Coastguard Worker #include "unicode/rbbi.h"
25*0e209d39SAndroid Build Coastguard Worker #include "unicode/uniset.h"
26*0e209d39SAndroid Build Coastguard Worker #include "unicode/parseerr.h"
27*0e209d39SAndroid Build Coastguard Worker #include "uhash.h"
28*0e209d39SAndroid Build Coastguard Worker #include "uvector.h"
29*0e209d39SAndroid Build Coastguard Worker #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
30*0e209d39SAndroid Build Coastguard Worker                              //    looks up references to $variables within a set.
31*0e209d39SAndroid Build Coastguard Worker 
32*0e209d39SAndroid Build Coastguard Worker 
33*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
34*0e209d39SAndroid Build Coastguard Worker 
35*0e209d39SAndroid Build Coastguard Worker class               RBBIRuleScanner;
36*0e209d39SAndroid Build Coastguard Worker struct              RBBIRuleTableEl;
37*0e209d39SAndroid Build Coastguard Worker class               RBBISetBuilder;
38*0e209d39SAndroid Build Coastguard Worker class               RBBINode;
39*0e209d39SAndroid Build Coastguard Worker class               RBBITableBuilder;
40*0e209d39SAndroid Build Coastguard Worker 
41*0e209d39SAndroid Build Coastguard Worker 
42*0e209d39SAndroid Build Coastguard Worker 
43*0e209d39SAndroid Build Coastguard Worker //--------------------------------------------------------------------------------
44*0e209d39SAndroid Build Coastguard Worker //
45*0e209d39SAndroid Build Coastguard Worker //   RBBISymbolTable.    Implements SymbolTable interface that is used by the
46*0e209d39SAndroid Build Coastguard Worker //                       UnicodeSet parser to resolve references to $variables.
47*0e209d39SAndroid Build Coastguard Worker //
48*0e209d39SAndroid Build Coastguard Worker //--------------------------------------------------------------------------------
49*0e209d39SAndroid Build Coastguard Worker class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
50*0e209d39SAndroid Build Coastguard Worker public:                                       //   of these structs for each entry.
51*0e209d39SAndroid Build Coastguard Worker     RBBISymbolTableEntry();
52*0e209d39SAndroid Build Coastguard Worker     UnicodeString          key;
53*0e209d39SAndroid Build Coastguard Worker     RBBINode               *val;
54*0e209d39SAndroid Build Coastguard Worker     ~RBBISymbolTableEntry();
55*0e209d39SAndroid Build Coastguard Worker 
56*0e209d39SAndroid Build Coastguard Worker private:
57*0e209d39SAndroid Build Coastguard Worker     RBBISymbolTableEntry(const RBBISymbolTableEntry &other) = delete; // forbid copying of this class
58*0e209d39SAndroid Build Coastguard Worker     RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other) = delete; // forbid copying of this class
59*0e209d39SAndroid Build Coastguard Worker };
60*0e209d39SAndroid Build Coastguard Worker 
61*0e209d39SAndroid Build Coastguard Worker 
62*0e209d39SAndroid Build Coastguard Worker class RBBISymbolTable : public UMemory, public SymbolTable {
63*0e209d39SAndroid Build Coastguard Worker private:
64*0e209d39SAndroid Build Coastguard Worker     const UnicodeString      &fRules;
65*0e209d39SAndroid Build Coastguard Worker     UHashtable               *fHashTable;
66*0e209d39SAndroid Build Coastguard Worker     RBBIRuleScanner          *fRuleScanner;
67*0e209d39SAndroid Build Coastguard Worker 
68*0e209d39SAndroid Build Coastguard Worker     // These next two fields are part of the mechanism for passing references to
69*0e209d39SAndroid Build Coastguard Worker     //   already-constructed UnicodeSets back to the UnicodeSet constructor
70*0e209d39SAndroid Build Coastguard Worker     //   when the pattern includes $variable references.
71*0e209d39SAndroid Build Coastguard Worker     const UnicodeString      ffffString;      // = "/uffff"
72*0e209d39SAndroid Build Coastguard Worker     UnicodeSet              *fCachedSetLookup;
73*0e209d39SAndroid Build Coastguard Worker 
74*0e209d39SAndroid Build Coastguard Worker public:
75*0e209d39SAndroid Build Coastguard Worker     //  API inherited from class SymbolTable
76*0e209d39SAndroid Build Coastguard Worker     virtual const UnicodeString*  lookup(const UnicodeString& s) const override;
77*0e209d39SAndroid Build Coastguard Worker     virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const override;
78*0e209d39SAndroid Build Coastguard Worker     virtual UnicodeString parseReference(const UnicodeString& text,
79*0e209d39SAndroid Build Coastguard Worker                                          ParsePosition& pos, int32_t limit) const override;
80*0e209d39SAndroid Build Coastguard Worker 
81*0e209d39SAndroid Build Coastguard Worker     //  Additional Functions
82*0e209d39SAndroid Build Coastguard Worker     RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
83*0e209d39SAndroid Build Coastguard Worker     virtual ~RBBISymbolTable();
84*0e209d39SAndroid Build Coastguard Worker 
85*0e209d39SAndroid Build Coastguard Worker     virtual RBBINode *lookupNode(const UnicodeString &key) const;
86*0e209d39SAndroid Build Coastguard Worker     virtual void      addEntry  (const UnicodeString &key, RBBINode *val, UErrorCode &err);
87*0e209d39SAndroid Build Coastguard Worker 
88*0e209d39SAndroid Build Coastguard Worker #ifdef RBBI_DEBUG
89*0e209d39SAndroid Build Coastguard Worker     virtual void      rbbiSymtablePrint() const;
90*0e209d39SAndroid Build Coastguard Worker #else
91*0e209d39SAndroid Build Coastguard Worker     // A do-nothing inline function for non-debug builds.  Member funcs can't be empty
92*0e209d39SAndroid Build Coastguard Worker     //  or the call sites won't compile.
93*0e209d39SAndroid Build Coastguard Worker     int32_t fFakeField;
94*0e209d39SAndroid Build Coastguard Worker     #define rbbiSymtablePrint() fFakeField=0;
95*0e209d39SAndroid Build Coastguard Worker #endif
96*0e209d39SAndroid Build Coastguard Worker 
97*0e209d39SAndroid Build Coastguard Worker private:
98*0e209d39SAndroid Build Coastguard Worker     RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
99*0e209d39SAndroid Build Coastguard Worker     RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
100*0e209d39SAndroid Build Coastguard Worker };
101*0e209d39SAndroid Build Coastguard Worker 
102*0e209d39SAndroid Build Coastguard Worker 
103*0e209d39SAndroid Build Coastguard Worker //--------------------------------------------------------------------------------
104*0e209d39SAndroid Build Coastguard Worker //
105*0e209d39SAndroid Build Coastguard Worker //  class RBBIRuleBuilder       The top-level class handling RBBI rule compiling.
106*0e209d39SAndroid Build Coastguard Worker //
107*0e209d39SAndroid Build Coastguard Worker //--------------------------------------------------------------------------------
108*0e209d39SAndroid Build Coastguard Worker class RBBIRuleBuilder : public UMemory {
109*0e209d39SAndroid Build Coastguard Worker public:
110*0e209d39SAndroid Build Coastguard Worker 
111*0e209d39SAndroid Build Coastguard Worker     //  Create a rule based break iterator from a set of rules.
112*0e209d39SAndroid Build Coastguard Worker     //  This function is the main entry point into the rule builder.  The
113*0e209d39SAndroid Build Coastguard Worker     //   public ICU API for creating RBBIs uses this function to do the actual work.
114*0e209d39SAndroid Build Coastguard Worker     //
115*0e209d39SAndroid Build Coastguard Worker     static BreakIterator * createRuleBasedBreakIterator( const UnicodeString    &rules,
116*0e209d39SAndroid Build Coastguard Worker                                     UParseError      *parseError,
117*0e209d39SAndroid Build Coastguard Worker                                     UErrorCode       &status);
118*0e209d39SAndroid Build Coastguard Worker 
119*0e209d39SAndroid Build Coastguard Worker public:
120*0e209d39SAndroid Build Coastguard Worker     // The "public" functions and data members that appear below are accessed
121*0e209d39SAndroid Build Coastguard Worker     //  (and shared) by the various parts that make up the rule builder.  They
122*0e209d39SAndroid Build Coastguard Worker     //  are NOT intended to be accessed by anything outside of the
123*0e209d39SAndroid Build Coastguard Worker     //  rule builder implementation.
124*0e209d39SAndroid Build Coastguard Worker     RBBIRuleBuilder(const UnicodeString  &rules,
125*0e209d39SAndroid Build Coastguard Worker                     UParseError          *parseErr,
126*0e209d39SAndroid Build Coastguard Worker                     UErrorCode           &status
127*0e209d39SAndroid Build Coastguard Worker     );
128*0e209d39SAndroid Build Coastguard Worker 
129*0e209d39SAndroid Build Coastguard Worker     virtual    ~RBBIRuleBuilder();
130*0e209d39SAndroid Build Coastguard Worker 
131*0e209d39SAndroid Build Coastguard Worker     /**
132*0e209d39SAndroid Build Coastguard Worker      *  Build the state tables and char class Trie from the source rules.
133*0e209d39SAndroid Build Coastguard Worker      */
134*0e209d39SAndroid Build Coastguard Worker     RBBIDataHeader  *build(UErrorCode &status);
135*0e209d39SAndroid Build Coastguard Worker 
136*0e209d39SAndroid Build Coastguard Worker 
137*0e209d39SAndroid Build Coastguard Worker     /**
138*0e209d39SAndroid Build Coastguard Worker      * Fold together redundant character classes (table columns) and
139*0e209d39SAndroid Build Coastguard Worker      * redundant states (table rows). Done after initial table generation,
140*0e209d39SAndroid Build Coastguard Worker      * before serializing the result.
141*0e209d39SAndroid Build Coastguard Worker      */
142*0e209d39SAndroid Build Coastguard Worker     void optimizeTables();
143*0e209d39SAndroid Build Coastguard Worker 
144*0e209d39SAndroid Build Coastguard Worker     char                          *fDebugEnv;        // controls debug trace output
145*0e209d39SAndroid Build Coastguard Worker     UErrorCode                    *fStatus;          // Error reporting.  Keeping status
146*0e209d39SAndroid Build Coastguard Worker     UParseError                   *fParseError;      //   here avoids passing it everywhere.
147*0e209d39SAndroid Build Coastguard Worker     const UnicodeString           &fRules;           // The rule string that we are compiling
148*0e209d39SAndroid Build Coastguard Worker     UnicodeString                 fStrippedRules;    // The rule string, with comments stripped.
149*0e209d39SAndroid Build Coastguard Worker 
150*0e209d39SAndroid Build Coastguard Worker     RBBIRuleScanner               *fScanner;         // The scanner.
151*0e209d39SAndroid Build Coastguard Worker     RBBINode                      *fForwardTree;     // The parse trees, generated by the scanner,
152*0e209d39SAndroid Build Coastguard Worker     RBBINode                      *fReverseTree;     //   then manipulated by subsequent steps.
153*0e209d39SAndroid Build Coastguard Worker     RBBINode                      *fSafeFwdTree;
154*0e209d39SAndroid Build Coastguard Worker     RBBINode                      *fSafeRevTree;
155*0e209d39SAndroid Build Coastguard Worker 
156*0e209d39SAndroid Build Coastguard Worker     RBBINode                      **fDefaultTree;    // For rules not qualified with a !
157*0e209d39SAndroid Build Coastguard Worker                                                      //   the tree to which they belong to.
158*0e209d39SAndroid Build Coastguard Worker 
159*0e209d39SAndroid Build Coastguard Worker     UBool                         fChainRules;       // True for chained Unicode TR style rules.
160*0e209d39SAndroid Build Coastguard Worker                                                      // False for traditional regexp rules.
161*0e209d39SAndroid Build Coastguard Worker 
162*0e209d39SAndroid Build Coastguard Worker     UBool                         fLookAheadHardBreak;  // True:  Look ahead matches cause an
163*0e209d39SAndroid Build Coastguard Worker                                                      // immediate break, no continuing for the
164*0e209d39SAndroid Build Coastguard Worker                                                      // longest match.
165*0e209d39SAndroid Build Coastguard Worker 
166*0e209d39SAndroid Build Coastguard Worker     RBBISetBuilder                *fSetBuilder;      // Set and Character Category builder.
167*0e209d39SAndroid Build Coastguard Worker     UVector                       *fUSetNodes;       // Vector of all uset nodes.
168*0e209d39SAndroid Build Coastguard Worker 
169*0e209d39SAndroid Build Coastguard Worker     RBBITableBuilder              *fForwardTable;    // State transition table, build time form.
170*0e209d39SAndroid Build Coastguard Worker 
171*0e209d39SAndroid Build Coastguard Worker     UVector                       *fRuleStatusVals;  // The values that can be returned
172*0e209d39SAndroid Build Coastguard Worker                                                      //   from getRuleStatus().
173*0e209d39SAndroid Build Coastguard Worker 
174*0e209d39SAndroid Build Coastguard Worker     RBBIDataHeader                *flattenData();    // Create the flattened (runtime format)
175*0e209d39SAndroid Build Coastguard Worker                                                      // data tables..
176*0e209d39SAndroid Build Coastguard Worker private:
177*0e209d39SAndroid Build Coastguard Worker     RBBIRuleBuilder(const RBBIRuleBuilder &other) = delete; // forbid copying of this class
178*0e209d39SAndroid Build Coastguard Worker     RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other) = delete; // forbid copying of this class
179*0e209d39SAndroid Build Coastguard Worker };
180*0e209d39SAndroid Build Coastguard Worker 
181*0e209d39SAndroid Build Coastguard Worker 
182*0e209d39SAndroid Build Coastguard Worker 
183*0e209d39SAndroid Build Coastguard Worker 
184*0e209d39SAndroid Build Coastguard Worker //----------------------------------------------------------------------------
185*0e209d39SAndroid Build Coastguard Worker //
186*0e209d39SAndroid Build Coastguard Worker //   RBBISetTableEl   is an entry in the hash table of UnicodeSets that have
187*0e209d39SAndroid Build Coastguard Worker //                    been encountered.  The val Node will be of nodetype uset
188*0e209d39SAndroid Build Coastguard Worker //                    and contain pointers to the actual UnicodeSets.
189*0e209d39SAndroid Build Coastguard Worker //                    The Key is the source string for initializing the set.
190*0e209d39SAndroid Build Coastguard Worker //
191*0e209d39SAndroid Build Coastguard Worker //                    The hash table is used to avoid creating duplicate
192*0e209d39SAndroid Build Coastguard Worker //                    unnamed (not $var references) UnicodeSets.
193*0e209d39SAndroid Build Coastguard Worker //
194*0e209d39SAndroid Build Coastguard Worker //                    Memory Management:
195*0e209d39SAndroid Build Coastguard Worker //                       The Hash Table owns these RBBISetTableEl structs and
196*0e209d39SAndroid Build Coastguard Worker //                            the key strings.  It does NOT own the val nodes.
197*0e209d39SAndroid Build Coastguard Worker //
198*0e209d39SAndroid Build Coastguard Worker //----------------------------------------------------------------------------
199*0e209d39SAndroid Build Coastguard Worker struct RBBISetTableEl {
200*0e209d39SAndroid Build Coastguard Worker     UnicodeString *key;
201*0e209d39SAndroid Build Coastguard Worker     RBBINode      *val;
202*0e209d39SAndroid Build Coastguard Worker };
203*0e209d39SAndroid Build Coastguard Worker 
204*0e209d39SAndroid Build Coastguard Worker /**
205*0e209d39SAndroid Build Coastguard Worker  *   A pair of ints, used to bundle pairs of states or pairs of character classes.
206*0e209d39SAndroid Build Coastguard Worker  */
207*0e209d39SAndroid Build Coastguard Worker typedef std::pair<int32_t, int32_t> IntPair;
208*0e209d39SAndroid Build Coastguard Worker 
209*0e209d39SAndroid Build Coastguard Worker 
210*0e209d39SAndroid Build Coastguard Worker //----------------------------------------------------------------------------
211*0e209d39SAndroid Build Coastguard Worker //
212*0e209d39SAndroid Build Coastguard Worker //   RBBIDebugPrintf    Printf equivalent, for debugging output.
213*0e209d39SAndroid Build Coastguard Worker //                      Conditional compilation of the implementation lets us
214*0e209d39SAndroid Build Coastguard Worker //                      get rid of the stdio dependency in environments where it
215*0e209d39SAndroid Build Coastguard Worker //                      is unavailable.
216*0e209d39SAndroid Build Coastguard Worker //
217*0e209d39SAndroid Build Coastguard Worker //----------------------------------------------------------------------------
218*0e209d39SAndroid Build Coastguard Worker #ifdef RBBI_DEBUG
219*0e209d39SAndroid Build Coastguard Worker #include <stdio.h>
220*0e209d39SAndroid Build Coastguard Worker #define RBBIDebugPrintf printf
221*0e209d39SAndroid Build Coastguard Worker #define RBBIDebugPuts puts
222*0e209d39SAndroid Build Coastguard Worker #else
223*0e209d39SAndroid Build Coastguard Worker #undef RBBIDebugPrintf
224*0e209d39SAndroid Build Coastguard Worker #define RBBIDebugPuts(arg)
225*0e209d39SAndroid Build Coastguard Worker #endif
226*0e209d39SAndroid Build Coastguard Worker 
227*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
228*0e209d39SAndroid Build Coastguard Worker 
229*0e209d39SAndroid Build Coastguard Worker #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
230*0e209d39SAndroid Build Coastguard Worker 
231*0e209d39SAndroid Build Coastguard Worker #endif
232*0e209d39SAndroid Build Coastguard Worker 
233*0e209d39SAndroid Build Coastguard Worker 
234*0e209d39SAndroid Build Coastguard Worker 
235