xref: /aosp_15_r20/external/icu/libicu/cts_headers/rbbiscan.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker //
4*0e209d39SAndroid Build Coastguard Worker //  rbbiscan.h
5*0e209d39SAndroid Build Coastguard Worker //
6*0e209d39SAndroid Build Coastguard Worker //  Copyright (C) 2002-2016, International Business Machines Corporation and others.
7*0e209d39SAndroid Build Coastguard Worker //  All Rights Reserved.
8*0e209d39SAndroid Build Coastguard Worker //
9*0e209d39SAndroid Build Coastguard Worker //  This file contains declarations for class RBBIRuleScanner
10*0e209d39SAndroid Build Coastguard Worker //
11*0e209d39SAndroid Build Coastguard Worker 
12*0e209d39SAndroid Build Coastguard Worker 
13*0e209d39SAndroid Build Coastguard Worker #ifndef RBBISCAN_H
14*0e209d39SAndroid Build Coastguard Worker #define RBBISCAN_H
15*0e209d39SAndroid Build Coastguard Worker 
16*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
17*0e209d39SAndroid Build Coastguard Worker #include "unicode/uobject.h"
18*0e209d39SAndroid Build Coastguard Worker #include "unicode/rbbi.h"
19*0e209d39SAndroid Build Coastguard Worker #include "unicode/uniset.h"
20*0e209d39SAndroid Build Coastguard Worker #include "unicode/parseerr.h"
21*0e209d39SAndroid Build Coastguard Worker #include "uhash.h"
22*0e209d39SAndroid Build Coastguard Worker #include "uvector.h"
23*0e209d39SAndroid Build Coastguard Worker #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
24*0e209d39SAndroid Build Coastguard Worker                           //    looks up references to $variables within a set.
25*0e209d39SAndroid Build Coastguard Worker #include "rbbinode.h"
26*0e209d39SAndroid Build Coastguard Worker #include "rbbirpt.h"
27*0e209d39SAndroid Build Coastguard Worker 
28*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
29*0e209d39SAndroid Build Coastguard Worker 
30*0e209d39SAndroid Build Coastguard Worker class   RBBIRuleBuilder;
31*0e209d39SAndroid Build Coastguard Worker class   RBBISymbolTable;
32*0e209d39SAndroid Build Coastguard Worker 
33*0e209d39SAndroid Build Coastguard Worker 
34*0e209d39SAndroid Build Coastguard Worker //--------------------------------------------------------------------------------
35*0e209d39SAndroid Build Coastguard Worker //
36*0e209d39SAndroid Build Coastguard Worker //  class RBBIRuleScanner does the lowest level, character-at-a-time
37*0e209d39SAndroid Build Coastguard Worker //                        scanning of break iterator rules.
38*0e209d39SAndroid Build Coastguard Worker //
39*0e209d39SAndroid Build Coastguard Worker //                        The output of the scanner is parse trees for
40*0e209d39SAndroid Build Coastguard Worker //                        the rule expressions and a list of all Unicode Sets
41*0e209d39SAndroid Build Coastguard Worker //                        encountered.
42*0e209d39SAndroid Build Coastguard Worker //
43*0e209d39SAndroid Build Coastguard Worker //--------------------------------------------------------------------------------
44*0e209d39SAndroid Build Coastguard Worker 
45*0e209d39SAndroid Build Coastguard Worker class RBBIRuleScanner : public UMemory {
46*0e209d39SAndroid Build Coastguard Worker public:
47*0e209d39SAndroid Build Coastguard Worker 
48*0e209d39SAndroid Build Coastguard Worker     enum {
49*0e209d39SAndroid Build Coastguard Worker         kStackSize = 100            // The size of the state stack for
50*0e209d39SAndroid Build Coastguard Worker     };                              //   rules parsing.  Corresponds roughly
51*0e209d39SAndroid Build Coastguard Worker                                     //   to the depth of parentheses nesting
52*0e209d39SAndroid Build Coastguard Worker                                     //   that is allowed in the rules.
53*0e209d39SAndroid Build Coastguard Worker 
54*0e209d39SAndroid Build Coastguard Worker     struct RBBIRuleChar {
55*0e209d39SAndroid Build Coastguard Worker         UChar32             fChar;
56*0e209d39SAndroid Build Coastguard Worker         UBool               fEscaped;
RBBIRuleCharRBBIRuleChar57*0e209d39SAndroid Build Coastguard Worker         RBBIRuleChar() : fChar(0), fEscaped(false) {}
58*0e209d39SAndroid Build Coastguard Worker     };
59*0e209d39SAndroid Build Coastguard Worker 
60*0e209d39SAndroid Build Coastguard Worker     RBBIRuleScanner(RBBIRuleBuilder  *rb);
61*0e209d39SAndroid Build Coastguard Worker 
62*0e209d39SAndroid Build Coastguard Worker 
63*0e209d39SAndroid Build Coastguard Worker     virtual    ~RBBIRuleScanner();
64*0e209d39SAndroid Build Coastguard Worker 
65*0e209d39SAndroid Build Coastguard Worker     void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream.
66*0e209d39SAndroid Build Coastguard Worker                                                     // Return false if at end.
67*0e209d39SAndroid Build Coastguard Worker 
68*0e209d39SAndroid Build Coastguard Worker     UBool       push(const RBBIRuleChar &c);        // Push (unget) one character.
69*0e209d39SAndroid Build Coastguard Worker                                                     //   Only a single character may be pushed.
70*0e209d39SAndroid Build Coastguard Worker 
71*0e209d39SAndroid Build Coastguard Worker     void        parse();                            // Parse the rules, generating two parse
72*0e209d39SAndroid Build Coastguard Worker                                                     //   trees, one each for the forward and
73*0e209d39SAndroid Build Coastguard Worker                                                     //   reverse rules,
74*0e209d39SAndroid Build Coastguard Worker                                                     //   and a list of UnicodeSets encountered.
75*0e209d39SAndroid Build Coastguard Worker 
76*0e209d39SAndroid Build Coastguard Worker     int32_t     numRules();                         // Return the number of rules that have been seen.
77*0e209d39SAndroid Build Coastguard Worker 
78*0e209d39SAndroid Build Coastguard Worker     /**
79*0e209d39SAndroid Build Coastguard Worker      * Return a rules string without unnecessary
80*0e209d39SAndroid Build Coastguard Worker      * characters.
81*0e209d39SAndroid Build Coastguard Worker      */
82*0e209d39SAndroid Build Coastguard Worker     static UnicodeString stripRules(const UnicodeString &rules);
83*0e209d39SAndroid Build Coastguard Worker private:
84*0e209d39SAndroid Build Coastguard Worker 
85*0e209d39SAndroid Build Coastguard Worker     UBool       doParseActions(int32_t a);
86*0e209d39SAndroid Build Coastguard Worker     void        error(UErrorCode e);                   // error reporting convenience function.
87*0e209d39SAndroid Build Coastguard Worker     void        fixOpStack(RBBINode::OpPrecedence p);
88*0e209d39SAndroid Build Coastguard Worker                                                        //   a character.
89*0e209d39SAndroid Build Coastguard Worker     void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = nullptr);
90*0e209d39SAndroid Build Coastguard Worker 
91*0e209d39SAndroid Build Coastguard Worker     UChar32     nextCharLL();
92*0e209d39SAndroid Build Coastguard Worker #ifdef RBBI_DEBUG
93*0e209d39SAndroid Build Coastguard Worker     void        printNodeStack(const char *title);
94*0e209d39SAndroid Build Coastguard Worker #endif
95*0e209d39SAndroid Build Coastguard Worker     RBBINode    *pushNewNode(RBBINode::NodeType  t);
96*0e209d39SAndroid Build Coastguard Worker     void        scanSet();
97*0e209d39SAndroid Build Coastguard Worker 
98*0e209d39SAndroid Build Coastguard Worker 
99*0e209d39SAndroid Build Coastguard Worker     RBBIRuleBuilder               *fRB;              // The rule builder that we are part of.
100*0e209d39SAndroid Build Coastguard Worker 
101*0e209d39SAndroid Build Coastguard Worker     int32_t                       fScanIndex;        // Index of current character being processed
102*0e209d39SAndroid Build Coastguard Worker                                                      //   in the rule input string.
103*0e209d39SAndroid Build Coastguard Worker     int32_t                       fNextIndex;        // Index of the next character, which
104*0e209d39SAndroid Build Coastguard Worker                                                      //   is the first character not yet scanned.
105*0e209d39SAndroid Build Coastguard Worker     UBool                         fQuoteMode;        // Scan is in a 'quoted region'
106*0e209d39SAndroid Build Coastguard Worker     int32_t                       fLineNum;          // Line number in input file.
107*0e209d39SAndroid Build Coastguard Worker     int32_t                       fCharNum;          // Char position within the line.
108*0e209d39SAndroid Build Coastguard Worker     UChar32                       fLastChar;         // Previous char, needed to count CR-LF
109*0e209d39SAndroid Build Coastguard Worker                                                      //   as a single line, not two.
110*0e209d39SAndroid Build Coastguard Worker 
111*0e209d39SAndroid Build Coastguard Worker     RBBIRuleChar                  fC;                // Current char for parse state machine
112*0e209d39SAndroid Build Coastguard Worker                                                      //   processing.
113*0e209d39SAndroid Build Coastguard Worker     UnicodeString                 fVarName;          // $variableName, valid when we've just
114*0e209d39SAndroid Build Coastguard Worker                                                      //   scanned one.
115*0e209d39SAndroid Build Coastguard Worker 
116*0e209d39SAndroid Build Coastguard Worker     RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule
117*0e209d39SAndroid Build Coastguard Worker                                                      //   parsing.  index by p[state][char-class]
118*0e209d39SAndroid Build Coastguard Worker 
119*0e209d39SAndroid Build Coastguard Worker     uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
120*0e209d39SAndroid Build Coastguard Worker     int32_t                       fStackPtr;           //  and pops as specified in the state
121*0e209d39SAndroid Build Coastguard Worker                                                        //  transition rules.
122*0e209d39SAndroid Build Coastguard Worker 
123*0e209d39SAndroid Build Coastguard Worker     RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created
124*0e209d39SAndroid Build Coastguard Worker                                                            //  during the parse of a rule
125*0e209d39SAndroid Build Coastguard Worker     int32_t                        fNodeStackPtr;
126*0e209d39SAndroid Build Coastguard Worker 
127*0e209d39SAndroid Build Coastguard Worker 
128*0e209d39SAndroid Build Coastguard Worker     UBool                          fReverseRule;     // True if the rule currently being scanned
129*0e209d39SAndroid Build Coastguard Worker                                                      //  is a reverse direction rule (if it
130*0e209d39SAndroid Build Coastguard Worker                                                      //  starts with a '!')
131*0e209d39SAndroid Build Coastguard Worker 
132*0e209d39SAndroid Build Coastguard Worker     UBool                          fLookAheadRule;   // True if the rule includes a '/'
133*0e209d39SAndroid Build Coastguard Worker                                                      //   somewhere within it.
134*0e209d39SAndroid Build Coastguard Worker 
135*0e209d39SAndroid Build Coastguard Worker     UBool                          fNoChainInRule;   // True if the current rule starts with a '^'.
136*0e209d39SAndroid Build Coastguard Worker 
137*0e209d39SAndroid Build Coastguard Worker     RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of
138*0e209d39SAndroid Build Coastguard Worker                                                      //   $variable symbols.
139*0e209d39SAndroid Build Coastguard Worker 
140*0e209d39SAndroid Build Coastguard Worker     UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to
141*0e209d39SAndroid Build Coastguard Worker                                                      //   the sets created while parsing rules.
142*0e209d39SAndroid Build Coastguard Worker                                                      //   The key is the string used for creating
143*0e209d39SAndroid Build Coastguard Worker                                                      //   the set.
144*0e209d39SAndroid Build Coastguard Worker 
145*0e209d39SAndroid Build Coastguard Worker     UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during
146*0e209d39SAndroid Build Coastguard Worker                                                      //  the scanning of RBBI rules.  The
147*0e209d39SAndroid Build Coastguard Worker                                                      //  indices for these are assigned by the
148*0e209d39SAndroid Build Coastguard Worker                                                      //  perl script that builds the state tables.
149*0e209d39SAndroid Build Coastguard Worker                                                      //  See rbbirpt.h.
150*0e209d39SAndroid Build Coastguard Worker 
151*0e209d39SAndroid Build Coastguard Worker     int32_t                        fRuleNum;         // Counts each rule as it is scanned.
152*0e209d39SAndroid Build Coastguard Worker 
153*0e209d39SAndroid Build Coastguard Worker     int32_t                        fOptionStart;     // Input index of start of a !!option
154*0e209d39SAndroid Build Coastguard Worker                                                      //   keyword, while being scanned.
155*0e209d39SAndroid Build Coastguard Worker 
156*0e209d39SAndroid Build Coastguard Worker     UnicodeSet *gRuleSet_rule_char;
157*0e209d39SAndroid Build Coastguard Worker     UnicodeSet *gRuleSet_white_space;
158*0e209d39SAndroid Build Coastguard Worker     UnicodeSet *gRuleSet_name_char;
159*0e209d39SAndroid Build Coastguard Worker     UnicodeSet *gRuleSet_name_start_char;
160*0e209d39SAndroid Build Coastguard Worker 
161*0e209d39SAndroid Build Coastguard Worker     RBBIRuleScanner(const RBBIRuleScanner &other) = delete; // forbid copying of this class
162*0e209d39SAndroid Build Coastguard Worker     RBBIRuleScanner &operator=(const RBBIRuleScanner &other) = delete; // forbid copying of this class
163*0e209d39SAndroid Build Coastguard Worker };
164*0e209d39SAndroid Build Coastguard Worker 
165*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
166*0e209d39SAndroid Build Coastguard Worker 
167*0e209d39SAndroid Build Coastguard Worker #endif
168