1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker // 4*0e209d39SAndroid Build Coastguard Worker // rbbiscan.h 5*0e209d39SAndroid Build Coastguard Worker // 6*0e209d39SAndroid Build Coastguard Worker // Copyright (C) 2002-2016, International Business Machines Corporation and others. 7*0e209d39SAndroid Build Coastguard Worker // All Rights Reserved. 8*0e209d39SAndroid Build Coastguard Worker // 9*0e209d39SAndroid Build Coastguard Worker // This file contains declarations for class RBBIRuleScanner 10*0e209d39SAndroid Build Coastguard Worker // 11*0e209d39SAndroid Build Coastguard Worker 12*0e209d39SAndroid Build Coastguard Worker 13*0e209d39SAndroid Build Coastguard Worker #ifndef RBBISCAN_H 14*0e209d39SAndroid Build Coastguard Worker #define RBBISCAN_H 15*0e209d39SAndroid Build Coastguard Worker 16*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 17*0e209d39SAndroid Build Coastguard Worker #include "unicode/uobject.h" 18*0e209d39SAndroid Build Coastguard Worker #include "unicode/rbbi.h" 19*0e209d39SAndroid Build Coastguard Worker #include "unicode/uniset.h" 20*0e209d39SAndroid Build Coastguard Worker #include "unicode/parseerr.h" 21*0e209d39SAndroid Build Coastguard Worker #include "uhash.h" 22*0e209d39SAndroid Build Coastguard Worker #include "uvector.h" 23*0e209d39SAndroid Build Coastguard Worker #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 24*0e209d39SAndroid Build Coastguard Worker // looks up references to $variables within a set. 25*0e209d39SAndroid Build Coastguard Worker #include "rbbinode.h" 26*0e209d39SAndroid Build Coastguard Worker #include "rbbirpt.h" 27*0e209d39SAndroid Build Coastguard Worker 28*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 29*0e209d39SAndroid Build Coastguard Worker 30*0e209d39SAndroid Build Coastguard Worker class RBBIRuleBuilder; 31*0e209d39SAndroid Build Coastguard Worker class RBBISymbolTable; 32*0e209d39SAndroid Build Coastguard Worker 33*0e209d39SAndroid Build Coastguard Worker 34*0e209d39SAndroid Build Coastguard Worker //-------------------------------------------------------------------------------- 35*0e209d39SAndroid Build Coastguard Worker // 36*0e209d39SAndroid Build Coastguard Worker // class RBBIRuleScanner does the lowest level, character-at-a-time 37*0e209d39SAndroid Build Coastguard Worker // scanning of break iterator rules. 38*0e209d39SAndroid Build Coastguard Worker // 39*0e209d39SAndroid Build Coastguard Worker // The output of the scanner is parse trees for 40*0e209d39SAndroid Build Coastguard Worker // the rule expressions and a list of all Unicode Sets 41*0e209d39SAndroid Build Coastguard Worker // encountered. 42*0e209d39SAndroid Build Coastguard Worker // 43*0e209d39SAndroid Build Coastguard Worker //-------------------------------------------------------------------------------- 44*0e209d39SAndroid Build Coastguard Worker 45*0e209d39SAndroid Build Coastguard Worker class RBBIRuleScanner : public UMemory { 46*0e209d39SAndroid Build Coastguard Worker public: 47*0e209d39SAndroid Build Coastguard Worker 48*0e209d39SAndroid Build Coastguard Worker enum { 49*0e209d39SAndroid Build Coastguard Worker kStackSize = 100 // The size of the state stack for 50*0e209d39SAndroid Build Coastguard Worker }; // rules parsing. Corresponds roughly 51*0e209d39SAndroid Build Coastguard Worker // to the depth of parentheses nesting 52*0e209d39SAndroid Build Coastguard Worker // that is allowed in the rules. 53*0e209d39SAndroid Build Coastguard Worker 54*0e209d39SAndroid Build Coastguard Worker struct RBBIRuleChar { 55*0e209d39SAndroid Build Coastguard Worker UChar32 fChar; 56*0e209d39SAndroid Build Coastguard Worker UBool fEscaped; RBBIRuleCharRBBIRuleChar57*0e209d39SAndroid Build Coastguard Worker RBBIRuleChar() : fChar(0), fEscaped(false) {} 58*0e209d39SAndroid Build Coastguard Worker }; 59*0e209d39SAndroid Build Coastguard Worker 60*0e209d39SAndroid Build Coastguard Worker RBBIRuleScanner(RBBIRuleBuilder *rb); 61*0e209d39SAndroid Build Coastguard Worker 62*0e209d39SAndroid Build Coastguard Worker 63*0e209d39SAndroid Build Coastguard Worker virtual ~RBBIRuleScanner(); 64*0e209d39SAndroid Build Coastguard Worker 65*0e209d39SAndroid Build Coastguard Worker void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. 66*0e209d39SAndroid Build Coastguard Worker // Return false if at end. 67*0e209d39SAndroid Build Coastguard Worker 68*0e209d39SAndroid Build Coastguard Worker UBool push(const RBBIRuleChar &c); // Push (unget) one character. 69*0e209d39SAndroid Build Coastguard Worker // Only a single character may be pushed. 70*0e209d39SAndroid Build Coastguard Worker 71*0e209d39SAndroid Build Coastguard Worker void parse(); // Parse the rules, generating two parse 72*0e209d39SAndroid Build Coastguard Worker // trees, one each for the forward and 73*0e209d39SAndroid Build Coastguard Worker // reverse rules, 74*0e209d39SAndroid Build Coastguard Worker // and a list of UnicodeSets encountered. 75*0e209d39SAndroid Build Coastguard Worker 76*0e209d39SAndroid Build Coastguard Worker int32_t numRules(); // Return the number of rules that have been seen. 77*0e209d39SAndroid Build Coastguard Worker 78*0e209d39SAndroid Build Coastguard Worker /** 79*0e209d39SAndroid Build Coastguard Worker * Return a rules string without unnecessary 80*0e209d39SAndroid Build Coastguard Worker * characters. 81*0e209d39SAndroid Build Coastguard Worker */ 82*0e209d39SAndroid Build Coastguard Worker static UnicodeString stripRules(const UnicodeString &rules); 83*0e209d39SAndroid Build Coastguard Worker private: 84*0e209d39SAndroid Build Coastguard Worker 85*0e209d39SAndroid Build Coastguard Worker UBool doParseActions(int32_t a); 86*0e209d39SAndroid Build Coastguard Worker void error(UErrorCode e); // error reporting convenience function. 87*0e209d39SAndroid Build Coastguard Worker void fixOpStack(RBBINode::OpPrecedence p); 88*0e209d39SAndroid Build Coastguard Worker // a character. 89*0e209d39SAndroid Build Coastguard Worker void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = nullptr); 90*0e209d39SAndroid Build Coastguard Worker 91*0e209d39SAndroid Build Coastguard Worker UChar32 nextCharLL(); 92*0e209d39SAndroid Build Coastguard Worker #ifdef RBBI_DEBUG 93*0e209d39SAndroid Build Coastguard Worker void printNodeStack(const char *title); 94*0e209d39SAndroid Build Coastguard Worker #endif 95*0e209d39SAndroid Build Coastguard Worker RBBINode *pushNewNode(RBBINode::NodeType t); 96*0e209d39SAndroid Build Coastguard Worker void scanSet(); 97*0e209d39SAndroid Build Coastguard Worker 98*0e209d39SAndroid Build Coastguard Worker 99*0e209d39SAndroid Build Coastguard Worker RBBIRuleBuilder *fRB; // The rule builder that we are part of. 100*0e209d39SAndroid Build Coastguard Worker 101*0e209d39SAndroid Build Coastguard Worker int32_t fScanIndex; // Index of current character being processed 102*0e209d39SAndroid Build Coastguard Worker // in the rule input string. 103*0e209d39SAndroid Build Coastguard Worker int32_t fNextIndex; // Index of the next character, which 104*0e209d39SAndroid Build Coastguard Worker // is the first character not yet scanned. 105*0e209d39SAndroid Build Coastguard Worker UBool fQuoteMode; // Scan is in a 'quoted region' 106*0e209d39SAndroid Build Coastguard Worker int32_t fLineNum; // Line number in input file. 107*0e209d39SAndroid Build Coastguard Worker int32_t fCharNum; // Char position within the line. 108*0e209d39SAndroid Build Coastguard Worker UChar32 fLastChar; // Previous char, needed to count CR-LF 109*0e209d39SAndroid Build Coastguard Worker // as a single line, not two. 110*0e209d39SAndroid Build Coastguard Worker 111*0e209d39SAndroid Build Coastguard Worker RBBIRuleChar fC; // Current char for parse state machine 112*0e209d39SAndroid Build Coastguard Worker // processing. 113*0e209d39SAndroid Build Coastguard Worker UnicodeString fVarName; // $variableName, valid when we've just 114*0e209d39SAndroid Build Coastguard Worker // scanned one. 115*0e209d39SAndroid Build Coastguard Worker 116*0e209d39SAndroid Build Coastguard Worker RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule 117*0e209d39SAndroid Build Coastguard Worker // parsing. index by p[state][char-class] 118*0e209d39SAndroid Build Coastguard Worker 119*0e209d39SAndroid Build Coastguard Worker uint16_t fStack[kStackSize]; // State stack, holds state pushes 120*0e209d39SAndroid Build Coastguard Worker int32_t fStackPtr; // and pops as specified in the state 121*0e209d39SAndroid Build Coastguard Worker // transition rules. 122*0e209d39SAndroid Build Coastguard Worker 123*0e209d39SAndroid Build Coastguard Worker RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created 124*0e209d39SAndroid Build Coastguard Worker // during the parse of a rule 125*0e209d39SAndroid Build Coastguard Worker int32_t fNodeStackPtr; 126*0e209d39SAndroid Build Coastguard Worker 127*0e209d39SAndroid Build Coastguard Worker 128*0e209d39SAndroid Build Coastguard Worker UBool fReverseRule; // True if the rule currently being scanned 129*0e209d39SAndroid Build Coastguard Worker // is a reverse direction rule (if it 130*0e209d39SAndroid Build Coastguard Worker // starts with a '!') 131*0e209d39SAndroid Build Coastguard Worker 132*0e209d39SAndroid Build Coastguard Worker UBool fLookAheadRule; // True if the rule includes a '/' 133*0e209d39SAndroid Build Coastguard Worker // somewhere within it. 134*0e209d39SAndroid Build Coastguard Worker 135*0e209d39SAndroid Build Coastguard Worker UBool fNoChainInRule; // True if the current rule starts with a '^'. 136*0e209d39SAndroid Build Coastguard Worker 137*0e209d39SAndroid Build Coastguard Worker RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of 138*0e209d39SAndroid Build Coastguard Worker // $variable symbols. 139*0e209d39SAndroid Build Coastguard Worker 140*0e209d39SAndroid Build Coastguard Worker UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to 141*0e209d39SAndroid Build Coastguard Worker // the sets created while parsing rules. 142*0e209d39SAndroid Build Coastguard Worker // The key is the string used for creating 143*0e209d39SAndroid Build Coastguard Worker // the set. 144*0e209d39SAndroid Build Coastguard Worker 145*0e209d39SAndroid Build Coastguard Worker UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during 146*0e209d39SAndroid Build Coastguard Worker // the scanning of RBBI rules. The 147*0e209d39SAndroid Build Coastguard Worker // indices for these are assigned by the 148*0e209d39SAndroid Build Coastguard Worker // perl script that builds the state tables. 149*0e209d39SAndroid Build Coastguard Worker // See rbbirpt.h. 150*0e209d39SAndroid Build Coastguard Worker 151*0e209d39SAndroid Build Coastguard Worker int32_t fRuleNum; // Counts each rule as it is scanned. 152*0e209d39SAndroid Build Coastguard Worker 153*0e209d39SAndroid Build Coastguard Worker int32_t fOptionStart; // Input index of start of a !!option 154*0e209d39SAndroid Build Coastguard Worker // keyword, while being scanned. 155*0e209d39SAndroid Build Coastguard Worker 156*0e209d39SAndroid Build Coastguard Worker UnicodeSet *gRuleSet_rule_char; 157*0e209d39SAndroid Build Coastguard Worker UnicodeSet *gRuleSet_white_space; 158*0e209d39SAndroid Build Coastguard Worker UnicodeSet *gRuleSet_name_char; 159*0e209d39SAndroid Build Coastguard Worker UnicodeSet *gRuleSet_name_start_char; 160*0e209d39SAndroid Build Coastguard Worker 161*0e209d39SAndroid Build Coastguard Worker RBBIRuleScanner(const RBBIRuleScanner &other) = delete; // forbid copying of this class 162*0e209d39SAndroid Build Coastguard Worker RBBIRuleScanner &operator=(const RBBIRuleScanner &other) = delete; // forbid copying of this class 163*0e209d39SAndroid Build Coastguard Worker }; 164*0e209d39SAndroid Build Coastguard Worker 165*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 166*0e209d39SAndroid Build Coastguard Worker 167*0e209d39SAndroid Build Coastguard Worker #endif 168