1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker /* 4*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2001-2011, International Business Machines Corporation 5*0e209d39SAndroid Build Coastguard Worker * and others. All Rights Reserved. 6*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 7*0e209d39SAndroid Build Coastguard Worker * Date Name Description 8*0e209d39SAndroid Build Coastguard Worker * 07/23/01 aliu Creation. 9*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 10*0e209d39SAndroid Build Coastguard Worker */ 11*0e209d39SAndroid Build Coastguard Worker #ifndef STRMATCH_H 12*0e209d39SAndroid Build Coastguard Worker #define STRMATCH_H 13*0e209d39SAndroid Build Coastguard Worker 14*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 15*0e209d39SAndroid Build Coastguard Worker 16*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_TRANSLITERATION 17*0e209d39SAndroid Build Coastguard Worker 18*0e209d39SAndroid Build Coastguard Worker #include "unicode/unistr.h" 19*0e209d39SAndroid Build Coastguard Worker #include "unicode/unifunct.h" 20*0e209d39SAndroid Build Coastguard Worker #include "unicode/unimatch.h" 21*0e209d39SAndroid Build Coastguard Worker #include "unicode/unirepl.h" 22*0e209d39SAndroid Build Coastguard Worker 23*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 24*0e209d39SAndroid Build Coastguard Worker 25*0e209d39SAndroid Build Coastguard Worker class TransliterationRuleData; 26*0e209d39SAndroid Build Coastguard Worker 27*0e209d39SAndroid Build Coastguard Worker /** 28*0e209d39SAndroid Build Coastguard Worker * An object that matches a fixed input string, implementing the 29*0e209d39SAndroid Build Coastguard Worker * UnicodeMatcher API. This object also implements the 30*0e209d39SAndroid Build Coastguard Worker * UnicodeReplacer API, allowing it to emit the matched text as 31*0e209d39SAndroid Build Coastguard Worker * output. Since the match text may contain flexible match elements, 32*0e209d39SAndroid Build Coastguard Worker * such as UnicodeSets, the emitted text is not the match pattern, but 33*0e209d39SAndroid Build Coastguard Worker * instead a substring of the actual matched text. Following 34*0e209d39SAndroid Build Coastguard Worker * convention, the output text is the leftmost match seen up to this 35*0e209d39SAndroid Build Coastguard Worker * point. 36*0e209d39SAndroid Build Coastguard Worker * 37*0e209d39SAndroid Build Coastguard Worker * A StringMatcher may represent a segment, in which case it has a 38*0e209d39SAndroid Build Coastguard Worker * positive segment number. This affects how the matcher converts 39*0e209d39SAndroid Build Coastguard Worker * itself to a pattern but does not otherwise affect its function. 40*0e209d39SAndroid Build Coastguard Worker * 41*0e209d39SAndroid Build Coastguard Worker * A StringMatcher that is not a segment should not be used as a 42*0e209d39SAndroid Build Coastguard Worker * UnicodeReplacer. 43*0e209d39SAndroid Build Coastguard Worker */ 44*0e209d39SAndroid Build Coastguard Worker class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer { 45*0e209d39SAndroid Build Coastguard Worker 46*0e209d39SAndroid Build Coastguard Worker public: 47*0e209d39SAndroid Build Coastguard Worker 48*0e209d39SAndroid Build Coastguard Worker /** 49*0e209d39SAndroid Build Coastguard Worker * Construct a matcher that matches the given pattern string. 50*0e209d39SAndroid Build Coastguard Worker * @param string the pattern to be matched, possibly containing 51*0e209d39SAndroid Build Coastguard Worker * stand-ins that represent nested UnicodeMatcher objects. 52*0e209d39SAndroid Build Coastguard Worker * @param start inclusive start index of text to be replaced 53*0e209d39SAndroid Build Coastguard Worker * @param limit exclusive end index of text to be replaced; 54*0e209d39SAndroid Build Coastguard Worker * must be greater than or equal to start 55*0e209d39SAndroid Build Coastguard Worker * @param segmentNum the segment number from 1..n, or 0 if this is 56*0e209d39SAndroid Build Coastguard Worker * not a segment. 57*0e209d39SAndroid Build Coastguard Worker * @param data context object mapping stand-ins to 58*0e209d39SAndroid Build Coastguard Worker * UnicodeMatcher objects. 59*0e209d39SAndroid Build Coastguard Worker */ 60*0e209d39SAndroid Build Coastguard Worker StringMatcher(const UnicodeString& string, 61*0e209d39SAndroid Build Coastguard Worker int32_t start, 62*0e209d39SAndroid Build Coastguard Worker int32_t limit, 63*0e209d39SAndroid Build Coastguard Worker int32_t segmentNum, 64*0e209d39SAndroid Build Coastguard Worker const TransliterationRuleData& data); 65*0e209d39SAndroid Build Coastguard Worker 66*0e209d39SAndroid Build Coastguard Worker /** 67*0e209d39SAndroid Build Coastguard Worker * Copy constructor 68*0e209d39SAndroid Build Coastguard Worker * @param o the object to be copied. 69*0e209d39SAndroid Build Coastguard Worker */ 70*0e209d39SAndroid Build Coastguard Worker StringMatcher(const StringMatcher& o); 71*0e209d39SAndroid Build Coastguard Worker 72*0e209d39SAndroid Build Coastguard Worker /** 73*0e209d39SAndroid Build Coastguard Worker * Destructor 74*0e209d39SAndroid Build Coastguard Worker */ 75*0e209d39SAndroid Build Coastguard Worker virtual ~StringMatcher(); 76*0e209d39SAndroid Build Coastguard Worker 77*0e209d39SAndroid Build Coastguard Worker /** 78*0e209d39SAndroid Build Coastguard Worker * Implement UnicodeFunctor 79*0e209d39SAndroid Build Coastguard Worker * @return a copy of the object. 80*0e209d39SAndroid Build Coastguard Worker */ 81*0e209d39SAndroid Build Coastguard Worker virtual StringMatcher* clone() const override; 82*0e209d39SAndroid Build Coastguard Worker 83*0e209d39SAndroid Build Coastguard Worker /** 84*0e209d39SAndroid Build Coastguard Worker * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer 85*0e209d39SAndroid Build Coastguard Worker * and return the pointer. 86*0e209d39SAndroid Build Coastguard Worker * @return the UnicodeMatcher point. 87*0e209d39SAndroid Build Coastguard Worker */ 88*0e209d39SAndroid Build Coastguard Worker virtual UnicodeMatcher* toMatcher() const override; 89*0e209d39SAndroid Build Coastguard Worker 90*0e209d39SAndroid Build Coastguard Worker /** 91*0e209d39SAndroid Build Coastguard Worker * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer 92*0e209d39SAndroid Build Coastguard Worker * and return the pointer. 93*0e209d39SAndroid Build Coastguard Worker * @return the UnicodeReplacer pointer. 94*0e209d39SAndroid Build Coastguard Worker */ 95*0e209d39SAndroid Build Coastguard Worker virtual UnicodeReplacer* toReplacer() const override; 96*0e209d39SAndroid Build Coastguard Worker 97*0e209d39SAndroid Build Coastguard Worker /** 98*0e209d39SAndroid Build Coastguard Worker * Implement UnicodeMatcher 99*0e209d39SAndroid Build Coastguard Worker * @param text the text to be matched 100*0e209d39SAndroid Build Coastguard Worker * @param offset on input, the index into text at which to begin 101*0e209d39SAndroid Build Coastguard Worker * matching. On output, the limit of the matched text. The 102*0e209d39SAndroid Build Coastguard Worker * number of matched characters is the output value of offset 103*0e209d39SAndroid Build Coastguard Worker * minus the input value. Offset should always point to the 104*0e209d39SAndroid Build Coastguard Worker * HIGH SURROGATE (leading code unit) of a pair of surrogates, 105*0e209d39SAndroid Build Coastguard Worker * both on entry and upon return. 106*0e209d39SAndroid Build Coastguard Worker * @param limit the limit index of text to be matched. Greater 107*0e209d39SAndroid Build Coastguard Worker * than offset for a forward direction match, less than offset for 108*0e209d39SAndroid Build Coastguard Worker * a backward direction match. The last character to be 109*0e209d39SAndroid Build Coastguard Worker * considered for matching will be text.charAt(limit-1) in the 110*0e209d39SAndroid Build Coastguard Worker * forward direction or text.charAt(limit+1) in the backward 111*0e209d39SAndroid Build Coastguard Worker * direction. 112*0e209d39SAndroid Build Coastguard Worker * @param incremental if true, then assume further characters may 113*0e209d39SAndroid Build Coastguard Worker * be inserted at limit and check for partial matching. Otherwise 114*0e209d39SAndroid Build Coastguard Worker * assume the text as given is complete. 115*0e209d39SAndroid Build Coastguard Worker * @return a match degree value indicating a full match, a partial 116*0e209d39SAndroid Build Coastguard Worker * match, or a mismatch. If incremental is false then 117*0e209d39SAndroid Build Coastguard Worker * U_PARTIAL_MATCH should never be returned. 118*0e209d39SAndroid Build Coastguard Worker */ 119*0e209d39SAndroid Build Coastguard Worker virtual UMatchDegree matches(const Replaceable& text, 120*0e209d39SAndroid Build Coastguard Worker int32_t& offset, 121*0e209d39SAndroid Build Coastguard Worker int32_t limit, 122*0e209d39SAndroid Build Coastguard Worker UBool incremental) override; 123*0e209d39SAndroid Build Coastguard Worker 124*0e209d39SAndroid Build Coastguard Worker /** 125*0e209d39SAndroid Build Coastguard Worker * Implement UnicodeMatcher 126*0e209d39SAndroid Build Coastguard Worker * @param result Output param to receive the pattern. 127*0e209d39SAndroid Build Coastguard Worker * @param escapeUnprintable if True then escape the unprintable characters. 128*0e209d39SAndroid Build Coastguard Worker * @return A reference to 'result'. 129*0e209d39SAndroid Build Coastguard Worker */ 130*0e209d39SAndroid Build Coastguard Worker virtual UnicodeString& toPattern(UnicodeString& result, 131*0e209d39SAndroid Build Coastguard Worker UBool escapeUnprintable = false) const override; 132*0e209d39SAndroid Build Coastguard Worker 133*0e209d39SAndroid Build Coastguard Worker /** 134*0e209d39SAndroid Build Coastguard Worker * Implement UnicodeMatcher 135*0e209d39SAndroid Build Coastguard Worker * Returns true if this matcher will match a character c, where c 136*0e209d39SAndroid Build Coastguard Worker * & 0xFF == v, at offset, in the forward direction (with limit > 137*0e209d39SAndroid Build Coastguard Worker * offset). This is used by <tt>RuleBasedTransliterator</tt> for 138*0e209d39SAndroid Build Coastguard Worker * indexing. 139*0e209d39SAndroid Build Coastguard Worker * @param v the given value 140*0e209d39SAndroid Build Coastguard Worker * @return true if this matcher will match a character c, 141*0e209d39SAndroid Build Coastguard Worker * where c & 0xFF == v 142*0e209d39SAndroid Build Coastguard Worker */ 143*0e209d39SAndroid Build Coastguard Worker virtual UBool matchesIndexValue(uint8_t v) const override; 144*0e209d39SAndroid Build Coastguard Worker 145*0e209d39SAndroid Build Coastguard Worker /** 146*0e209d39SAndroid Build Coastguard Worker * Implement UnicodeMatcher 147*0e209d39SAndroid Build Coastguard Worker */ 148*0e209d39SAndroid Build Coastguard Worker virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override; 149*0e209d39SAndroid Build Coastguard Worker 150*0e209d39SAndroid Build Coastguard Worker /** 151*0e209d39SAndroid Build Coastguard Worker * Implement UnicodeFunctor 152*0e209d39SAndroid Build Coastguard Worker */ 153*0e209d39SAndroid Build Coastguard Worker virtual void setData(const TransliterationRuleData*) override; 154*0e209d39SAndroid Build Coastguard Worker 155*0e209d39SAndroid Build Coastguard Worker /** 156*0e209d39SAndroid Build Coastguard Worker * Replace characters in 'text' from 'start' to 'limit' with the 157*0e209d39SAndroid Build Coastguard Worker * output text of this object. Update the 'cursor' parameter to 158*0e209d39SAndroid Build Coastguard Worker * give the cursor position and return the length of the 159*0e209d39SAndroid Build Coastguard Worker * replacement text. 160*0e209d39SAndroid Build Coastguard Worker * 161*0e209d39SAndroid Build Coastguard Worker * @param text the text to be matched 162*0e209d39SAndroid Build Coastguard Worker * @param start inclusive start index of text to be replaced 163*0e209d39SAndroid Build Coastguard Worker * @param limit exclusive end index of text to be replaced; 164*0e209d39SAndroid Build Coastguard Worker * must be greater than or equal to start 165*0e209d39SAndroid Build Coastguard Worker * @param cursor output parameter for the cursor position. 166*0e209d39SAndroid Build Coastguard Worker * Not all replacer objects will update this, but in a complete 167*0e209d39SAndroid Build Coastguard Worker * tree of replacer objects, representing the entire output side 168*0e209d39SAndroid Build Coastguard Worker * of a transliteration rule, at least one must update it. 169*0e209d39SAndroid Build Coastguard Worker * @return the number of 16-bit code units in the text replacing 170*0e209d39SAndroid Build Coastguard Worker * the characters at offsets start..(limit-1) in text 171*0e209d39SAndroid Build Coastguard Worker */ 172*0e209d39SAndroid Build Coastguard Worker virtual int32_t replace(Replaceable& text, 173*0e209d39SAndroid Build Coastguard Worker int32_t start, 174*0e209d39SAndroid Build Coastguard Worker int32_t limit, 175*0e209d39SAndroid Build Coastguard Worker int32_t& cursor) override; 176*0e209d39SAndroid Build Coastguard Worker 177*0e209d39SAndroid Build Coastguard Worker /** 178*0e209d39SAndroid Build Coastguard Worker * Returns a string representation of this replacer. If the 179*0e209d39SAndroid Build Coastguard Worker * result of calling this function is passed to the appropriate 180*0e209d39SAndroid Build Coastguard Worker * parser, typically TransliteratorParser, it will produce another 181*0e209d39SAndroid Build Coastguard Worker * replacer that is equal to this one. 182*0e209d39SAndroid Build Coastguard Worker * @param result the string to receive the pattern. Previous 183*0e209d39SAndroid Build Coastguard Worker * contents will be deleted. 184*0e209d39SAndroid Build Coastguard Worker * @param escapeUnprintable if true then convert unprintable 185*0e209d39SAndroid Build Coastguard Worker * character to their hex escape representations, \\uxxxx or 186*0e209d39SAndroid Build Coastguard Worker * \\Uxxxxxxxx. Unprintable characters are defined by 187*0e209d39SAndroid Build Coastguard Worker * Utility.isUnprintable(). 188*0e209d39SAndroid Build Coastguard Worker * @return a reference to 'result'. 189*0e209d39SAndroid Build Coastguard Worker */ 190*0e209d39SAndroid Build Coastguard Worker virtual UnicodeString& toReplacerPattern(UnicodeString& result, 191*0e209d39SAndroid Build Coastguard Worker UBool escapeUnprintable) const override; 192*0e209d39SAndroid Build Coastguard Worker 193*0e209d39SAndroid Build Coastguard Worker /** 194*0e209d39SAndroid Build Coastguard Worker * Remove any match data. This must be called before performing a 195*0e209d39SAndroid Build Coastguard Worker * set of matches with this segment. 196*0e209d39SAndroid Build Coastguard Worker */ 197*0e209d39SAndroid Build Coastguard Worker void resetMatch(); 198*0e209d39SAndroid Build Coastguard Worker 199*0e209d39SAndroid Build Coastguard Worker /** 200*0e209d39SAndroid Build Coastguard Worker * ICU "poor man's RTTI", returns a UClassID for the actual class. 201*0e209d39SAndroid Build Coastguard Worker */ 202*0e209d39SAndroid Build Coastguard Worker virtual UClassID getDynamicClassID() const override; 203*0e209d39SAndroid Build Coastguard Worker 204*0e209d39SAndroid Build Coastguard Worker /** 205*0e209d39SAndroid Build Coastguard Worker * ICU "poor man's RTTI", returns a UClassID for this class. 206*0e209d39SAndroid Build Coastguard Worker */ 207*0e209d39SAndroid Build Coastguard Worker static UClassID U_EXPORT2 getStaticClassID(); 208*0e209d39SAndroid Build Coastguard Worker 209*0e209d39SAndroid Build Coastguard Worker /** 210*0e209d39SAndroid Build Coastguard Worker * Union the set of all characters that may output by this object 211*0e209d39SAndroid Build Coastguard Worker * into the given set. 212*0e209d39SAndroid Build Coastguard Worker * @param toUnionTo the set into which to union the output characters 213*0e209d39SAndroid Build Coastguard Worker */ 214*0e209d39SAndroid Build Coastguard Worker virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const override; 215*0e209d39SAndroid Build Coastguard Worker 216*0e209d39SAndroid Build Coastguard Worker private: 217*0e209d39SAndroid Build Coastguard Worker 218*0e209d39SAndroid Build Coastguard Worker /** 219*0e209d39SAndroid Build Coastguard Worker * The text to be matched. 220*0e209d39SAndroid Build Coastguard Worker */ 221*0e209d39SAndroid Build Coastguard Worker UnicodeString pattern; 222*0e209d39SAndroid Build Coastguard Worker 223*0e209d39SAndroid Build Coastguard Worker /** 224*0e209d39SAndroid Build Coastguard Worker * Context object that maps stand-ins to matcher and replacer 225*0e209d39SAndroid Build Coastguard Worker * objects. 226*0e209d39SAndroid Build Coastguard Worker */ 227*0e209d39SAndroid Build Coastguard Worker const TransliterationRuleData* data; 228*0e209d39SAndroid Build Coastguard Worker 229*0e209d39SAndroid Build Coastguard Worker /** 230*0e209d39SAndroid Build Coastguard Worker * The segment number, 1-based, or 0 if not a segment. 231*0e209d39SAndroid Build Coastguard Worker */ 232*0e209d39SAndroid Build Coastguard Worker int32_t segmentNumber; 233*0e209d39SAndroid Build Coastguard Worker 234*0e209d39SAndroid Build Coastguard Worker /** 235*0e209d39SAndroid Build Coastguard Worker * Start offset, in the match text, of the <em>rightmost</em> 236*0e209d39SAndroid Build Coastguard Worker * match. 237*0e209d39SAndroid Build Coastguard Worker */ 238*0e209d39SAndroid Build Coastguard Worker int32_t matchStart; 239*0e209d39SAndroid Build Coastguard Worker 240*0e209d39SAndroid Build Coastguard Worker /** 241*0e209d39SAndroid Build Coastguard Worker * Limit offset, in the match text, of the <em>rightmost</em> 242*0e209d39SAndroid Build Coastguard Worker * match. 243*0e209d39SAndroid Build Coastguard Worker */ 244*0e209d39SAndroid Build Coastguard Worker int32_t matchLimit; 245*0e209d39SAndroid Build Coastguard Worker 246*0e209d39SAndroid Build Coastguard Worker }; 247*0e209d39SAndroid Build Coastguard Worker 248*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 249*0e209d39SAndroid Build Coastguard Worker 250*0e209d39SAndroid Build Coastguard Worker #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 251*0e209d39SAndroid Build Coastguard Worker 252*0e209d39SAndroid Build Coastguard Worker #endif 253