1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker /* 4*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 5*0e209d39SAndroid Build Coastguard Worker * Copyright (c) 2001-2011, International Business Machines 6*0e209d39SAndroid Build Coastguard Worker * Corporation and others. All Rights Reserved. 7*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 8*0e209d39SAndroid Build Coastguard Worker * Date Name Description 9*0e209d39SAndroid Build Coastguard Worker * 11/19/2001 aliu Creation. 10*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 11*0e209d39SAndroid Build Coastguard Worker */ 12*0e209d39SAndroid Build Coastguard Worker 13*0e209d39SAndroid Build Coastguard Worker #ifndef ICU_UTIL_H 14*0e209d39SAndroid Build Coastguard Worker #define ICU_UTIL_H 15*0e209d39SAndroid Build Coastguard Worker 16*0e209d39SAndroid Build Coastguard Worker #include "charstr.h" 17*0e209d39SAndroid Build Coastguard Worker #include "unicode/unistr.h" 18*0e209d39SAndroid Build Coastguard Worker #include "unicode/uobject.h" 19*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 20*0e209d39SAndroid Build Coastguard Worker //-------------------------------------------------------------------- 21*0e209d39SAndroid Build Coastguard Worker // class ICU_Utility 22*0e209d39SAndroid Build Coastguard Worker // i18n utility functions, scoped into the class ICU_Utility. 23*0e209d39SAndroid Build Coastguard Worker //-------------------------------------------------------------------- 24*0e209d39SAndroid Build Coastguard Worker 25*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 26*0e209d39SAndroid Build Coastguard Worker 27*0e209d39SAndroid Build Coastguard Worker class UnicodeMatcher; 28*0e209d39SAndroid Build Coastguard Worker 29*0e209d39SAndroid Build Coastguard Worker class U_COMMON_API ICU_Utility /* not : public UObject because all methods are static */ { 30*0e209d39SAndroid Build Coastguard Worker public: 31*0e209d39SAndroid Build Coastguard Worker 32*0e209d39SAndroid Build Coastguard Worker /** 33*0e209d39SAndroid Build Coastguard Worker * Append a number to the given UnicodeString in the given radix. 34*0e209d39SAndroid Build Coastguard Worker * Standard digits '0'-'9' are used and letters 'A'-'Z' for 35*0e209d39SAndroid Build Coastguard Worker * radices 11 through 36. 36*0e209d39SAndroid Build Coastguard Worker * @param result the digits of the number are appended here 37*0e209d39SAndroid Build Coastguard Worker * @param n the number to be converted to digits; may be negative. 38*0e209d39SAndroid Build Coastguard Worker * If negative, a '-' is prepended to the digits. 39*0e209d39SAndroid Build Coastguard Worker * @param radix a radix from 2 to 36 inclusive. 40*0e209d39SAndroid Build Coastguard Worker * @param minDigits the minimum number of digits, not including 41*0e209d39SAndroid Build Coastguard Worker * any '-', to produce. Values less than 2 have no effect. One 42*0e209d39SAndroid Build Coastguard Worker * digit is always emitted regardless of this parameter. 43*0e209d39SAndroid Build Coastguard Worker * @return a reference to result 44*0e209d39SAndroid Build Coastguard Worker */ 45*0e209d39SAndroid Build Coastguard Worker static UnicodeString& appendNumber(UnicodeString& result, int32_t n, 46*0e209d39SAndroid Build Coastguard Worker int32_t radix = 10, 47*0e209d39SAndroid Build Coastguard Worker int32_t minDigits = 1); 48*0e209d39SAndroid Build Coastguard Worker 49*0e209d39SAndroid Build Coastguard Worker /** Returns a bogus UnicodeString by value. */ makeBogusString()50*0e209d39SAndroid Build Coastguard Worker static inline UnicodeString makeBogusString() { 51*0e209d39SAndroid Build Coastguard Worker UnicodeString result; 52*0e209d39SAndroid Build Coastguard Worker result.setToBogus(); 53*0e209d39SAndroid Build Coastguard Worker return result; 54*0e209d39SAndroid Build Coastguard Worker } 55*0e209d39SAndroid Build Coastguard Worker 56*0e209d39SAndroid Build Coastguard Worker /** 57*0e209d39SAndroid Build Coastguard Worker * Return true if the character is NOT printable ASCII. 58*0e209d39SAndroid Build Coastguard Worker * The tab, newline and linefeed characters are considered unprintable. 59*0e209d39SAndroid Build Coastguard Worker */ 60*0e209d39SAndroid Build Coastguard Worker static UBool isUnprintable(UChar32 c); 61*0e209d39SAndroid Build Coastguard Worker 62*0e209d39SAndroid Build Coastguard Worker /** 63*0e209d39SAndroid Build Coastguard Worker * @return true for control codes and for surrogate and noncharacter code points 64*0e209d39SAndroid Build Coastguard Worker */ 65*0e209d39SAndroid Build Coastguard Worker static UBool shouldAlwaysBeEscaped(UChar32 c); 66*0e209d39SAndroid Build Coastguard Worker 67*0e209d39SAndroid Build Coastguard Worker /** 68*0e209d39SAndroid Build Coastguard Worker * Escapes one unprintable code point using \uxxxx notation for U+0000 to 69*0e209d39SAndroid Build Coastguard Worker * U+FFFF and \Uxxxxxxxx for U+10000 and above. If the character is 70*0e209d39SAndroid Build Coastguard Worker * printable ASCII, then do nothing and return false. Otherwise, 71*0e209d39SAndroid Build Coastguard Worker * append the escaped notation and return true. 72*0e209d39SAndroid Build Coastguard Worker */ 73*0e209d39SAndroid Build Coastguard Worker static UBool escapeUnprintable(UnicodeString& result, UChar32 c); 74*0e209d39SAndroid Build Coastguard Worker 75*0e209d39SAndroid Build Coastguard Worker /** 76*0e209d39SAndroid Build Coastguard Worker * Escapes one code point using \uxxxx notation 77*0e209d39SAndroid Build Coastguard Worker * for U+0000 to U+FFFF and \Uxxxxxxxx for U+10000 and above. 78*0e209d39SAndroid Build Coastguard Worker * @return result 79*0e209d39SAndroid Build Coastguard Worker */ 80*0e209d39SAndroid Build Coastguard Worker static UnicodeString &escape(UnicodeString& result, UChar32 c); 81*0e209d39SAndroid Build Coastguard Worker 82*0e209d39SAndroid Build Coastguard Worker /** 83*0e209d39SAndroid Build Coastguard Worker * Returns the index of a character, ignoring quoted text. 84*0e209d39SAndroid Build Coastguard Worker * For example, in the string "abc'hide'h", the 'h' in "hide" will not be 85*0e209d39SAndroid Build Coastguard Worker * found by a search for 'h'. 86*0e209d39SAndroid Build Coastguard Worker * @param text text to be searched 87*0e209d39SAndroid Build Coastguard Worker * @param start the beginning index, inclusive; <code>0 <= start 88*0e209d39SAndroid Build Coastguard Worker * <= limit</code>. 89*0e209d39SAndroid Build Coastguard Worker * @param limit the ending index, exclusive; <code>start <= limit 90*0e209d39SAndroid Build Coastguard Worker * <= text.length()</code>. 91*0e209d39SAndroid Build Coastguard Worker * @param c character to search for 92*0e209d39SAndroid Build Coastguard Worker * @return Offset of the first instance of c, or -1 if not found. 93*0e209d39SAndroid Build Coastguard Worker */ 94*0e209d39SAndroid Build Coastguard Worker //?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons. 95*0e209d39SAndroid Build Coastguard Worker // static int32_t quotedIndexOf(const UnicodeString& text, 96*0e209d39SAndroid Build Coastguard Worker // int32_t start, int32_t limit, 97*0e209d39SAndroid Build Coastguard Worker // char16_t c); 98*0e209d39SAndroid Build Coastguard Worker 99*0e209d39SAndroid Build Coastguard Worker /** 100*0e209d39SAndroid Build Coastguard Worker * Skip over a sequence of zero or more white space characters at pos. 101*0e209d39SAndroid Build Coastguard Worker * @param advance if true, advance pos to the first non-white-space 102*0e209d39SAndroid Build Coastguard Worker * character at or after pos, or str.length(), if there is none. 103*0e209d39SAndroid Build Coastguard Worker * Otherwise leave pos unchanged. 104*0e209d39SAndroid Build Coastguard Worker * @return the index of the first non-white-space character at or 105*0e209d39SAndroid Build Coastguard Worker * after pos, or str.length(), if there is none. 106*0e209d39SAndroid Build Coastguard Worker */ 107*0e209d39SAndroid Build Coastguard Worker static int32_t skipWhitespace(const UnicodeString& str, int32_t& pos, 108*0e209d39SAndroid Build Coastguard Worker UBool advance = false); 109*0e209d39SAndroid Build Coastguard Worker 110*0e209d39SAndroid Build Coastguard Worker /** 111*0e209d39SAndroid Build Coastguard Worker * Skip over Pattern_White_Space in a Replaceable. 112*0e209d39SAndroid Build Coastguard Worker * Skipping may be done in the forward or 113*0e209d39SAndroid Build Coastguard Worker * reverse direction. In either case, the leftmost index will be 114*0e209d39SAndroid Build Coastguard Worker * inclusive, and the rightmost index will be exclusive. That is, 115*0e209d39SAndroid Build Coastguard Worker * given a range defined as [start, limit), the call 116*0e209d39SAndroid Build Coastguard Worker * skipWhitespace(text, start, limit) will advance start past leading 117*0e209d39SAndroid Build Coastguard Worker * whitespace, whereas the call skipWhitespace(text, limit, start), 118*0e209d39SAndroid Build Coastguard Worker * will back up limit past trailing whitespace. 119*0e209d39SAndroid Build Coastguard Worker * @param text the text to be analyzed 120*0e209d39SAndroid Build Coastguard Worker * @param pos either the start or limit of a range of 'text', to skip 121*0e209d39SAndroid Build Coastguard Worker * leading or trailing whitespace, respectively 122*0e209d39SAndroid Build Coastguard Worker * @param stop either the limit or start of a range of 'text', to skip 123*0e209d39SAndroid Build Coastguard Worker * leading or trailing whitespace, respectively 124*0e209d39SAndroid Build Coastguard Worker * @return the new start or limit, depending on what was passed in to 125*0e209d39SAndroid Build Coastguard Worker * 'pos' 126*0e209d39SAndroid Build Coastguard Worker */ 127*0e209d39SAndroid Build Coastguard Worker //?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons. 128*0e209d39SAndroid Build Coastguard Worker //? static int32_t skipWhitespace(const Replaceable& text, 129*0e209d39SAndroid Build Coastguard Worker //? int32_t pos, int32_t stop); 130*0e209d39SAndroid Build Coastguard Worker 131*0e209d39SAndroid Build Coastguard Worker /** 132*0e209d39SAndroid Build Coastguard Worker * Parse a single non-whitespace character 'ch', optionally 133*0e209d39SAndroid Build Coastguard Worker * preceded by whitespace. 134*0e209d39SAndroid Build Coastguard Worker * @param id the string to be parsed 135*0e209d39SAndroid Build Coastguard Worker * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the 136*0e209d39SAndroid Build Coastguard Worker * offset of the first character to be parsed. On output, pos[0] 137*0e209d39SAndroid Build Coastguard Worker * is the index after the last parsed character. If the parse 138*0e209d39SAndroid Build Coastguard Worker * fails, pos[0] will be unchanged. 139*0e209d39SAndroid Build Coastguard Worker * @param ch the non-whitespace character to be parsed. 140*0e209d39SAndroid Build Coastguard Worker * @return true if 'ch' is seen preceded by zero or more 141*0e209d39SAndroid Build Coastguard Worker * whitespace characters. 142*0e209d39SAndroid Build Coastguard Worker */ 143*0e209d39SAndroid Build Coastguard Worker static UBool parseChar(const UnicodeString& id, int32_t& pos, char16_t ch); 144*0e209d39SAndroid Build Coastguard Worker 145*0e209d39SAndroid Build Coastguard Worker /** 146*0e209d39SAndroid Build Coastguard Worker * Parse a pattern string starting at offset pos. Keywords are 147*0e209d39SAndroid Build Coastguard Worker * matched case-insensitively. Spaces may be skipped and may be 148*0e209d39SAndroid Build Coastguard Worker * optional or required. Integer values may be parsed, and if 149*0e209d39SAndroid Build Coastguard Worker * they are, they will be returned in the given array. If 150*0e209d39SAndroid Build Coastguard Worker * successful, the offset of the next non-space character is 151*0e209d39SAndroid Build Coastguard Worker * returned. On failure, -1 is returned. 152*0e209d39SAndroid Build Coastguard Worker * @param pattern must only contain lowercase characters, which 153*0e209d39SAndroid Build Coastguard Worker * will match their uppercase equivalents as well. A space 154*0e209d39SAndroid Build Coastguard Worker * character matches one or more required spaces. A '~' character 155*0e209d39SAndroid Build Coastguard Worker * matches zero or more optional spaces. A '#' character matches 156*0e209d39SAndroid Build Coastguard Worker * an integer and stores it in parsedInts, which the caller must 157*0e209d39SAndroid Build Coastguard Worker * ensure has enough capacity. 158*0e209d39SAndroid Build Coastguard Worker * @param parsedInts array to receive parsed integers. Caller 159*0e209d39SAndroid Build Coastguard Worker * must ensure that parsedInts.length is >= the number of '#' 160*0e209d39SAndroid Build Coastguard Worker * signs in 'pattern'. 161*0e209d39SAndroid Build Coastguard Worker * @return the position after the last character parsed, or -1 if 162*0e209d39SAndroid Build Coastguard Worker * the parse failed 163*0e209d39SAndroid Build Coastguard Worker */ 164*0e209d39SAndroid Build Coastguard Worker static int32_t parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit, 165*0e209d39SAndroid Build Coastguard Worker const UnicodeString& pattern, int32_t* parsedInts); 166*0e209d39SAndroid Build Coastguard Worker 167*0e209d39SAndroid Build Coastguard Worker /** 168*0e209d39SAndroid Build Coastguard Worker * Parse a pattern string within the given Replaceable and a parsing 169*0e209d39SAndroid Build Coastguard Worker * pattern. Characters are matched literally and case-sensitively 170*0e209d39SAndroid Build Coastguard Worker * except for the following special characters: 171*0e209d39SAndroid Build Coastguard Worker * 172*0e209d39SAndroid Build Coastguard Worker * ~ zero or more Pattern_White_Space chars 173*0e209d39SAndroid Build Coastguard Worker * 174*0e209d39SAndroid Build Coastguard Worker * If end of pattern is reached with all matches along the way, 175*0e209d39SAndroid Build Coastguard Worker * pos is advanced to the first unparsed index and returned. 176*0e209d39SAndroid Build Coastguard Worker * Otherwise -1 is returned. 177*0e209d39SAndroid Build Coastguard Worker * @param pat pattern that controls parsing 178*0e209d39SAndroid Build Coastguard Worker * @param text text to be parsed, starting at index 179*0e209d39SAndroid Build Coastguard Worker * @param index offset to first character to parse 180*0e209d39SAndroid Build Coastguard Worker * @param limit offset after last character to parse 181*0e209d39SAndroid Build Coastguard Worker * @return index after last parsed character, or -1 on parse failure. 182*0e209d39SAndroid Build Coastguard Worker */ 183*0e209d39SAndroid Build Coastguard Worker static int32_t parsePattern(const UnicodeString& pat, 184*0e209d39SAndroid Build Coastguard Worker const Replaceable& text, 185*0e209d39SAndroid Build Coastguard Worker int32_t index, 186*0e209d39SAndroid Build Coastguard Worker int32_t limit); 187*0e209d39SAndroid Build Coastguard Worker 188*0e209d39SAndroid Build Coastguard Worker /** 189*0e209d39SAndroid Build Coastguard Worker * Parse an integer at pos, either of the form \d+ or of the form 190*0e209d39SAndroid Build Coastguard Worker * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex, 191*0e209d39SAndroid Build Coastguard Worker * or octal format. 192*0e209d39SAndroid Build Coastguard Worker * @param pos INPUT-OUTPUT parameter. On input, the index of the first 193*0e209d39SAndroid Build Coastguard Worker * character to parse. On output, the index of the character after the 194*0e209d39SAndroid Build Coastguard Worker * last parsed character. 195*0e209d39SAndroid Build Coastguard Worker */ 196*0e209d39SAndroid Build Coastguard Worker static int32_t parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit); 197*0e209d39SAndroid Build Coastguard Worker 198*0e209d39SAndroid Build Coastguard Worker /** 199*0e209d39SAndroid Build Coastguard Worker * Parse an integer at pos using only ASCII digits. 200*0e209d39SAndroid Build Coastguard Worker * Base 10 only. 201*0e209d39SAndroid Build Coastguard Worker * @param pos INPUT-OUTPUT parameter. On input, the index of the first 202*0e209d39SAndroid Build Coastguard Worker * character to parse. On output, the index of the character after the 203*0e209d39SAndroid Build Coastguard Worker * last parsed character. 204*0e209d39SAndroid Build Coastguard Worker */ 205*0e209d39SAndroid Build Coastguard Worker static int32_t parseAsciiInteger(const UnicodeString& str, int32_t& pos); 206*0e209d39SAndroid Build Coastguard Worker 207*0e209d39SAndroid Build Coastguard Worker /** 208*0e209d39SAndroid Build Coastguard Worker * Parse a Unicode identifier from the given string at the given 209*0e209d39SAndroid Build Coastguard Worker * position. Return the identifier, or an empty string if there 210*0e209d39SAndroid Build Coastguard Worker * is no identifier. 211*0e209d39SAndroid Build Coastguard Worker * @param str the string to parse 212*0e209d39SAndroid Build Coastguard Worker * @param pos INPUT-OUTPUT parameter. On INPUT, pos is the 213*0e209d39SAndroid Build Coastguard Worker * first character to examine. It must be less than str.length(), 214*0e209d39SAndroid Build Coastguard Worker * and it must not point to a whitespace character. That is, must 215*0e209d39SAndroid Build Coastguard Worker * have pos < str.length() and 216*0e209d39SAndroid Build Coastguard Worker * !UCharacter::isWhitespace(str.char32At(pos)). On 217*0e209d39SAndroid Build Coastguard Worker * OUTPUT, the position after the last parsed character. 218*0e209d39SAndroid Build Coastguard Worker * @return the Unicode identifier, or an empty string if there is 219*0e209d39SAndroid Build Coastguard Worker * no valid identifier at pos. 220*0e209d39SAndroid Build Coastguard Worker */ 221*0e209d39SAndroid Build Coastguard Worker static UnicodeString parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos); 222*0e209d39SAndroid Build Coastguard Worker 223*0e209d39SAndroid Build Coastguard Worker /** 224*0e209d39SAndroid Build Coastguard Worker * Parse an unsigned 31-bit integer at the given offset. Use 225*0e209d39SAndroid Build Coastguard Worker * UCharacter.digit() to parse individual characters into digits. 226*0e209d39SAndroid Build Coastguard Worker * @param text the text to be parsed 227*0e209d39SAndroid Build Coastguard Worker * @param pos INPUT-OUTPUT parameter. On entry, pos is the 228*0e209d39SAndroid Build Coastguard Worker * offset within text at which to start parsing; it should point 229*0e209d39SAndroid Build Coastguard Worker * to a valid digit. On exit, pos is the offset after the last 230*0e209d39SAndroid Build Coastguard Worker * parsed character. If the parse failed, it will be unchanged on 231*0e209d39SAndroid Build Coastguard Worker * exit. Must be >= 0 on entry. 232*0e209d39SAndroid Build Coastguard Worker * @param radix the radix in which to parse; must be >= 2 and <= 233*0e209d39SAndroid Build Coastguard Worker * 36. 234*0e209d39SAndroid Build Coastguard Worker * @return a non-negative parsed number, or -1 upon parse failure. 235*0e209d39SAndroid Build Coastguard Worker * Parse fails if there are no digits, that is, if pos does not 236*0e209d39SAndroid Build Coastguard Worker * point to a valid digit on entry, or if the number to be parsed 237*0e209d39SAndroid Build Coastguard Worker * does not fit into a 31-bit unsigned integer. 238*0e209d39SAndroid Build Coastguard Worker */ 239*0e209d39SAndroid Build Coastguard Worker static int32_t parseNumber(const UnicodeString& text, 240*0e209d39SAndroid Build Coastguard Worker int32_t& pos, int8_t radix); 241*0e209d39SAndroid Build Coastguard Worker 242*0e209d39SAndroid Build Coastguard Worker static void appendToRule(UnicodeString& rule, 243*0e209d39SAndroid Build Coastguard Worker UChar32 c, 244*0e209d39SAndroid Build Coastguard Worker UBool isLiteral, 245*0e209d39SAndroid Build Coastguard Worker UBool escapeUnprintable, 246*0e209d39SAndroid Build Coastguard Worker UnicodeString& quoteBuf); 247*0e209d39SAndroid Build Coastguard Worker 248*0e209d39SAndroid Build Coastguard Worker static void appendToRule(UnicodeString& rule, 249*0e209d39SAndroid Build Coastguard Worker const UnicodeString& text, 250*0e209d39SAndroid Build Coastguard Worker UBool isLiteral, 251*0e209d39SAndroid Build Coastguard Worker UBool escapeUnprintable, 252*0e209d39SAndroid Build Coastguard Worker UnicodeString& quoteBuf); 253*0e209d39SAndroid Build Coastguard Worker 254*0e209d39SAndroid Build Coastguard Worker static void appendToRule(UnicodeString& rule, 255*0e209d39SAndroid Build Coastguard Worker const UnicodeMatcher* matcher, 256*0e209d39SAndroid Build Coastguard Worker UBool escapeUnprintable, 257*0e209d39SAndroid Build Coastguard Worker UnicodeString& quoteBuf); 258*0e209d39SAndroid Build Coastguard Worker 259*0e209d39SAndroid Build Coastguard Worker private: 260*0e209d39SAndroid Build Coastguard Worker // do not instantiate 261*0e209d39SAndroid Build Coastguard Worker ICU_Utility() = delete; 262*0e209d39SAndroid Build Coastguard Worker }; 263*0e209d39SAndroid Build Coastguard Worker 264*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 265*0e209d39SAndroid Build Coastguard Worker 266*0e209d39SAndroid Build Coastguard Worker #endif 267*0e209d39SAndroid Build Coastguard Worker //eof 268