1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2002-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: uset.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002mar07 16 * created by: Markus W. Scherer 17 * 18 * C version of UnicodeSet. 19 */ 20 21 22 /** 23 * \file 24 * \brief C API: Unicode Set 25 * 26 * <p>This is a C wrapper around the C++ UnicodeSet class.</p> 27 */ 28 29 #ifndef __USET_H__ 30 #define __USET_H__ 31 32 #include "unicode/utypes.h" 33 #include "unicode/uchar.h" 34 35 #if U_SHOW_CPLUSPLUS_API 36 #include "unicode/localpointer.h" 37 #endif // U_SHOW_CPLUSPLUS_API 38 39 #ifndef USET_DEFINED 40 41 #ifndef U_IN_DOXYGEN 42 #define USET_DEFINED 43 #endif 44 /** 45 * USet is the C API type corresponding to C++ class UnicodeSet. 46 * Use the uset_* API to manipulate. Create with 47 * uset_open*, and destroy with uset_close. 48 * @stable ICU 2.4 49 */ 50 typedef struct USet USet; 51 #endif 52 53 /** 54 * Bitmask values to be passed to uset_openPatternOptions() or 55 * uset_applyPattern() taking an option parameter. 56 * 57 * Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 58 * These case options are mutually exclusive. 59 * 60 * Undefined options bits are ignored, and reserved for future use. 61 * 62 * @stable ICU 2.4 63 */ 64 enum { 65 /** 66 * Ignore white space within patterns unless quoted or escaped. 67 * @stable ICU 2.4 68 */ 69 USET_IGNORE_SPACE = 1, 70 71 /** 72 * Enable case insensitive matching. E.g., "[ab]" with this flag 73 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 74 * match all except 'a', 'A', 'b', and 'B'. This performs a full 75 * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'. 76 * 77 * The resulting set is a superset of the input for the code points but 78 * not for the strings. 79 * It performs a case mapping closure of the code points and adds 80 * full case folding strings for the code points, and reduces strings of 81 * the original set to their full case folding equivalents. 82 * 83 * This is designed for case-insensitive matches, for example 84 * in regular expressions. The full code point case closure allows checking of 85 * an input character directly against the closure set. 86 * Strings are matched by comparing the case-folded form from the closure 87 * set with an incremental case folding of the string in question. 88 * 89 * The closure set will also contain single code points if the original 90 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 91 * This is not necessary (that is, redundant) for the above matching method 92 * but results in the same closure sets regardless of whether the original 93 * set contained the code point or a string. 94 * 95 * @stable ICU 2.4 96 */ 97 USET_CASE_INSENSITIVE = 2, 98 99 /** 100 * Adds all case mappings for each element in the set. 101 * This adds the full lower-, title-, and uppercase mappings as well as the full case folding 102 * of each existing element in the set. 103 * 104 * Unlike the “case insensitive” options, this does not perform a closure. 105 * For example, it does not add 'ſ' (U+017F long s) for 's', 106 * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions. 107 * 108 * @stable ICU 3.2 109 */ 110 USET_ADD_CASE_MAPPINGS = 4, 111 112 /** 113 * Enable case insensitive matching. 114 * Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings, 115 * which map each code point to one code point, 116 * not full Case_Folding (cf) mappings, which map some code points to multiple code points. 117 * 118 * This is designed for case-insensitive matches, for example in certain 119 * regular expression implementations where only Simple_Case_Folding mappings are used, 120 * such as in ECMAScript (JavaScript) regular expressions. 121 * 122 * @stable ICU 73 123 */ 124 USET_SIMPLE_CASE_INSENSITIVE = 6 125 }; 126 127 /** 128 * Argument values for whether span() and similar functions continue while 129 * the current character is contained vs. not contained in the set. 130 * 131 * The functionality is straightforward for sets with only single code points, 132 * without strings (which is the common case): 133 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same. 134 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED. 135 * - span() and spanBack() partition any string the same way when 136 * alternating between span(USET_SPAN_NOT_CONTAINED) and 137 * span(either "contained" condition). 138 * - Using a complemented (inverted) set and the opposite span conditions 139 * yields the same results. 140 * 141 * When a set contains multi-code point strings, then these statements may not 142 * be true, depending on the strings in the set (for example, whether they 143 * overlap with each other) and the string that is processed. 144 * For a set with strings: 145 * - The complement of the set contains the opposite set of code points, 146 * but the same set of strings. 147 * Therefore, complementing both the set and the span conditions 148 * may yield different results. 149 * - When starting spans at different positions in a string 150 * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 151 * because a set string may start before the later position. 152 * - span(USET_SPAN_SIMPLE) may be shorter than 153 * span(USET_SPAN_CONTAINED) because it will not recursively try 154 * all possible paths. 155 * For example, with a set which contains the three strings "xy", "xya" and "ax", 156 * span("xyax", USET_SPAN_CONTAINED) will return 4 but 157 * span("xyax", USET_SPAN_SIMPLE) will return 3. 158 * span(USET_SPAN_SIMPLE) will never be longer than 159 * span(USET_SPAN_CONTAINED). 160 * - With either "contained" condition, span() and spanBack() may partition 161 * a string in different ways. 162 * For example, with a set which contains the two strings "ab" and "ba", 163 * and when processing the string "aba", 164 * span() will yield contained/not-contained boundaries of { 0, 2, 3 } 165 * while spanBack() will yield boundaries of { 0, 1, 3 }. 166 * 167 * Note: If it is important to get the same boundaries whether iterating forward 168 * or backward through a string, then either only span() should be used and 169 * the boundaries cached for backward operation, or an ICU BreakIterator 170 * could be used. 171 * 172 * Note: Unpaired surrogates are treated like surrogate code points. 173 * Similarly, set strings match only on code point boundaries, 174 * never in the middle of a surrogate pair. 175 * Illegal UTF-8 sequences are treated like U+FFFD. 176 * When processing UTF-8 strings, malformed set strings 177 * (strings with unpaired surrogates which cannot be converted to UTF-8) 178 * are ignored. 179 * 180 * @stable ICU 3.8 181 */ 182 typedef enum USetSpanCondition { 183 /** 184 * Continues a span() while there is no set element at the current position. 185 * Increments by one code point at a time. 186 * Stops before the first set element (character or string). 187 * (For code points only, this is like while contains(current)==false). 188 * 189 * When span() returns, the substring between where it started and the position 190 * it returned consists only of characters that are not in the set, 191 * and none of its strings overlap with the span. 192 * 193 * @stable ICU 3.8 194 */ 195 USET_SPAN_NOT_CONTAINED = 0, 196 /** 197 * Spans the longest substring that is a concatenation of set elements (characters or strings). 198 * (For characters only, this is like while contains(current)==true). 199 * 200 * When span() returns, the substring between where it started and the position 201 * it returned consists only of set elements (characters or strings) that are in the set. 202 * 203 * If a set contains strings, then the span will be the longest substring for which there 204 * exists at least one non-overlapping concatenation of set elements (characters or strings). 205 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 206 * (Java/ICU/Perl regex stops at the first match of an OR.) 207 * 208 * @stable ICU 3.8 209 */ 210 USET_SPAN_CONTAINED = 1, 211 /** 212 * Continues a span() while there is a set element at the current position. 213 * Increments by the longest matching element at each position. 214 * (For characters only, this is like while contains(current)==true). 215 * 216 * When span() returns, the substring between where it started and the position 217 * it returned consists only of set elements (characters or strings) that are in the set. 218 * 219 * If a set only contains single characters, then this is the same 220 * as USET_SPAN_CONTAINED. 221 * 222 * If a set contains strings, then the span will be the longest substring 223 * with a match at each position with the longest single set element (character or string). 224 * 225 * Use this span condition together with other longest-match algorithms, 226 * such as ICU converters (ucnv_getUnicodeSet()). 227 * 228 * @stable ICU 3.8 229 */ 230 USET_SPAN_SIMPLE = 2, 231 #ifndef U_HIDE_DEPRECATED_API 232 /** 233 * One more than the last span condition. 234 * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. 235 */ 236 USET_SPAN_CONDITION_COUNT 237 #endif // U_HIDE_DEPRECATED_API 238 } USetSpanCondition; 239 240 enum { 241 /** 242 * Capacity of USerializedSet::staticArray. 243 * Enough for any single-code point set. 244 * Also provides padding for nice sizeof(USerializedSet). 245 * @stable ICU 2.4 246 */ 247 USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8 248 }; 249 250 /** 251 * A serialized form of a Unicode set. Limited manipulations are 252 * possible directly on a serialized set. See below. 253 * @stable ICU 2.4 254 */ 255 typedef struct USerializedSet { 256 /** 257 * The serialized Unicode Set. 258 * @stable ICU 2.4 259 */ 260 const uint16_t *array; 261 /** 262 * The length of the array that contains BMP characters. 263 * @stable ICU 2.4 264 */ 265 int32_t bmpLength; 266 /** 267 * The total length of the array. 268 * @stable ICU 2.4 269 */ 270 int32_t length; 271 /** 272 * A small buffer for the array to reduce memory allocations. 273 * @stable ICU 2.4 274 */ 275 uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY]; 276 } USerializedSet; 277 278 /********************************************************************* 279 * USet API 280 *********************************************************************/ 281 282 /** 283 * Create an empty USet object. 284 * Equivalent to uset_open(1, 0). 285 * @return a newly created USet. The caller must call uset_close() on 286 * it when done. 287 * @stable ICU 4.2 288 */ 289 U_CAPI USet* U_EXPORT2 290 uset_openEmpty(void); 291 292 /** 293 * Creates a USet object that contains the range of characters 294 * start..end, inclusive. If <code>start > end</code> 295 * then an empty set is created (same as using uset_openEmpty()). 296 * @param start first character of the range, inclusive 297 * @param end last character of the range, inclusive 298 * @return a newly created USet. The caller must call uset_close() on 299 * it when done. 300 * @stable ICU 2.4 301 */ 302 U_CAPI USet* U_EXPORT2 303 uset_open(UChar32 start, UChar32 end); 304 305 /** 306 * Creates a set from the given pattern. See the UnicodeSet class 307 * description for the syntax of the pattern language. 308 * @param pattern a string specifying what characters are in the set 309 * @param patternLength the length of the pattern, or -1 if null 310 * terminated 311 * @param ec the error code 312 * @stable ICU 2.4 313 */ 314 U_CAPI USet* U_EXPORT2 315 uset_openPattern(const UChar* pattern, int32_t patternLength, 316 UErrorCode* ec); 317 318 /** 319 * Creates a set from the given pattern. See the UnicodeSet class 320 * description for the syntax of the pattern language. 321 * @param pattern a string specifying what characters are in the set 322 * @param patternLength the length of the pattern, or -1 if null 323 * terminated 324 * @param options bitmask for options to apply to the pattern. 325 * Valid options are USET_IGNORE_SPACE and 326 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 327 * These case options are mutually exclusive. 328 * @param ec the error code 329 * @stable ICU 2.4 330 */ 331 U_CAPI USet* U_EXPORT2 332 uset_openPatternOptions(const UChar* pattern, int32_t patternLength, 333 uint32_t options, 334 UErrorCode* ec); 335 336 /** 337 * Disposes of the storage used by a USet object. This function should 338 * be called exactly once for objects returned by uset_open(). 339 * @param set the object to dispose of 340 * @stable ICU 2.4 341 */ 342 U_CAPI void U_EXPORT2 343 uset_close(USet* set); 344 345 #if U_SHOW_CPLUSPLUS_API 346 347 U_NAMESPACE_BEGIN 348 349 /** 350 * \class LocalUSetPointer 351 * "Smart pointer" class, closes a USet via uset_close(). 352 * For most methods see the LocalPointerBase base class. 353 * 354 * @see LocalPointerBase 355 * @see LocalPointer 356 * @stable ICU 4.4 357 */ 358 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close); 359 360 U_NAMESPACE_END 361 362 #endif 363 364 /** 365 * Returns a copy of this object. 366 * If this set is frozen, then the clone will be frozen as well. 367 * Use uset_cloneAsThawed() for a mutable clone of a frozen set. 368 * @param set the original set 369 * @return the newly allocated copy of the set 370 * @see uset_cloneAsThawed 371 * @stable ICU 3.8 372 */ 373 U_CAPI USet * U_EXPORT2 374 uset_clone(const USet *set); 375 376 /** 377 * Determines whether the set has been frozen (made immutable) or not. 378 * See the ICU4J Freezable interface for details. 379 * @param set the set 380 * @return true/false for whether the set has been frozen 381 * @see uset_freeze 382 * @see uset_cloneAsThawed 383 * @stable ICU 3.8 384 */ 385 U_CAPI UBool U_EXPORT2 386 uset_isFrozen(const USet *set); 387 388 /** 389 * Freeze the set (make it immutable). 390 * Once frozen, it cannot be unfrozen and is therefore thread-safe 391 * until it is deleted. 392 * See the ICU4J Freezable interface for details. 393 * Freezing the set may also make some operations faster, for example 394 * uset_contains() and uset_span(). 395 * A frozen set will not be modified. (It remains frozen.) 396 * @param set the set 397 * @return the same set, now frozen 398 * @see uset_isFrozen 399 * @see uset_cloneAsThawed 400 * @stable ICU 3.8 401 */ 402 U_CAPI void U_EXPORT2 403 uset_freeze(USet *set); 404 405 /** 406 * Clone the set and make the clone mutable. 407 * See the ICU4J Freezable interface for details. 408 * @param set the set 409 * @return the mutable clone 410 * @see uset_freeze 411 * @see uset_isFrozen 412 * @see uset_clone 413 * @stable ICU 3.8 414 */ 415 U_CAPI USet * U_EXPORT2 416 uset_cloneAsThawed(const USet *set); 417 418 /** 419 * Causes the USet object to represent the range <code>start - end</code>. 420 * If <code>start > end</code> then this USet is set to an empty range. 421 * A frozen set will not be modified. 422 * @param set the object to set to the given range 423 * @param start first character in the set, inclusive 424 * @param end last character in the set, inclusive 425 * @stable ICU 3.2 426 */ 427 U_CAPI void U_EXPORT2 428 uset_set(USet* set, 429 UChar32 start, UChar32 end); 430 431 /** 432 * Modifies the set to represent the set specified by the given 433 * pattern. See the UnicodeSet class description for the syntax of 434 * the pattern language. See also the User Guide chapter about UnicodeSet. 435 * <em>Empties the set passed before applying the pattern.</em> 436 * A frozen set will not be modified. 437 * @param set The set to which the pattern is to be applied. 438 * @param pattern A pointer to UChar string specifying what characters are in the set. 439 * The character at pattern[0] must be a '['. 440 * @param patternLength The length of the UChar string. -1 if NUL terminated. 441 * @param options A bitmask for options to apply to the pattern. 442 * Valid options are USET_IGNORE_SPACE and 443 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, 444 * USET_SIMPLE_CASE_INSENSITIVE. 445 * These case options are mutually exclusive. 446 * @param status Returns an error if the pattern cannot be parsed. 447 * @return Upon successful parse, the value is either 448 * the index of the character after the closing ']' 449 * of the parsed pattern. 450 * If the status code indicates failure, then the return value 451 * is the index of the error in the source. 452 * 453 * @stable ICU 2.8 454 */ 455 U_CAPI int32_t U_EXPORT2 456 uset_applyPattern(USet *set, 457 const UChar *pattern, int32_t patternLength, 458 uint32_t options, 459 UErrorCode *status); 460 461 /** 462 * Modifies the set to contain those code points which have the given value 463 * for the given binary or enumerated property, as returned by 464 * u_getIntPropertyValue. Prior contents of this set are lost. 465 * A frozen set will not be modified. 466 * 467 * @param set the object to contain the code points defined by the property 468 * 469 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 470 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 471 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. 472 * 473 * @param value a value in the range u_getIntPropertyMinValue(prop).. 474 * u_getIntPropertyMaxValue(prop), with one exception. If prop is 475 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but 476 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped 477 * categories such as [:L:] to be represented. 478 * 479 * @param ec error code input/output parameter 480 * 481 * @stable ICU 3.2 482 */ 483 U_CAPI void U_EXPORT2 484 uset_applyIntPropertyValue(USet* set, 485 UProperty prop, int32_t value, UErrorCode* ec); 486 487 /** 488 * Modifies the set to contain those code points which have the 489 * given value for the given property. Prior contents of this 490 * set are lost. 491 * A frozen set will not be modified. 492 * 493 * @param set the object to contain the code points defined by the given 494 * property and value alias 495 * 496 * @param prop a string specifying a property alias, either short or long. 497 * The name is matched loosely. See PropertyAliases.txt for names and a 498 * description of loose matching. If the value string is empty, then this 499 * string is interpreted as either a General_Category value alias, a Script 500 * value alias, a binary property alias, or a special ID. Special IDs are 501 * matched loosely and correspond to the following sets: 502 * 503 * "ANY" = [\\u0000-\\U0010FFFF], 504 * "ASCII" = [\\u0000-\\u007F], 505 * "Assigned" = [:^Cn:]. 506 * 507 * @param propLength the length of the prop, or -1 if NULL 508 * 509 * @param value a string specifying a value alias, either short or long. 510 * The name is matched loosely. See PropertyValueAliases.txt for names 511 * and a description of loose matching. In addition to aliases listed, 512 * numeric values and canonical combining classes may be expressed 513 * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string 514 * may also be empty. 515 * 516 * @param valueLength the length of the value, or -1 if NULL 517 * 518 * @param ec error code input/output parameter 519 * 520 * @stable ICU 3.2 521 */ 522 U_CAPI void U_EXPORT2 523 uset_applyPropertyAlias(USet* set, 524 const UChar *prop, int32_t propLength, 525 const UChar *value, int32_t valueLength, 526 UErrorCode* ec); 527 528 /** 529 * Return true if the given position, in the given pattern, appears 530 * to be the start of a UnicodeSet pattern. 531 * 532 * @param pattern a string specifying the pattern 533 * @param patternLength the length of the pattern, or -1 if NULL 534 * @param pos the given position 535 * @stable ICU 3.2 536 */ 537 U_CAPI UBool U_EXPORT2 538 uset_resemblesPattern(const UChar *pattern, int32_t patternLength, 539 int32_t pos); 540 541 /** 542 * Returns a string representation of this set. If the result of 543 * calling this function is passed to a uset_openPattern(), it 544 * will produce another set that is equal to this one. 545 * @param set the set 546 * @param result the string to receive the rules, may be NULL 547 * @param resultCapacity the capacity of result, may be 0 if result is NULL 548 * @param escapeUnprintable if true then convert unprintable 549 * character to their hex escape representations, \\uxxxx or 550 * \\Uxxxxxxxx. Unprintable characters are those other than 551 * U+000A, U+0020..U+007E. 552 * @param ec error code. 553 * @return length of string, possibly larger than resultCapacity 554 * @stable ICU 2.4 555 */ 556 U_CAPI int32_t U_EXPORT2 557 uset_toPattern(const USet* set, 558 UChar* result, int32_t resultCapacity, 559 UBool escapeUnprintable, 560 UErrorCode* ec); 561 562 /** 563 * Adds the given character to the given USet. After this call, 564 * uset_contains(set, c) will return true. 565 * A frozen set will not be modified. 566 * @param set the object to which to add the character 567 * @param c the character to add 568 * @stable ICU 2.4 569 */ 570 U_CAPI void U_EXPORT2 571 uset_add(USet* set, UChar32 c); 572 573 /** 574 * Adds all of the elements in the specified set to this set if 575 * they're not already present. This operation effectively 576 * modifies this set so that its value is the <i>union</i> of the two 577 * sets. The behavior of this operation is unspecified if the specified 578 * collection is modified while the operation is in progress. 579 * A frozen set will not be modified. 580 * 581 * @param set the object to which to add the set 582 * @param additionalSet the source set whose elements are to be added to this set. 583 * @stable ICU 2.6 584 */ 585 U_CAPI void U_EXPORT2 586 uset_addAll(USet* set, const USet *additionalSet); 587 588 /** 589 * Adds the given range of characters to the given USet. After this call, 590 * uset_contains(set, start, end) will return true. 591 * A frozen set will not be modified. 592 * @param set the object to which to add the character 593 * @param start the first character of the range to add, inclusive 594 * @param end the last character of the range to add, inclusive 595 * @stable ICU 2.2 596 */ 597 U_CAPI void U_EXPORT2 598 uset_addRange(USet* set, UChar32 start, UChar32 end); 599 600 /** 601 * Adds the given string to the given USet. After this call, 602 * uset_containsString(set, str, strLen) will return true. 603 * A frozen set will not be modified. 604 * @param set the object to which to add the character 605 * @param str the string to add 606 * @param strLen the length of the string or -1 if null terminated. 607 * @stable ICU 2.4 608 */ 609 U_CAPI void U_EXPORT2 610 uset_addString(USet* set, const UChar* str, int32_t strLen); 611 612 /** 613 * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"} 614 * If this set already contains any particular character, it has no effect on that character. 615 * A frozen set will not be modified. 616 * @param set the object to which to add the character 617 * @param str the source string 618 * @param strLen the length of the string or -1 if null terminated. 619 * @stable ICU 3.4 620 */ 621 U_CAPI void U_EXPORT2 622 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen); 623 624 /** 625 * Removes the given character from the given USet. After this call, 626 * uset_contains(set, c) will return false. 627 * A frozen set will not be modified. 628 * @param set the object from which to remove the character 629 * @param c the character to remove 630 * @stable ICU 2.4 631 */ 632 U_CAPI void U_EXPORT2 633 uset_remove(USet* set, UChar32 c); 634 635 /** 636 * Removes the given range of characters from the given USet. After this call, 637 * uset_contains(set, start, end) will return false. 638 * A frozen set will not be modified. 639 * @param set the object to which to add the character 640 * @param start the first character of the range to remove, inclusive 641 * @param end the last character of the range to remove, inclusive 642 * @stable ICU 2.2 643 */ 644 U_CAPI void U_EXPORT2 645 uset_removeRange(USet* set, UChar32 start, UChar32 end); 646 647 /** 648 * Removes the given string to the given USet. After this call, 649 * uset_containsString(set, str, strLen) will return false. 650 * A frozen set will not be modified. 651 * @param set the object to which to add the character 652 * @param str the string to remove 653 * @param strLen the length of the string or -1 if null terminated. 654 * @stable ICU 2.4 655 */ 656 U_CAPI void U_EXPORT2 657 uset_removeString(USet* set, const UChar* str, int32_t strLen); 658 659 /** 660 * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"} 661 * A frozen set will not be modified. 662 * 663 * @param set the object to be modified 664 * @param str the string 665 * @param length the length of the string, or -1 if NUL-terminated 666 * @stable ICU 69 667 */ 668 U_CAPI void U_EXPORT2 669 uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length); 670 671 /** 672 * Removes from this set all of its elements that are contained in the 673 * specified set. This operation effectively modifies this 674 * set so that its value is the <i>asymmetric set difference</i> of 675 * the two sets. 676 * A frozen set will not be modified. 677 * @param set the object from which the elements are to be removed 678 * @param removeSet the object that defines which elements will be 679 * removed from this set 680 * @stable ICU 3.2 681 */ 682 U_CAPI void U_EXPORT2 683 uset_removeAll(USet* set, const USet* removeSet); 684 685 /** 686 * Retain only the elements in this set that are contained in the 687 * specified range. If <code>start > end</code> then an empty range is 688 * retained, leaving the set empty. This is equivalent to 689 * a boolean logic AND, or a set INTERSECTION. 690 * A frozen set will not be modified. 691 * 692 * @param set the object for which to retain only the specified range 693 * @param start first character, inclusive, of range 694 * @param end last character, inclusive, of range 695 * @stable ICU 3.2 696 */ 697 U_CAPI void U_EXPORT2 698 uset_retain(USet* set, UChar32 start, UChar32 end); 699 700 /** 701 * Retains only the specified string from this set if it is present. 702 * Upon return this set will be empty if it did not contain s, or 703 * will only contain s if it did contain s. 704 * A frozen set will not be modified. 705 * 706 * @param set the object to be modified 707 * @param str the string 708 * @param length the length of the string, or -1 if NUL-terminated 709 * @stable ICU 69 710 */ 711 U_CAPI void U_EXPORT2 712 uset_retainString(USet *set, const UChar *str, int32_t length); 713 714 /** 715 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 716 * A frozen set will not be modified. 717 * 718 * @param set the object to be modified 719 * @param str the string 720 * @param length the length of the string, or -1 if NUL-terminated 721 * @stable ICU 69 722 */ 723 U_CAPI void U_EXPORT2 724 uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length); 725 726 /** 727 * Retains only the elements in this set that are contained in the 728 * specified set. In other words, removes from this set all of 729 * its elements that are not contained in the specified set. This 730 * operation effectively modifies this set so that its value is 731 * the <i>intersection</i> of the two sets. 732 * A frozen set will not be modified. 733 * 734 * @param set the object on which to perform the retain 735 * @param retain set that defines which elements this set will retain 736 * @stable ICU 3.2 737 */ 738 U_CAPI void U_EXPORT2 739 uset_retainAll(USet* set, const USet* retain); 740 741 /** 742 * Reallocate this objects internal structures to take up the least 743 * possible space, without changing this object's value. 744 * A frozen set will not be modified. 745 * 746 * @param set the object on which to perform the compact 747 * @stable ICU 3.2 748 */ 749 U_CAPI void U_EXPORT2 750 uset_compact(USet* set); 751 752 /** 753 * This is equivalent to 754 * <code>uset_complementRange(set, 0, 0x10FFFF)</code>. 755 * 756 * <strong>Note:</strong> This performs a symmetric difference with all code points 757 * <em>and thus retains all multicharacter strings</em>. 758 * In order to achieve a “code point complement” (all code points minus this set), 759 * the easiest is to <code>uset_complement(set); uset_removeAllStrings(set);</code>. 760 * 761 * A frozen set will not be modified. 762 * @param set the set 763 * @stable ICU 2.4 764 */ 765 U_CAPI void U_EXPORT2 766 uset_complement(USet* set); 767 768 /** 769 * Complements the specified range in this set. Any character in 770 * the range will be removed if it is in this set, or will be 771 * added if it is not in this set. If <code>start > end</code> 772 * then an empty range is complemented, leaving the set unchanged. 773 * This is equivalent to a boolean logic XOR. 774 * A frozen set will not be modified. 775 * 776 * @param set the object to be modified 777 * @param start first character, inclusive, of range 778 * @param end last character, inclusive, of range 779 * @stable ICU 69 780 */ 781 U_CAPI void U_EXPORT2 782 uset_complementRange(USet *set, UChar32 start, UChar32 end); 783 784 /** 785 * Complements the specified string in this set. 786 * The string will be removed if it is in this set, or will be added if it is not in this set. 787 * A frozen set will not be modified. 788 * 789 * @param set the object to be modified 790 * @param str the string 791 * @param length the length of the string, or -1 if NUL-terminated 792 * @stable ICU 69 793 */ 794 U_CAPI void U_EXPORT2 795 uset_complementString(USet *set, const UChar *str, int32_t length); 796 797 /** 798 * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"} 799 * A frozen set will not be modified. 800 * 801 * @param set the object to be modified 802 * @param str the string 803 * @param length the length of the string, or -1 if NUL-terminated 804 * @stable ICU 69 805 */ 806 U_CAPI void U_EXPORT2 807 uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length); 808 809 /** 810 * Complements in this set all elements contained in the specified 811 * set. Any character in the other set will be removed if it is 812 * in this set, or will be added if it is not in this set. 813 * A frozen set will not be modified. 814 * 815 * @param set the set with which to complement 816 * @param complement set that defines which elements will be xor'ed 817 * from this set. 818 * @stable ICU 3.2 819 */ 820 U_CAPI void U_EXPORT2 821 uset_complementAll(USet* set, const USet* complement); 822 823 /** 824 * Removes all of the elements from this set. This set will be 825 * empty after this call returns. 826 * A frozen set will not be modified. 827 * @param set the set 828 * @stable ICU 2.4 829 */ 830 U_CAPI void U_EXPORT2 831 uset_clear(USet* set); 832 833 /** 834 * Close this set over the given attribute. For the attribute 835 * USET_CASE_INSENSITIVE, the result is to modify this set so that: 836 * 837 * 1. For each character or string 'a' in this set, all strings or 838 * characters 'b' such that foldCase(a) == foldCase(b) are added 839 * to this set. 840 * 841 * 2. For each string 'e' in the resulting set, if e != 842 * foldCase(e), 'e' will be removed. 843 * 844 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] 845 * 846 * (Here foldCase(x) refers to the operation u_strFoldCase, and a 847 * == b denotes that the contents are the same, not pointer 848 * comparison.) 849 * 850 * A frozen set will not be modified. 851 * 852 * @param set the set 853 * 854 * @param attributes bitmask for attributes to close over. 855 * Valid options: 856 * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 857 * These case options are mutually exclusive. 858 * Unrelated options bits are ignored. 859 * @stable ICU 4.2 860 */ 861 U_CAPI void U_EXPORT2 862 uset_closeOver(USet* set, int32_t attributes); 863 864 /** 865 * Remove all strings from this set. 866 * 867 * @param set the set 868 * @stable ICU 4.2 869 */ 870 U_CAPI void U_EXPORT2 871 uset_removeAllStrings(USet* set); 872 873 /** 874 * Returns true if the given USet contains no characters and no 875 * strings. 876 * @param set the set 877 * @return true if set is empty 878 * @stable ICU 2.4 879 */ 880 U_CAPI UBool U_EXPORT2 881 uset_isEmpty(const USet* set); 882 883 /** 884 * @param set the set 885 * @return true if this set contains multi-character strings or the empty string. 886 * @stable ICU 70 887 */ 888 U_CAPI UBool U_EXPORT2 889 uset_hasStrings(const USet *set); 890 891 /** 892 * Returns true if the given USet contains the given character. 893 * This function works faster with a frozen set. 894 * @param set the set 895 * @param c The codepoint to check for within the set 896 * @return true if set contains c 897 * @stable ICU 2.4 898 */ 899 U_CAPI UBool U_EXPORT2 900 uset_contains(const USet* set, UChar32 c); 901 902 /** 903 * Returns true if the given USet contains all characters c 904 * where start <= c && c <= end. 905 * @param set the set 906 * @param start the first character of the range to test, inclusive 907 * @param end the last character of the range to test, inclusive 908 * @return true if set contains the range 909 * @stable ICU 2.2 910 */ 911 U_CAPI UBool U_EXPORT2 912 uset_containsRange(const USet* set, UChar32 start, UChar32 end); 913 914 /** 915 * Returns true if the given USet contains the given string. 916 * @param set the set 917 * @param str the string 918 * @param strLen the length of the string or -1 if null terminated. 919 * @return true if set contains str 920 * @stable ICU 2.4 921 */ 922 U_CAPI UBool U_EXPORT2 923 uset_containsString(const USet* set, const UChar* str, int32_t strLen); 924 925 /** 926 * Returns the index of the given character within this set, where 927 * the set is ordered by ascending code point. If the character 928 * is not in this set, return -1. The inverse of this method is 929 * <code>charAt()</code>. 930 * @param set the set 931 * @param c the character to obtain the index for 932 * @return an index from 0..size()-1, or -1 933 * @stable ICU 3.2 934 */ 935 U_CAPI int32_t U_EXPORT2 936 uset_indexOf(const USet* set, UChar32 c); 937 938 /** 939 * Returns the character at the given index within this set, where 940 * the set is ordered by ascending code point. If the index is 941 * out of range for characters, returns (UChar32)-1. 942 * The inverse of this method is <code>indexOf()</code>. 943 * 944 * For iteration, this is slower than uset_getRangeCount()/uset_getItemCount() 945 * with uset_getItem(), because for each call it skips linearly over <code>index</code> 946 * characters in the ranges. 947 * 948 * @param set the set 949 * @param charIndex an index from 0..size()-1 to obtain the char for 950 * @return the character at the given index, or (UChar32)-1. 951 * @stable ICU 3.2 952 */ 953 U_CAPI UChar32 U_EXPORT2 954 uset_charAt(const USet* set, int32_t charIndex); 955 956 /** 957 * Returns the number of characters and strings contained in this set. 958 * The last (uset_getItemCount() - uset_getRangeCount()) items are strings. 959 * 960 * This is slower than uset_getRangeCount() and uset_getItemCount() because 961 * it counts the code points of all ranges. 962 * 963 * @param set the set 964 * @return a non-negative integer counting the characters and strings 965 * contained in set 966 * @stable ICU 2.4 967 * @see uset_getRangeCount 968 */ 969 U_CAPI int32_t U_EXPORT2 970 uset_size(const USet* set); 971 972 /** 973 * @param set the set 974 * @return the number of ranges in this set. 975 * @stable ICU 70 976 * @see uset_getItemCount 977 * @see uset_getItem 978 * @see uset_size 979 */ 980 U_CAPI int32_t U_EXPORT2 981 uset_getRangeCount(const USet *set); 982 983 /** 984 * Returns the number of items in this set. An item is either a range 985 * of characters or a single multicharacter string. 986 * @param set the set 987 * @return a non-negative integer counting the character ranges 988 * and/or strings contained in set 989 * @stable ICU 2.4 990 */ 991 U_CAPI int32_t U_EXPORT2 992 uset_getItemCount(const USet* set); 993 994 /** 995 * Returns an item of this set. An item is either a range of 996 * characters or a single multicharacter string (which can be the empty string). 997 * 998 * If <code>itemIndex</code> is less than uset_getRangeCount(), then this function returns 0, 999 * and the range is <code>*start</code>..<code>*end</code>. 1000 * 1001 * If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then 1002 * this function copies the string into <code>str[strCapacity]</code> and 1003 * returns the length of the string (0 for the empty string). 1004 * 1005 * If <code>itemIndex</code> is out of range, then this function returns -1. 1006 * 1007 * Note that 0 is returned for each range as well as for the empty string. 1008 * 1009 * @param set the set 1010 * @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1 1011 * @param start pointer to variable to receive first character in range, inclusive; 1012 * can be NULL for a string item 1013 * @param end pointer to variable to receive last character in range, inclusive; 1014 * can be NULL for a string item 1015 * @param str buffer to receive the string, may be NULL 1016 * @param strCapacity capacity of str, or 0 if str is NULL 1017 * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range 1018 * @return the length of the string (0 or >= 2), or 0 if the item is a range, 1019 * or -1 if the itemIndex is out of range 1020 * @stable ICU 2.4 1021 */ 1022 U_CAPI int32_t U_EXPORT2 1023 uset_getItem(const USet* set, int32_t itemIndex, 1024 UChar32* start, UChar32* end, 1025 UChar* str, int32_t strCapacity, 1026 UErrorCode* ec); 1027 1028 /** 1029 * Returns true if set1 contains all the characters and strings 1030 * of set2. It answers the question, 'Is set1 a superset of set2?' 1031 * @param set1 set to be checked for containment 1032 * @param set2 set to be checked for containment 1033 * @return true if the test condition is met 1034 * @stable ICU 3.2 1035 */ 1036 U_CAPI UBool U_EXPORT2 1037 uset_containsAll(const USet* set1, const USet* set2); 1038 1039 /** 1040 * Returns true if this set contains all the characters 1041 * of the given string. This is does not check containment of grapheme 1042 * clusters, like uset_containsString. 1043 * @param set set of characters to be checked for containment 1044 * @param str string containing codepoints to be checked for containment 1045 * @param strLen the length of the string or -1 if null terminated. 1046 * @return true if the test condition is met 1047 * @stable ICU 3.4 1048 */ 1049 U_CAPI UBool U_EXPORT2 1050 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen); 1051 1052 /** 1053 * Returns true if set1 contains none of the characters and strings 1054 * of set2. It answers the question, 'Is set1 a disjoint set of set2?' 1055 * @param set1 set to be checked for containment 1056 * @param set2 set to be checked for containment 1057 * @return true if the test condition is met 1058 * @stable ICU 3.2 1059 */ 1060 U_CAPI UBool U_EXPORT2 1061 uset_containsNone(const USet* set1, const USet* set2); 1062 1063 /** 1064 * Returns true if set1 contains some of the characters and strings 1065 * of set2. It answers the question, 'Does set1 and set2 have an intersection?' 1066 * @param set1 set to be checked for containment 1067 * @param set2 set to be checked for containment 1068 * @return true if the test condition is met 1069 * @stable ICU 3.2 1070 */ 1071 U_CAPI UBool U_EXPORT2 1072 uset_containsSome(const USet* set1, const USet* set2); 1073 1074 /** 1075 * Returns the length of the initial substring of the input string which 1076 * consists only of characters and strings that are contained in this set 1077 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1078 * or only of characters and strings that are not contained 1079 * in this set (USET_SPAN_NOT_CONTAINED). 1080 * See USetSpanCondition for details. 1081 * Similar to the strspn() C library function. 1082 * Unpaired surrogates are treated according to contains() of their surrogate code points. 1083 * This function works faster with a frozen set and with a non-negative string length argument. 1084 * @param set the set 1085 * @param s start of the string 1086 * @param length of the string; can be -1 for NUL-terminated 1087 * @param spanCondition specifies the containment condition 1088 * @return the length of the initial substring according to the spanCondition; 1089 * 0 if the start of the string does not fit the spanCondition 1090 * @stable ICU 3.8 1091 * @see USetSpanCondition 1092 */ 1093 U_CAPI int32_t U_EXPORT2 1094 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 1095 1096 /** 1097 * Returns the start of the trailing substring of the input string which 1098 * consists only of characters and strings that are contained in this set 1099 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1100 * or only of characters and strings that are not contained 1101 * in this set (USET_SPAN_NOT_CONTAINED). 1102 * See USetSpanCondition for details. 1103 * Unpaired surrogates are treated according to contains() of their surrogate code points. 1104 * This function works faster with a frozen set and with a non-negative string length argument. 1105 * @param set the set 1106 * @param s start of the string 1107 * @param length of the string; can be -1 for NUL-terminated 1108 * @param spanCondition specifies the containment condition 1109 * @return the start of the trailing substring according to the spanCondition; 1110 * the string length if the end of the string does not fit the spanCondition 1111 * @stable ICU 3.8 1112 * @see USetSpanCondition 1113 */ 1114 U_CAPI int32_t U_EXPORT2 1115 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 1116 1117 /** 1118 * Returns the length of the initial substring of the input string which 1119 * consists only of characters and strings that are contained in this set 1120 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1121 * or only of characters and strings that are not contained 1122 * in this set (USET_SPAN_NOT_CONTAINED). 1123 * See USetSpanCondition for details. 1124 * Similar to the strspn() C library function. 1125 * Malformed byte sequences are treated according to contains(0xfffd). 1126 * This function works faster with a frozen set and with a non-negative string length argument. 1127 * @param set the set 1128 * @param s start of the string (UTF-8) 1129 * @param length of the string; can be -1 for NUL-terminated 1130 * @param spanCondition specifies the containment condition 1131 * @return the length of the initial substring according to the spanCondition; 1132 * 0 if the start of the string does not fit the spanCondition 1133 * @stable ICU 3.8 1134 * @see USetSpanCondition 1135 */ 1136 U_CAPI int32_t U_EXPORT2 1137 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 1138 1139 /** 1140 * Returns the start of the trailing substring of the input string which 1141 * consists only of characters and strings that are contained in this set 1142 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1143 * or only of characters and strings that are not contained 1144 * in this set (USET_SPAN_NOT_CONTAINED). 1145 * See USetSpanCondition for details. 1146 * Malformed byte sequences are treated according to contains(0xfffd). 1147 * This function works faster with a frozen set and with a non-negative string length argument. 1148 * @param set the set 1149 * @param s start of the string (UTF-8) 1150 * @param length of the string; can be -1 for NUL-terminated 1151 * @param spanCondition specifies the containment condition 1152 * @return the start of the trailing substring according to the spanCondition; 1153 * the string length if the end of the string does not fit the spanCondition 1154 * @stable ICU 3.8 1155 * @see USetSpanCondition 1156 */ 1157 U_CAPI int32_t U_EXPORT2 1158 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 1159 1160 /** 1161 * Returns true if set1 contains all of the characters and strings 1162 * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' 1163 * @param set1 set to be checked for containment 1164 * @param set2 set to be checked for containment 1165 * @return true if the test condition is met 1166 * @stable ICU 3.2 1167 */ 1168 U_CAPI UBool U_EXPORT2 1169 uset_equals(const USet* set1, const USet* set2); 1170 1171 /********************************************************************* 1172 * Serialized set API 1173 *********************************************************************/ 1174 1175 /** 1176 * Serializes this set into an array of 16-bit integers. Serialization 1177 * (currently) only records the characters in the set; multicharacter 1178 * strings are ignored. 1179 * 1180 * The array 1181 * has following format (each line is one 16-bit integer): 1182 * 1183 * length = (n+2*m) | (m!=0?0x8000:0) 1184 * bmpLength = n; present if m!=0 1185 * bmp[0] 1186 * bmp[1] 1187 * ... 1188 * bmp[n-1] 1189 * supp-high[0] 1190 * supp-low[0] 1191 * supp-high[1] 1192 * supp-low[1] 1193 * ... 1194 * supp-high[m-1] 1195 * supp-low[m-1] 1196 * 1197 * The array starts with a header. After the header are n bmp 1198 * code points, then m supplementary code points. Either n or m 1199 * or both may be zero. n+2*m is always <= 0x7FFF. 1200 * 1201 * If there are no supplementary characters (if m==0) then the 1202 * header is one 16-bit integer, 'length', with value n. 1203 * 1204 * If there are supplementary characters (if m!=0) then the header 1205 * is two 16-bit integers. The first, 'length', has value 1206 * (n+2*m)|0x8000. The second, 'bmpLength', has value n. 1207 * 1208 * After the header the code points are stored in ascending order. 1209 * Supplementary code points are stored as most significant 16 1210 * bits followed by least significant 16 bits. 1211 * 1212 * @param set the set 1213 * @param dest pointer to buffer of destCapacity 16-bit integers. 1214 * May be NULL only if destCapacity is zero. 1215 * @param destCapacity size of dest, or zero. Must not be negative. 1216 * @param pErrorCode pointer to the error code. Will be set to 1217 * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to 1218 * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. 1219 * @return the total length of the serialized format, including 1220 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other 1221 * than U_BUFFER_OVERFLOW_ERROR. 1222 * @stable ICU 2.4 1223 */ 1224 U_CAPI int32_t U_EXPORT2 1225 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode); 1226 1227 /** 1228 * Given a serialized array, fill in the given serialized set object. 1229 * @param fillSet pointer to result 1230 * @param src pointer to start of array 1231 * @param srcLength length of array 1232 * @return true if the given array is valid, otherwise false 1233 * @stable ICU 2.4 1234 */ 1235 U_CAPI UBool U_EXPORT2 1236 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength); 1237 1238 /** 1239 * Set the USerializedSet to contain the given character (and nothing 1240 * else). 1241 * @param fillSet pointer to result 1242 * @param c The codepoint to set 1243 * @stable ICU 2.4 1244 */ 1245 U_CAPI void U_EXPORT2 1246 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c); 1247 1248 /** 1249 * Returns true if the given USerializedSet contains the given 1250 * character. 1251 * @param set the serialized set 1252 * @param c The codepoint to check for within the set 1253 * @return true if set contains c 1254 * @stable ICU 2.4 1255 */ 1256 U_CAPI UBool U_EXPORT2 1257 uset_serializedContains(const USerializedSet* set, UChar32 c); 1258 1259 /** 1260 * Returns the number of disjoint ranges of characters contained in 1261 * the given serialized set. Ignores any strings contained in the 1262 * set. 1263 * @param set the serialized set 1264 * @return a non-negative integer counting the character ranges 1265 * contained in set 1266 * @stable ICU 2.4 1267 */ 1268 U_CAPI int32_t U_EXPORT2 1269 uset_getSerializedRangeCount(const USerializedSet* set); 1270 1271 /** 1272 * Returns a range of characters contained in the given serialized 1273 * set. 1274 * @param set the serialized set 1275 * @param rangeIndex a non-negative integer in the range 0.. 1276 * uset_getSerializedRangeCount(set)-1 1277 * @param pStart pointer to variable to receive first character 1278 * in range, inclusive 1279 * @param pEnd pointer to variable to receive last character in range, 1280 * inclusive 1281 * @return true if rangeIndex is valid, otherwise false 1282 * @stable ICU 2.4 1283 */ 1284 U_CAPI UBool U_EXPORT2 1285 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 1286 UChar32* pStart, UChar32* pEnd); 1287 1288 #endif 1289