xref: /aosp_15_r20/external/icu/libandroidicu/include/unicode/uspoof.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 2008-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ***************************************************************************
8 *   file name:  uspoof.h
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2008Feb13
14 *   created by: Andy Heninger
15 *
16 *   Unicode Spoof Detection
17 */
18 
19 #ifndef USPOOF_H
20 #define USPOOF_H
21 
22 #include "unicode/ubidi.h"
23 #include "unicode/utypes.h"
24 #include "unicode/uset.h"
25 #include "unicode/parseerr.h"
26 
27 #if !UCONFIG_NO_NORMALIZATION
28 
29 
30 #if U_SHOW_CPLUSPLUS_API
31 #include "unicode/localpointer.h"
32 #include "unicode/unistr.h"
33 #include "unicode/uniset.h"
34 #endif
35 
36 
37 /**
38  * \file
39  * \brief C API: Unicode Security and Spoofing Detection
40  *
41  * <p>
42  * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and
43  * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
44  *
45  * <ol>
46  * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and
47  * &quot;&Eta;arvest&quot;, where the second string starts with the Greek capital letter Eta.</li>
48  * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
49  * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li>
50  * </ol>
51  *
52  * <p>
53  * Although originally designed as a method for flagging suspicious identifier strings such as URLs,
54  * <code>USpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word
55  * content filters.
56  *
57  * <p>
58  * The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++.
59  *
60  * <h2>Confusables</h2>
61  *
62  * <p>
63  * The following example shows how to use <code>USpoofChecker</code> to check for confusability between two strings:
64  *
65  * \code{.c}
66  * UErrorCode status = U_ZERO_ERROR;
67  * UChar* str1 = (UChar*) u"Harvest";
68  * UChar* str2 = (UChar*) u"\u0397arvest";  // with U+0397 GREEK CAPITAL LETTER ETA
69  *
70  * USpoofChecker* sc = uspoof_open(&status);
71  * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
72  *
73  * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status);
74  * UBool result = bitmask != 0;
75  * // areConfusable: 1 (status: U_ZERO_ERROR)
76  * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
77  * uspoof_close(sc);
78  * \endcode
79  *
80  * <p>
81  * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks}
82  * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the
83  * confusability test; and the following line extracts the result out of the return value. For best performance,
84  * the instance should be created once (e.g., upon application startup), and the efficient
85  * {@link uspoof_areConfusable} method can be used at runtime.
86  *
87  * If the paragraph direction used to display the strings is known, the bidi function should be used instead:
88  *
89  * \code{.c}
90  * UErrorCode status = U_ZERO_ERROR;
91  * // These strings look identical when rendered in a left-to-right context.
92  * // They look distinct in a right-to-left context.
93  * UChar* str1 = (UChar*) u"A1\u05D0";  // A1א
94  * UChar* str2 = (UChar*) u"A\u05D01";  // Aא1
95  *
96  * USpoofChecker* sc = uspoof_open(&status);
97  * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
98  *
99  * int32_t bitmask = uspoof_areBidiConfusable(sc, UBIDI_LTR, str1, -1, str2, -1, &status);
100  * UBool result = bitmask != 0;
101  * // areBidiConfusable: 1 (status: U_ZERO_ERROR)
102  * printf("areBidiConfusable: %d (status: %s)\n", result, u_errorName(status));
103  * uspoof_close(sc);
104  * \endcode
105  *
106  * <p>
107  * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers.  It will automatically call
108  * {@link uspoof_close} when the object goes out of scope:
109  *
110  * \code{.cpp}
111  * UErrorCode status = U_ZERO_ERROR;
112  * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
113  * uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status);
114  * // ...
115  * \endcode
116  *
117  * UTS 39 defines two strings to be <em>confusable</em> if they map to the same <em>skeleton string</em>. A skeleton can
118  * be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so
119  * the following snippet is equivalent to the example above:
120  *
121  * \code{.c}
122  * UErrorCode status = U_ZERO_ERROR;
123  * UChar* str1 = (UChar*) u"Harvest";
124  * UChar* str2 = (UChar*) u"\u0397arvest";  // with U+0397 GREEK CAPITAL LETTER ETA
125  *
126  * USpoofChecker* sc = uspoof_open(&status);
127  * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
128  *
129  * // Get skeleton 1
130  * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status);
131  * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar));
132  * status = U_ZERO_ERROR;
133  * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status);
134  *
135  * // Get skeleton 2
136  * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status);
137  * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar));
138  * status = U_ZERO_ERROR;
139  * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status);
140  *
141  * // Are the skeletons the same?
142  * UBool result = u_strcmp(skel1, skel2) == 0;
143  * // areConfusable: 1 (status: U_ZERO_ERROR)
144  * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
145  * uspoof_close(sc);
146  * free(skel1);
147  * free(skel2);
148  * \endcode
149  *
150  * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling
151  * {@link uspoof_areConfusable} many times in a loop, {@link uspoof_getSkeleton} can be used instead, as shown below:
152  *
153  * \code{.c}
154  * UErrorCode status = U_ZERO_ERROR;
155  * #define DICTIONARY_LENGTH 2
156  * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
157  * UChar* skeletons[DICTIONARY_LENGTH];
158  * UChar* str = (UChar*) u"1orern";
159  *
160  * // Setup:
161  * USpoofChecker* sc = uspoof_open(&status);
162  * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
163  * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
164  *     UChar* word = dictionary[i];
165  *     int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status);
166  *     skeletons[i] = (UChar*) malloc(++len * sizeof(UChar));
167  *     status = U_ZERO_ERROR;
168  *     uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status);
169  * }
170  *
171  * // Live Check:
172  * {
173  *     int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status);
174  *     UChar* skel = (UChar*) malloc(++len * sizeof(UChar));
175  *     status = U_ZERO_ERROR;
176  *     uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status);
177  *     UBool result = false;
178  *     for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
179  *         result = u_strcmp(skel, skeletons[i]) == 0;
180  *         if (result == true) { break; }
181  *     }
182  *     // Has confusable in dictionary: 1 (status: U_ZERO_ERROR)
183  *     printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status));
184  *     free(skel);
185  * }
186  *
187  * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
188  *     free(skeletons[i]);
189  * }
190  * uspoof_close(sc);
191  * \endcode
192  *
193  * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em>
194  * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons
195  * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons.
196  *
197  * <h2>Spoof Detection</h2>
198  *
199  * The following snippet shows a minimal example of using <code>USpoofChecker</code> to perform spoof detection on a
200  * string:
201  *
202  * \code{.c}
203  * UErrorCode status = U_ZERO_ERROR;
204  * UChar* str = (UChar*) u"p\u0430ypal";  // with U+0430 CYRILLIC SMALL LETTER A
205  *
206  * // Get the default set of allowable characters:
207  * USet* allowed = uset_openEmpty();
208  * uset_addAll(allowed, uspoof_getRecommendedSet(&status));
209  * uset_addAll(allowed, uspoof_getInclusionSet(&status));
210  *
211  * USpoofChecker* sc = uspoof_open(&status);
212  * uspoof_setAllowedChars(sc, allowed, &status);
213  * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
214  *
215  * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status);
216  * UBool result = bitmask != 0;
217  * // fails checks: 1 (status: U_ZERO_ERROR)
218  * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
219  * uspoof_close(sc);
220  * uset_close(allowed);
221  * \endcode
222  *
223  * As in the case for confusability checking, it is good practice to create one <code>USpoofChecker</code> instance at
224  * startup, and call the cheaper {@link uspoof_check} online. We specify the set of
225  * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39.
226  *
227  * In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings,
228  * and {@link uspoof_checkUnicodeString} is exposed for C++ programmers.
229  *
230  * If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks
231  * is available in the returned bitmask.  For complete information, use the {@link uspoof_check2} class of functions
232  * with a {@link USpoofCheckResult} parameter:
233  *
234  * \code{.c}
235  * UErrorCode status = U_ZERO_ERROR;
236  * UChar* str = (UChar*) u"p\u0430ypal";  // with U+0430 CYRILLIC SMALL LETTER A
237  *
238  * // Get the default set of allowable characters:
239  * USet* allowed = uset_openEmpty();
240  * uset_addAll(allowed, uspoof_getRecommendedSet(&status));
241  * uset_addAll(allowed, uspoof_getInclusionSet(&status));
242  *
243  * USpoofChecker* sc = uspoof_open(&status);
244  * uspoof_setAllowedChars(sc, allowed, &status);
245  * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
246  *
247  * USpoofCheckResult* checkResult = uspoof_openCheckResult(&status);
248  * int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status);
249  *
250  * int32_t failures1 = bitmask;
251  * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status);
252  * assert(failures1 == failures2);
253  * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
254  * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
255  *
256  * // Cleanup:
257  * uspoof_close(sc);
258  * uset_close(allowed);
259  * uspoof_closeCheckResult(checkResult);
260  * \endcode
261  *
262  * C++ users can take advantage of a few syntactical conveniences.  The following snippet is functionally
263  * equivalent to the one above:
264  *
265  * \code{.cpp}
266  * UErrorCode status = U_ZERO_ERROR;
267  * UnicodeString str((UChar*) u"p\u0430ypal");  // with U+0430 CYRILLIC SMALL LETTER A
268  *
269  * // Get the default set of allowable characters:
270  * UnicodeSet allowed;
271  * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
272  * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
273  *
274  * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
275  * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
276  * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
277  *
278  * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
279  * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
280  *
281  * int32_t failures1 = bitmask;
282  * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status);
283  * assert(failures1 == failures2);
284  * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
285  * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
286  *
287  * // Explicit cleanup not necessary.
288  * \endcode
289  *
290  * The return value is a bitmask of the checks that failed. In this case, there was one check that failed:
291  * {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are:
292  *
293  * <ul>
294  * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the
295  * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS
296  * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li>
297  * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character
298  * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li>
299  * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable
300  * characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.</li>
301  * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li>
302  * </ul>
303  *
304  * <p>
305  * These checks can be enabled independently of each other. For example, if you were interested in checking for only the
306  * INVISIBLE and MIXED_NUMBERS conditions, you could do:
307  *
308  * \code{.c}
309  * UErrorCode status = U_ZERO_ERROR;
310  * UChar* str = (UChar*) u"8\u09EA";  // 8 mixed with U+09EA BENGALI DIGIT FOUR
311  *
312  * USpoofChecker* sc = uspoof_open(&status);
313  * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status);
314  *
315  * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status);
316  * UBool result = bitmask != 0;
317  * // fails checks: 1 (status: U_ZERO_ERROR)
318  * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
319  * uspoof_close(sc);
320  * \endcode
321  *
322  * Here is an example in C++ showing how to compute the restriction level of a string:
323  *
324  * \code{.cpp}
325  * UErrorCode status = U_ZERO_ERROR;
326  * UnicodeString str((UChar*) u"p\u0430ypal");  // with U+0430 CYRILLIC SMALL LETTER A
327  *
328  * // Get the default set of allowable characters:
329  * UnicodeSet allowed;
330  * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
331  * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
332  *
333  * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
334  * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
335  * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
336  * uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status);
337  *
338  * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
339  * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
340  *
341  * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status);
342  * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask:
343  * assert((restrictionLevel & bitmask) == restrictionLevel);
344  * // Restriction level: 0x50000000 (status: U_ZERO_ERROR)
345  * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status));
346  * \endcode
347  *
348  * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE.  Since
349  * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check.
350  *
351  * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in
352  * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings
353  * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have
354  * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is
355  * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed
356  * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on
357  * the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of
358  * allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code
359  * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
360  * scripts.
361  *
362  * <h2>Advanced bidirectional usage</h2>
363  * If the paragraph direction with which the identifiers will be displayed is not known, there are
364  * multiple options for confusable detection depending on the circumstances.
365  *
366  * <p>
367  * In some circumstances, the only concern is confusion between identifiers displayed with the same
368  * paragraph direction.
369  *
370  * <p>
371  * An example is the case where identifiers are usernames prefixed with the @ symbol.
372  * That symbol will appear to the left in a left-to-right context, and to the right in a
373  * right-to-left context, so that an identifier displayed in a left-to-right context can never be
374  * confused with an identifier displayed in a right-to-left context:
375  * <ul>
376  * <li>
377  * The usernames "A1א" (A one aleph) and "Aא1" (A aleph 1)
378  * would be considered confusable, since they both appear as \@A1א in a left-to-right context, and the
379  * usernames "אA_1" (aleph A underscore one) and "א1_A" (aleph one underscore A) would be considered
380  * confusable, since they both appear as A_1א@ in a right-to-left context.
381  * </li>
382  * <li>
383  * The username "Mark_" would not be considered confusable with the username "_Mark",
384  * even though the latter would appear as Mark_@ in a right-to-left context, and the
385  * former as \@Mark_ in a left-to-right context.
386  * </li>
387  * </ul>
388  * <p>
389  * In that case, the caller should check for both LTR-confusability and RTL-confusability:
390  *
391  * \code{.cpp}
392  * bool confusableInEitherDirection =
393  *     uspoof_areBidiConfusableUnicodeString(sc, UBIDI_LTR, id1, id2, &status) ||
394  *     uspoof_areBidiConfusableUnicodeString(sc, UBIDI_RTL, id1, id2, &status);
395  * \endcode
396  *
397  * If the bidiSkeleton is used, the LTR and RTL skeleta should be kept separately and compared, LTR
398  * with LTR and RTL with RTL.
399  *
400  * <p>
401  * In cases where confusability between the visual appearances of an identifier displayed in a
402  * left-to-right context with another identifier displayed in a right-to-left context is a concern,
403  * the LTR skeleton of one can be compared with the RTL skeleton of the other.  However, this
404  * very broad definition of confusability may have unexpected results; for instance, it treats the
405  * ASCII identifiers "Mark_" and "_Mark" as confusable.
406  *
407  * <h2>Additional Information</h2>
408  *
409  * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
410  *
411  * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether
412  * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads,
413  * using the same USpoofChecker instance.
414  *
415  * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are
416  * thread safe. Those that take a non-const USpoofChecker are not thread safe..
417  *
418  * @stable ICU 4.6
419  */
420 
421 U_CDECL_BEGIN
422 
423 struct USpoofChecker;
424 /**
425  * @stable ICU 4.2
426  */
427 typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */
428 
429 struct USpoofCheckResult;
430 /**
431  * @see uspoof_openCheckResult
432  * @stable ICU 58
433  */
434 typedef struct USpoofCheckResult USpoofCheckResult;
435 
436 /**
437  * Enum for the kinds of checks that USpoofChecker can perform.
438  * These enum values are used both to select the set of checks that
439  * will be performed, and to report results from the check function.
440  *
441  * @stable ICU 4.2
442  */
443 typedef enum USpoofChecks {
444     /**
445      * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
446      * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section
447      * 4.
448      *
449      * @see uspoof_areConfusable
450      * @stable ICU 4.2
451      */
452     USPOOF_SINGLE_SCRIPT_CONFUSABLE =   1,
453 
454     /**
455      * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
456      * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS
457      * 39 section 4.
458      *
459      * @see uspoof_areConfusable
460      * @stable ICU 4.2
461      */
462     USPOOF_MIXED_SCRIPT_CONFUSABLE  =   2,
463 
464     /**
465      * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
466      * that the two strings are visually confusable and that they are not from the same script but both of them are
467      * single-script strings, according to UTS 39 section 4.
468      *
469      * @see uspoof_areConfusable
470      * @stable ICU 4.2
471      */
472     USPOOF_WHOLE_SCRIPT_CONFUSABLE  =   4,
473 
474     /**
475      * Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables.  You may set
476      * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to
477      * make {@link uspoof_areConfusable} return only those types of confusables.
478      *
479      * @see uspoof_areConfusable
480      * @see uspoof_getSkeleton
481      * @stable ICU 58
482      */
483     USPOOF_CONFUSABLE               =   USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE,
484 
485 #ifndef U_HIDE_DEPRECATED_API
486     /**
487       * This flag is deprecated and no longer affects the behavior of SpoofChecker.
488       *
489       * @deprecated ICU 58  Any case confusable mappings were removed from UTS 39; the corresponding ICU API was deprecated.
490       */
491     USPOOF_ANY_CASE                 =   8,
492 #endif  /* U_HIDE_DEPRECATED_API */
493 
494     /**
495       * Check that an identifier is no looser than the specified RestrictionLevel.
496       * The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE.
497       *
498       * If USPOOF_AUX_INFO is enabled the actual restriction level of the
499       * identifier being tested will also be returned by uspoof_check().
500       *
501       * @see URestrictionLevel
502       * @see uspoof_setRestrictionLevel
503       * @see USPOOF_AUX_INFO
504       *
505       * @stable ICU 51
506       */
507     USPOOF_RESTRICTION_LEVEL        = 16,
508 
509 #ifndef U_HIDE_DEPRECATED_API
510     /** Check that an identifier contains only characters from a
511       * single script (plus chars from the common and inherited scripts.)
512       * Applies to checks of a single identifier check only.
513       * @deprecated ICU 51  Use RESTRICTION_LEVEL instead.
514       */
515     USPOOF_SINGLE_SCRIPT            =  USPOOF_RESTRICTION_LEVEL,
516 #endif  /* U_HIDE_DEPRECATED_API */
517 
518     /** Check an identifier for the presence of invisible characters,
519       * such as zero-width spaces, or character sequences that are
520       * likely not to display, such as multiple occurrences of the same
521       * non-spacing mark.  This check does not test the input string as a whole
522       * for conformance to any particular syntax for identifiers.
523       */
524     USPOOF_INVISIBLE                =  32,
525 
526     /** Check that an identifier contains only characters from a specified set
527       * of acceptable characters.  See {@link uspoof_setAllowedChars} and
528       * {@link uspoof_setAllowedLocales}.  Note that a string that fails this check
529       * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check.
530       */
531     USPOOF_CHAR_LIMIT               =  64,
532 
533     /**
534      * Check that an identifier does not mix numbers from different numbering systems.
535      * For more information, see UTS 39 section 5.3.
536      *
537      * @stable ICU 51
538      */
539     USPOOF_MIXED_NUMBERS            = 128,
540 
541     /**
542      * Check that an identifier does not have a combining character following a character in which that
543      * combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
544      *
545      * More specifically, the following characters are forbidden from preceding a U+0307:
546      * <ul>
547      * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
548      * <li>Latin lowercase letter 'l'</li>
549      * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
550      * <li>Any character whose confusable prototype ends with such a character
551      * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
552      * </ul>
553      * In addition, combining characters are allowed between the above characters and U+0307 except those
554      * with combining class 0 or combining class "Above" (230, same class as U+0307).
555      *
556      * This list and the number of combing characters considered by this check may grow over time.
557      *
558      * @stable ICU 62
559      */
560     USPOOF_HIDDEN_OVERLAY            = 256,
561 
562    /**
563      * Enable all spoof checks.
564      *
565      * @stable ICU 4.6
566      */
567     USPOOF_ALL_CHECKS               = 0xFFFF,
568 
569     /**
570       * Enable the return of auxiliary (non-error) information in the
571       * upper bits of the check results value.
572       *
573       * If this "check" is not enabled, the results of {@link uspoof_check} will be
574       * zero when an identifier passes all of the enabled checks.
575       *
576       * If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will
577       * be zero when an identifier passes all checks.
578       *
579       * @stable ICU 51
580       */
581     USPOOF_AUX_INFO                  = 0x40000000
582 
583     } USpoofChecks;
584 
585 
586     /**
587      * Constants from UTS #39 for use in {@link uspoof_setRestrictionLevel}, and
588      * for returned identifier restriction levels in check results.
589      *
590      * @stable ICU 51
591      *
592      * @see uspoof_setRestrictionLevel
593      * @see uspoof_check
594      */
595     typedef enum URestrictionLevel {
596         /**
597          * All characters in the string are in the identifier profile and all characters in the string are in the
598          * ASCII range.
599          *
600          * @stable ICU 51
601          */
602         USPOOF_ASCII = 0x10000000,
603         /**
604          * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and
605          * the string is single-script, according to the definition in UTS 39 section 5.1.
606          *
607          * @stable ICU 53
608          */
609         USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000,
610         /**
611          * The string classifies as Single Script, or all characters in the string are in the identifier profile and
612          * the string is covered by any of the following sets of scripts, according to the definition in UTS 39
613          * section 5.1:
614          * <ul>
615          *   <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li>
616          *   <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li>
617          *   <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li>
618          * </ul>
619          * This is the default restriction in ICU.
620          *
621          * @stable ICU 51
622          */
623         USPOOF_HIGHLY_RESTRICTIVE = 0x30000000,
624         /**
625          * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile
626          * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic,
627          * Greek, and Cherokee.
628          *
629          * @stable ICU 51
630          */
631         USPOOF_MODERATELY_RESTRICTIVE = 0x40000000,
632         /**
633          * All characters in the string are in the identifier profile.  Allow arbitrary mixtures of scripts.
634          *
635          * @stable ICU 51
636          */
637         USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000,
638         /**
639          * Any valid identifiers, including characters outside of the Identifier Profile.
640          *
641          * @stable ICU 51
642          */
643         USPOOF_UNRESTRICTIVE = 0x60000000,
644         /**
645          * Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}.
646          *
647          * @stable ICU 53
648          */
649         USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000,
650 #ifndef U_HIDE_INTERNAL_API
651         /**
652          * An undefined restriction level.
653          * @internal
654          */
655         USPOOF_UNDEFINED_RESTRICTIVE = -1
656 #endif  /* U_HIDE_INTERNAL_API */
657     } URestrictionLevel;
658 
659 /**
660  *  Create a Unicode Spoof Checker, configured to perform all
661  *  checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT.
662  *  Note that additional checks may be added in the future,
663  *  resulting in the changes to the default checking behavior.
664  *
665  *  @param status  The error code, set if this function encounters a problem.
666  *  @return        the newly created Spoof Checker
667  *  @stable ICU 4.2
668  */
669 U_CAPI USpoofChecker * U_EXPORT2
670 uspoof_open(UErrorCode *status);
671 
672 
673 /**
674  * Open a Spoof checker from its serialized form, stored in 32-bit-aligned memory.
675  * Inverse of uspoof_serialize().
676  * The memory containing the serialized data must remain valid and unchanged
677  * as long as the spoof checker, or any cloned copies of the spoof checker,
678  * are in use.  Ownership of the memory remains with the caller.
679  * The spoof checker (and any clones) must be closed prior to deleting the
680  * serialized data.
681  *
682  * @param data a pointer to 32-bit-aligned memory containing the serialized form of spoof data
683  * @param length the number of bytes available at data;
684  *               can be more than necessary
685  * @param pActualLength receives the actual number of bytes at data taken up by the data;
686  *                      can be NULL
687  * @param pErrorCode ICU error code
688  * @return the spoof checker.
689  *
690  * @see uspoof_open
691  * @see uspoof_serialize
692  * @stable ICU 4.2
693  */
694 U_CAPI USpoofChecker * U_EXPORT2
695 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
696                           UErrorCode *pErrorCode);
697 
698 /**
699   * Open a Spoof Checker from the source form of the spoof data.
700   * The input corresponds to the Unicode data file confusables.txt
701   * as described in Unicode Technical Standard #39.  The syntax of the source data
702   * is as described in UTS #39 for this file, and the content of
703   * this file is acceptable input.
704   *
705   * The character encoding of the (char *) input text is UTF-8.
706   *
707   * @param confusables a pointer to the confusable characters definitions,
708   *                    as found in file confusables.txt from unicode.org.
709   * @param confusablesLen The length of the confusables text, or -1 if the
710   *                    input string is zero terminated.
711   * @param confusablesWholeScript
712   *                    Deprecated in ICU 58.  No longer used.
713   * @param confusablesWholeScriptLen
714   *                    Deprecated in ICU 58.  No longer used.
715   * @param errType     In the event of an error in the input, indicates
716   *                    which of the input files contains the error.
717   *                    The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or
718   *                    USPOOF_WHOLE_SCRIPT_CONFUSABLE, or
719   *                    zero if no errors are found.
720   * @param pe          In the event of an error in the input, receives the position
721   *                    in the input text (line, offset) of the error.
722   * @param status      an in/out ICU UErrorCode.  Among the possible errors is
723   *                    U_PARSE_ERROR, which is used to report syntax errors
724   *                    in the input.
725   * @return            A spoof checker that uses the rules from the input files.
726   * @stable ICU 4.2
727   */
728 U_CAPI USpoofChecker * U_EXPORT2
729 uspoof_openFromSource(const char *confusables,  int32_t confusablesLen,
730                       const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
731                       int32_t *errType, UParseError *pe, UErrorCode *status);
732 
733 
734 /**
735   * Close a Spoof Checker, freeing any memory that was being held by
736   *   its implementation.
737   * @stable ICU 4.2
738   */
739 U_CAPI void U_EXPORT2
740 uspoof_close(USpoofChecker *sc);
741 
742 /**
743  * Clone a Spoof Checker.  The clone will be set to perform the same checks
744  *   as the original source.
745  *
746  * @param sc       The source USpoofChecker
747  * @param status   The error code, set if this function encounters a problem.
748  * @return
749  * @stable ICU 4.2
750  */
751 U_CAPI USpoofChecker * U_EXPORT2
752 uspoof_clone(const USpoofChecker *sc, UErrorCode *status);
753 
754 
755 /**
756  * Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method
757  * overwrites any checks that may have already been enabled. By default, all checks are enabled.
758  *
759  * To enable specific checks and disable all others,
760  * OR together only the bit constants for the desired checks.
761  * For example, to fail strings containing characters outside of
762  * the set specified by {@link uspoof_setAllowedChars} and
763  * also strings that contain digits from mixed numbering systems:
764  *
765  * <pre>
766  * {@code
767  * uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS);
768  * }
769  * </pre>
770  *
771  * To disable specific checks and enable all others,
772  * start with ALL_CHECKS and "AND away" the not-desired checks.
773  * For example, if you are not planning to use the {@link uspoof_areConfusable} functionality,
774  * it is good practice to disable the CONFUSABLE check:
775  *
776  * <pre>
777  * {@code
778  * uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE);
779  * }
780  * </pre>
781  *
782  * Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and
783  * {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they
784  * enable onto the existing bitmask specified by this method. For more details, see the documentation of those
785  * methods.
786  *
787  * @param sc       The USpoofChecker
788  * @param checks         The set of checks that this spoof checker will perform.
789  *                 The value is a bit set, obtained by OR-ing together
790  *                 values from enum USpoofChecks.
791  * @param status   The error code, set if this function encounters a problem.
792  * @stable ICU 4.2
793  *
794  */
795 U_CAPI void U_EXPORT2
796 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status);
797 
798 /**
799  * Get the set of checks that this Spoof Checker has been configured to perform.
800  *
801  * @param sc       The USpoofChecker
802  * @param status   The error code, set if this function encounters a problem.
803  * @return         The set of checks that this spoof checker will perform.
804  *                 The value is a bit set, obtained by OR-ing together
805  *                 values from enum USpoofChecks.
806  * @stable ICU 4.2
807  *
808  */
809 U_CAPI int32_t U_EXPORT2
810 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
811 
812 /**
813  * Set the loosest restriction level allowed for strings. The default if this is not called is
814  * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
815  * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
816  * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
817  *
818  * @param sc       The USpoofChecker
819  * @param restrictionLevel The loosest restriction level allowed.
820  * @see URestrictionLevel
821  * @stable ICU 51
822  */
823 U_CAPI void U_EXPORT2
824 uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
825 
826 
827 /**
828   * Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}.
829   *
830   * @return The restriction level
831   * @see URestrictionLevel
832   * @stable ICU 51
833   */
834 U_CAPI URestrictionLevel U_EXPORT2
835 uspoof_getRestrictionLevel(const USpoofChecker *sc);
836 
837 /**
838  * Limit characters that are acceptable in identifiers being checked to those
839  * normally used with the languages associated with the specified locales.
840  * Any previously specified list of locales is replaced by the new settings.
841  *
842  * A set of languages is determined from the locale(s), and
843  * from those a set of acceptable Unicode scripts is determined.
844  * Characters from this set of scripts, along with characters from
845  * the "common" and "inherited" Unicode Script categories
846  * will be permitted.
847  *
848  * Supplying an empty string removes all restrictions;
849  * characters from any script will be allowed.
850  *
851  * The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this
852  * USpoofChecker when calling this function with a non-empty list
853  * of locales.
854  *
855  * The Unicode Set of characters that will be allowed is accessible
856  * via the uspoof_getAllowedChars() function.  uspoof_setAllowedLocales()
857  * will <i>replace</i> any previously applied set of allowed characters.
858  *
859  * Adjustments, such as additions or deletions of certain classes of characters,
860  * can be made to the result of uspoof_setAllowedLocales() by
861  * fetching the resulting set with uspoof_getAllowedChars(),
862  * manipulating it with the Unicode Set API, then resetting the
863  * spoof detectors limits with uspoof_setAllowedChars().
864  *
865  * @param sc           The USpoofChecker
866  * @param localesList  A list list of locales, from which the language
867  *                     and associated script are extracted.  The locales
868  *                     are comma-separated if there is more than one.
869  *                     White space may not appear within an individual locale,
870  *                     but is ignored otherwise.
871  *                     The locales are syntactically like those from the
872  *                     HTTP Accept-Language header.
873  *                     If the localesList is empty, no restrictions will be placed on
874  *                     the allowed characters.
875  *
876  * @param status       The error code, set if this function encounters a problem.
877  * @stable ICU 4.2
878  */
879 U_CAPI void U_EXPORT2
880 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status);
881 
882 /**
883  * Get a list of locales for the scripts that are acceptable in strings
884  *  to be checked.  If no limitations on scripts have been specified,
885  *  an empty string will be returned.
886  *
887  *  uspoof_setAllowedChars() will reset the list of allowed to be empty.
888  *
889  *  The format of the returned list is the same as that supplied to
890  *  uspoof_setAllowedLocales(), but returned list may not be identical
891  *  to the originally specified string; the string may be reformatted,
892  *  and information other than languages from
893  *  the originally specified locales may be omitted.
894  *
895  * @param sc           The USpoofChecker
896  * @param status       The error code, set if this function encounters a problem.
897  * @return             A string containing a list of  locales corresponding
898  *                     to the acceptable scripts, formatted like an
899  *                     HTTP Accept Language value.
900  *
901  * @stable ICU 4.2
902  */
903 U_CAPI const char * U_EXPORT2
904 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status);
905 
906 
907 /**
908  * Limit the acceptable characters to those specified by a Unicode Set.
909  *   Any previously specified character limit is
910  *   is replaced by the new settings.  This includes limits on
911  *   characters that were set with the uspoof_setAllowedLocales() function.
912  *
913  * The USPOOF_CHAR_LIMIT test is automatically enabled for this
914  * USpoofChecker by this function.
915  *
916  * @param sc       The USpoofChecker
917  * @param chars    A Unicode Set containing the list of
918  *                 characters that are permitted.  Ownership of the set
919  *                 remains with the caller.  The incoming set is cloned by
920  *                 this function, so there are no restrictions on modifying
921  *                 or deleting the USet after calling this function.
922  * @param status   The error code, set if this function encounters a problem.
923  * @stable ICU 4.2
924  */
925 U_CAPI void U_EXPORT2
926 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status);
927 
928 
929 /**
930  * Get a USet for the characters permitted in an identifier.
931  * This corresponds to the limits imposed by the Set Allowed Characters
932  * functions. Limitations imposed by other checks will not be
933  * reflected in the set returned by this function.
934  *
935  * The returned set will be frozen, meaning that it cannot be modified
936  * by the caller.
937  *
938  * Ownership of the returned set remains with the Spoof Detector.  The
939  * returned set will become invalid if the spoof detector is closed,
940  * or if a new set of allowed characters is specified.
941  *
942  *
943  * @param sc       The USpoofChecker
944  * @param status   The error code, set if this function encounters a problem.
945  * @return         A USet containing the characters that are permitted by
946  *                 the USPOOF_CHAR_LIMIT test.
947  * @stable ICU 4.2
948  */
949 U_CAPI const USet * U_EXPORT2
950 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status);
951 
952 
953 /**
954  * Check the specified string for possible security issues.
955  * The text to be checked will typically be an identifier of some sort.
956  * The set of checks to be performed is specified with uspoof_setChecks().
957  *
958  * \note
959  *   Consider using the newer API, {@link uspoof_check2}, instead.
960  *   The newer API exposes additional information from the check procedure
961  *   and is otherwise identical to this method.
962  *
963  * @param sc      The USpoofChecker
964  * @param id      The identifier to be checked for possible security issues,
965  *                in UTF-16 format.
966  * @param length  the length of the string to be checked, expressed in
967  *                16 bit UTF-16 code units, or -1 if the string is
968  *                zero terminated.
969  * @param position  Deprecated in ICU 51.  Always returns zero.
970  *                Originally, an out parameter for the index of the first
971  *                string position that failed a check.
972  *                This parameter may be NULL.
973  * @param status  The error code, set if an error occurred while attempting to
974  *                perform the check.
975  *                Spoofing or security issues detected with the input string are
976  *                not reported here, but through the function's return value.
977  * @return        An integer value with bits set for any potential security
978  *                or spoofing issues detected.  The bits are defined by
979  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
980  *                will be zero if the input string passes all of the
981  *                enabled checks.
982  * @see uspoof_check2
983  * @stable ICU 4.2
984  */
985 U_CAPI int32_t U_EXPORT2
986 uspoof_check(const USpoofChecker *sc,
987                          const UChar *id, int32_t length,
988                          int32_t *position,
989                          UErrorCode *status);
990 
991 
992 /**
993  * Check the specified string for possible security issues.
994  * The text to be checked will typically be an identifier of some sort.
995  * The set of checks to be performed is specified with uspoof_setChecks().
996  *
997  * \note
998  *   Consider using the newer API, {@link uspoof_check2UTF8}, instead.
999  *   The newer API exposes additional information from the check procedure
1000  *   and is otherwise identical to this method.
1001  *
1002  * @param sc      The USpoofChecker
1003  * @param id      A identifier to be checked for possible security issues, in UTF8 format.
1004  * @param length  the length of the string to be checked, or -1 if the string is
1005  *                zero terminated.
1006  * @param position  Deprecated in ICU 51.  Always returns zero.
1007  *                Originally, an out parameter for the index of the first
1008  *                string position that failed a check.
1009  *                This parameter may be NULL.
1010  * @param status  The error code, set if an error occurred while attempting to
1011  *                perform the check.
1012  *                Spoofing or security issues detected with the input string are
1013  *                not reported here, but through the function's return value.
1014  *                If the input contains invalid UTF-8 sequences,
1015  *                a status of U_INVALID_CHAR_FOUND will be returned.
1016  * @return        An integer value with bits set for any potential security
1017  *                or spoofing issues detected.  The bits are defined by
1018  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1019  *                will be zero if the input string passes all of the
1020  *                enabled checks.
1021  * @see uspoof_check2UTF8
1022  * @stable ICU 4.2
1023  */
1024 U_CAPI int32_t U_EXPORT2
1025 uspoof_checkUTF8(const USpoofChecker *sc,
1026                  const char *id, int32_t length,
1027                  int32_t *position,
1028                  UErrorCode *status);
1029 
1030 
1031 /**
1032  * Check the specified string for possible security issues.
1033  * The text to be checked will typically be an identifier of some sort.
1034  * The set of checks to be performed is specified with uspoof_setChecks().
1035  *
1036  * @param sc      The USpoofChecker
1037  * @param id      The identifier to be checked for possible security issues,
1038  *                in UTF-16 format.
1039  * @param length  the length of the string to be checked, or -1 if the string is
1040  *                zero terminated.
1041  * @param checkResult  An instance of USpoofCheckResult to be filled with
1042  *                details about the identifier.  Can be NULL.
1043  * @param status  The error code, set if an error occurred while attempting to
1044  *                perform the check.
1045  *                Spoofing or security issues detected with the input string are
1046  *                not reported here, but through the function's return value.
1047  * @return        An integer value with bits set for any potential security
1048  *                or spoofing issues detected.  The bits are defined by
1049  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1050  *                will be zero if the input string passes all of the
1051  *                enabled checks.  Any information in this bitmask will be
1052  *                consistent with the information saved in the optional
1053  *                checkResult parameter.
1054  * @see uspoof_openCheckResult
1055  * @see uspoof_check2UTF8
1056  * @see uspoof_check2UnicodeString
1057  * @stable ICU 58
1058  */
1059 U_CAPI int32_t U_EXPORT2
1060 uspoof_check2(const USpoofChecker *sc,
1061     const UChar* id, int32_t length,
1062     USpoofCheckResult* checkResult,
1063     UErrorCode *status);
1064 
1065 /**
1066  * Check the specified string for possible security issues.
1067  * The text to be checked will typically be an identifier of some sort.
1068  * The set of checks to be performed is specified with uspoof_setChecks().
1069  *
1070  * This version of {@link uspoof_check} accepts a USpoofCheckResult, which
1071  * returns additional information about the identifier.  For more
1072  * information, see {@link uspoof_openCheckResult}.
1073  *
1074  * @param sc      The USpoofChecker
1075  * @param id      A identifier to be checked for possible security issues, in UTF8 format.
1076  * @param length  the length of the string to be checked, or -1 if the string is
1077  *                zero terminated.
1078  * @param checkResult  An instance of USpoofCheckResult to be filled with
1079  *                details about the identifier.  Can be NULL.
1080  * @param status  The error code, set if an error occurred while attempting to
1081  *                perform the check.
1082  *                Spoofing or security issues detected with the input string are
1083  *                not reported here, but through the function's return value.
1084  * @return        An integer value with bits set for any potential security
1085  *                or spoofing issues detected.  The bits are defined by
1086  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1087  *                will be zero if the input string passes all of the
1088  *                enabled checks.  Any information in this bitmask will be
1089  *                consistent with the information saved in the optional
1090  *                checkResult parameter.
1091  * @see uspoof_openCheckResult
1092  * @see uspoof_check2
1093  * @see uspoof_check2UnicodeString
1094  * @stable ICU 58
1095  */
1096 U_CAPI int32_t U_EXPORT2
1097 uspoof_check2UTF8(const USpoofChecker *sc,
1098     const char *id, int32_t length,
1099     USpoofCheckResult* checkResult,
1100     UErrorCode *status);
1101 
1102 /**
1103  * Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return
1104  * information about the identifier.  Information includes:
1105  * <ul>
1106  *   <li>A bitmask of the checks that failed</li>
1107  *   <li>The identifier's restriction level (UTS 39 section 5.2)</li>
1108  *   <li>The set of numerics in the string (UTS 39 section 5.3)</li>
1109  * </ul>
1110  * The data held in a USpoofCheckResult is cleared whenever it is passed into a new call
1111  * of {@link uspoof_check2}.
1112  *
1113  * @param status  The error code, set if this function encounters a problem.
1114  * @return        the newly created USpoofCheckResult
1115  * @see uspoof_check2
1116  * @see uspoof_check2UTF8
1117  * @see uspoof_check2UnicodeString
1118  * @stable ICU 58
1119  */
1120 U_CAPI USpoofCheckResult* U_EXPORT2
1121 uspoof_openCheckResult(UErrorCode *status);
1122 
1123 /**
1124  * Close a USpoofCheckResult, freeing any memory that was being held by
1125  *   its implementation.
1126  *
1127  * @param checkResult  The instance of USpoofCheckResult to close
1128  * @stable ICU 58
1129  */
1130 U_CAPI void U_EXPORT2
1131 uspoof_closeCheckResult(USpoofCheckResult *checkResult);
1132 
1133 /**
1134  * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests
1135  * in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on.
1136  *
1137  * @param checkResult  The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
1138  * @param status       The error code, set if an error occurred.
1139  * @return        An integer value with bits set for any potential security
1140  *                or spoofing issues detected.  The bits are defined by
1141  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1142  *                will be zero if the input string passes all of the
1143  *                enabled checks.
1144  * @see uspoof_setChecks
1145  * @stable ICU 58
1146  */
1147 U_CAPI int32_t U_EXPORT2
1148 uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status);
1149 
1150 /**
1151  * Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check
1152  * was enabled; otherwise, undefined.
1153  *
1154  * @param checkResult  The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
1155  * @param status       The error code, set if an error occurred.
1156  * @return             The restriction level contained in the USpoofCheckResult
1157  * @see uspoof_setRestrictionLevel
1158  * @stable ICU 58
1159  */
1160 U_CAPI URestrictionLevel U_EXPORT2
1161 uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status);
1162 
1163 /**
1164  * Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled;
1165  * otherwise, undefined.  The set will contain the zero digit from each decimal number system found
1166  * in the input string.  Ownership of the returned USet remains with the USpoofCheckResult.
1167  * The USet will be free'd when {@link uspoof_closeCheckResult} is called.
1168  *
1169  * @param checkResult  The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
1170  * @return             The set of numerics contained in the USpoofCheckResult
1171  * @param status       The error code, set if an error occurred.
1172  * @stable ICU 58
1173  */
1174 U_CAPI const USet* U_EXPORT2
1175 uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status);
1176 
1177 
1178 /**
1179  * Check whether two specified strings are visually confusable.
1180  *
1181  * If the strings are confusable, the return value will be nonzero, as long as
1182  * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
1183  *
1184  * The bits in the return value correspond to flags for each of the classes of
1185  * confusables applicable to the two input strings.  According to UTS 39
1186  * section 4, the possible flags are:
1187  *
1188  * <ul>
1189  *   <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
1190  *   <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
1191  *   <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
1192  * </ul>
1193  *
1194  * If one or more of the above flags were not listed in uspoof_setChecks(), this
1195  * function will never report that class of confusable.  The check
1196  * {@link USPOOF_CONFUSABLE} enables all three flags.
1197  *
1198  *
1199  * @param sc      The USpoofChecker
1200  * @param id1     The first of the two identifiers to be compared for
1201  *                confusability.  The strings are in UTF-16 format.
1202  * @param length1 the length of the first identifier, expressed in
1203  *                16 bit UTF-16 code units, or -1 if the string is
1204  *                nul terminated.
1205  * @param id2     The second of the two identifiers to be compared for
1206  *                confusability.  The identifiers are in UTF-16 format.
1207  * @param length2 The length of the second identifiers, expressed in
1208  *                16 bit UTF-16 code units, or -1 if the string is
1209  *                nul terminated.
1210  * @param status  The error code, set if an error occurred while attempting to
1211  *                perform the check.
1212  *                Confusability of the identifiers is not reported here,
1213  *                but through this function's return value.
1214  * @return        An integer value with bit(s) set corresponding to
1215  *                the type of confusability found, as defined by
1216  *                enum USpoofChecks.  Zero is returned if the identifiers
1217  *                are not confusable.
1218  *
1219  * @stable ICU 4.2
1220  */
1221 U_CAPI int32_t U_EXPORT2
1222 uspoof_areConfusable(const USpoofChecker *sc,
1223                      const UChar *id1, int32_t length1,
1224                      const UChar *id2, int32_t length2,
1225                      UErrorCode *status);
1226 
1227 #ifndef U_HIDE_DRAFT_API
1228 /**
1229  * Check whether two specified strings are visually confusable when
1230  * displayed in a context with the given paragraph direction.
1231  *
1232  * If the strings are confusable, the return value will be nonzero, as long as
1233  * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
1234  *
1235  * The bits in the return value correspond to flags for each of the classes of
1236  * confusables applicable to the two input strings.  According to UTS 39
1237  * section 4, the possible flags are:
1238  *
1239  * <ul>
1240  *   <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
1241  *   <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
1242  *   <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
1243  * </ul>
1244  *
1245  * If one or more of the above flags were not listed in uspoof_setChecks(), this
1246  * function will never report that class of confusable.  The check
1247  * {@link USPOOF_CONFUSABLE} enables all three flags.
1248  *
1249  *
1250  * @param sc      The USpoofChecker
1251  * @param direction The paragraph direction with which the identifiers are
1252  *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1253  * @param id1     The first of the two identifiers to be compared for
1254  *                confusability.  The strings are in UTF-16 format.
1255  * @param length1 the length of the first identifier, expressed in
1256  *                16 bit UTF-16 code units, or -1 if the string is
1257  *                nul terminated.
1258  * @param id2     The second of the two identifiers to be compared for
1259  *                confusability.  The identifiers are in UTF-16 format.
1260  * @param length2 The length of the second identifiers, expressed in
1261  *                16 bit UTF-16 code units, or -1 if the string is
1262  *                nul terminated.
1263  * @param status  The error code, set if an error occurred while attempting to
1264  *                perform the check.
1265  *                Confusability of the identifiers is not reported here,
1266  *                but through this function's return value.
1267  * @return        An integer value with bit(s) set corresponding to
1268  *                the type of confusability found, as defined by
1269  *                enum USpoofChecks.  Zero is returned if the identifiers
1270  *                are not confusable.
1271  *
1272  * @draft ICU 74
1273  */
1274 U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
1275                                                   const UChar *id1, int32_t length1,
1276                                                   const UChar *id2, int32_t length2,
1277                                                   UErrorCode *status);
1278 #endif /* U_HIDE_DRAFT_API */
1279 
1280 /**
1281  * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format.
1282  *
1283  * @param sc      The USpoofChecker
1284  * @param id1     The first of the two identifiers to be compared for
1285  *                confusability.  The strings are in UTF-8 format.
1286  * @param length1 the length of the first identifiers, in bytes, or -1
1287  *                if the string is nul terminated.
1288  * @param id2     The second of the two identifiers to be compared for
1289  *                confusability.  The strings are in UTF-8 format.
1290  * @param length2 The length of the second string in bytes, or -1
1291  *                if the string is nul terminated.
1292  * @param status  The error code, set if an error occurred while attempting to
1293  *                perform the check.
1294  *                Confusability of the strings is not reported here,
1295  *                but through this function's return value.
1296  * @return        An integer value with bit(s) set corresponding to
1297  *                the type of confusability found, as defined by
1298  *                enum USpoofChecks.  Zero is returned if the strings
1299  *                are not confusable.
1300  *
1301  * @stable ICU 4.2
1302  *
1303  * @see uspoof_areConfusable
1304  */
1305 U_CAPI int32_t U_EXPORT2
1306 uspoof_areConfusableUTF8(const USpoofChecker *sc,
1307                          const char *id1, int32_t length1,
1308                          const char *id2, int32_t length2,
1309                          UErrorCode *status);
1310 
1311 #ifndef U_HIDE_DRAFT_API
1312 /**
1313  * A version of {@link uspoof_areBidiConfusable} accepting strings in UTF-8 format.
1314  *
1315  * @param sc      The USpoofChecker
1316  * @param direction The paragraph direction with which the identifiers are
1317  *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1318  * @param id1     The first of the two identifiers to be compared for
1319  *                confusability.  The strings are in UTF-8 format.
1320  * @param length1 the length of the first identifiers, in bytes, or -1
1321  *                if the string is nul terminated.
1322  * @param id2     The second of the two identifiers to be compared for
1323  *                confusability.  The strings are in UTF-8 format.
1324  * @param length2 The length of the second string in bytes, or -1
1325  *                if the string is nul terminated.
1326  * @param status  The error code, set if an error occurred while attempting to
1327  *                perform the check.
1328  *                Confusability of the strings is not reported here,
1329  *                but through this function's return value.
1330  * @return        An integer value with bit(s) set corresponding to
1331  *                the type of confusability found, as defined by
1332  *                enum USpoofChecks.  Zero is returned if the strings
1333  *                are not confusable.
1334  *
1335  * @draft ICU 74
1336  *
1337  * @see uspoof_areBidiConfusable
1338  */
1339 U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
1340                                                       const char *id1, int32_t length1,
1341                                                       const char *id2, int32_t length2,
1342                                                       UErrorCode *status);
1343 #endif /* U_HIDE_DRAFT_API */
1344 
1345 /**
1346  *  Get the "skeleton" for an identifier.
1347  *  Skeletons are a transformation of the input identifier;
1348  * Two identifiers are confusable if their skeletons are identical.
1349  *  See Unicode Technical Standard #39 for additional information.
1350  *
1351  *  Using skeletons directly makes it possible to quickly check
1352  *  whether an identifier is confusable with any of some large
1353  *  set of existing identifiers, by creating an efficiently
1354  *  searchable collection of the skeletons.
1355  *
1356  * @param sc      The USpoofChecker
1357  * @param type    Deprecated in ICU 58.  You may pass any number.
1358  *                Originally, controlled which of the Unicode confusable data
1359  *                tables to use.
1360  * @param id      The input identifier whose skeleton will be computed.
1361  * @param length  The length of the input identifier, expressed in 16 bit
1362  *                UTF-16 code units, or -1 if the string is zero terminated.
1363  * @param dest    The output buffer, to receive the skeleton string.
1364  * @param destCapacity  The length of the output buffer, in 16 bit units.
1365  *                The destCapacity may be zero, in which case the function will
1366  *                return the actual length of the skeleton.
1367  * @param status  The error code, set if an error occurred while attempting to
1368  *                perform the check.
1369  * @return        The length of the skeleton string.  The returned length
1370  *                is always that of the complete skeleton, even when the
1371  *                supplied buffer is too small (or of zero length)
1372  *
1373  * @stable ICU 4.2
1374  * @see uspoof_areConfusable
1375  */
1376 U_CAPI int32_t U_EXPORT2
1377 uspoof_getSkeleton(const USpoofChecker *sc,
1378                    uint32_t type,
1379                    const UChar *id,  int32_t length,
1380                    UChar *dest, int32_t destCapacity,
1381                    UErrorCode *status);
1382 
1383 #ifndef U_HIDE_DRAFT_API
1384 /**
1385  *  Get the "bidiSkeleton" for an identifier and a direction.
1386  *  Skeletons are a transformation of the input identifier;
1387  *  Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1388  *  they are RTL-confusable if their RTL bidiSkeletons are identical.
1389  *  See Unicode Technical Standard #39 for additional information:
1390  *  https://www.unicode.org/reports/tr39/#Confusable_Detection.
1391  *
1392  *  Using skeletons directly makes it possible to quickly check
1393  *  whether an identifier is confusable with any of some large
1394  *  set of existing identifiers, by creating an efficiently
1395  *  searchable collection of the skeletons.
1396  *
1397  * @param sc      The USpoofChecker.
1398  * @param direction The context direction with which the identifier will be
1399  *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1400  * @param id      The input identifier whose skeleton will be computed.
1401  * @param length  The length of the input identifier, expressed in 16 bit
1402  *                UTF-16 code units, or -1 if the string is zero terminated.
1403  * @param dest    The output buffer, to receive the skeleton string.
1404  * @param destCapacity  The length of the output buffer, in 16 bit units.
1405  *                The destCapacity may be zero, in which case the function will
1406  *                return the actual length of the skeleton.
1407  * @param status  The error code, set if an error occurred while attempting to
1408  *                perform the check.
1409  * @return        The length of the skeleton string.  The returned length
1410  *                is always that of the complete skeleton, even when the
1411  *                supplied buffer is too small (or of zero length)
1412  *
1413  * @draft ICU 74
1414  * @see uspoof_areBidiConfusable
1415  */
1416 U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc,
1417                                                 UBiDiDirection direction,
1418                                                 const UChar *id, int32_t length,
1419                                                 UChar *dest, int32_t destCapacity, UErrorCode *status);
1420 #endif /* U_HIDE_DRAFT_API */
1421 
1422 /**
1423  *  Get the "skeleton" for an identifier.
1424  *  Skeletons are a transformation of the input identifier;
1425  *  Two identifiers are confusable if their skeletons are identical.
1426  *  See Unicode Technical Standard #39 for additional information.
1427  *
1428  *  Using skeletons directly makes it possible to quickly check
1429  *  whether an identifier is confusable with any of some large
1430  *  set of existing identifiers, by creating an efficiently
1431  *  searchable collection of the skeletons.
1432  *
1433  * @param sc      The USpoofChecker
1434  * @param type    Deprecated in ICU 58.  You may pass any number.
1435  *                Originally, controlled which of the Unicode confusable data
1436  *                tables to use.
1437  * @param id      The UTF-8 format identifier whose skeleton will be computed.
1438  * @param length  The length of the input string, in bytes,
1439  *                or -1 if the string is zero terminated.
1440  * @param dest    The output buffer, to receive the skeleton string.
1441  * @param destCapacity  The length of the output buffer, in bytes.
1442  *                The destCapacity may be zero, in which case the function will
1443  *                return the actual length of the skeleton.
1444  * @param status  The error code, set if an error occurred while attempting to
1445  *                perform the check.  Possible Errors include U_INVALID_CHAR_FOUND
1446  *                   for invalid UTF-8 sequences, and
1447  *                   U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
1448  *                   to hold the complete skeleton.
1449  * @return        The length of the skeleton string, in bytes.  The returned length
1450  *                is always that of the complete skeleton, even when the
1451  *                supplied buffer is too small (or of zero length)
1452  *
1453  * @stable ICU 4.2
1454  */
1455 U_CAPI int32_t U_EXPORT2
1456 uspoof_getSkeletonUTF8(const USpoofChecker *sc,
1457                        uint32_t type,
1458                        const char *id,  int32_t length,
1459                        char *dest, int32_t destCapacity,
1460                        UErrorCode *status);
1461 
1462 #ifndef U_HIDE_DRAFT_API
1463 /**
1464  *  Get the "bidiSkeleton" for an identifier and a direction.
1465  *  Skeletons are a transformation of the input identifier;
1466  *  Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1467  *  they are RTL-confusable if their RTL bidiSkeletons are identical.
1468  *  See Unicode Technical Standard #39 for additional information:
1469  *  https://www.unicode.org/reports/tr39/#Confusable_Detection.
1470  *
1471  *  Using skeletons directly makes it possible to quickly check
1472  *  whether an identifier is confusable with any of some large
1473  *  set of existing identifiers, by creating an efficiently
1474  *  searchable collection of the skeletons.
1475  *
1476  * @param sc      The USpoofChecker
1477  * @param direction The context direction with which the identifier will be
1478  *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1479  * @param id      The UTF-8 format identifier whose skeleton will be computed.
1480  * @param length  The length of the input string, in bytes,
1481  *                or -1 if the string is zero terminated.
1482  * @param dest    The output buffer, to receive the skeleton string.
1483  * @param destCapacity  The length of the output buffer, in bytes.
1484  *                The destCapacity may be zero, in which case the function will
1485  *                return the actual length of the skeleton.
1486  * @param status  The error code, set if an error occurred while attempting to
1487  *                perform the check.  Possible Errors include U_INVALID_CHAR_FOUND
1488  *                for invalid UTF-8 sequences, and
1489  *                U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
1490  *                to hold the complete skeleton.
1491  * @return        The length of the skeleton string, in bytes.  The returned length
1492  *                is always that of the complete skeleton, even when the
1493  *                supplied buffer is too small (or of zero length)
1494  *
1495  * @draft ICU 74
1496  */
1497 U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
1498                                                     const char *id, int32_t length, char *dest,
1499                                                     int32_t destCapacity, UErrorCode *status);
1500 #endif /* U_HIDE_DRAFT_API */
1501 
1502 /**
1503   * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
1504   * in http://unicode.org/Public/security/latest/xidmodifications.txt
1505   * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1506   *
1507   * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1508   * be deleted by the caller.
1509   *
1510   * @param status The error code, set if a problem occurs while creating the set.
1511   *
1512   * @stable ICU 51
1513   */
1514 U_CAPI const USet * U_EXPORT2
1515 uspoof_getInclusionSet(UErrorCode *status);
1516 
1517 /**
1518   * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
1519   * in http://unicode.org/Public/security/latest/xidmodifications.txt
1520   * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1521   *
1522   * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1523   * be deleted by the caller.
1524   *
1525   * @param status The error code, set if a problem occurs while creating the set.
1526   *
1527   * @stable ICU 51
1528   */
1529 U_CAPI const USet * U_EXPORT2
1530 uspoof_getRecommendedSet(UErrorCode *status);
1531 
1532 /**
1533  * Serialize the data for a spoof detector into a chunk of memory.
1534  * The flattened spoof detection tables can later be used to efficiently
1535  * instantiate a new Spoof Detector.
1536  *
1537  * The serialized spoof checker includes only the data compiled from the
1538  * Unicode data tables by uspoof_openFromSource(); it does not include
1539  * include any other state or configuration that may have been set.
1540  *
1541  * @param sc   the Spoof Detector whose data is to be serialized.
1542  * @param data a pointer to 32-bit-aligned memory to be filled with the data,
1543  *             can be NULL if capacity==0
1544  * @param capacity the number of bytes available at data,
1545  *                 or 0 for preflighting
1546  * @param status an in/out ICU UErrorCode; possible errors include:
1547  * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
1548  * - U_ILLEGAL_ARGUMENT_ERROR  the data or capacity parameters are bad
1549  * @return the number of bytes written or needed for the spoof data
1550  *
1551  * @see utrie2_openFromSerialized()
1552  * @stable ICU 4.2
1553  */
1554 U_CAPI int32_t U_EXPORT2
1555 uspoof_serialize(USpoofChecker *sc,
1556                  void *data, int32_t capacity,
1557                  UErrorCode *status);
1558 
1559 U_CDECL_END
1560 
1561 #if U_SHOW_CPLUSPLUS_API
1562 
1563 U_NAMESPACE_BEGIN
1564 
1565 /**
1566  * \class LocalUSpoofCheckerPointer
1567  * "Smart pointer" class, closes a USpoofChecker via uspoof_close().
1568  * For most methods see the LocalPointerBase base class.
1569  *
1570  * @see LocalPointerBase
1571  * @see LocalPointer
1572  * @stable ICU 4.4
1573  */
1574 /**
1575  * \cond
1576  * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER.
1577  *       For now, suppress with a Doxygen cond
1578  */
1579 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckerPointer, USpoofChecker, uspoof_close);
1580 /** \endcond */
1581 
1582 /**
1583  * \class LocalUSpoofCheckResultPointer
1584  * "Smart pointer" class, closes a USpoofCheckResult via `uspoof_closeCheckResult()`.
1585  * For most methods see the LocalPointerBase base class.
1586  *
1587  * @see LocalPointerBase
1588  * @see LocalPointer
1589  * @stable ICU 58
1590  */
1591 
1592 /**
1593  * \cond
1594  * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER.
1595  *       For now, suppress with a Doxygen cond
1596  */
1597 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckResultPointer, USpoofCheckResult, uspoof_closeCheckResult);
1598 /** \endcond */
1599 
1600 U_NAMESPACE_END
1601 
1602 /**
1603  * Limit the acceptable characters to those specified by a Unicode Set.
1604  *   Any previously specified character limit is
1605  *   is replaced by the new settings.    This includes limits on
1606  *   characters that were set with the uspoof_setAllowedLocales() function.
1607  *
1608  * The USPOOF_CHAR_LIMIT test is automatically enabled for this
1609  * USoofChecker by this function.
1610  *
1611  * @param sc       The USpoofChecker
1612  * @param chars    A Unicode Set containing the list of
1613  *                 characters that are permitted.  Ownership of the set
1614  *                 remains with the caller.  The incoming set is cloned by
1615  *                 this function, so there are no restrictions on modifying
1616  *                 or deleting the UnicodeSet after calling this function.
1617  * @param status   The error code, set if this function encounters a problem.
1618  * @stable ICU 4.2
1619  */
1620 U_CAPI void U_EXPORT2
1621 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const icu::UnicodeSet *chars, UErrorCode *status);
1622 
1623 
1624 /**
1625  * Get a UnicodeSet for the characters permitted in an identifier.
1626  * This corresponds to the limits imposed by the Set Allowed Characters /
1627  * UnicodeSet functions. Limitations imposed by other checks will not be
1628  * reflected in the set returned by this function.
1629  *
1630  * The returned set will be frozen, meaning that it cannot be modified
1631  * by the caller.
1632  *
1633  * Ownership of the returned set remains with the Spoof Detector.  The
1634  * returned set will become invalid if the spoof detector is closed,
1635  * or if a new set of allowed characters is specified.
1636  *
1637  *
1638  * @param sc       The USpoofChecker
1639  * @param status   The error code, set if this function encounters a problem.
1640  * @return         A UnicodeSet containing the characters that are permitted by
1641  *                 the USPOOF_CHAR_LIMIT test.
1642  * @stable ICU 4.2
1643  */
1644 U_CAPI const icu::UnicodeSet * U_EXPORT2
1645 uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status);
1646 
1647 /**
1648  * Check the specified string for possible security issues.
1649  * The text to be checked will typically be an identifier of some sort.
1650  * The set of checks to be performed is specified with uspoof_setChecks().
1651  *
1652  * \note
1653  *   Consider using the newer API, {@link uspoof_check2UnicodeString}, instead.
1654  *   The newer API exposes additional information from the check procedure
1655  *   and is otherwise identical to this method.
1656  *
1657  * @param sc      The USpoofChecker
1658  * @param id      A identifier to be checked for possible security issues.
1659  * @param position  Deprecated in ICU 51.  Always returns zero.
1660  *                Originally, an out parameter for the index of the first
1661  *                string position that failed a check.
1662  *                This parameter may be nullptr.
1663  * @param status  The error code, set if an error occurred while attempting to
1664  *                perform the check.
1665  *                Spoofing or security issues detected with the input string are
1666  *                not reported here, but through the function's return value.
1667  * @return        An integer value with bits set for any potential security
1668  *                or spoofing issues detected.  The bits are defined by
1669  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1670  *                will be zero if the input string passes all of the
1671  *                enabled checks.
1672  * @see uspoof_check2UnicodeString
1673  * @stable ICU 4.2
1674  */
1675 U_CAPI int32_t U_EXPORT2
1676 uspoof_checkUnicodeString(const USpoofChecker *sc,
1677                           const icu::UnicodeString &id,
1678                           int32_t *position,
1679                           UErrorCode *status);
1680 
1681 /**
1682  * Check the specified string for possible security issues.
1683  * The text to be checked will typically be an identifier of some sort.
1684  * The set of checks to be performed is specified with uspoof_setChecks().
1685  *
1686  * @param sc      The USpoofChecker
1687  * @param id      A identifier to be checked for possible security issues.
1688  * @param checkResult  An instance of USpoofCheckResult to be filled with
1689  *                details about the identifier.  Can be nullptr.
1690  * @param status  The error code, set if an error occurred while attempting to
1691  *                perform the check.
1692  *                Spoofing or security issues detected with the input string are
1693  *                not reported here, but through the function's return value.
1694  * @return        An integer value with bits set for any potential security
1695  *                or spoofing issues detected.  The bits are defined by
1696  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1697  *                will be zero if the input string passes all of the
1698  *                enabled checks.  Any information in this bitmask will be
1699  *                consistent with the information saved in the optional
1700  *                checkResult parameter.
1701  * @see uspoof_openCheckResult
1702  * @see uspoof_check2
1703  * @see uspoof_check2UTF8
1704  * @stable ICU 58
1705  */
1706 U_CAPI int32_t U_EXPORT2
1707 uspoof_check2UnicodeString(const USpoofChecker *sc,
1708     const icu::UnicodeString &id,
1709     USpoofCheckResult* checkResult,
1710     UErrorCode *status);
1711 
1712 /**
1713  * A version of {@link uspoof_areConfusable} accepting UnicodeStrings.
1714  *
1715  * @param sc      The USpoofChecker
1716  * @param s1     The first of the two identifiers to be compared for
1717  *                confusability.  The strings are in UTF-8 format.
1718  * @param s2     The second of the two identifiers to be compared for
1719  *                confusability.  The strings are in UTF-8 format.
1720  * @param status  The error code, set if an error occurred while attempting to
1721  *                perform the check.
1722  *                Confusability of the identifiers is not reported here,
1723  *                but through this function's return value.
1724  * @return        An integer value with bit(s) set corresponding to
1725  *                the type of confusability found, as defined by
1726  *                enum USpoofChecks.  Zero is returned if the identifiers
1727  *                are not confusable.
1728  *
1729  * @stable ICU 4.2
1730  *
1731  * @see uspoof_areConfusable
1732  */
1733 U_CAPI int32_t U_EXPORT2
1734 uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
1735                                   const icu::UnicodeString &s1,
1736                                   const icu::UnicodeString &s2,
1737                                   UErrorCode *status);
1738 
1739 #ifndef U_HIDE_DRAFT_API
1740 /**
1741  * A version of {@link uspoof_areBidiConfusable} accepting UnicodeStrings.
1742  *
1743  * @param sc      The USpoofChecker
1744  * @param direction The paragraph direction with which the identifiers are
1745  *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1746  * @param s1     The first of the two identifiers to be compared for
1747  *                confusability.  The strings are in UTF-8 format.
1748  * @param s2     The second of the two identifiers to be compared for
1749  *                confusability.  The strings are in UTF-8 format.
1750  * @param status  The error code, set if an error occurred while attempting to
1751  *                perform the check.
1752  *                Confusability of the identifiers is not reported here,
1753  *                but through this function's return value.
1754  * @return        An integer value with bit(s) set corresponding to
1755  *                the type of confusability found, as defined by
1756  *                enum USpoofChecks.  Zero is returned if the identifiers
1757  *                are not confusable.
1758  *
1759  * @draft ICU 74
1760  *
1761  * @see uspoof_areBidiConfusable
1762  */
1763 U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
1764                                                                UBiDiDirection direction,
1765                                                                const icu::UnicodeString &s1,
1766                                                                const icu::UnicodeString &s2,
1767                                                                UErrorCode *status);
1768 #endif /* U_HIDE_DRAFT_API */
1769 
1770 /**
1771  *  Get the "skeleton" for an identifier.
1772  *  Skeletons are a transformation of the input identifier;
1773  *  Two identifiers are confusable if their skeletons are identical.
1774  *  See Unicode Technical Standard #39 for additional information.
1775  *
1776  *  Using skeletons directly makes it possible to quickly check
1777  *  whether an identifier is confusable with any of some large
1778  *  set of existing identifiers, by creating an efficiently
1779  *  searchable collection of the skeletons.
1780  *
1781  * @param sc      The USpoofChecker.
1782  * @param type    Deprecated in ICU 58.  You may pass any number.
1783  *                Originally, controlled which of the Unicode confusable data
1784  *                tables to use.
1785  * @param id      The input identifier whose skeleton will be computed.
1786  * @param dest    The output identifier, to receive the skeleton string.
1787  * @param status  The error code, set if an error occurred while attempting to
1788  *                perform the check.
1789  * @return        A reference to the destination (skeleton) string.
1790  *
1791  * @stable ICU 4.2
1792  */
1793 U_I18N_API icu::UnicodeString & U_EXPORT2
1794 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
1795                                 uint32_t type,
1796                                 const icu::UnicodeString &id,
1797                                 icu::UnicodeString &dest,
1798                                 UErrorCode *status);
1799 
1800 #ifndef U_HIDE_DRAFT_API
1801 /**
1802  *  Get the "bidiSkeleton" for an identifier and a direction.
1803  *  Skeletons are a transformation of the input identifier;
1804  *  Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1805  *  they are RTL-confusable if their RTL bidiSkeletons are identical.
1806  *  See Unicode Technical Standard #39 for additional information.
1807  *  https://www.unicode.org/reports/tr39/#Confusable_Detection.
1808  *
1809  *  Using skeletons directly makes it possible to quickly check
1810  *  whether an identifier is confusable with any of some large
1811  *  set of existing identifiers, by creating an efficiently
1812  *  searchable collection of the skeletons.
1813  *
1814  * @param sc      The USpoofChecker.
1815  * @param direction The context direction with which the identifier will be
1816  *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1817  * @param id      The input identifier whose bidiSkeleton will be computed.
1818  * @param dest    The output identifier, to receive the skeleton string.
1819  * @param status  The error code, set if an error occurred while attempting to
1820  *                perform the check.
1821  * @return        A reference to the destination (skeleton) string.
1822  *
1823  * @draft ICU 74
1824  */
1825 U_I18N_API icu::UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(
1826     const USpoofChecker *sc, UBiDiDirection direction, const icu::UnicodeString &id,
1827     icu::UnicodeString &dest, UErrorCode *status);
1828 #endif /* U_HIDE_DRAFT_API */
1829 
1830 /**
1831   * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
1832   * in http://unicode.org/Public/security/latest/xidmodifications.txt
1833   * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1834   *
1835   * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1836   * be deleted by the caller.
1837   *
1838   * @param status The error code, set if a problem occurs while creating the set.
1839   *
1840   * @stable ICU 51
1841   */
1842 U_CAPI const icu::UnicodeSet * U_EXPORT2
1843 uspoof_getInclusionUnicodeSet(UErrorCode *status);
1844 
1845 /**
1846   * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
1847   * in http://unicode.org/Public/security/latest/xidmodifications.txt
1848   * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1849   *
1850   * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1851   * be deleted by the caller.
1852   *
1853   * @param status The error code, set if a problem occurs while creating the set.
1854   *
1855   * @stable ICU 51
1856   */
1857 U_CAPI const icu::UnicodeSet * U_EXPORT2
1858 uspoof_getRecommendedUnicodeSet(UErrorCode *status);
1859 
1860 #endif /* U_SHOW_CPLUSPLUS_API */
1861 
1862 #endif /* UCONFIG_NO_NORMALIZATION */
1863 
1864 #endif   /* USPOOF_H */
1865