1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others. 2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html 3*0e209d39SAndroid Build Coastguard Worker /* 4*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 5*0e209d39SAndroid Build Coastguard Worker * Copyright (C) 2005-2013, International Business Machines 6*0e209d39SAndroid Build Coastguard Worker * Corporation and others. All Rights Reserved. 7*0e209d39SAndroid Build Coastguard Worker ********************************************************************** 8*0e209d39SAndroid Build Coastguard Worker * file name: ucsdet.h 9*0e209d39SAndroid Build Coastguard Worker * encoding: UTF-8 10*0e209d39SAndroid Build Coastguard Worker * indentation:4 11*0e209d39SAndroid Build Coastguard Worker * 12*0e209d39SAndroid Build Coastguard Worker * created on: 2005Aug04 13*0e209d39SAndroid Build Coastguard Worker * created by: Andy Heninger 14*0e209d39SAndroid Build Coastguard Worker * 15*0e209d39SAndroid Build Coastguard Worker * ICU Character Set Detection, API for C 16*0e209d39SAndroid Build Coastguard Worker * 17*0e209d39SAndroid Build Coastguard Worker * Draft version 18 Oct 2005 18*0e209d39SAndroid Build Coastguard Worker * 19*0e209d39SAndroid Build Coastguard Worker */ 20*0e209d39SAndroid Build Coastguard Worker 21*0e209d39SAndroid Build Coastguard Worker #ifndef __UCSDET_H 22*0e209d39SAndroid Build Coastguard Worker #define __UCSDET_H 23*0e209d39SAndroid Build Coastguard Worker 24*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h" 25*0e209d39SAndroid Build Coastguard Worker 26*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_CONVERSION 27*0e209d39SAndroid Build Coastguard Worker 28*0e209d39SAndroid Build Coastguard Worker #include "unicode/uenum.h" 29*0e209d39SAndroid Build Coastguard Worker 30*0e209d39SAndroid Build Coastguard Worker #if U_SHOW_CPLUSPLUS_API 31*0e209d39SAndroid Build Coastguard Worker #include "unicode/localpointer.h" 32*0e209d39SAndroid Build Coastguard Worker #endif // U_SHOW_CPLUSPLUS_API 33*0e209d39SAndroid Build Coastguard Worker 34*0e209d39SAndroid Build Coastguard Worker /** 35*0e209d39SAndroid Build Coastguard Worker * \file 36*0e209d39SAndroid Build Coastguard Worker * \brief C API: Charset Detection API 37*0e209d39SAndroid Build Coastguard Worker * 38*0e209d39SAndroid Build Coastguard Worker * This API provides a facility for detecting the 39*0e209d39SAndroid Build Coastguard Worker * charset or encoding of character data in an unknown text format. 40*0e209d39SAndroid Build Coastguard Worker * The input data can be from an array of bytes. 41*0e209d39SAndroid Build Coastguard Worker * <p> 42*0e209d39SAndroid Build Coastguard Worker * Character set detection is at best an imprecise operation. The detection 43*0e209d39SAndroid Build Coastguard Worker * process will attempt to identify the charset that best matches the characteristics 44*0e209d39SAndroid Build Coastguard Worker * of the byte data, but the process is partly statistical in nature, and 45*0e209d39SAndroid Build Coastguard Worker * the results can not be guaranteed to always be correct. 46*0e209d39SAndroid Build Coastguard Worker * <p> 47*0e209d39SAndroid Build Coastguard Worker * For best accuracy in charset detection, the input data should be primarily 48*0e209d39SAndroid Build Coastguard Worker * in a single language, and a minimum of a few hundred bytes worth of plain text 49*0e209d39SAndroid Build Coastguard Worker * in the language are needed. The detection process will attempt to 50*0e209d39SAndroid Build Coastguard Worker * ignore html or xml style markup that could otherwise obscure the content. 51*0e209d39SAndroid Build Coastguard Worker * <p> 52*0e209d39SAndroid Build Coastguard Worker * An alternative to the ICU Charset Detector is the 53*0e209d39SAndroid Build Coastguard Worker * Compact Encoding Detector, https://github.com/google/compact_enc_det. 54*0e209d39SAndroid Build Coastguard Worker * It often gives more accurate results, especially with short input samples. 55*0e209d39SAndroid Build Coastguard Worker */ 56*0e209d39SAndroid Build Coastguard Worker 57*0e209d39SAndroid Build Coastguard Worker 58*0e209d39SAndroid Build Coastguard Worker struct UCharsetDetector; 59*0e209d39SAndroid Build Coastguard Worker /** 60*0e209d39SAndroid Build Coastguard Worker * Structure representing a charset detector 61*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 62*0e209d39SAndroid Build Coastguard Worker */ 63*0e209d39SAndroid Build Coastguard Worker typedef struct UCharsetDetector UCharsetDetector; 64*0e209d39SAndroid Build Coastguard Worker 65*0e209d39SAndroid Build Coastguard Worker struct UCharsetMatch; 66*0e209d39SAndroid Build Coastguard Worker /** 67*0e209d39SAndroid Build Coastguard Worker * Opaque structure representing a match that was identified 68*0e209d39SAndroid Build Coastguard Worker * from a charset detection operation. 69*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 70*0e209d39SAndroid Build Coastguard Worker */ 71*0e209d39SAndroid Build Coastguard Worker typedef struct UCharsetMatch UCharsetMatch; 72*0e209d39SAndroid Build Coastguard Worker 73*0e209d39SAndroid Build Coastguard Worker /** 74*0e209d39SAndroid Build Coastguard Worker * Open a charset detector. 75*0e209d39SAndroid Build Coastguard Worker * 76*0e209d39SAndroid Build Coastguard Worker * @param status Any error conditions occurring during the open 77*0e209d39SAndroid Build Coastguard Worker * operation are reported back in this variable. 78*0e209d39SAndroid Build Coastguard Worker * @return the newly opened charset detector. 79*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 80*0e209d39SAndroid Build Coastguard Worker */ 81*0e209d39SAndroid Build Coastguard Worker U_CAPI UCharsetDetector * U_EXPORT2 82*0e209d39SAndroid Build Coastguard Worker ucsdet_open(UErrorCode *status); 83*0e209d39SAndroid Build Coastguard Worker 84*0e209d39SAndroid Build Coastguard Worker /** 85*0e209d39SAndroid Build Coastguard Worker * Close a charset detector. All storage and any other resources 86*0e209d39SAndroid Build Coastguard Worker * owned by this charset detector will be released. Failure to 87*0e209d39SAndroid Build Coastguard Worker * close a charset detector when finished with it can result in 88*0e209d39SAndroid Build Coastguard Worker * memory leaks in the application. 89*0e209d39SAndroid Build Coastguard Worker * 90*0e209d39SAndroid Build Coastguard Worker * @param ucsd The charset detector to be closed. 91*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 92*0e209d39SAndroid Build Coastguard Worker */ 93*0e209d39SAndroid Build Coastguard Worker U_CAPI void U_EXPORT2 94*0e209d39SAndroid Build Coastguard Worker ucsdet_close(UCharsetDetector *ucsd); 95*0e209d39SAndroid Build Coastguard Worker 96*0e209d39SAndroid Build Coastguard Worker #if U_SHOW_CPLUSPLUS_API 97*0e209d39SAndroid Build Coastguard Worker 98*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN 99*0e209d39SAndroid Build Coastguard Worker 100*0e209d39SAndroid Build Coastguard Worker /** 101*0e209d39SAndroid Build Coastguard Worker * \class LocalUCharsetDetectorPointer 102*0e209d39SAndroid Build Coastguard Worker * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). 103*0e209d39SAndroid Build Coastguard Worker * For most methods see the LocalPointerBase base class. 104*0e209d39SAndroid Build Coastguard Worker * 105*0e209d39SAndroid Build Coastguard Worker * @see LocalPointerBase 106*0e209d39SAndroid Build Coastguard Worker * @see LocalPointer 107*0e209d39SAndroid Build Coastguard Worker * @stable ICU 4.4 108*0e209d39SAndroid Build Coastguard Worker */ 109*0e209d39SAndroid Build Coastguard Worker U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); 110*0e209d39SAndroid Build Coastguard Worker 111*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END 112*0e209d39SAndroid Build Coastguard Worker 113*0e209d39SAndroid Build Coastguard Worker #endif 114*0e209d39SAndroid Build Coastguard Worker 115*0e209d39SAndroid Build Coastguard Worker /** 116*0e209d39SAndroid Build Coastguard Worker * Set the input byte data whose charset is to detected. 117*0e209d39SAndroid Build Coastguard Worker * 118*0e209d39SAndroid Build Coastguard Worker * Ownership of the input text byte array remains with the caller. 119*0e209d39SAndroid Build Coastguard Worker * The input string must not be altered or deleted until the charset 120*0e209d39SAndroid Build Coastguard Worker * detector is either closed or reset to refer to different input text. 121*0e209d39SAndroid Build Coastguard Worker * 122*0e209d39SAndroid Build Coastguard Worker * @param ucsd the charset detector to be used. 123*0e209d39SAndroid Build Coastguard Worker * @param textIn the input text of unknown encoding. . 124*0e209d39SAndroid Build Coastguard Worker * @param len the length of the input text, or -1 if the text 125*0e209d39SAndroid Build Coastguard Worker * is NUL terminated. 126*0e209d39SAndroid Build Coastguard Worker * @param status any error conditions are reported back in this variable. 127*0e209d39SAndroid Build Coastguard Worker * 128*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 129*0e209d39SAndroid Build Coastguard Worker */ 130*0e209d39SAndroid Build Coastguard Worker U_CAPI void U_EXPORT2 131*0e209d39SAndroid Build Coastguard Worker ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); 132*0e209d39SAndroid Build Coastguard Worker 133*0e209d39SAndroid Build Coastguard Worker 134*0e209d39SAndroid Build Coastguard Worker /** Set the declared encoding for charset detection. 135*0e209d39SAndroid Build Coastguard Worker * The declared encoding of an input text is an encoding obtained 136*0e209d39SAndroid Build Coastguard Worker * by the user from an http header or xml declaration or similar source that 137*0e209d39SAndroid Build Coastguard Worker * can be provided as an additional hint to the charset detector. 138*0e209d39SAndroid Build Coastguard Worker * 139*0e209d39SAndroid Build Coastguard Worker * How and whether the declared encoding will be used during the 140*0e209d39SAndroid Build Coastguard Worker * detection process is TBD. 141*0e209d39SAndroid Build Coastguard Worker * 142*0e209d39SAndroid Build Coastguard Worker * @param ucsd the charset detector to be used. 143*0e209d39SAndroid Build Coastguard Worker * @param encoding an encoding for the current data obtained from 144*0e209d39SAndroid Build Coastguard Worker * a header or declaration or other source outside 145*0e209d39SAndroid Build Coastguard Worker * of the byte data itself. 146*0e209d39SAndroid Build Coastguard Worker * @param length the length of the encoding name, or -1 if the name string 147*0e209d39SAndroid Build Coastguard Worker * is NUL terminated. 148*0e209d39SAndroid Build Coastguard Worker * @param status any error conditions are reported back in this variable. 149*0e209d39SAndroid Build Coastguard Worker * 150*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 151*0e209d39SAndroid Build Coastguard Worker */ 152*0e209d39SAndroid Build Coastguard Worker U_CAPI void U_EXPORT2 153*0e209d39SAndroid Build Coastguard Worker ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); 154*0e209d39SAndroid Build Coastguard Worker 155*0e209d39SAndroid Build Coastguard Worker 156*0e209d39SAndroid Build Coastguard Worker /** 157*0e209d39SAndroid Build Coastguard Worker * Return the charset that best matches the supplied input data. 158*0e209d39SAndroid Build Coastguard Worker * 159*0e209d39SAndroid Build Coastguard Worker * Note though, that because the detection 160*0e209d39SAndroid Build Coastguard Worker * only looks at the start of the input data, 161*0e209d39SAndroid Build Coastguard Worker * there is a possibility that the returned charset will fail to handle 162*0e209d39SAndroid Build Coastguard Worker * the full set of input data. 163*0e209d39SAndroid Build Coastguard Worker * <p> 164*0e209d39SAndroid Build Coastguard Worker * The returned UCharsetMatch object is owned by the UCharsetDetector. 165*0e209d39SAndroid Build Coastguard Worker * It will remain valid until the detector input is reset, or until 166*0e209d39SAndroid Build Coastguard Worker * the detector is closed. 167*0e209d39SAndroid Build Coastguard Worker * <p> 168*0e209d39SAndroid Build Coastguard Worker * The function will fail if 169*0e209d39SAndroid Build Coastguard Worker * <ul> 170*0e209d39SAndroid Build Coastguard Worker * <li>no charset appears to match the data.</li> 171*0e209d39SAndroid Build Coastguard Worker * <li>no input text has been provided</li> 172*0e209d39SAndroid Build Coastguard Worker * </ul> 173*0e209d39SAndroid Build Coastguard Worker * 174*0e209d39SAndroid Build Coastguard Worker * @param ucsd the charset detector to be used. 175*0e209d39SAndroid Build Coastguard Worker * @param status any error conditions are reported back in this variable. 176*0e209d39SAndroid Build Coastguard Worker * @return a UCharsetMatch representing the best matching charset, 177*0e209d39SAndroid Build Coastguard Worker * or NULL if no charset matches the byte data. 178*0e209d39SAndroid Build Coastguard Worker * 179*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 180*0e209d39SAndroid Build Coastguard Worker */ 181*0e209d39SAndroid Build Coastguard Worker U_CAPI const UCharsetMatch * U_EXPORT2 182*0e209d39SAndroid Build Coastguard Worker ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status); 183*0e209d39SAndroid Build Coastguard Worker 184*0e209d39SAndroid Build Coastguard Worker 185*0e209d39SAndroid Build Coastguard Worker /** 186*0e209d39SAndroid Build Coastguard Worker * Find all charset matches that appear to be consistent with the input, 187*0e209d39SAndroid Build Coastguard Worker * returning an array of results. The results are ordered with the 188*0e209d39SAndroid Build Coastguard Worker * best quality match first. 189*0e209d39SAndroid Build Coastguard Worker * 190*0e209d39SAndroid Build Coastguard Worker * Because the detection only looks at a limited amount of the 191*0e209d39SAndroid Build Coastguard Worker * input byte data, some of the returned charsets may fail to handle 192*0e209d39SAndroid Build Coastguard Worker * the all of input data. 193*0e209d39SAndroid Build Coastguard Worker * <p> 194*0e209d39SAndroid Build Coastguard Worker * The returned UCharsetMatch objects are owned by the UCharsetDetector. 195*0e209d39SAndroid Build Coastguard Worker * They will remain valid until the detector is closed or modified 196*0e209d39SAndroid Build Coastguard Worker * 197*0e209d39SAndroid Build Coastguard Worker * <p> 198*0e209d39SAndroid Build Coastguard Worker * Return an error if 199*0e209d39SAndroid Build Coastguard Worker * <ul> 200*0e209d39SAndroid Build Coastguard Worker * <li>no charsets appear to match the input data.</li> 201*0e209d39SAndroid Build Coastguard Worker * <li>no input text has been provided</li> 202*0e209d39SAndroid Build Coastguard Worker * </ul> 203*0e209d39SAndroid Build Coastguard Worker * 204*0e209d39SAndroid Build Coastguard Worker * @param ucsd the charset detector to be used. 205*0e209d39SAndroid Build Coastguard Worker * @param matchesFound pointer to a variable that will be set to the 206*0e209d39SAndroid Build Coastguard Worker * number of charsets identified that are consistent with 207*0e209d39SAndroid Build Coastguard Worker * the input data. Output only. 208*0e209d39SAndroid Build Coastguard Worker * @param status any error conditions are reported back in this variable. 209*0e209d39SAndroid Build Coastguard Worker * @return A pointer to an array of pointers to UCharSetMatch objects. 210*0e209d39SAndroid Build Coastguard Worker * This array, and the UCharSetMatch instances to which it refers, 211*0e209d39SAndroid Build Coastguard Worker * are owned by the UCharsetDetector, and will remain valid until 212*0e209d39SAndroid Build Coastguard Worker * the detector is closed or modified. 213*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 214*0e209d39SAndroid Build Coastguard Worker */ 215*0e209d39SAndroid Build Coastguard Worker U_CAPI const UCharsetMatch ** U_EXPORT2 216*0e209d39SAndroid Build Coastguard Worker ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status); 217*0e209d39SAndroid Build Coastguard Worker 218*0e209d39SAndroid Build Coastguard Worker 219*0e209d39SAndroid Build Coastguard Worker 220*0e209d39SAndroid Build Coastguard Worker /** 221*0e209d39SAndroid Build Coastguard Worker * Get the name of the charset represented by a UCharsetMatch. 222*0e209d39SAndroid Build Coastguard Worker * 223*0e209d39SAndroid Build Coastguard Worker * The storage for the returned name string is owned by the 224*0e209d39SAndroid Build Coastguard Worker * UCharsetMatch, and will remain valid while the UCharsetMatch 225*0e209d39SAndroid Build Coastguard Worker * is valid. 226*0e209d39SAndroid Build Coastguard Worker * 227*0e209d39SAndroid Build Coastguard Worker * The name returned is suitable for use with the ICU conversion APIs. 228*0e209d39SAndroid Build Coastguard Worker * 229*0e209d39SAndroid Build Coastguard Worker * @param ucsm The charset match object. 230*0e209d39SAndroid Build Coastguard Worker * @param status Any error conditions are reported back in this variable. 231*0e209d39SAndroid Build Coastguard Worker * @return The name of the matching charset. 232*0e209d39SAndroid Build Coastguard Worker * 233*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 234*0e209d39SAndroid Build Coastguard Worker */ 235*0e209d39SAndroid Build Coastguard Worker U_CAPI const char * U_EXPORT2 236*0e209d39SAndroid Build Coastguard Worker ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status); 237*0e209d39SAndroid Build Coastguard Worker 238*0e209d39SAndroid Build Coastguard Worker /** 239*0e209d39SAndroid Build Coastguard Worker * Get a confidence number for the quality of the match of the byte 240*0e209d39SAndroid Build Coastguard Worker * data with the charset. Confidence numbers range from zero to 100, 241*0e209d39SAndroid Build Coastguard Worker * with 100 representing complete confidence and zero representing 242*0e209d39SAndroid Build Coastguard Worker * no confidence. 243*0e209d39SAndroid Build Coastguard Worker * 244*0e209d39SAndroid Build Coastguard Worker * The confidence values are somewhat arbitrary. They define an 245*0e209d39SAndroid Build Coastguard Worker * an ordering within the results for any single detection operation 246*0e209d39SAndroid Build Coastguard Worker * but are not generally comparable between the results for different input. 247*0e209d39SAndroid Build Coastguard Worker * 248*0e209d39SAndroid Build Coastguard Worker * A confidence value of ten does have a general meaning - it is used 249*0e209d39SAndroid Build Coastguard Worker * for charsets that can represent the input data, but for which there 250*0e209d39SAndroid Build Coastguard Worker * is no other indication that suggests that the charset is the correct one. 251*0e209d39SAndroid Build Coastguard Worker * Pure 7 bit ASCII data, for example, is compatible with a 252*0e209d39SAndroid Build Coastguard Worker * great many charsets, most of which will appear as possible matches 253*0e209d39SAndroid Build Coastguard Worker * with a confidence of 10. 254*0e209d39SAndroid Build Coastguard Worker * 255*0e209d39SAndroid Build Coastguard Worker * @param ucsm The charset match object. 256*0e209d39SAndroid Build Coastguard Worker * @param status Any error conditions are reported back in this variable. 257*0e209d39SAndroid Build Coastguard Worker * @return A confidence number for the charset match. 258*0e209d39SAndroid Build Coastguard Worker * 259*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 260*0e209d39SAndroid Build Coastguard Worker */ 261*0e209d39SAndroid Build Coastguard Worker U_CAPI int32_t U_EXPORT2 262*0e209d39SAndroid Build Coastguard Worker ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status); 263*0e209d39SAndroid Build Coastguard Worker 264*0e209d39SAndroid Build Coastguard Worker /** 265*0e209d39SAndroid Build Coastguard Worker * Get the RFC 3066 code for the language of the input data. 266*0e209d39SAndroid Build Coastguard Worker * 267*0e209d39SAndroid Build Coastguard Worker * The Charset Detection service is intended primarily for detecting 268*0e209d39SAndroid Build Coastguard Worker * charsets, not language. For some, but not all, charsets, a language is 269*0e209d39SAndroid Build Coastguard Worker * identified as a byproduct of the detection process, and that is what 270*0e209d39SAndroid Build Coastguard Worker * is returned by this function. 271*0e209d39SAndroid Build Coastguard Worker * 272*0e209d39SAndroid Build Coastguard Worker * CAUTION: 273*0e209d39SAndroid Build Coastguard Worker * 1. Language information is not available for input data encoded in 274*0e209d39SAndroid Build Coastguard Worker * all charsets. In particular, no language is identified 275*0e209d39SAndroid Build Coastguard Worker * for UTF-8 input data. 276*0e209d39SAndroid Build Coastguard Worker * 277*0e209d39SAndroid Build Coastguard Worker * 2. Closely related languages may sometimes be confused. 278*0e209d39SAndroid Build Coastguard Worker * 279*0e209d39SAndroid Build Coastguard Worker * If more accurate language detection is required, a linguistic 280*0e209d39SAndroid Build Coastguard Worker * analysis package should be used. 281*0e209d39SAndroid Build Coastguard Worker * 282*0e209d39SAndroid Build Coastguard Worker * The storage for the returned name string is owned by the 283*0e209d39SAndroid Build Coastguard Worker * UCharsetMatch, and will remain valid while the UCharsetMatch 284*0e209d39SAndroid Build Coastguard Worker * is valid. 285*0e209d39SAndroid Build Coastguard Worker * 286*0e209d39SAndroid Build Coastguard Worker * @param ucsm The charset match object. 287*0e209d39SAndroid Build Coastguard Worker * @param status Any error conditions are reported back in this variable. 288*0e209d39SAndroid Build Coastguard Worker * @return The RFC 3066 code for the language of the input data, or 289*0e209d39SAndroid Build Coastguard Worker * an empty string if the language could not be determined. 290*0e209d39SAndroid Build Coastguard Worker * 291*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 292*0e209d39SAndroid Build Coastguard Worker */ 293*0e209d39SAndroid Build Coastguard Worker U_CAPI const char * U_EXPORT2 294*0e209d39SAndroid Build Coastguard Worker ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status); 295*0e209d39SAndroid Build Coastguard Worker 296*0e209d39SAndroid Build Coastguard Worker 297*0e209d39SAndroid Build Coastguard Worker /** 298*0e209d39SAndroid Build Coastguard Worker * Get the entire input text as a UChar string, placing it into 299*0e209d39SAndroid Build Coastguard Worker * a caller-supplied buffer. A terminating 300*0e209d39SAndroid Build Coastguard Worker * NUL character will be appended to the buffer if space is available. 301*0e209d39SAndroid Build Coastguard Worker * 302*0e209d39SAndroid Build Coastguard Worker * The number of UChars in the output string, not including the terminating 303*0e209d39SAndroid Build Coastguard Worker * NUL, is returned. 304*0e209d39SAndroid Build Coastguard Worker * 305*0e209d39SAndroid Build Coastguard Worker * If the supplied buffer is smaller than required to hold the output, 306*0e209d39SAndroid Build Coastguard Worker * the contents of the buffer are undefined. The full output string length 307*0e209d39SAndroid Build Coastguard Worker * (in UChars) is returned as always, and can be used to allocate a buffer 308*0e209d39SAndroid Build Coastguard Worker * of the correct size. 309*0e209d39SAndroid Build Coastguard Worker * 310*0e209d39SAndroid Build Coastguard Worker * 311*0e209d39SAndroid Build Coastguard Worker * @param ucsm The charset match object. 312*0e209d39SAndroid Build Coastguard Worker * @param buf A UChar buffer to be filled with the converted text data. 313*0e209d39SAndroid Build Coastguard Worker * @param cap The capacity of the buffer in UChars. 314*0e209d39SAndroid Build Coastguard Worker * @param status Any error conditions are reported back in this variable. 315*0e209d39SAndroid Build Coastguard Worker * @return The number of UChars in the output string. 316*0e209d39SAndroid Build Coastguard Worker * 317*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 318*0e209d39SAndroid Build Coastguard Worker */ 319*0e209d39SAndroid Build Coastguard Worker U_CAPI int32_t U_EXPORT2 320*0e209d39SAndroid Build Coastguard Worker ucsdet_getUChars(const UCharsetMatch *ucsm, 321*0e209d39SAndroid Build Coastguard Worker UChar *buf, int32_t cap, UErrorCode *status); 322*0e209d39SAndroid Build Coastguard Worker 323*0e209d39SAndroid Build Coastguard Worker 324*0e209d39SAndroid Build Coastguard Worker 325*0e209d39SAndroid Build Coastguard Worker /** 326*0e209d39SAndroid Build Coastguard Worker * Get an iterator over the set of all detectable charsets - 327*0e209d39SAndroid Build Coastguard Worker * over the charsets that are known to the charset detection 328*0e209d39SAndroid Build Coastguard Worker * service. 329*0e209d39SAndroid Build Coastguard Worker * 330*0e209d39SAndroid Build Coastguard Worker * The returned UEnumeration provides access to the names of 331*0e209d39SAndroid Build Coastguard Worker * the charsets. 332*0e209d39SAndroid Build Coastguard Worker * 333*0e209d39SAndroid Build Coastguard Worker * <p> 334*0e209d39SAndroid Build Coastguard Worker * The state of the Charset detector that is passed in does not 335*0e209d39SAndroid Build Coastguard Worker * affect the result of this function, but requiring a valid, open 336*0e209d39SAndroid Build Coastguard Worker * charset detector as a parameter insures that the charset detection 337*0e209d39SAndroid Build Coastguard Worker * service has been safely initialized and that the required detection 338*0e209d39SAndroid Build Coastguard Worker * data is available. 339*0e209d39SAndroid Build Coastguard Worker * 340*0e209d39SAndroid Build Coastguard Worker * <p> 341*0e209d39SAndroid Build Coastguard Worker * <b>Note:</b> Multiple different charset encodings in a same family may use 342*0e209d39SAndroid Build Coastguard Worker * a single shared name in this implementation. For example, this method returns 343*0e209d39SAndroid Build Coastguard Worker * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 344*0e209d39SAndroid Build Coastguard Worker * (Windows Latin 1). However, actual detection result could be "windows-1252" 345*0e209d39SAndroid Build Coastguard Worker * when the input data matches Latin 1 code points with any points only available 346*0e209d39SAndroid Build Coastguard Worker * in "windows-1252". 347*0e209d39SAndroid Build Coastguard Worker * 348*0e209d39SAndroid Build Coastguard Worker * @param ucsd a Charset detector. 349*0e209d39SAndroid Build Coastguard Worker * @param status Any error conditions are reported back in this variable. 350*0e209d39SAndroid Build Coastguard Worker * @return an iterator providing access to the detectable charset names. 351*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 352*0e209d39SAndroid Build Coastguard Worker */ 353*0e209d39SAndroid Build Coastguard Worker U_CAPI UEnumeration * U_EXPORT2 354*0e209d39SAndroid Build Coastguard Worker ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 355*0e209d39SAndroid Build Coastguard Worker 356*0e209d39SAndroid Build Coastguard Worker /** 357*0e209d39SAndroid Build Coastguard Worker * Test whether input filtering is enabled for this charset detector. 358*0e209d39SAndroid Build Coastguard Worker * Input filtering removes text that appears to be HTML or xml 359*0e209d39SAndroid Build Coastguard Worker * markup from the input before applying the code page detection 360*0e209d39SAndroid Build Coastguard Worker * heuristics. 361*0e209d39SAndroid Build Coastguard Worker * 362*0e209d39SAndroid Build Coastguard Worker * @param ucsd The charset detector to check. 363*0e209d39SAndroid Build Coastguard Worker * @return true if filtering is enabled. 364*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 365*0e209d39SAndroid Build Coastguard Worker */ 366*0e209d39SAndroid Build Coastguard Worker 367*0e209d39SAndroid Build Coastguard Worker U_CAPI UBool U_EXPORT2 368*0e209d39SAndroid Build Coastguard Worker ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); 369*0e209d39SAndroid Build Coastguard Worker 370*0e209d39SAndroid Build Coastguard Worker 371*0e209d39SAndroid Build Coastguard Worker /** 372*0e209d39SAndroid Build Coastguard Worker * Enable filtering of input text. If filtering is enabled, 373*0e209d39SAndroid Build Coastguard Worker * text within angle brackets ("<" and ">") will be removed 374*0e209d39SAndroid Build Coastguard Worker * before detection, which will remove most HTML or xml markup. 375*0e209d39SAndroid Build Coastguard Worker * 376*0e209d39SAndroid Build Coastguard Worker * @param ucsd the charset detector to be modified. 377*0e209d39SAndroid Build Coastguard Worker * @param filter <code>true</code> to enable input text filtering. 378*0e209d39SAndroid Build Coastguard Worker * @return The previous setting. 379*0e209d39SAndroid Build Coastguard Worker * 380*0e209d39SAndroid Build Coastguard Worker * @stable ICU 3.6 381*0e209d39SAndroid Build Coastguard Worker */ 382*0e209d39SAndroid Build Coastguard Worker U_CAPI UBool U_EXPORT2 383*0e209d39SAndroid Build Coastguard Worker ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); 384*0e209d39SAndroid Build Coastguard Worker 385*0e209d39SAndroid Build Coastguard Worker #ifndef U_HIDE_INTERNAL_API 386*0e209d39SAndroid Build Coastguard Worker /** 387*0e209d39SAndroid Build Coastguard Worker * Get an iterator over the set of detectable charsets - 388*0e209d39SAndroid Build Coastguard Worker * over the charsets that are enabled by the specified charset detector. 389*0e209d39SAndroid Build Coastguard Worker * 390*0e209d39SAndroid Build Coastguard Worker * The returned UEnumeration provides access to the names of 391*0e209d39SAndroid Build Coastguard Worker * the charsets. 392*0e209d39SAndroid Build Coastguard Worker * 393*0e209d39SAndroid Build Coastguard Worker * @param ucsd a Charset detector. 394*0e209d39SAndroid Build Coastguard Worker * @param status Any error conditions are reported back in this variable. 395*0e209d39SAndroid Build Coastguard Worker * @return an iterator providing access to the detectable charset names by 396*0e209d39SAndroid Build Coastguard Worker * the specified charset detector. 397*0e209d39SAndroid Build Coastguard Worker * @internal 398*0e209d39SAndroid Build Coastguard Worker */ 399*0e209d39SAndroid Build Coastguard Worker U_CAPI UEnumeration * U_EXPORT2 400*0e209d39SAndroid Build Coastguard Worker ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 401*0e209d39SAndroid Build Coastguard Worker 402*0e209d39SAndroid Build Coastguard Worker /** 403*0e209d39SAndroid Build Coastguard Worker * Enable or disable individual charset encoding. 404*0e209d39SAndroid Build Coastguard Worker * A name of charset encoding must be included in the names returned by 405*0e209d39SAndroid Build Coastguard Worker * {@link #ucsdet_getAllDetectableCharsets()}. 406*0e209d39SAndroid Build Coastguard Worker * 407*0e209d39SAndroid Build Coastguard Worker * @param ucsd a Charset detector. 408*0e209d39SAndroid Build Coastguard Worker * @param encoding encoding the name of charset encoding. 409*0e209d39SAndroid Build Coastguard Worker * @param enabled <code>true</code> to enable, or <code>false</code> to disable the 410*0e209d39SAndroid Build Coastguard Worker * charset encoding. 411*0e209d39SAndroid Build Coastguard Worker * @param status receives the return status. When the name of charset encoding 412*0e209d39SAndroid Build Coastguard Worker * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set. 413*0e209d39SAndroid Build Coastguard Worker * @internal 414*0e209d39SAndroid Build Coastguard Worker */ 415*0e209d39SAndroid Build Coastguard Worker U_CAPI void U_EXPORT2 416*0e209d39SAndroid Build Coastguard Worker ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); 417*0e209d39SAndroid Build Coastguard Worker #endif /* U_HIDE_INTERNAL_API */ 418*0e209d39SAndroid Build Coastguard Worker 419*0e209d39SAndroid Build Coastguard Worker #endif 420*0e209d39SAndroid Build Coastguard Worker #endif /* __UCSDET_H */ 421*0e209d39SAndroid Build Coastguard Worker 422*0e209d39SAndroid Build Coastguard Worker 423