xref: /aosp_15_r20/external/icu/libandroidicu/include/unicode/ucsdet.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker  **********************************************************************
5*0e209d39SAndroid Build Coastguard Worker  *   Copyright (C) 2005-2013, International Business Machines
6*0e209d39SAndroid Build Coastguard Worker  *   Corporation and others.  All Rights Reserved.
7*0e209d39SAndroid Build Coastguard Worker  **********************************************************************
8*0e209d39SAndroid Build Coastguard Worker  *   file name:  ucsdet.h
9*0e209d39SAndroid Build Coastguard Worker  *   encoding:   UTF-8
10*0e209d39SAndroid Build Coastguard Worker  *   indentation:4
11*0e209d39SAndroid Build Coastguard Worker  *
12*0e209d39SAndroid Build Coastguard Worker  *   created on: 2005Aug04
13*0e209d39SAndroid Build Coastguard Worker  *   created by: Andy Heninger
14*0e209d39SAndroid Build Coastguard Worker  *
15*0e209d39SAndroid Build Coastguard Worker  *   ICU Character Set Detection, API for C
16*0e209d39SAndroid Build Coastguard Worker  *
17*0e209d39SAndroid Build Coastguard Worker  *   Draft version 18 Oct 2005
18*0e209d39SAndroid Build Coastguard Worker  *
19*0e209d39SAndroid Build Coastguard Worker  */
20*0e209d39SAndroid Build Coastguard Worker 
21*0e209d39SAndroid Build Coastguard Worker #ifndef __UCSDET_H
22*0e209d39SAndroid Build Coastguard Worker #define __UCSDET_H
23*0e209d39SAndroid Build Coastguard Worker 
24*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
25*0e209d39SAndroid Build Coastguard Worker 
26*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_CONVERSION
27*0e209d39SAndroid Build Coastguard Worker 
28*0e209d39SAndroid Build Coastguard Worker #include "unicode/uenum.h"
29*0e209d39SAndroid Build Coastguard Worker 
30*0e209d39SAndroid Build Coastguard Worker #if U_SHOW_CPLUSPLUS_API
31*0e209d39SAndroid Build Coastguard Worker #include "unicode/localpointer.h"
32*0e209d39SAndroid Build Coastguard Worker #endif   // U_SHOW_CPLUSPLUS_API
33*0e209d39SAndroid Build Coastguard Worker 
34*0e209d39SAndroid Build Coastguard Worker /**
35*0e209d39SAndroid Build Coastguard Worker  * \file
36*0e209d39SAndroid Build Coastguard Worker  * \brief C API: Charset Detection API
37*0e209d39SAndroid Build Coastguard Worker  *
38*0e209d39SAndroid Build Coastguard Worker  * This API provides a facility for detecting the
39*0e209d39SAndroid Build Coastguard Worker  * charset or encoding of character data in an unknown text format.
40*0e209d39SAndroid Build Coastguard Worker  * The input data can be from an array of bytes.
41*0e209d39SAndroid Build Coastguard Worker  * <p>
42*0e209d39SAndroid Build Coastguard Worker  * Character set detection is at best an imprecise operation.  The detection
43*0e209d39SAndroid Build Coastguard Worker  * process will attempt to identify the charset that best matches the characteristics
44*0e209d39SAndroid Build Coastguard Worker  * of the byte data, but the process is partly statistical in nature, and
45*0e209d39SAndroid Build Coastguard Worker  * the results can not be guaranteed to always be correct.
46*0e209d39SAndroid Build Coastguard Worker  * <p>
47*0e209d39SAndroid Build Coastguard Worker  * For best accuracy in charset detection, the input data should be primarily
48*0e209d39SAndroid Build Coastguard Worker  * in a single language, and a minimum of a few hundred bytes worth of plain text
49*0e209d39SAndroid Build Coastguard Worker  * in the language are needed.  The detection process will attempt to
50*0e209d39SAndroid Build Coastguard Worker  * ignore html or xml style markup that could otherwise obscure the content.
51*0e209d39SAndroid Build Coastguard Worker  * <p>
52*0e209d39SAndroid Build Coastguard Worker  * An alternative to the ICU Charset Detector is the
53*0e209d39SAndroid Build Coastguard Worker  * Compact Encoding Detector, https://github.com/google/compact_enc_det.
54*0e209d39SAndroid Build Coastguard Worker  * It often gives more accurate results, especially with short input samples.
55*0e209d39SAndroid Build Coastguard Worker  */
56*0e209d39SAndroid Build Coastguard Worker 
57*0e209d39SAndroid Build Coastguard Worker 
58*0e209d39SAndroid Build Coastguard Worker struct UCharsetDetector;
59*0e209d39SAndroid Build Coastguard Worker /**
60*0e209d39SAndroid Build Coastguard Worker   * Structure representing a charset detector
61*0e209d39SAndroid Build Coastguard Worker   * @stable ICU 3.6
62*0e209d39SAndroid Build Coastguard Worker   */
63*0e209d39SAndroid Build Coastguard Worker typedef struct UCharsetDetector UCharsetDetector;
64*0e209d39SAndroid Build Coastguard Worker 
65*0e209d39SAndroid Build Coastguard Worker struct UCharsetMatch;
66*0e209d39SAndroid Build Coastguard Worker /**
67*0e209d39SAndroid Build Coastguard Worker   *  Opaque structure representing a match that was identified
68*0e209d39SAndroid Build Coastguard Worker   *  from a charset detection operation.
69*0e209d39SAndroid Build Coastguard Worker   *  @stable ICU 3.6
70*0e209d39SAndroid Build Coastguard Worker   */
71*0e209d39SAndroid Build Coastguard Worker typedef struct UCharsetMatch UCharsetMatch;
72*0e209d39SAndroid Build Coastguard Worker 
73*0e209d39SAndroid Build Coastguard Worker /**
74*0e209d39SAndroid Build Coastguard Worker   *  Open a charset detector.
75*0e209d39SAndroid Build Coastguard Worker   *
76*0e209d39SAndroid Build Coastguard Worker   *  @param status Any error conditions occurring during the open
77*0e209d39SAndroid Build Coastguard Worker   *                operation are reported back in this variable.
78*0e209d39SAndroid Build Coastguard Worker   *  @return the newly opened charset detector.
79*0e209d39SAndroid Build Coastguard Worker   *  @stable ICU 3.6
80*0e209d39SAndroid Build Coastguard Worker   */
81*0e209d39SAndroid Build Coastguard Worker U_CAPI UCharsetDetector * U_EXPORT2
82*0e209d39SAndroid Build Coastguard Worker ucsdet_open(UErrorCode   *status);
83*0e209d39SAndroid Build Coastguard Worker 
84*0e209d39SAndroid Build Coastguard Worker /**
85*0e209d39SAndroid Build Coastguard Worker   * Close a charset detector.  All storage and any other resources
86*0e209d39SAndroid Build Coastguard Worker   *   owned by this charset detector will be released.  Failure to
87*0e209d39SAndroid Build Coastguard Worker   *   close a charset detector when finished with it can result in
88*0e209d39SAndroid Build Coastguard Worker   *   memory leaks in the application.
89*0e209d39SAndroid Build Coastguard Worker   *
90*0e209d39SAndroid Build Coastguard Worker   *  @param ucsd  The charset detector to be closed.
91*0e209d39SAndroid Build Coastguard Worker   *  @stable ICU 3.6
92*0e209d39SAndroid Build Coastguard Worker   */
93*0e209d39SAndroid Build Coastguard Worker U_CAPI void U_EXPORT2
94*0e209d39SAndroid Build Coastguard Worker ucsdet_close(UCharsetDetector *ucsd);
95*0e209d39SAndroid Build Coastguard Worker 
96*0e209d39SAndroid Build Coastguard Worker #if U_SHOW_CPLUSPLUS_API
97*0e209d39SAndroid Build Coastguard Worker 
98*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
99*0e209d39SAndroid Build Coastguard Worker 
100*0e209d39SAndroid Build Coastguard Worker /**
101*0e209d39SAndroid Build Coastguard Worker  * \class LocalUCharsetDetectorPointer
102*0e209d39SAndroid Build Coastguard Worker  * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
103*0e209d39SAndroid Build Coastguard Worker  * For most methods see the LocalPointerBase base class.
104*0e209d39SAndroid Build Coastguard Worker  *
105*0e209d39SAndroid Build Coastguard Worker  * @see LocalPointerBase
106*0e209d39SAndroid Build Coastguard Worker  * @see LocalPointer
107*0e209d39SAndroid Build Coastguard Worker  * @stable ICU 4.4
108*0e209d39SAndroid Build Coastguard Worker  */
109*0e209d39SAndroid Build Coastguard Worker U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
110*0e209d39SAndroid Build Coastguard Worker 
111*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
112*0e209d39SAndroid Build Coastguard Worker 
113*0e209d39SAndroid Build Coastguard Worker #endif
114*0e209d39SAndroid Build Coastguard Worker 
115*0e209d39SAndroid Build Coastguard Worker /**
116*0e209d39SAndroid Build Coastguard Worker   * Set the input byte data whose charset is to detected.
117*0e209d39SAndroid Build Coastguard Worker   *
118*0e209d39SAndroid Build Coastguard Worker   * Ownership of the input  text byte array remains with the caller.
119*0e209d39SAndroid Build Coastguard Worker   * The input string must not be altered or deleted until the charset
120*0e209d39SAndroid Build Coastguard Worker   * detector is either closed or reset to refer to different input text.
121*0e209d39SAndroid Build Coastguard Worker   *
122*0e209d39SAndroid Build Coastguard Worker   * @param ucsd   the charset detector to be used.
123*0e209d39SAndroid Build Coastguard Worker   * @param textIn the input text of unknown encoding.   .
124*0e209d39SAndroid Build Coastguard Worker   * @param len    the length of the input text, or -1 if the text
125*0e209d39SAndroid Build Coastguard Worker   *               is NUL terminated.
126*0e209d39SAndroid Build Coastguard Worker   * @param status any error conditions are reported back in this variable.
127*0e209d39SAndroid Build Coastguard Worker   *
128*0e209d39SAndroid Build Coastguard Worker   * @stable ICU 3.6
129*0e209d39SAndroid Build Coastguard Worker   */
130*0e209d39SAndroid Build Coastguard Worker U_CAPI void U_EXPORT2
131*0e209d39SAndroid Build Coastguard Worker ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
132*0e209d39SAndroid Build Coastguard Worker 
133*0e209d39SAndroid Build Coastguard Worker 
134*0e209d39SAndroid Build Coastguard Worker /** Set the declared encoding for charset detection.
135*0e209d39SAndroid Build Coastguard Worker  *  The declared encoding of an input text is an encoding obtained
136*0e209d39SAndroid Build Coastguard Worker  *  by the user from an http header or xml declaration or similar source that
137*0e209d39SAndroid Build Coastguard Worker  *  can be provided as an additional hint to the charset detector.
138*0e209d39SAndroid Build Coastguard Worker  *
139*0e209d39SAndroid Build Coastguard Worker  *  How and whether the declared encoding will be used during the
140*0e209d39SAndroid Build Coastguard Worker  *  detection process is TBD.
141*0e209d39SAndroid Build Coastguard Worker  *
142*0e209d39SAndroid Build Coastguard Worker  * @param ucsd      the charset detector to be used.
143*0e209d39SAndroid Build Coastguard Worker  * @param encoding  an encoding for the current data obtained from
144*0e209d39SAndroid Build Coastguard Worker  *                  a header or declaration or other source outside
145*0e209d39SAndroid Build Coastguard Worker  *                  of the byte data itself.
146*0e209d39SAndroid Build Coastguard Worker  * @param length    the length of the encoding name, or -1 if the name string
147*0e209d39SAndroid Build Coastguard Worker  *                  is NUL terminated.
148*0e209d39SAndroid Build Coastguard Worker  * @param status    any error conditions are reported back in this variable.
149*0e209d39SAndroid Build Coastguard Worker  *
150*0e209d39SAndroid Build Coastguard Worker  * @stable ICU 3.6
151*0e209d39SAndroid Build Coastguard Worker  */
152*0e209d39SAndroid Build Coastguard Worker U_CAPI void U_EXPORT2
153*0e209d39SAndroid Build Coastguard Worker ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
154*0e209d39SAndroid Build Coastguard Worker 
155*0e209d39SAndroid Build Coastguard Worker 
156*0e209d39SAndroid Build Coastguard Worker /**
157*0e209d39SAndroid Build Coastguard Worker  * Return the charset that best matches the supplied input data.
158*0e209d39SAndroid Build Coastguard Worker  *
159*0e209d39SAndroid Build Coastguard Worker  * Note though, that because the detection
160*0e209d39SAndroid Build Coastguard Worker  * only looks at the start of the input data,
161*0e209d39SAndroid Build Coastguard Worker  * there is a possibility that the returned charset will fail to handle
162*0e209d39SAndroid Build Coastguard Worker  * the full set of input data.
163*0e209d39SAndroid Build Coastguard Worker  * <p>
164*0e209d39SAndroid Build Coastguard Worker  * The returned UCharsetMatch object is owned by the UCharsetDetector.
165*0e209d39SAndroid Build Coastguard Worker  * It will remain valid until the detector input is reset, or until
166*0e209d39SAndroid Build Coastguard Worker  * the detector is closed.
167*0e209d39SAndroid Build Coastguard Worker  * <p>
168*0e209d39SAndroid Build Coastguard Worker  * The function will fail if
169*0e209d39SAndroid Build Coastguard Worker  *  <ul>
170*0e209d39SAndroid Build Coastguard Worker  *    <li>no charset appears to match the data.</li>
171*0e209d39SAndroid Build Coastguard Worker  *    <li>no input text has been provided</li>
172*0e209d39SAndroid Build Coastguard Worker  *  </ul>
173*0e209d39SAndroid Build Coastguard Worker  *
174*0e209d39SAndroid Build Coastguard Worker  * @param ucsd      the charset detector to be used.
175*0e209d39SAndroid Build Coastguard Worker  * @param status    any error conditions are reported back in this variable.
176*0e209d39SAndroid Build Coastguard Worker  * @return          a UCharsetMatch  representing the best matching charset,
177*0e209d39SAndroid Build Coastguard Worker  *                  or NULL if no charset matches the byte data.
178*0e209d39SAndroid Build Coastguard Worker  *
179*0e209d39SAndroid Build Coastguard Worker  * @stable ICU 3.6
180*0e209d39SAndroid Build Coastguard Worker  */
181*0e209d39SAndroid Build Coastguard Worker U_CAPI const UCharsetMatch * U_EXPORT2
182*0e209d39SAndroid Build Coastguard Worker ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
183*0e209d39SAndroid Build Coastguard Worker 
184*0e209d39SAndroid Build Coastguard Worker 
185*0e209d39SAndroid Build Coastguard Worker /**
186*0e209d39SAndroid Build Coastguard Worker  *  Find all charset matches that appear to be consistent with the input,
187*0e209d39SAndroid Build Coastguard Worker  *  returning an array of results.  The results are ordered with the
188*0e209d39SAndroid Build Coastguard Worker  *  best quality match first.
189*0e209d39SAndroid Build Coastguard Worker  *
190*0e209d39SAndroid Build Coastguard Worker  *  Because the detection only looks at a limited amount of the
191*0e209d39SAndroid Build Coastguard Worker  *  input byte data, some of the returned charsets may fail to handle
192*0e209d39SAndroid Build Coastguard Worker  *  the all of input data.
193*0e209d39SAndroid Build Coastguard Worker  *  <p>
194*0e209d39SAndroid Build Coastguard Worker  *  The returned UCharsetMatch objects are owned by the UCharsetDetector.
195*0e209d39SAndroid Build Coastguard Worker  *  They will remain valid until the detector is closed or modified
196*0e209d39SAndroid Build Coastguard Worker  *
197*0e209d39SAndroid Build Coastguard Worker  * <p>
198*0e209d39SAndroid Build Coastguard Worker  * Return an error if
199*0e209d39SAndroid Build Coastguard Worker  *  <ul>
200*0e209d39SAndroid Build Coastguard Worker  *    <li>no charsets appear to match the input data.</li>
201*0e209d39SAndroid Build Coastguard Worker  *    <li>no input text has been provided</li>
202*0e209d39SAndroid Build Coastguard Worker  *  </ul>
203*0e209d39SAndroid Build Coastguard Worker  *
204*0e209d39SAndroid Build Coastguard Worker  * @param ucsd          the charset detector to be used.
205*0e209d39SAndroid Build Coastguard Worker  * @param matchesFound  pointer to a variable that will be set to the
206*0e209d39SAndroid Build Coastguard Worker  *                      number of charsets identified that are consistent with
207*0e209d39SAndroid Build Coastguard Worker  *                      the input data.  Output only.
208*0e209d39SAndroid Build Coastguard Worker  * @param status        any error conditions are reported back in this variable.
209*0e209d39SAndroid Build Coastguard Worker  * @return              A pointer to an array of pointers to UCharSetMatch objects.
210*0e209d39SAndroid Build Coastguard Worker  *                      This array, and the UCharSetMatch instances to which it refers,
211*0e209d39SAndroid Build Coastguard Worker  *                      are owned by the UCharsetDetector, and will remain valid until
212*0e209d39SAndroid Build Coastguard Worker  *                      the detector is closed or modified.
213*0e209d39SAndroid Build Coastguard Worker  * @stable ICU 3.6
214*0e209d39SAndroid Build Coastguard Worker  */
215*0e209d39SAndroid Build Coastguard Worker U_CAPI const UCharsetMatch ** U_EXPORT2
216*0e209d39SAndroid Build Coastguard Worker ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
217*0e209d39SAndroid Build Coastguard Worker 
218*0e209d39SAndroid Build Coastguard Worker 
219*0e209d39SAndroid Build Coastguard Worker 
220*0e209d39SAndroid Build Coastguard Worker /**
221*0e209d39SAndroid Build Coastguard Worker  *  Get the name of the charset represented by a UCharsetMatch.
222*0e209d39SAndroid Build Coastguard Worker  *
223*0e209d39SAndroid Build Coastguard Worker  *  The storage for the returned name string is owned by the
224*0e209d39SAndroid Build Coastguard Worker  *  UCharsetMatch, and will remain valid while the UCharsetMatch
225*0e209d39SAndroid Build Coastguard Worker  *  is valid.
226*0e209d39SAndroid Build Coastguard Worker  *
227*0e209d39SAndroid Build Coastguard Worker  *  The name returned is suitable for use with the ICU conversion APIs.
228*0e209d39SAndroid Build Coastguard Worker  *
229*0e209d39SAndroid Build Coastguard Worker  *  @param ucsm    The charset match object.
230*0e209d39SAndroid Build Coastguard Worker  *  @param status  Any error conditions are reported back in this variable.
231*0e209d39SAndroid Build Coastguard Worker  *  @return        The name of the matching charset.
232*0e209d39SAndroid Build Coastguard Worker  *
233*0e209d39SAndroid Build Coastguard Worker  *  @stable ICU 3.6
234*0e209d39SAndroid Build Coastguard Worker  */
235*0e209d39SAndroid Build Coastguard Worker U_CAPI const char * U_EXPORT2
236*0e209d39SAndroid Build Coastguard Worker ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
237*0e209d39SAndroid Build Coastguard Worker 
238*0e209d39SAndroid Build Coastguard Worker /**
239*0e209d39SAndroid Build Coastguard Worker  *  Get a confidence number for the quality of the match of the byte
240*0e209d39SAndroid Build Coastguard Worker  *  data with the charset.  Confidence numbers range from zero to 100,
241*0e209d39SAndroid Build Coastguard Worker  *  with 100 representing complete confidence and zero representing
242*0e209d39SAndroid Build Coastguard Worker  *  no confidence.
243*0e209d39SAndroid Build Coastguard Worker  *
244*0e209d39SAndroid Build Coastguard Worker  *  The confidence values are somewhat arbitrary.  They define an
245*0e209d39SAndroid Build Coastguard Worker  *  an ordering within the results for any single detection operation
246*0e209d39SAndroid Build Coastguard Worker  *  but are not generally comparable between the results for different input.
247*0e209d39SAndroid Build Coastguard Worker  *
248*0e209d39SAndroid Build Coastguard Worker  *  A confidence value of ten does have a general meaning - it is used
249*0e209d39SAndroid Build Coastguard Worker  *  for charsets that can represent the input data, but for which there
250*0e209d39SAndroid Build Coastguard Worker  *  is no other indication that suggests that the charset is the correct one.
251*0e209d39SAndroid Build Coastguard Worker  *  Pure 7 bit ASCII data, for example, is compatible with a
252*0e209d39SAndroid Build Coastguard Worker  *  great many charsets, most of which will appear as possible matches
253*0e209d39SAndroid Build Coastguard Worker  *  with a confidence of 10.
254*0e209d39SAndroid Build Coastguard Worker  *
255*0e209d39SAndroid Build Coastguard Worker  *  @param ucsm    The charset match object.
256*0e209d39SAndroid Build Coastguard Worker  *  @param status  Any error conditions are reported back in this variable.
257*0e209d39SAndroid Build Coastguard Worker  *  @return        A confidence number for the charset match.
258*0e209d39SAndroid Build Coastguard Worker  *
259*0e209d39SAndroid Build Coastguard Worker  *  @stable ICU 3.6
260*0e209d39SAndroid Build Coastguard Worker  */
261*0e209d39SAndroid Build Coastguard Worker U_CAPI int32_t U_EXPORT2
262*0e209d39SAndroid Build Coastguard Worker ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
263*0e209d39SAndroid Build Coastguard Worker 
264*0e209d39SAndroid Build Coastguard Worker /**
265*0e209d39SAndroid Build Coastguard Worker  *  Get the RFC 3066 code for the language of the input data.
266*0e209d39SAndroid Build Coastguard Worker  *
267*0e209d39SAndroid Build Coastguard Worker  *  The Charset Detection service is intended primarily for detecting
268*0e209d39SAndroid Build Coastguard Worker  *  charsets, not language.  For some, but not all, charsets, a language is
269*0e209d39SAndroid Build Coastguard Worker  *  identified as a byproduct of the detection process, and that is what
270*0e209d39SAndroid Build Coastguard Worker  *  is returned by this function.
271*0e209d39SAndroid Build Coastguard Worker  *
272*0e209d39SAndroid Build Coastguard Worker  *  CAUTION:
273*0e209d39SAndroid Build Coastguard Worker  *    1.  Language information is not available for input data encoded in
274*0e209d39SAndroid Build Coastguard Worker  *        all charsets. In particular, no language is identified
275*0e209d39SAndroid Build Coastguard Worker  *        for UTF-8 input data.
276*0e209d39SAndroid Build Coastguard Worker  *
277*0e209d39SAndroid Build Coastguard Worker  *    2.  Closely related languages may sometimes be confused.
278*0e209d39SAndroid Build Coastguard Worker  *
279*0e209d39SAndroid Build Coastguard Worker  *  If more accurate language detection is required, a linguistic
280*0e209d39SAndroid Build Coastguard Worker  *  analysis package should be used.
281*0e209d39SAndroid Build Coastguard Worker  *
282*0e209d39SAndroid Build Coastguard Worker  *  The storage for the returned name string is owned by the
283*0e209d39SAndroid Build Coastguard Worker  *  UCharsetMatch, and will remain valid while the UCharsetMatch
284*0e209d39SAndroid Build Coastguard Worker  *  is valid.
285*0e209d39SAndroid Build Coastguard Worker  *
286*0e209d39SAndroid Build Coastguard Worker  *  @param ucsm    The charset match object.
287*0e209d39SAndroid Build Coastguard Worker  *  @param status  Any error conditions are reported back in this variable.
288*0e209d39SAndroid Build Coastguard Worker  *  @return        The RFC 3066 code for the language of the input data, or
289*0e209d39SAndroid Build Coastguard Worker  *                 an empty string if the language could not be determined.
290*0e209d39SAndroid Build Coastguard Worker  *
291*0e209d39SAndroid Build Coastguard Worker  *  @stable ICU 3.6
292*0e209d39SAndroid Build Coastguard Worker  */
293*0e209d39SAndroid Build Coastguard Worker U_CAPI const char * U_EXPORT2
294*0e209d39SAndroid Build Coastguard Worker ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
295*0e209d39SAndroid Build Coastguard Worker 
296*0e209d39SAndroid Build Coastguard Worker 
297*0e209d39SAndroid Build Coastguard Worker /**
298*0e209d39SAndroid Build Coastguard Worker   *  Get the entire input text as a UChar string, placing it into
299*0e209d39SAndroid Build Coastguard Worker   *  a caller-supplied buffer.  A terminating
300*0e209d39SAndroid Build Coastguard Worker   *  NUL character will be appended to the buffer if space is available.
301*0e209d39SAndroid Build Coastguard Worker   *
302*0e209d39SAndroid Build Coastguard Worker   *  The number of UChars in the output string, not including the terminating
303*0e209d39SAndroid Build Coastguard Worker   *  NUL, is returned.
304*0e209d39SAndroid Build Coastguard Worker   *
305*0e209d39SAndroid Build Coastguard Worker   *  If the supplied buffer is smaller than required to hold the output,
306*0e209d39SAndroid Build Coastguard Worker   *  the contents of the buffer are undefined.  The full output string length
307*0e209d39SAndroid Build Coastguard Worker   *  (in UChars) is returned as always, and can be used to allocate a buffer
308*0e209d39SAndroid Build Coastguard Worker   *  of the correct size.
309*0e209d39SAndroid Build Coastguard Worker   *
310*0e209d39SAndroid Build Coastguard Worker   *
311*0e209d39SAndroid Build Coastguard Worker   * @param ucsm    The charset match object.
312*0e209d39SAndroid Build Coastguard Worker   * @param buf     A UChar buffer to be filled with the converted text data.
313*0e209d39SAndroid Build Coastguard Worker   * @param cap     The capacity of the buffer in UChars.
314*0e209d39SAndroid Build Coastguard Worker   * @param status  Any error conditions are reported back in this variable.
315*0e209d39SAndroid Build Coastguard Worker   * @return        The number of UChars in the output string.
316*0e209d39SAndroid Build Coastguard Worker   *
317*0e209d39SAndroid Build Coastguard Worker   * @stable ICU 3.6
318*0e209d39SAndroid Build Coastguard Worker   */
319*0e209d39SAndroid Build Coastguard Worker U_CAPI  int32_t U_EXPORT2
320*0e209d39SAndroid Build Coastguard Worker ucsdet_getUChars(const UCharsetMatch *ucsm,
321*0e209d39SAndroid Build Coastguard Worker                  UChar *buf, int32_t cap, UErrorCode *status);
322*0e209d39SAndroid Build Coastguard Worker 
323*0e209d39SAndroid Build Coastguard Worker 
324*0e209d39SAndroid Build Coastguard Worker 
325*0e209d39SAndroid Build Coastguard Worker /**
326*0e209d39SAndroid Build Coastguard Worker   *  Get an iterator over the set of all detectable charsets -
327*0e209d39SAndroid Build Coastguard Worker   *  over the charsets that are known to the charset detection
328*0e209d39SAndroid Build Coastguard Worker   *  service.
329*0e209d39SAndroid Build Coastguard Worker   *
330*0e209d39SAndroid Build Coastguard Worker   *  The returned UEnumeration provides access to the names of
331*0e209d39SAndroid Build Coastguard Worker   *  the charsets.
332*0e209d39SAndroid Build Coastguard Worker   *
333*0e209d39SAndroid Build Coastguard Worker   *  <p>
334*0e209d39SAndroid Build Coastguard Worker   *  The state of the Charset detector that is passed in does not
335*0e209d39SAndroid Build Coastguard Worker   *  affect the result of this function, but requiring a valid, open
336*0e209d39SAndroid Build Coastguard Worker   *  charset detector as a parameter insures that the charset detection
337*0e209d39SAndroid Build Coastguard Worker   *  service has been safely initialized and that the required detection
338*0e209d39SAndroid Build Coastguard Worker   *  data is available.
339*0e209d39SAndroid Build Coastguard Worker   *
340*0e209d39SAndroid Build Coastguard Worker   *  <p>
341*0e209d39SAndroid Build Coastguard Worker   *  <b>Note:</b> Multiple different charset encodings in a same family may use
342*0e209d39SAndroid Build Coastguard Worker   *  a single shared name in this implementation. For example, this method returns
343*0e209d39SAndroid Build Coastguard Worker   *  an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
344*0e209d39SAndroid Build Coastguard Worker   *  (Windows Latin 1). However, actual detection result could be "windows-1252"
345*0e209d39SAndroid Build Coastguard Worker   *  when the input data matches Latin 1 code points with any points only available
346*0e209d39SAndroid Build Coastguard Worker   *  in "windows-1252".
347*0e209d39SAndroid Build Coastguard Worker   *
348*0e209d39SAndroid Build Coastguard Worker   *  @param ucsd a Charset detector.
349*0e209d39SAndroid Build Coastguard Worker   *  @param status  Any error conditions are reported back in this variable.
350*0e209d39SAndroid Build Coastguard Worker   *  @return an iterator providing access to the detectable charset names.
351*0e209d39SAndroid Build Coastguard Worker   *  @stable ICU 3.6
352*0e209d39SAndroid Build Coastguard Worker   */
353*0e209d39SAndroid Build Coastguard Worker U_CAPI  UEnumeration * U_EXPORT2
354*0e209d39SAndroid Build Coastguard Worker ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
355*0e209d39SAndroid Build Coastguard Worker 
356*0e209d39SAndroid Build Coastguard Worker /**
357*0e209d39SAndroid Build Coastguard Worker   *  Test whether input filtering is enabled for this charset detector.
358*0e209d39SAndroid Build Coastguard Worker   *  Input filtering removes text that appears to be HTML or xml
359*0e209d39SAndroid Build Coastguard Worker   *  markup from the input before applying the code page detection
360*0e209d39SAndroid Build Coastguard Worker   *  heuristics.
361*0e209d39SAndroid Build Coastguard Worker   *
362*0e209d39SAndroid Build Coastguard Worker   *  @param ucsd  The charset detector to check.
363*0e209d39SAndroid Build Coastguard Worker   *  @return true if filtering is enabled.
364*0e209d39SAndroid Build Coastguard Worker   *  @stable ICU 3.6
365*0e209d39SAndroid Build Coastguard Worker   */
366*0e209d39SAndroid Build Coastguard Worker 
367*0e209d39SAndroid Build Coastguard Worker U_CAPI  UBool U_EXPORT2
368*0e209d39SAndroid Build Coastguard Worker ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
369*0e209d39SAndroid Build Coastguard Worker 
370*0e209d39SAndroid Build Coastguard Worker 
371*0e209d39SAndroid Build Coastguard Worker /**
372*0e209d39SAndroid Build Coastguard Worker  * Enable filtering of input text. If filtering is enabled,
373*0e209d39SAndroid Build Coastguard Worker  * text within angle brackets ("<" and ">") will be removed
374*0e209d39SAndroid Build Coastguard Worker  * before detection, which will remove most HTML or xml markup.
375*0e209d39SAndroid Build Coastguard Worker  *
376*0e209d39SAndroid Build Coastguard Worker  * @param ucsd   the charset detector to be modified.
377*0e209d39SAndroid Build Coastguard Worker  * @param filter <code>true</code> to enable input text filtering.
378*0e209d39SAndroid Build Coastguard Worker  * @return The previous setting.
379*0e209d39SAndroid Build Coastguard Worker  *
380*0e209d39SAndroid Build Coastguard Worker  * @stable ICU 3.6
381*0e209d39SAndroid Build Coastguard Worker  */
382*0e209d39SAndroid Build Coastguard Worker U_CAPI  UBool U_EXPORT2
383*0e209d39SAndroid Build Coastguard Worker ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
384*0e209d39SAndroid Build Coastguard Worker 
385*0e209d39SAndroid Build Coastguard Worker #ifndef U_HIDE_INTERNAL_API
386*0e209d39SAndroid Build Coastguard Worker /**
387*0e209d39SAndroid Build Coastguard Worker   *  Get an iterator over the set of detectable charsets -
388*0e209d39SAndroid Build Coastguard Worker   *  over the charsets that are enabled by the specified charset detector.
389*0e209d39SAndroid Build Coastguard Worker   *
390*0e209d39SAndroid Build Coastguard Worker   *  The returned UEnumeration provides access to the names of
391*0e209d39SAndroid Build Coastguard Worker   *  the charsets.
392*0e209d39SAndroid Build Coastguard Worker   *
393*0e209d39SAndroid Build Coastguard Worker   *  @param ucsd a Charset detector.
394*0e209d39SAndroid Build Coastguard Worker   *  @param status  Any error conditions are reported back in this variable.
395*0e209d39SAndroid Build Coastguard Worker   *  @return an iterator providing access to the detectable charset names by
396*0e209d39SAndroid Build Coastguard Worker   *  the specified charset detector.
397*0e209d39SAndroid Build Coastguard Worker   *  @internal
398*0e209d39SAndroid Build Coastguard Worker   */
399*0e209d39SAndroid Build Coastguard Worker U_CAPI UEnumeration * U_EXPORT2
400*0e209d39SAndroid Build Coastguard Worker ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
401*0e209d39SAndroid Build Coastguard Worker 
402*0e209d39SAndroid Build Coastguard Worker /**
403*0e209d39SAndroid Build Coastguard Worker   * Enable or disable individual charset encoding.
404*0e209d39SAndroid Build Coastguard Worker   * A name of charset encoding must be included in the names returned by
405*0e209d39SAndroid Build Coastguard Worker   * {@link #ucsdet_getAllDetectableCharsets()}.
406*0e209d39SAndroid Build Coastguard Worker   *
407*0e209d39SAndroid Build Coastguard Worker   * @param ucsd a Charset detector.
408*0e209d39SAndroid Build Coastguard Worker   * @param encoding encoding the name of charset encoding.
409*0e209d39SAndroid Build Coastguard Worker   * @param enabled <code>true</code> to enable, or <code>false</code> to disable the
410*0e209d39SAndroid Build Coastguard Worker   *   charset encoding.
411*0e209d39SAndroid Build Coastguard Worker   * @param status receives the return status. When the name of charset encoding
412*0e209d39SAndroid Build Coastguard Worker   *   is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
413*0e209d39SAndroid Build Coastguard Worker   * @internal
414*0e209d39SAndroid Build Coastguard Worker   */
415*0e209d39SAndroid Build Coastguard Worker U_CAPI void U_EXPORT2
416*0e209d39SAndroid Build Coastguard Worker ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
417*0e209d39SAndroid Build Coastguard Worker #endif  /* U_HIDE_INTERNAL_API */
418*0e209d39SAndroid Build Coastguard Worker 
419*0e209d39SAndroid Build Coastguard Worker #endif
420*0e209d39SAndroid Build Coastguard Worker #endif   /* __UCSDET_H */
421*0e209d39SAndroid Build Coastguard Worker 
422*0e209d39SAndroid Build Coastguard Worker 
423