xref: /aosp_15_r20/external/unicode/ConvertUTF.h (revision c14be686ac162d87fd361a4e7a5439b56849c4f4)
1*c14be686SAndroid Build Coastguard Worker /*
2*c14be686SAndroid Build Coastguard Worker  * Copyright 2001-2004 Unicode, Inc.
3*c14be686SAndroid Build Coastguard Worker  *
4*c14be686SAndroid Build Coastguard Worker  * Disclaimer
5*c14be686SAndroid Build Coastguard Worker  *
6*c14be686SAndroid Build Coastguard Worker  * This source code is provided as is by Unicode, Inc. No claims are
7*c14be686SAndroid Build Coastguard Worker  * made as to fitness for any particular purpose. No warranties of any
8*c14be686SAndroid Build Coastguard Worker  * kind are expressed or implied. The recipient agrees to determine
9*c14be686SAndroid Build Coastguard Worker  * applicability of information provided. If this file has been
10*c14be686SAndroid Build Coastguard Worker  * purchased on magnetic or optical media from Unicode, Inc., the
11*c14be686SAndroid Build Coastguard Worker  * sole remedy for any claim will be exchange of defective media
12*c14be686SAndroid Build Coastguard Worker  * within 90 days of receipt.
13*c14be686SAndroid Build Coastguard Worker  *
14*c14be686SAndroid Build Coastguard Worker  * Limitations on Rights to Redistribute This Code
15*c14be686SAndroid Build Coastguard Worker  *
16*c14be686SAndroid Build Coastguard Worker  * Unicode, Inc. hereby grants the right to freely use the information
17*c14be686SAndroid Build Coastguard Worker  * supplied in this file in the creation of products supporting the
18*c14be686SAndroid Build Coastguard Worker  * Unicode Standard, and to make copies of this file in any form
19*c14be686SAndroid Build Coastguard Worker  * for internal or external distribution as long as this notice
20*c14be686SAndroid Build Coastguard Worker  * remains attached.
21*c14be686SAndroid Build Coastguard Worker  */
22*c14be686SAndroid Build Coastguard Worker 
23*c14be686SAndroid Build Coastguard Worker /* ---------------------------------------------------------------------
24*c14be686SAndroid Build Coastguard Worker 
25*c14be686SAndroid Build Coastguard Worker     Conversions between UTF32, UTF-16, and UTF-8.  Header file.
26*c14be686SAndroid Build Coastguard Worker 
27*c14be686SAndroid Build Coastguard Worker     Several funtions are included here, forming a complete set of
28*c14be686SAndroid Build Coastguard Worker     conversions between the three formats.  UTF-7 is not included
29*c14be686SAndroid Build Coastguard Worker     here, but is handled in a separate source file.
30*c14be686SAndroid Build Coastguard Worker 
31*c14be686SAndroid Build Coastguard Worker     Each of these routines takes pointers to input buffers and output
32*c14be686SAndroid Build Coastguard Worker     buffers.  The input buffers are const.
33*c14be686SAndroid Build Coastguard Worker 
34*c14be686SAndroid Build Coastguard Worker     Each routine converts the text between *sourceStart and sourceEnd,
35*c14be686SAndroid Build Coastguard Worker     putting the result into the buffer between *targetStart and
36*c14be686SAndroid Build Coastguard Worker     targetEnd. Note: the end pointers are *after* the last item: e.g.
37*c14be686SAndroid Build Coastguard Worker     *(sourceEnd - 1) is the last item.
38*c14be686SAndroid Build Coastguard Worker 
39*c14be686SAndroid Build Coastguard Worker     The return result indicates whether the conversion was successful,
40*c14be686SAndroid Build Coastguard Worker     and if not, whether the problem was in the source or target buffers.
41*c14be686SAndroid Build Coastguard Worker     (Only the first encountered problem is indicated.)
42*c14be686SAndroid Build Coastguard Worker 
43*c14be686SAndroid Build Coastguard Worker     After the conversion, *sourceStart and *targetStart are both
44*c14be686SAndroid Build Coastguard Worker     updated to point to the end of last text successfully converted in
45*c14be686SAndroid Build Coastguard Worker     the respective buffers.
46*c14be686SAndroid Build Coastguard Worker 
47*c14be686SAndroid Build Coastguard Worker     Input parameters:
48*c14be686SAndroid Build Coastguard Worker 	sourceStart - pointer to a pointer to the source buffer.
49*c14be686SAndroid Build Coastguard Worker 		The contents of this are modified on return so that
50*c14be686SAndroid Build Coastguard Worker 		it points at the next thing to be converted.
51*c14be686SAndroid Build Coastguard Worker 	targetStart - similarly, pointer to pointer to the target buffer.
52*c14be686SAndroid Build Coastguard Worker 	sourceEnd, targetEnd - respectively pointers to the ends of the
53*c14be686SAndroid Build Coastguard Worker 		two buffers, for overflow checking only.
54*c14be686SAndroid Build Coastguard Worker 
55*c14be686SAndroid Build Coastguard Worker     These conversion functions take a ConversionFlags argument. When this
56*c14be686SAndroid Build Coastguard Worker     flag is set to strict, both irregular sequences and isolated surrogates
57*c14be686SAndroid Build Coastguard Worker     will cause an error.  When the flag is set to lenient, both irregular
58*c14be686SAndroid Build Coastguard Worker     sequences and isolated surrogates are converted.
59*c14be686SAndroid Build Coastguard Worker 
60*c14be686SAndroid Build Coastguard Worker     Whether the flag is strict or lenient, all illegal sequences will cause
61*c14be686SAndroid Build Coastguard Worker     an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
62*c14be686SAndroid Build Coastguard Worker     or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
63*c14be686SAndroid Build Coastguard Worker     must check for illegal sequences.
64*c14be686SAndroid Build Coastguard Worker 
65*c14be686SAndroid Build Coastguard Worker     When the flag is set to lenient, characters over 0x10FFFF are converted
66*c14be686SAndroid Build Coastguard Worker     to the replacement character; otherwise (when the flag is set to strict)
67*c14be686SAndroid Build Coastguard Worker     they constitute an error.
68*c14be686SAndroid Build Coastguard Worker 
69*c14be686SAndroid Build Coastguard Worker     Output parameters:
70*c14be686SAndroid Build Coastguard Worker 	The value "sourceIllegal" is returned from some routines if the input
71*c14be686SAndroid Build Coastguard Worker 	sequence is malformed.  When "sourceIllegal" is returned, the source
72*c14be686SAndroid Build Coastguard Worker 	value will point to the illegal value that caused the problem. E.g.,
73*c14be686SAndroid Build Coastguard Worker 	in UTF-8 when a sequence is malformed, it points to the start of the
74*c14be686SAndroid Build Coastguard Worker 	malformed sequence.
75*c14be686SAndroid Build Coastguard Worker 
76*c14be686SAndroid Build Coastguard Worker     Author: Mark E. Davis, 1994.
77*c14be686SAndroid Build Coastguard Worker     Rev History: Rick McGowan, fixes & updates May 2001.
78*c14be686SAndroid Build Coastguard Worker 		 Fixes & updates, Sept 2001.
79*c14be686SAndroid Build Coastguard Worker 
80*c14be686SAndroid Build Coastguard Worker ------------------------------------------------------------------------ */
81*c14be686SAndroid Build Coastguard Worker 
82*c14be686SAndroid Build Coastguard Worker /* ---------------------------------------------------------------------
83*c14be686SAndroid Build Coastguard Worker     The following 4 definitions are compiler-specific.
84*c14be686SAndroid Build Coastguard Worker     The C standard does not guarantee that wchar_t has at least
85*c14be686SAndroid Build Coastguard Worker     16 bits, so wchar_t is no less portable than unsigned short!
86*c14be686SAndroid Build Coastguard Worker     All should be unsigned values to avoid sign extension during
87*c14be686SAndroid Build Coastguard Worker     bit mask & shift operations.
88*c14be686SAndroid Build Coastguard Worker ------------------------------------------------------------------------ */
89*c14be686SAndroid Build Coastguard Worker 
90*c14be686SAndroid Build Coastguard Worker typedef unsigned long	UTF32;	/* at least 32 bits */
91*c14be686SAndroid Build Coastguard Worker typedef unsigned short	UTF16;	/* at least 16 bits */
92*c14be686SAndroid Build Coastguard Worker typedef unsigned char	UTF8;	/* typically 8 bits */
93*c14be686SAndroid Build Coastguard Worker typedef unsigned char	Boolean; /* 0 or 1 */
94*c14be686SAndroid Build Coastguard Worker 
95*c14be686SAndroid Build Coastguard Worker /* Some fundamental constants */
96*c14be686SAndroid Build Coastguard Worker #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
97*c14be686SAndroid Build Coastguard Worker #define UNI_MAX_BMP (UTF32)0x0000FFFF
98*c14be686SAndroid Build Coastguard Worker #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
99*c14be686SAndroid Build Coastguard Worker #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
100*c14be686SAndroid Build Coastguard Worker #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
101*c14be686SAndroid Build Coastguard Worker 
102*c14be686SAndroid Build Coastguard Worker typedef enum {
103*c14be686SAndroid Build Coastguard Worker 	conversionOK, 		/* conversion successful */
104*c14be686SAndroid Build Coastguard Worker 	sourceExhausted,	/* partial character in source, but hit end */
105*c14be686SAndroid Build Coastguard Worker 	targetExhausted,	/* insuff. room in target for conversion */
106*c14be686SAndroid Build Coastguard Worker 	sourceIllegal		/* source sequence is illegal/malformed */
107*c14be686SAndroid Build Coastguard Worker } ConversionResult;
108*c14be686SAndroid Build Coastguard Worker 
109*c14be686SAndroid Build Coastguard Worker typedef enum {
110*c14be686SAndroid Build Coastguard Worker 	strictConversion = 0,
111*c14be686SAndroid Build Coastguard Worker 	lenientConversion
112*c14be686SAndroid Build Coastguard Worker } ConversionFlags;
113*c14be686SAndroid Build Coastguard Worker 
114*c14be686SAndroid Build Coastguard Worker /* This is for C++ and does no harm in C */
115*c14be686SAndroid Build Coastguard Worker #ifdef __cplusplus
116*c14be686SAndroid Build Coastguard Worker extern "C" {
117*c14be686SAndroid Build Coastguard Worker #endif
118*c14be686SAndroid Build Coastguard Worker 
119*c14be686SAndroid Build Coastguard Worker ConversionResult ConvertUTF8toUTF16 (
120*c14be686SAndroid Build Coastguard Worker 		const UTF8** sourceStart, const UTF8* sourceEnd,
121*c14be686SAndroid Build Coastguard Worker 		UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
122*c14be686SAndroid Build Coastguard Worker 
123*c14be686SAndroid Build Coastguard Worker ConversionResult ConvertUTF16toUTF8 (
124*c14be686SAndroid Build Coastguard Worker 		const UTF16** sourceStart, const UTF16* sourceEnd,
125*c14be686SAndroid Build Coastguard Worker 		UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
126*c14be686SAndroid Build Coastguard Worker 
127*c14be686SAndroid Build Coastguard Worker ConversionResult ConvertUTF8toUTF32 (
128*c14be686SAndroid Build Coastguard Worker 		const UTF8** sourceStart, const UTF8* sourceEnd,
129*c14be686SAndroid Build Coastguard Worker 		UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
130*c14be686SAndroid Build Coastguard Worker 
131*c14be686SAndroid Build Coastguard Worker ConversionResult ConvertUTF32toUTF8 (
132*c14be686SAndroid Build Coastguard Worker 		const UTF32** sourceStart, const UTF32* sourceEnd,
133*c14be686SAndroid Build Coastguard Worker 		UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
134*c14be686SAndroid Build Coastguard Worker 
135*c14be686SAndroid Build Coastguard Worker ConversionResult ConvertUTF16toUTF32 (
136*c14be686SAndroid Build Coastguard Worker 		const UTF16** sourceStart, const UTF16* sourceEnd,
137*c14be686SAndroid Build Coastguard Worker 		UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
138*c14be686SAndroid Build Coastguard Worker 
139*c14be686SAndroid Build Coastguard Worker ConversionResult ConvertUTF32toUTF16 (
140*c14be686SAndroid Build Coastguard Worker 		const UTF32** sourceStart, const UTF32* sourceEnd,
141*c14be686SAndroid Build Coastguard Worker 		UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
142*c14be686SAndroid Build Coastguard Worker 
143*c14be686SAndroid Build Coastguard Worker Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
144*c14be686SAndroid Build Coastguard Worker 
145*c14be686SAndroid Build Coastguard Worker #ifdef __cplusplus
146*c14be686SAndroid Build Coastguard Worker }
147*c14be686SAndroid Build Coastguard Worker #endif
148*c14be686SAndroid Build Coastguard Worker 
149*c14be686SAndroid Build Coastguard Worker /* --------------------------------------------------------------------- */
150