xref: /aosp_15_r20/external/google-breakpad/src/common/convert_UTF.h (revision 9712c20fc9bbfbac4935993a2ca0b3958c5adad2)
1*9712c20fSFrederick Mayle /*
2*9712c20fSFrederick Mayle  * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
3*9712c20fSFrederick Mayle  * Distributed under the Terms of Use in
4*9712c20fSFrederick Mayle  * http://www.unicode.org/copyright.html.
5*9712c20fSFrederick Mayle  *
6*9712c20fSFrederick Mayle  * Permission is hereby granted, free of charge, to any person obtaining
7*9712c20fSFrederick Mayle  * a copy of the Unicode data files and any associated documentation
8*9712c20fSFrederick Mayle  * (the "Data Files") or Unicode software and any associated documentation
9*9712c20fSFrederick Mayle  * (the "Software") to deal in the Data Files or Software
10*9712c20fSFrederick Mayle  * without restriction, including without limitation the rights to use,
11*9712c20fSFrederick Mayle  * copy, modify, merge, publish, distribute, and/or sell copies of
12*9712c20fSFrederick Mayle  * the Data Files or Software, and to permit persons to whom the Data Files
13*9712c20fSFrederick Mayle  * or Software are furnished to do so, provided that
14*9712c20fSFrederick Mayle  * (a) this copyright and permission notice appear with all copies
15*9712c20fSFrederick Mayle  * of the Data Files or Software,
16*9712c20fSFrederick Mayle  * (b) this copyright and permission notice appear in associated
17*9712c20fSFrederick Mayle  * documentation, and
18*9712c20fSFrederick Mayle  * (c) there is clear notice in each modified Data File or in the Software
19*9712c20fSFrederick Mayle  * as well as in the documentation associated with the Data File(s) or
20*9712c20fSFrederick Mayle  * Software that the data or software has been modified.
21*9712c20fSFrederick Mayle  *
22*9712c20fSFrederick Mayle  * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
23*9712c20fSFrederick Mayle  * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
24*9712c20fSFrederick Mayle  * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
25*9712c20fSFrederick Mayle  * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
26*9712c20fSFrederick Mayle  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
27*9712c20fSFrederick Mayle  * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
28*9712c20fSFrederick Mayle  * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
29*9712c20fSFrederick Mayle  * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
30*9712c20fSFrederick Mayle  * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
31*9712c20fSFrederick Mayle  * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
32*9712c20fSFrederick Mayle  *
33*9712c20fSFrederick Mayle  * Except as contained in this notice, the name of a copyright holder
34*9712c20fSFrederick Mayle  * shall not be used in advertising or otherwise to promote the sale,
35*9712c20fSFrederick Mayle  * use or other dealings in these Data Files or Software without prior
36*9712c20fSFrederick Mayle  * written authorization of the copyright holder.
37*9712c20fSFrederick Mayle  */
38*9712c20fSFrederick Mayle 
39*9712c20fSFrederick Mayle #ifndef COMMON_CONVERT_UTF_H_
40*9712c20fSFrederick Mayle #define COMMON_CONVERT_UTF_H_
41*9712c20fSFrederick Mayle 
42*9712c20fSFrederick Mayle /* ---------------------------------------------------------------------
43*9712c20fSFrederick Mayle 
44*9712c20fSFrederick Mayle Conversions between UTF32, UTF-16, and UTF-8.  Header file.
45*9712c20fSFrederick Mayle 
46*9712c20fSFrederick Mayle Several funtions are included here, forming a complete set of
47*9712c20fSFrederick Mayle conversions between the three formats.  UTF-7 is not included
48*9712c20fSFrederick Mayle here, but is handled in a separate source file.
49*9712c20fSFrederick Mayle 
50*9712c20fSFrederick Mayle Each of these routines takes pointers to input buffers and output
51*9712c20fSFrederick Mayle buffers.  The input buffers are const.
52*9712c20fSFrederick Mayle 
53*9712c20fSFrederick Mayle Each routine converts the text between *sourceStart and sourceEnd,
54*9712c20fSFrederick Mayle putting the result into the buffer between *targetStart and
55*9712c20fSFrederick Mayle targetEnd. Note: the end pointers are *after* the last item: e.g.
56*9712c20fSFrederick Mayle *(sourceEnd - 1) is the last item.
57*9712c20fSFrederick Mayle 
58*9712c20fSFrederick Mayle The return result indicates whether the conversion was successful,
59*9712c20fSFrederick Mayle and if not, whether the problem was in the source or target buffers.
60*9712c20fSFrederick Mayle (Only the first encountered problem is indicated.)
61*9712c20fSFrederick Mayle 
62*9712c20fSFrederick Mayle After the conversion, *sourceStart and *targetStart are both
63*9712c20fSFrederick Mayle updated to point to the end of last text successfully converted in
64*9712c20fSFrederick Mayle the respective buffers.
65*9712c20fSFrederick Mayle 
66*9712c20fSFrederick Mayle Input parameters:
67*9712c20fSFrederick Mayle sourceStart - pointer to a pointer to the source buffer.
68*9712c20fSFrederick Mayle The contents of this are modified on return so that
69*9712c20fSFrederick Mayle it points at the next thing to be converted.
70*9712c20fSFrederick Mayle targetStart - similarly, pointer to pointer to the target buffer.
71*9712c20fSFrederick Mayle sourceEnd, targetEnd - respectively pointers to the ends of the
72*9712c20fSFrederick Mayle two buffers, for overflow checking only.
73*9712c20fSFrederick Mayle 
74*9712c20fSFrederick Mayle These conversion functions take a ConversionFlags argument. When this
75*9712c20fSFrederick Mayle flag is set to strict, both irregular sequences and isolated surrogates
76*9712c20fSFrederick Mayle will cause an error.  When the flag is set to lenient, both irregular
77*9712c20fSFrederick Mayle sequences and isolated surrogates are converted.
78*9712c20fSFrederick Mayle 
79*9712c20fSFrederick Mayle Whether the flag is strict or lenient, all illegal sequences will cause
80*9712c20fSFrederick Mayle an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
81*9712c20fSFrederick Mayle or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
82*9712c20fSFrederick Mayle must check for illegal sequences.
83*9712c20fSFrederick Mayle 
84*9712c20fSFrederick Mayle When the flag is set to lenient, characters over 0x10FFFF are converted
85*9712c20fSFrederick Mayle to the replacement character; otherwise (when the flag is set to strict)
86*9712c20fSFrederick Mayle they constitute an error.
87*9712c20fSFrederick Mayle 
88*9712c20fSFrederick Mayle Output parameters:
89*9712c20fSFrederick Mayle The value "sourceIllegal" is returned from some routines if the input
90*9712c20fSFrederick Mayle sequence is malformed.  When "sourceIllegal" is returned, the source
91*9712c20fSFrederick Mayle value will point to the illegal value that caused the problem. E.g.,
92*9712c20fSFrederick Mayle in UTF-8 when a sequence is malformed, it points to the start of the
93*9712c20fSFrederick Mayle malformed sequence.
94*9712c20fSFrederick Mayle 
95*9712c20fSFrederick Mayle Author: Mark E. Davis, 1994.
96*9712c20fSFrederick Mayle Rev History: Rick McGowan, fixes & updates May 2001.
97*9712c20fSFrederick Mayle Fixes & updates, Sept 2001.
98*9712c20fSFrederick Mayle 
99*9712c20fSFrederick Mayle ------------------------------------------------------------------------ */
100*9712c20fSFrederick Mayle 
101*9712c20fSFrederick Mayle /* ---------------------------------------------------------------------
102*9712c20fSFrederick Mayle The following 4 definitions are compiler-specific.
103*9712c20fSFrederick Mayle The C standard does not guarantee that wchar_t has at least
104*9712c20fSFrederick Mayle 16 bits, so wchar_t is no less portable than unsigned short!
105*9712c20fSFrederick Mayle All should be unsigned values to avoid sign extension during
106*9712c20fSFrederick Mayle bit mask & shift operations.
107*9712c20fSFrederick Mayle ------------------------------------------------------------------------ */
108*9712c20fSFrederick Mayle 
109*9712c20fSFrederick Mayle namespace google_breakpad {
110*9712c20fSFrederick Mayle 
111*9712c20fSFrederick Mayle typedef unsigned long	UTF32;	/* at least 32 bits */
112*9712c20fSFrederick Mayle typedef unsigned short	UTF16;	/* at least 16 bits */
113*9712c20fSFrederick Mayle typedef unsigned char	UTF8;	/* typically 8 bits */
114*9712c20fSFrederick Mayle typedef unsigned char	Boolean; /* 0 or 1 */
115*9712c20fSFrederick Mayle 
116*9712c20fSFrederick Mayle /* Some fundamental constants */
117*9712c20fSFrederick Mayle #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
118*9712c20fSFrederick Mayle #define UNI_MAX_BMP (UTF32)0x0000FFFF
119*9712c20fSFrederick Mayle #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
120*9712c20fSFrederick Mayle #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
121*9712c20fSFrederick Mayle #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
122*9712c20fSFrederick Mayle 
123*9712c20fSFrederick Mayle typedef enum {
124*9712c20fSFrederick Mayle 	conversionOK, 		/* conversion successful */
125*9712c20fSFrederick Mayle 	sourceExhausted,	/* partial character in source, but hit end */
126*9712c20fSFrederick Mayle 	targetExhausted,	/* insuff. room in target for conversion */
127*9712c20fSFrederick Mayle 	sourceIllegal		/* source sequence is illegal/malformed */
128*9712c20fSFrederick Mayle } ConversionResult;
129*9712c20fSFrederick Mayle 
130*9712c20fSFrederick Mayle typedef enum {
131*9712c20fSFrederick Mayle 	strictConversion = 0,
132*9712c20fSFrederick Mayle 	lenientConversion
133*9712c20fSFrederick Mayle } ConversionFlags;
134*9712c20fSFrederick Mayle 
135*9712c20fSFrederick Mayle ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd,
136*9712c20fSFrederick Mayle                                      UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
137*9712c20fSFrederick Mayle 
138*9712c20fSFrederick Mayle ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd,
139*9712c20fSFrederick Mayle                                      UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
140*9712c20fSFrederick Mayle 
141*9712c20fSFrederick Mayle ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd,
142*9712c20fSFrederick Mayle                                      UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
143*9712c20fSFrederick Mayle 
144*9712c20fSFrederick Mayle ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd,
145*9712c20fSFrederick Mayle                                      UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
146*9712c20fSFrederick Mayle 
147*9712c20fSFrederick Mayle ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd,
148*9712c20fSFrederick Mayle                                       UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
149*9712c20fSFrederick Mayle 
150*9712c20fSFrederick Mayle ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd,
151*9712c20fSFrederick Mayle                                       UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
152*9712c20fSFrederick Mayle 
153*9712c20fSFrederick Mayle Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
154*9712c20fSFrederick Mayle 
155*9712c20fSFrederick Mayle }  // namespace google_breakpad
156*9712c20fSFrederick Mayle 
157*9712c20fSFrederick Mayle /* --------------------------------------------------------------------- */
158*9712c20fSFrederick Mayle 
159*9712c20fSFrederick Mayle #endif  // COMMON_CONVERT_UTF_H_
160