xref: /aosp_15_r20/external/antlr/runtime/C/src/antlr3convertutf.c (revision 16467b971bd3e2009fad32dd79016f2c7e421deb)
1*16467b97STreehugger Robot /*
2*16467b97STreehugger Robot  * Copyright 2001-2004 Unicode, Inc.
3*16467b97STreehugger Robot  *
4*16467b97STreehugger Robot  * Disclaimer
5*16467b97STreehugger Robot  *
6*16467b97STreehugger Robot  * This source code is provided as is by Unicode, Inc. No claims are
7*16467b97STreehugger Robot  * made as to fitness for any particular purpose. No warranties of any
8*16467b97STreehugger Robot  * kind are expressed or implied. The recipient agrees to determine
9*16467b97STreehugger Robot  * applicability of information provided. If this file has been
10*16467b97STreehugger Robot  * purchased on magnetic or optical media from Unicode, Inc., the
11*16467b97STreehugger Robot  * sole remedy for any claim will be exchange of defective media
12*16467b97STreehugger Robot  * within 90 days of receipt.
13*16467b97STreehugger Robot  *
14*16467b97STreehugger Robot  * Limitations on Rights to Redistribute This Code
15*16467b97STreehugger Robot  *
16*16467b97STreehugger Robot  * Unicode, Inc. hereby grants the right to freely use the information
17*16467b97STreehugger Robot  * supplied in this file in the creation of products supporting the
18*16467b97STreehugger Robot  * Unicode Standard, and to make copies of this file in any form
19*16467b97STreehugger Robot  * for internal or external distribution as long as this notice
20*16467b97STreehugger Robot  * remains attached.
21*16467b97STreehugger Robot  */
22*16467b97STreehugger Robot 
23*16467b97STreehugger Robot /* ---------------------------------------------------------------------
24*16467b97STreehugger Robot 
25*16467b97STreehugger Robot     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26*16467b97STreehugger Robot     Author: Mark E. Davis, 1994.
27*16467b97STreehugger Robot     Rev History: Rick McGowan, fixes & updates May 2001.
28*16467b97STreehugger Robot     Sept 2001: fixed const & error conditions per
29*16467b97STreehugger Robot 	mods suggested by S. Parent & A. Lillich.
30*16467b97STreehugger Robot     June 2002: Tim Dodd added detection and handling of incomplete
31*16467b97STreehugger Robot 	source sequences, enhanced error detection, added casts
32*16467b97STreehugger Robot 	to eliminate compiler warnings.
33*16467b97STreehugger Robot     July 2003: slight mods to back out aggressive FFFE detection.
34*16467b97STreehugger Robot     Jan 2004: updated switches in from-UTF8 conversions.
35*16467b97STreehugger Robot     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36*16467b97STreehugger Robot 
37*16467b97STreehugger Robot     See the header file "ConvertUTF.h" for complete documentation.
38*16467b97STreehugger Robot 
39*16467b97STreehugger Robot ------------------------------------------------------------------------ */
40*16467b97STreehugger Robot 
41*16467b97STreehugger Robot 
42*16467b97STreehugger Robot #include "antlr3convertutf.h"
43*16467b97STreehugger Robot 
44*16467b97STreehugger Robot #ifdef CVTUTF_DEBUG
45*16467b97STreehugger Robot #include <stdio.h>
46*16467b97STreehugger Robot #endif
47*16467b97STreehugger Robot 
48*16467b97STreehugger Robot 
49*16467b97STreehugger Robot 
50*16467b97STreehugger Robot /* --------------------------------------------------------------------- */
51*16467b97STreehugger Robot 
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)52*16467b97STreehugger Robot ConversionResult ConvertUTF32toUTF16 (
53*16467b97STreehugger Robot 	const UTF32** sourceStart, const UTF32* sourceEnd,
54*16467b97STreehugger Robot 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
55*16467b97STreehugger Robot     ConversionResult result = conversionOK;
56*16467b97STreehugger Robot     const UTF32* source = *sourceStart;
57*16467b97STreehugger Robot     UTF16* target = *targetStart;
58*16467b97STreehugger Robot     while (source < sourceEnd) {
59*16467b97STreehugger Robot 	UTF32 ch;
60*16467b97STreehugger Robot 	if (target >= targetEnd) {
61*16467b97STreehugger Robot 	    result = targetExhausted; break;
62*16467b97STreehugger Robot 	}
63*16467b97STreehugger Robot 	ch = *source++;
64*16467b97STreehugger Robot 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
65*16467b97STreehugger Robot 	    /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
66*16467b97STreehugger Robot 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
67*16467b97STreehugger Robot 		if (flags == strictConversion) {
68*16467b97STreehugger Robot 		    --source; /* return to the illegal value itself */
69*16467b97STreehugger Robot 		    result = sourceIllegal;
70*16467b97STreehugger Robot 		    break;
71*16467b97STreehugger Robot 		} else {
72*16467b97STreehugger Robot 		    *target++ = UNI_REPLACEMENT_CHAR;
73*16467b97STreehugger Robot 		}
74*16467b97STreehugger Robot 	    } else {
75*16467b97STreehugger Robot 		*target++ = (UTF16)ch; /* normal case */
76*16467b97STreehugger Robot 	    }
77*16467b97STreehugger Robot 	} else if (ch > UNI_MAX_LEGAL_UTF32) {
78*16467b97STreehugger Robot 	    if (flags == strictConversion) {
79*16467b97STreehugger Robot 		result = sourceIllegal;
80*16467b97STreehugger Robot 	    } else {
81*16467b97STreehugger Robot 		*target++ = UNI_REPLACEMENT_CHAR;
82*16467b97STreehugger Robot 	    }
83*16467b97STreehugger Robot 	} else {
84*16467b97STreehugger Robot 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
85*16467b97STreehugger Robot 	    if (target + 1 >= targetEnd) {
86*16467b97STreehugger Robot 		--source; /* Back up source pointer! */
87*16467b97STreehugger Robot 		result = targetExhausted; break;
88*16467b97STreehugger Robot 	    }
89*16467b97STreehugger Robot 	    ch -= halfBase;
90*16467b97STreehugger Robot 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
91*16467b97STreehugger Robot 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
92*16467b97STreehugger Robot 	}
93*16467b97STreehugger Robot     }
94*16467b97STreehugger Robot     *sourceStart = source;
95*16467b97STreehugger Robot     *targetStart = target;
96*16467b97STreehugger Robot     return result;
97*16467b97STreehugger Robot }
98*16467b97STreehugger Robot 
99*16467b97STreehugger Robot /* --------------------------------------------------------------------- */
100*16467b97STreehugger Robot 
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)101*16467b97STreehugger Robot ConversionResult ConvertUTF16toUTF32 (
102*16467b97STreehugger Robot 	const UTF16** sourceStart, const UTF16* sourceEnd,
103*16467b97STreehugger Robot 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
104*16467b97STreehugger Robot     ConversionResult result = conversionOK;
105*16467b97STreehugger Robot     const UTF16* source = *sourceStart;
106*16467b97STreehugger Robot     UTF32* target = *targetStart;
107*16467b97STreehugger Robot     UTF32 ch, ch2;
108*16467b97STreehugger Robot     while (source < sourceEnd) {
109*16467b97STreehugger Robot 	const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
110*16467b97STreehugger Robot 	ch = *source++;
111*16467b97STreehugger Robot 	/* If we have a surrogate pair, convert to UTF32 first. */
112*16467b97STreehugger Robot 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
113*16467b97STreehugger Robot 	    /* If the 16 bits following the high surrogate are in the source buffer... */
114*16467b97STreehugger Robot 	    if (source < sourceEnd) {
115*16467b97STreehugger Robot 		ch2 = *source;
116*16467b97STreehugger Robot 		/* If it's a low surrogate, convert to UTF32. */
117*16467b97STreehugger Robot 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
118*16467b97STreehugger Robot 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
119*16467b97STreehugger Robot 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
120*16467b97STreehugger Robot 		    ++source;
121*16467b97STreehugger Robot 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
122*16467b97STreehugger Robot 		    --source; /* return to the illegal value itself */
123*16467b97STreehugger Robot 		    result = sourceIllegal;
124*16467b97STreehugger Robot 		    break;
125*16467b97STreehugger Robot 		}
126*16467b97STreehugger Robot 	    } else { /* We don't have the 16 bits following the high surrogate. */
127*16467b97STreehugger Robot 		--source; /* return to the high surrogate */
128*16467b97STreehugger Robot 		result = sourceExhausted;
129*16467b97STreehugger Robot 		break;
130*16467b97STreehugger Robot 	    }
131*16467b97STreehugger Robot 	} else if (flags == strictConversion) {
132*16467b97STreehugger Robot 	    /* UTF-16 surrogate values are illegal in UTF-32 */
133*16467b97STreehugger Robot 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
134*16467b97STreehugger Robot 		--source; /* return to the illegal value itself */
135*16467b97STreehugger Robot 		result = sourceIllegal;
136*16467b97STreehugger Robot 		break;
137*16467b97STreehugger Robot 	    }
138*16467b97STreehugger Robot 	}
139*16467b97STreehugger Robot 	if (target >= targetEnd) {
140*16467b97STreehugger Robot 	    source = oldSource; /* Back up source pointer! */
141*16467b97STreehugger Robot 	    result = targetExhausted; break;
142*16467b97STreehugger Robot 	}
143*16467b97STreehugger Robot 	*target++ = ch;
144*16467b97STreehugger Robot     }
145*16467b97STreehugger Robot     *sourceStart = source;
146*16467b97STreehugger Robot     *targetStart = target;
147*16467b97STreehugger Robot #ifdef CVTUTF_DEBUG
148*16467b97STreehugger Robot if (result == sourceIllegal) {
149*16467b97STreehugger Robot     ANTLR3_FPRINTF(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
150*16467b97STreehugger Robot     fflush(stderr);
151*16467b97STreehugger Robot }
152*16467b97STreehugger Robot #endif
153*16467b97STreehugger Robot     return result;
154*16467b97STreehugger Robot }
155*16467b97STreehugger Robot 
156*16467b97STreehugger Robot /* --------------------------------------------------------------------- */
157*16467b97STreehugger Robot 
158*16467b97STreehugger Robot /*
159*16467b97STreehugger Robot  * Index into the table below with the first byte of a UTF-8 sequence to
160*16467b97STreehugger Robot  * get the number of trailing bytes that are supposed to follow it.
161*16467b97STreehugger Robot  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
162*16467b97STreehugger Robot  * left as-is for anyone who may want to do such conversion, which was
163*16467b97STreehugger Robot  * allowed in earlier algorithms.
164*16467b97STreehugger Robot  */
165*16467b97STreehugger Robot static const char trailingBytesForUTF8[256] = {
166*16467b97STreehugger Robot     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
167*16467b97STreehugger Robot     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
168*16467b97STreehugger Robot     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
169*16467b97STreehugger Robot     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
170*16467b97STreehugger Robot     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
171*16467b97STreehugger Robot     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
172*16467b97STreehugger Robot     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
173*16467b97STreehugger Robot     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
174*16467b97STreehugger Robot };
175*16467b97STreehugger Robot 
176*16467b97STreehugger Robot /*
177*16467b97STreehugger Robot  * Magic values subtracted from a buffer value during UTF8 conversion.
178*16467b97STreehugger Robot  * This table contains as many values as there might be trailing bytes
179*16467b97STreehugger Robot  * in a UTF-8 sequence.
180*16467b97STreehugger Robot  */
181*16467b97STreehugger Robot static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
182*16467b97STreehugger Robot 		     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
183*16467b97STreehugger Robot 
184*16467b97STreehugger Robot /*
185*16467b97STreehugger Robot  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
186*16467b97STreehugger Robot  * into the first byte, depending on how many bytes follow.  There are
187*16467b97STreehugger Robot  * as many entries in this table as there are UTF-8 sequence types.
188*16467b97STreehugger Robot  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
189*16467b97STreehugger Robot  * for *legal* UTF-8 will be 4 or fewer bytes total.
190*16467b97STreehugger Robot  */
191*16467b97STreehugger Robot static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
192*16467b97STreehugger Robot 
193*16467b97STreehugger Robot /* --------------------------------------------------------------------- */
194*16467b97STreehugger Robot 
195*16467b97STreehugger Robot /* The interface converts a whole buffer to avoid function-call overhead.
196*16467b97STreehugger Robot  * Constants have been gathered. Loops & conditionals have been removed as
197*16467b97STreehugger Robot  * much as possible for efficiency, in favor of drop-through switches.
198*16467b97STreehugger Robot  * (See "Note A" at the bottom of the file for equivalent code.)
199*16467b97STreehugger Robot  * If your compiler supports it, the "isLegalUTF8" call can be turned
200*16467b97STreehugger Robot  * into an inline function.
201*16467b97STreehugger Robot  */
202*16467b97STreehugger Robot 
203*16467b97STreehugger Robot /* --------------------------------------------------------------------- */
204*16467b97STreehugger Robot 
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)205*16467b97STreehugger Robot ConversionResult ConvertUTF16toUTF8 (
206*16467b97STreehugger Robot 	const UTF16** sourceStart, const UTF16* sourceEnd,
207*16467b97STreehugger Robot 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
208*16467b97STreehugger Robot     ConversionResult result = conversionOK;
209*16467b97STreehugger Robot     const UTF16* source = *sourceStart;
210*16467b97STreehugger Robot     UTF8* target = *targetStart;
211*16467b97STreehugger Robot     while (source < sourceEnd) {
212*16467b97STreehugger Robot 	UTF32 ch;
213*16467b97STreehugger Robot 	unsigned short bytesToWrite = 0;
214*16467b97STreehugger Robot 	const UTF32 byteMask = 0xBF;
215*16467b97STreehugger Robot 	const UTF32 byteMark = 0x80;
216*16467b97STreehugger Robot 	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
217*16467b97STreehugger Robot 	ch = *source++;
218*16467b97STreehugger Robot 	/* If we have a surrogate pair, convert to UTF32 first. */
219*16467b97STreehugger Robot 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
220*16467b97STreehugger Robot 	    /* If the 16 bits following the high surrogate are in the source buffer... */
221*16467b97STreehugger Robot 	    if (source < sourceEnd) {
222*16467b97STreehugger Robot 		UTF32 ch2 = *source;
223*16467b97STreehugger Robot 		/* If it's a low surrogate, convert to UTF32. */
224*16467b97STreehugger Robot 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
225*16467b97STreehugger Robot 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
226*16467b97STreehugger Robot 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
227*16467b97STreehugger Robot 		    ++source;
228*16467b97STreehugger Robot 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
229*16467b97STreehugger Robot 		    --source; /* return to the illegal value itself */
230*16467b97STreehugger Robot 		    result = sourceIllegal;
231*16467b97STreehugger Robot 		    break;
232*16467b97STreehugger Robot 		}
233*16467b97STreehugger Robot 	    } else { /* We don't have the 16 bits following the high surrogate. */
234*16467b97STreehugger Robot 		--source; /* return to the high surrogate */
235*16467b97STreehugger Robot 		result = sourceExhausted;
236*16467b97STreehugger Robot 		break;
237*16467b97STreehugger Robot 	    }
238*16467b97STreehugger Robot         } else if (flags == strictConversion) {
239*16467b97STreehugger Robot 	    /* UTF-16 surrogate values are illegal in UTF-32 */
240*16467b97STreehugger Robot 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
241*16467b97STreehugger Robot 		--source; /* return to the illegal value itself */
242*16467b97STreehugger Robot 		result = sourceIllegal;
243*16467b97STreehugger Robot 		break;
244*16467b97STreehugger Robot 	    }
245*16467b97STreehugger Robot 	}
246*16467b97STreehugger Robot 	/* Figure out how many bytes the result will require */
247*16467b97STreehugger Robot 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
248*16467b97STreehugger Robot 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
249*16467b97STreehugger Robot 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
250*16467b97STreehugger Robot 	} else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
251*16467b97STreehugger Robot 	} else {			    bytesToWrite = 3;
252*16467b97STreehugger Robot 					    ch = UNI_REPLACEMENT_CHAR;
253*16467b97STreehugger Robot 	}
254*16467b97STreehugger Robot 
255*16467b97STreehugger Robot 	target += bytesToWrite;
256*16467b97STreehugger Robot 	if (target > targetEnd) {
257*16467b97STreehugger Robot 	    source = oldSource; /* Back up source pointer! */
258*16467b97STreehugger Robot 	    target -= bytesToWrite; result = targetExhausted; break;
259*16467b97STreehugger Robot 	}
260*16467b97STreehugger Robot 	switch (bytesToWrite) { /* note: everything falls through. */
261*16467b97STreehugger Robot 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
262*16467b97STreehugger Robot 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
263*16467b97STreehugger Robot 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
264*16467b97STreehugger Robot 	    case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
265*16467b97STreehugger Robot 	}
266*16467b97STreehugger Robot 	target += bytesToWrite;
267*16467b97STreehugger Robot     }
268*16467b97STreehugger Robot     *sourceStart = source;
269*16467b97STreehugger Robot     *targetStart = target;
270*16467b97STreehugger Robot     return result;
271*16467b97STreehugger Robot }
272*16467b97STreehugger Robot 
273*16467b97STreehugger Robot /* --------------------------------------------------------------------- */
274*16467b97STreehugger Robot 
275*16467b97STreehugger Robot /*
276*16467b97STreehugger Robot  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
277*16467b97STreehugger Robot  * This must be called with the length pre-determined by the first byte.
278*16467b97STreehugger Robot  * If not calling this from ConvertUTF8to*, then the length can be set by:
279*16467b97STreehugger Robot  *  length = trailingBytesForUTF8[*source]+1;
280*16467b97STreehugger Robot  * and the sequence is illegal right away if there aren't that many bytes
281*16467b97STreehugger Robot  * available.
282*16467b97STreehugger Robot  * If presented with a length > 4, this returns false.  The Unicode
283*16467b97STreehugger Robot  * definition of UTF-8 goes up to 4-byte sequences.
284*16467b97STreehugger Robot  */
285*16467b97STreehugger Robot 
286*16467b97STreehugger Robot static ANTLR3_BOOLEAN
isLegalUTF8(const UTF8 * source,int length)287*16467b97STreehugger Robot isLegalUTF8(const UTF8 *source, int length) {
288*16467b97STreehugger Robot     UTF8 a;
289*16467b97STreehugger Robot     const UTF8 *srcptr = source+length;
290*16467b97STreehugger Robot     switch (length) {
291*16467b97STreehugger Robot     default: return ANTLR3_FALSE;
292*16467b97STreehugger Robot 	/* Everything else falls through when "true"... */
293*16467b97STreehugger Robot     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return ANTLR3_FALSE;
294*16467b97STreehugger Robot     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return ANTLR3_FALSE;
295*16467b97STreehugger Robot     case 2: if ((a = (*--srcptr)) > 0xBF) return ANTLR3_FALSE;
296*16467b97STreehugger Robot 
297*16467b97STreehugger Robot 	switch (*source) {
298*16467b97STreehugger Robot 	    /* no fall-through in this inner switch */
299*16467b97STreehugger Robot 	    case 0xE0: if (a < 0xA0) return ANTLR3_FALSE; break;
300*16467b97STreehugger Robot 	    case 0xED: if (a > 0x9F) return ANTLR3_FALSE; break;
301*16467b97STreehugger Robot 	    case 0xF0: if (a < 0x90) return ANTLR3_FALSE; break;
302*16467b97STreehugger Robot 	    case 0xF4: if (a > 0x8F) return ANTLR3_FALSE; break;
303*16467b97STreehugger Robot 	    default:   if (a < 0x80) return ANTLR3_FALSE;
304*16467b97STreehugger Robot 	}
305*16467b97STreehugger Robot 
306*16467b97STreehugger Robot     case 1: if (*source >= 0x80 && *source < 0xC2) return ANTLR3_FALSE;
307*16467b97STreehugger Robot     }
308*16467b97STreehugger Robot     if (*source > 0xF4) return ANTLR3_FALSE;
309*16467b97STreehugger Robot     return ANTLR3_TRUE;
310*16467b97STreehugger Robot }
311*16467b97STreehugger Robot 
312*16467b97STreehugger Robot /* --------------------------------------------------------------------- */
313*16467b97STreehugger Robot 
314*16467b97STreehugger Robot /*
315*16467b97STreehugger Robot  * Exported function to return whether a UTF-8 sequence is legal or not.
316*16467b97STreehugger Robot  * This is not used here; it's just exported.
317*16467b97STreehugger Robot  */
318*16467b97STreehugger Robot ANTLR3_BOOLEAN
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)319*16467b97STreehugger Robot isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
320*16467b97STreehugger Robot     int length = trailingBytesForUTF8[*source]+1;
321*16467b97STreehugger Robot     if (source+length > sourceEnd) {
322*16467b97STreehugger Robot 	return ANTLR3_FALSE;
323*16467b97STreehugger Robot     }
324*16467b97STreehugger Robot     return isLegalUTF8(source, length);
325*16467b97STreehugger Robot }
326*16467b97STreehugger Robot 
327*16467b97STreehugger Robot /* --------------------------------------------------------------------- */
328*16467b97STreehugger Robot 
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)329*16467b97STreehugger Robot ConversionResult ConvertUTF8toUTF16 (
330*16467b97STreehugger Robot 	const UTF8** sourceStart, const UTF8* sourceEnd,
331*16467b97STreehugger Robot 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
332*16467b97STreehugger Robot     ConversionResult result = conversionOK;
333*16467b97STreehugger Robot     const UTF8* source = *sourceStart;
334*16467b97STreehugger Robot     UTF16* target = *targetStart;
335*16467b97STreehugger Robot     while (source < sourceEnd) {
336*16467b97STreehugger Robot 	UTF32 ch = 0;
337*16467b97STreehugger Robot 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
338*16467b97STreehugger Robot 	if (source + extraBytesToRead >= sourceEnd) {
339*16467b97STreehugger Robot 	    result = sourceExhausted; break;
340*16467b97STreehugger Robot 	}
341*16467b97STreehugger Robot 	/* Do this check whether lenient or strict */
342*16467b97STreehugger Robot 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
343*16467b97STreehugger Robot 	    result = sourceIllegal;
344*16467b97STreehugger Robot 	    break;
345*16467b97STreehugger Robot 	}
346*16467b97STreehugger Robot 	/*
347*16467b97STreehugger Robot 	 * The cases all fall through. See "Note A" below.
348*16467b97STreehugger Robot 	 */
349*16467b97STreehugger Robot 	switch (extraBytesToRead) {
350*16467b97STreehugger Robot 	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
351*16467b97STreehugger Robot 	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
352*16467b97STreehugger Robot 	    case 3: ch += *source++; ch <<= 6;
353*16467b97STreehugger Robot 	    case 2: ch += *source++; ch <<= 6;
354*16467b97STreehugger Robot 	    case 1: ch += *source++; ch <<= 6;
355*16467b97STreehugger Robot 	    case 0: ch += *source++;
356*16467b97STreehugger Robot 	}
357*16467b97STreehugger Robot 	ch -= offsetsFromUTF8[extraBytesToRead];
358*16467b97STreehugger Robot 
359*16467b97STreehugger Robot 	if (target >= targetEnd) {
360*16467b97STreehugger Robot 	    source -= (extraBytesToRead+1); /* Back up source pointer! */
361*16467b97STreehugger Robot 	    result = targetExhausted; break;
362*16467b97STreehugger Robot 	}
363*16467b97STreehugger Robot 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
364*16467b97STreehugger Robot 	    /* UTF-16 surrogate values are illegal in UTF-32 */
365*16467b97STreehugger Robot 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
366*16467b97STreehugger Robot 		if (flags == strictConversion) {
367*16467b97STreehugger Robot 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
368*16467b97STreehugger Robot 		    result = sourceIllegal;
369*16467b97STreehugger Robot 		    break;
370*16467b97STreehugger Robot 		} else {
371*16467b97STreehugger Robot 		    *target++ = UNI_REPLACEMENT_CHAR;
372*16467b97STreehugger Robot 		}
373*16467b97STreehugger Robot 	    } else {
374*16467b97STreehugger Robot 		*target++ = (UTF16)ch; /* normal case */
375*16467b97STreehugger Robot 	    }
376*16467b97STreehugger Robot 	} else if (ch > UNI_MAX_UTF16) {
377*16467b97STreehugger Robot 	    if (flags == strictConversion) {
378*16467b97STreehugger Robot 		result = sourceIllegal;
379*16467b97STreehugger Robot 		source -= (extraBytesToRead+1); /* return to the start */
380*16467b97STreehugger Robot 		break; /* Bail out; shouldn't continue */
381*16467b97STreehugger Robot 	    } else {
382*16467b97STreehugger Robot 		*target++ = UNI_REPLACEMENT_CHAR;
383*16467b97STreehugger Robot 	    }
384*16467b97STreehugger Robot 	} else {
385*16467b97STreehugger Robot 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
386*16467b97STreehugger Robot 	    if (target + 1 >= targetEnd) {
387*16467b97STreehugger Robot 		source -= (extraBytesToRead+1); /* Back up source pointer! */
388*16467b97STreehugger Robot 		result = targetExhausted; break;
389*16467b97STreehugger Robot 	    }
390*16467b97STreehugger Robot 	    ch -= halfBase;
391*16467b97STreehugger Robot 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
392*16467b97STreehugger Robot 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
393*16467b97STreehugger Robot 	}
394*16467b97STreehugger Robot     }
395*16467b97STreehugger Robot     *sourceStart = source;
396*16467b97STreehugger Robot     *targetStart = target;
397*16467b97STreehugger Robot     return result;
398*16467b97STreehugger Robot }
399*16467b97STreehugger Robot 
400*16467b97STreehugger Robot /* --------------------------------------------------------------------- */
401*16467b97STreehugger Robot 
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)402*16467b97STreehugger Robot ConversionResult ConvertUTF32toUTF8 (
403*16467b97STreehugger Robot 	const UTF32** sourceStart, const UTF32* sourceEnd,
404*16467b97STreehugger Robot 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
405*16467b97STreehugger Robot     ConversionResult result = conversionOK;
406*16467b97STreehugger Robot     const UTF32* source = *sourceStart;
407*16467b97STreehugger Robot     UTF8* target = *targetStart;
408*16467b97STreehugger Robot     while (source < sourceEnd) {
409*16467b97STreehugger Robot 	UTF32 ch;
410*16467b97STreehugger Robot 	unsigned short bytesToWrite = 0;
411*16467b97STreehugger Robot 	const UTF32 byteMask = 0xBF;
412*16467b97STreehugger Robot 	const UTF32 byteMark = 0x80;
413*16467b97STreehugger Robot 	ch = *source++;
414*16467b97STreehugger Robot 	if (flags == strictConversion ) {
415*16467b97STreehugger Robot 	    /* UTF-16 surrogate values are illegal in UTF-32 */
416*16467b97STreehugger Robot 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
417*16467b97STreehugger Robot 		--source; /* return to the illegal value itself */
418*16467b97STreehugger Robot 		result = sourceIllegal;
419*16467b97STreehugger Robot 		break;
420*16467b97STreehugger Robot 	    }
421*16467b97STreehugger Robot 	}
422*16467b97STreehugger Robot 	/*
423*16467b97STreehugger Robot 	 * Figure out how many bytes the result will require. Turn any
424*16467b97STreehugger Robot 	 * illegally large UTF32 things (> Plane 17) into replacement chars.
425*16467b97STreehugger Robot 	 */
426*16467b97STreehugger Robot 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
427*16467b97STreehugger Robot 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
428*16467b97STreehugger Robot 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
429*16467b97STreehugger Robot 	} else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
430*16467b97STreehugger Robot 	} else {			    bytesToWrite = 3;
431*16467b97STreehugger Robot 					    ch = UNI_REPLACEMENT_CHAR;
432*16467b97STreehugger Robot 					    result = sourceIllegal;
433*16467b97STreehugger Robot 	}
434*16467b97STreehugger Robot 
435*16467b97STreehugger Robot 	target += bytesToWrite;
436*16467b97STreehugger Robot 	if (target > targetEnd) {
437*16467b97STreehugger Robot 	    --source; /* Back up source pointer! */
438*16467b97STreehugger Robot 	    target -= bytesToWrite; result = targetExhausted; break;
439*16467b97STreehugger Robot 	}
440*16467b97STreehugger Robot 	switch (bytesToWrite) { /* note: everything falls through. */
441*16467b97STreehugger Robot 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
442*16467b97STreehugger Robot 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
443*16467b97STreehugger Robot 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
444*16467b97STreehugger Robot 	    case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
445*16467b97STreehugger Robot 	}
446*16467b97STreehugger Robot 	target += bytesToWrite;
447*16467b97STreehugger Robot     }
448*16467b97STreehugger Robot     *sourceStart = source;
449*16467b97STreehugger Robot     *targetStart = target;
450*16467b97STreehugger Robot     return result;
451*16467b97STreehugger Robot }
452*16467b97STreehugger Robot 
453*16467b97STreehugger Robot /* --------------------------------------------------------------------- */
454*16467b97STreehugger Robot 
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)455*16467b97STreehugger Robot ConversionResult ConvertUTF8toUTF32 (
456*16467b97STreehugger Robot 	const UTF8** sourceStart, const UTF8* sourceEnd,
457*16467b97STreehugger Robot 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
458*16467b97STreehugger Robot     ConversionResult result = conversionOK;
459*16467b97STreehugger Robot     const UTF8* source = *sourceStart;
460*16467b97STreehugger Robot     UTF32* target = *targetStart;
461*16467b97STreehugger Robot     while (source < sourceEnd) {
462*16467b97STreehugger Robot 	UTF32 ch = 0;
463*16467b97STreehugger Robot 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
464*16467b97STreehugger Robot 	if (source + extraBytesToRead >= sourceEnd) {
465*16467b97STreehugger Robot 	    result = sourceExhausted; break;
466*16467b97STreehugger Robot 	}
467*16467b97STreehugger Robot 	/* Do this check whether lenient or strict */
468*16467b97STreehugger Robot 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
469*16467b97STreehugger Robot 	    result = sourceIllegal;
470*16467b97STreehugger Robot 	    break;
471*16467b97STreehugger Robot 	}
472*16467b97STreehugger Robot 	/*
473*16467b97STreehugger Robot 	 * The cases all fall through. See "Note A" below.
474*16467b97STreehugger Robot 	 */
475*16467b97STreehugger Robot 	switch (extraBytesToRead) {
476*16467b97STreehugger Robot 	    case 5: ch += *source++; ch <<= 6;
477*16467b97STreehugger Robot 	    case 4: ch += *source++; ch <<= 6;
478*16467b97STreehugger Robot 	    case 3: ch += *source++; ch <<= 6;
479*16467b97STreehugger Robot 	    case 2: ch += *source++; ch <<= 6;
480*16467b97STreehugger Robot 	    case 1: ch += *source++; ch <<= 6;
481*16467b97STreehugger Robot 	    case 0: ch += *source++;
482*16467b97STreehugger Robot 	}
483*16467b97STreehugger Robot 	ch -= offsetsFromUTF8[extraBytesToRead];
484*16467b97STreehugger Robot 
485*16467b97STreehugger Robot 	if (target >= targetEnd) {
486*16467b97STreehugger Robot 	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
487*16467b97STreehugger Robot 	    result = targetExhausted; break;
488*16467b97STreehugger Robot 	}
489*16467b97STreehugger Robot 	if (ch <= UNI_MAX_LEGAL_UTF32) {
490*16467b97STreehugger Robot 	    /*
491*16467b97STreehugger Robot 	     * UTF-16 surrogate values are illegal in UTF-32, and anything
492*16467b97STreehugger Robot 	     * over Plane 17 (> 0x10FFFF) is illegal.
493*16467b97STreehugger Robot 	     */
494*16467b97STreehugger Robot 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
495*16467b97STreehugger Robot 		if (flags == strictConversion) {
496*16467b97STreehugger Robot 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
497*16467b97STreehugger Robot 		    result = sourceIllegal;
498*16467b97STreehugger Robot 		    break;
499*16467b97STreehugger Robot 		} else {
500*16467b97STreehugger Robot 		    *target++ = UNI_REPLACEMENT_CHAR;
501*16467b97STreehugger Robot 		}
502*16467b97STreehugger Robot 	    } else {
503*16467b97STreehugger Robot 		*target++ = ch;
504*16467b97STreehugger Robot 	    }
505*16467b97STreehugger Robot 	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
506*16467b97STreehugger Robot 	    result = sourceIllegal;
507*16467b97STreehugger Robot 	    *target++ = UNI_REPLACEMENT_CHAR;
508*16467b97STreehugger Robot 	}
509*16467b97STreehugger Robot     }
510*16467b97STreehugger Robot     *sourceStart = source;
511*16467b97STreehugger Robot     *targetStart = target;
512*16467b97STreehugger Robot     return result;
513*16467b97STreehugger Robot }
514*16467b97STreehugger Robot 
515*16467b97STreehugger Robot /* ---------------------------------------------------------------------
516*16467b97STreehugger Robot 
517*16467b97STreehugger Robot     Note A.
518*16467b97STreehugger Robot     The fall-through switches in UTF-8 reading code save a
519*16467b97STreehugger Robot     temp variable, some decrements & conditionals.  The switches
520*16467b97STreehugger Robot     are equivalent to the following loop:
521*16467b97STreehugger Robot 	{
522*16467b97STreehugger Robot 	    int tmpBytesToRead = extraBytesToRead+1;
523*16467b97STreehugger Robot 	    do {
524*16467b97STreehugger Robot 		ch += *source++;
525*16467b97STreehugger Robot 		--tmpBytesToRead;
526*16467b97STreehugger Robot 		if (tmpBytesToRead) ch <<= 6;
527*16467b97STreehugger Robot 	    } while (tmpBytesToRead > 0);
528*16467b97STreehugger Robot 	}
529*16467b97STreehugger Robot     In UTF-8 writing code, the switches on "bytesToWrite" are
530*16467b97STreehugger Robot     similarly unrolled loops.
531*16467b97STreehugger Robot 
532*16467b97STreehugger Robot    --------------------------------------------------------------------- */
533