1*16467b97STreehugger Robot /* 2*16467b97STreehugger Robot * Copyright 2001-2004 Unicode, Inc. 3*16467b97STreehugger Robot * 4*16467b97STreehugger Robot * Disclaimer 5*16467b97STreehugger Robot * 6*16467b97STreehugger Robot * This source code is provided as is by Unicode, Inc. No claims are 7*16467b97STreehugger Robot * made as to fitness for any particular purpose. No warranties of any 8*16467b97STreehugger Robot * kind are expressed or implied. The recipient agrees to determine 9*16467b97STreehugger Robot * applicability of information provided. If this file has been 10*16467b97STreehugger Robot * purchased on magnetic or optical media from Unicode, Inc., the 11*16467b97STreehugger Robot * sole remedy for any claim will be exchange of defective media 12*16467b97STreehugger Robot * within 90 days of receipt. 13*16467b97STreehugger Robot * 14*16467b97STreehugger Robot * Limitations on Rights to Redistribute This Code 15*16467b97STreehugger Robot * 16*16467b97STreehugger Robot * Unicode, Inc. hereby grants the right to freely use the information 17*16467b97STreehugger Robot * supplied in this file in the creation of products supporting the 18*16467b97STreehugger Robot * Unicode Standard, and to make copies of this file in any form 19*16467b97STreehugger Robot * for internal or external distribution as long as this notice 20*16467b97STreehugger Robot * remains attached. 21*16467b97STreehugger Robot */ 22*16467b97STreehugger Robot 23*16467b97STreehugger Robot /* --------------------------------------------------------------------- 24*16467b97STreehugger Robot 25*16467b97STreehugger Robot Conversions between UTF32, UTF-16, and UTF-8. Header file. 26*16467b97STreehugger Robot 27*16467b97STreehugger Robot Several functions are included here, forming a complete set of 28*16467b97STreehugger Robot conversions between the three formats. UTF-7 is not included 29*16467b97STreehugger Robot here, but is handled in a separate source file. 30*16467b97STreehugger Robot 31*16467b97STreehugger Robot Each of these routines takes pointers to input buffers and output 32*16467b97STreehugger Robot buffers. The input buffers are const. 33*16467b97STreehugger Robot 34*16467b97STreehugger Robot Each routine converts the text between *sourceStart and sourceEnd, 35*16467b97STreehugger Robot putting the result into the buffer between *targetStart and 36*16467b97STreehugger Robot targetEnd. Note: the end pointers are *after* the last item: e.g. 37*16467b97STreehugger Robot *(sourceEnd - 1) is the last item. 38*16467b97STreehugger Robot 39*16467b97STreehugger Robot The return result indicates whether the conversion was successful, 40*16467b97STreehugger Robot and if not, whether the problem was in the source or target buffers. 41*16467b97STreehugger Robot (Only the first encountered problem is indicated.) 42*16467b97STreehugger Robot 43*16467b97STreehugger Robot After the conversion, *sourceStart and *targetStart are both 44*16467b97STreehugger Robot updated to point to the end of last text successfully converted in 45*16467b97STreehugger Robot the respective buffers. 46*16467b97STreehugger Robot 47*16467b97STreehugger Robot Input parameters: 48*16467b97STreehugger Robot sourceStart - pointer to a pointer to the source buffer. 49*16467b97STreehugger Robot The contents of this are modified on return so that 50*16467b97STreehugger Robot it points at the next thing to be converted. 51*16467b97STreehugger Robot targetStart - similarly, pointer to pointer to the target buffer. 52*16467b97STreehugger Robot sourceEnd, targetEnd - respectively pointers to the ends of the 53*16467b97STreehugger Robot two buffers, for overflow checking only. 54*16467b97STreehugger Robot 55*16467b97STreehugger Robot These conversion functions take a ConversionFlags argument. When this 56*16467b97STreehugger Robot flag is set to strict, both irregular sequences and isolated surrogates 57*16467b97STreehugger Robot will cause an error. When the flag is set to lenient, both irregular 58*16467b97STreehugger Robot sequences and isolated surrogates are converted. 59*16467b97STreehugger Robot 60*16467b97STreehugger Robot Whether the flag is strict or lenient, all illegal sequences will cause 61*16467b97STreehugger Robot an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, 62*16467b97STreehugger Robot or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code 63*16467b97STreehugger Robot must check for illegal sequences. 64*16467b97STreehugger Robot 65*16467b97STreehugger Robot When the flag is set to lenient, characters over 0x10FFFF are converted 66*16467b97STreehugger Robot to the replacement character; otherwise (when the flag is set to strict) 67*16467b97STreehugger Robot they constitute an error. 68*16467b97STreehugger Robot 69*16467b97STreehugger Robot Output parameters: 70*16467b97STreehugger Robot The value "sourceIllegal" is returned from some routines if the input 71*16467b97STreehugger Robot sequence is malformed. When "sourceIllegal" is returned, the source 72*16467b97STreehugger Robot value will point to the illegal value that caused the problem. E.g., 73*16467b97STreehugger Robot in UTF-8 when a sequence is malformed, it points to the start of the 74*16467b97STreehugger Robot malformed sequence. 75*16467b97STreehugger Robot 76*16467b97STreehugger Robot Author: Mark E. Davis, 1994. 77*16467b97STreehugger Robot Rev History: Rick McGowan, fixes & updates May 2001. 78*16467b97STreehugger Robot Fixes & updates, Sept 2001. 79*16467b97STreehugger Robot 80*16467b97STreehugger Robot ------------------------------------------------------------------------ */ 81*16467b97STreehugger Robot 82*16467b97STreehugger Robot /* --------------------------------------------------------------------- 83*16467b97STreehugger Robot The following 4 definitions are compiler-specific. 84*16467b97STreehugger Robot The C standard does not guarantee that wchar_t has at least 85*16467b97STreehugger Robot 16 bits, so wchar_t is no less portable than unsigned short! 86*16467b97STreehugger Robot All should be unsigned values to avoid sign extension during 87*16467b97STreehugger Robot bit mask & shift operations. 88*16467b97STreehugger Robot ------------------------------------------------------------------------ */ 89*16467b97STreehugger Robot 90*16467b97STreehugger Robot 91*16467b97STreehugger Robot // Changes for ANTLR3 - Jim Idle, January 2008. 92*16467b97STreehugger Robot // builtin types defined for Unicode types changed to 93*16467b97STreehugger Robot // aliases for the types that are system determined by 94*16467b97STreehugger Robot // ANTLR at compile time. 95*16467b97STreehugger Robot // 96*16467b97STreehugger Robot // typedef unsigned long UTF32; /* at least 32 bits */ 97*16467b97STreehugger Robot // typedef unsigned short UTF16; /* at least 16 bits */ 98*16467b97STreehugger Robot // typedef unsigned char UTF8; /* typically 8 bits */ 99*16467b97STreehugger Robot // typedef unsigned char Boolean; /* 0 or 1 */ 100*16467b97STreehugger Robot 101*16467b97STreehugger Robot #ifndef _ANTLR3_CONVERTUTF_H 102*16467b97STreehugger Robot #define _ANTLR3_CONVERTUTF_H 103*16467b97STreehugger Robot 104*16467b97STreehugger Robot ANTLR_BEGIN_NAMESPACE() 105*16467b97STreehugger Robot 106*16467b97STreehugger Robot typedef ANTLR_UINT32 UTF32; /* at least 32 bits */ 107*16467b97STreehugger Robot typedef ANTLR_UINT16 UTF16; /* at least 16 bits */ 108*16467b97STreehugger Robot typedef ANTLR_UINT8 UTF8; /* typically 8 bits */ 109*16467b97STreehugger Robot 110*16467b97STreehugger Robot /* Some fundamental constants */ 111*16467b97STreehugger Robot #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD 112*16467b97STreehugger Robot #define UNI_MAX_BMP (UTF32)0x0000FFFF 113*16467b97STreehugger Robot #define UNI_MAX_UTF16 (UTF32)0x0010FFFF 114*16467b97STreehugger Robot #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF 115*16467b97STreehugger Robot #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF 116*16467b97STreehugger Robot 117*16467b97STreehugger Robot #define UNI_SUR_HIGH_START (UTF32)0xD800 118*16467b97STreehugger Robot #define UNI_SUR_HIGH_END (UTF32)0xDBFF 119*16467b97STreehugger Robot #define UNI_SUR_LOW_START (UTF32)0xDC00 120*16467b97STreehugger Robot #define UNI_SUR_LOW_END (UTF32)0xDFFF 121*16467b97STreehugger Robot #define halfShift ((UTF32)10) 122*16467b97STreehugger Robot #define halfBase ((UTF32)0x0010000UL) 123*16467b97STreehugger Robot #define halfMask ((UTF32)0x3FFUL) 124*16467b97STreehugger Robot 125*16467b97STreehugger Robot enum ConversionResult { 126*16467b97STreehugger Robot conversionOK, /* conversion successful */ 127*16467b97STreehugger Robot sourceExhausted, /* partial character in source, but hit end */ 128*16467b97STreehugger Robot targetExhausted, /* insuff. room in target for conversion */ 129*16467b97STreehugger Robot sourceIllegal /* source sequence is illegal/malformed */ 130*16467b97STreehugger Robot }; 131*16467b97STreehugger Robot 132*16467b97STreehugger Robot enum ConversionFlags { 133*16467b97STreehugger Robot strictConversion = 0, 134*16467b97STreehugger Robot lenientConversion 135*16467b97STreehugger Robot } ; 136*16467b97STreehugger Robot 137*16467b97STreehugger Robot 138*16467b97STreehugger Robot 139*16467b97STreehugger Robot ANTLR_END_NAMESPACE() 140*16467b97STreehugger Robot 141*16467b97STreehugger Robot #endif 142*16467b97STreehugger Robot 143*16467b97STreehugger Robot /* --------------------------------------------------------------------- */ 144