1*9712c20fSFrederick Mayle /* 2*9712c20fSFrederick Mayle * Copyright © 1991-2015 Unicode, Inc. All rights reserved. 3*9712c20fSFrederick Mayle * Distributed under the Terms of Use in 4*9712c20fSFrederick Mayle * http://www.unicode.org/copyright.html. 5*9712c20fSFrederick Mayle * 6*9712c20fSFrederick Mayle * Permission is hereby granted, free of charge, to any person obtaining 7*9712c20fSFrederick Mayle * a copy of the Unicode data files and any associated documentation 8*9712c20fSFrederick Mayle * (the "Data Files") or Unicode software and any associated documentation 9*9712c20fSFrederick Mayle * (the "Software") to deal in the Data Files or Software 10*9712c20fSFrederick Mayle * without restriction, including without limitation the rights to use, 11*9712c20fSFrederick Mayle * copy, modify, merge, publish, distribute, and/or sell copies of 12*9712c20fSFrederick Mayle * the Data Files or Software, and to permit persons to whom the Data Files 13*9712c20fSFrederick Mayle * or Software are furnished to do so, provided that 14*9712c20fSFrederick Mayle * (a) this copyright and permission notice appear with all copies 15*9712c20fSFrederick Mayle * of the Data Files or Software, 16*9712c20fSFrederick Mayle * (b) this copyright and permission notice appear in associated 17*9712c20fSFrederick Mayle * documentation, and 18*9712c20fSFrederick Mayle * (c) there is clear notice in each modified Data File or in the Software 19*9712c20fSFrederick Mayle * as well as in the documentation associated with the Data File(s) or 20*9712c20fSFrederick Mayle * Software that the data or software has been modified. 21*9712c20fSFrederick Mayle * 22*9712c20fSFrederick Mayle * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 23*9712c20fSFrederick Mayle * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 24*9712c20fSFrederick Mayle * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 25*9712c20fSFrederick Mayle * NONINFRINGEMENT OF THIRD PARTY RIGHTS. 26*9712c20fSFrederick Mayle * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 27*9712c20fSFrederick Mayle * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 28*9712c20fSFrederick Mayle * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 29*9712c20fSFrederick Mayle * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 30*9712c20fSFrederick Mayle * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 31*9712c20fSFrederick Mayle * PERFORMANCE OF THE DATA FILES OR SOFTWARE. 32*9712c20fSFrederick Mayle * 33*9712c20fSFrederick Mayle * Except as contained in this notice, the name of a copyright holder 34*9712c20fSFrederick Mayle * shall not be used in advertising or otherwise to promote the sale, 35*9712c20fSFrederick Mayle * use or other dealings in these Data Files or Software without prior 36*9712c20fSFrederick Mayle * written authorization of the copyright holder. 37*9712c20fSFrederick Mayle */ 38*9712c20fSFrederick Mayle 39*9712c20fSFrederick Mayle #ifndef COMMON_CONVERT_UTF_H_ 40*9712c20fSFrederick Mayle #define COMMON_CONVERT_UTF_H_ 41*9712c20fSFrederick Mayle 42*9712c20fSFrederick Mayle /* --------------------------------------------------------------------- 43*9712c20fSFrederick Mayle 44*9712c20fSFrederick Mayle Conversions between UTF32, UTF-16, and UTF-8. Header file. 45*9712c20fSFrederick Mayle 46*9712c20fSFrederick Mayle Several funtions are included here, forming a complete set of 47*9712c20fSFrederick Mayle conversions between the three formats. UTF-7 is not included 48*9712c20fSFrederick Mayle here, but is handled in a separate source file. 49*9712c20fSFrederick Mayle 50*9712c20fSFrederick Mayle Each of these routines takes pointers to input buffers and output 51*9712c20fSFrederick Mayle buffers. The input buffers are const. 52*9712c20fSFrederick Mayle 53*9712c20fSFrederick Mayle Each routine converts the text between *sourceStart and sourceEnd, 54*9712c20fSFrederick Mayle putting the result into the buffer between *targetStart and 55*9712c20fSFrederick Mayle targetEnd. Note: the end pointers are *after* the last item: e.g. 56*9712c20fSFrederick Mayle *(sourceEnd - 1) is the last item. 57*9712c20fSFrederick Mayle 58*9712c20fSFrederick Mayle The return result indicates whether the conversion was successful, 59*9712c20fSFrederick Mayle and if not, whether the problem was in the source or target buffers. 60*9712c20fSFrederick Mayle (Only the first encountered problem is indicated.) 61*9712c20fSFrederick Mayle 62*9712c20fSFrederick Mayle After the conversion, *sourceStart and *targetStart are both 63*9712c20fSFrederick Mayle updated to point to the end of last text successfully converted in 64*9712c20fSFrederick Mayle the respective buffers. 65*9712c20fSFrederick Mayle 66*9712c20fSFrederick Mayle Input parameters: 67*9712c20fSFrederick Mayle sourceStart - pointer to a pointer to the source buffer. 68*9712c20fSFrederick Mayle The contents of this are modified on return so that 69*9712c20fSFrederick Mayle it points at the next thing to be converted. 70*9712c20fSFrederick Mayle targetStart - similarly, pointer to pointer to the target buffer. 71*9712c20fSFrederick Mayle sourceEnd, targetEnd - respectively pointers to the ends of the 72*9712c20fSFrederick Mayle two buffers, for overflow checking only. 73*9712c20fSFrederick Mayle 74*9712c20fSFrederick Mayle These conversion functions take a ConversionFlags argument. When this 75*9712c20fSFrederick Mayle flag is set to strict, both irregular sequences and isolated surrogates 76*9712c20fSFrederick Mayle will cause an error. When the flag is set to lenient, both irregular 77*9712c20fSFrederick Mayle sequences and isolated surrogates are converted. 78*9712c20fSFrederick Mayle 79*9712c20fSFrederick Mayle Whether the flag is strict or lenient, all illegal sequences will cause 80*9712c20fSFrederick Mayle an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, 81*9712c20fSFrederick Mayle or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code 82*9712c20fSFrederick Mayle must check for illegal sequences. 83*9712c20fSFrederick Mayle 84*9712c20fSFrederick Mayle When the flag is set to lenient, characters over 0x10FFFF are converted 85*9712c20fSFrederick Mayle to the replacement character; otherwise (when the flag is set to strict) 86*9712c20fSFrederick Mayle they constitute an error. 87*9712c20fSFrederick Mayle 88*9712c20fSFrederick Mayle Output parameters: 89*9712c20fSFrederick Mayle The value "sourceIllegal" is returned from some routines if the input 90*9712c20fSFrederick Mayle sequence is malformed. When "sourceIllegal" is returned, the source 91*9712c20fSFrederick Mayle value will point to the illegal value that caused the problem. E.g., 92*9712c20fSFrederick Mayle in UTF-8 when a sequence is malformed, it points to the start of the 93*9712c20fSFrederick Mayle malformed sequence. 94*9712c20fSFrederick Mayle 95*9712c20fSFrederick Mayle Author: Mark E. Davis, 1994. 96*9712c20fSFrederick Mayle Rev History: Rick McGowan, fixes & updates May 2001. 97*9712c20fSFrederick Mayle Fixes & updates, Sept 2001. 98*9712c20fSFrederick Mayle 99*9712c20fSFrederick Mayle ------------------------------------------------------------------------ */ 100*9712c20fSFrederick Mayle 101*9712c20fSFrederick Mayle /* --------------------------------------------------------------------- 102*9712c20fSFrederick Mayle The following 4 definitions are compiler-specific. 103*9712c20fSFrederick Mayle The C standard does not guarantee that wchar_t has at least 104*9712c20fSFrederick Mayle 16 bits, so wchar_t is no less portable than unsigned short! 105*9712c20fSFrederick Mayle All should be unsigned values to avoid sign extension during 106*9712c20fSFrederick Mayle bit mask & shift operations. 107*9712c20fSFrederick Mayle ------------------------------------------------------------------------ */ 108*9712c20fSFrederick Mayle 109*9712c20fSFrederick Mayle namespace google_breakpad { 110*9712c20fSFrederick Mayle 111*9712c20fSFrederick Mayle typedef unsigned long UTF32; /* at least 32 bits */ 112*9712c20fSFrederick Mayle typedef unsigned short UTF16; /* at least 16 bits */ 113*9712c20fSFrederick Mayle typedef unsigned char UTF8; /* typically 8 bits */ 114*9712c20fSFrederick Mayle typedef unsigned char Boolean; /* 0 or 1 */ 115*9712c20fSFrederick Mayle 116*9712c20fSFrederick Mayle /* Some fundamental constants */ 117*9712c20fSFrederick Mayle #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD 118*9712c20fSFrederick Mayle #define UNI_MAX_BMP (UTF32)0x0000FFFF 119*9712c20fSFrederick Mayle #define UNI_MAX_UTF16 (UTF32)0x0010FFFF 120*9712c20fSFrederick Mayle #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF 121*9712c20fSFrederick Mayle #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF 122*9712c20fSFrederick Mayle 123*9712c20fSFrederick Mayle typedef enum { 124*9712c20fSFrederick Mayle conversionOK, /* conversion successful */ 125*9712c20fSFrederick Mayle sourceExhausted, /* partial character in source, but hit end */ 126*9712c20fSFrederick Mayle targetExhausted, /* insuff. room in target for conversion */ 127*9712c20fSFrederick Mayle sourceIllegal /* source sequence is illegal/malformed */ 128*9712c20fSFrederick Mayle } ConversionResult; 129*9712c20fSFrederick Mayle 130*9712c20fSFrederick Mayle typedef enum { 131*9712c20fSFrederick Mayle strictConversion = 0, 132*9712c20fSFrederick Mayle lenientConversion 133*9712c20fSFrederick Mayle } ConversionFlags; 134*9712c20fSFrederick Mayle 135*9712c20fSFrederick Mayle ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd, 136*9712c20fSFrederick Mayle UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); 137*9712c20fSFrederick Mayle 138*9712c20fSFrederick Mayle ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd, 139*9712c20fSFrederick Mayle UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); 140*9712c20fSFrederick Mayle 141*9712c20fSFrederick Mayle ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd, 142*9712c20fSFrederick Mayle UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 143*9712c20fSFrederick Mayle 144*9712c20fSFrederick Mayle ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd, 145*9712c20fSFrederick Mayle UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); 146*9712c20fSFrederick Mayle 147*9712c20fSFrederick Mayle ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd, 148*9712c20fSFrederick Mayle UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 149*9712c20fSFrederick Mayle 150*9712c20fSFrederick Mayle ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd, 151*9712c20fSFrederick Mayle UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); 152*9712c20fSFrederick Mayle 153*9712c20fSFrederick Mayle Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); 154*9712c20fSFrederick Mayle 155*9712c20fSFrederick Mayle } // namespace google_breakpad 156*9712c20fSFrederick Mayle 157*9712c20fSFrederick Mayle /* --------------------------------------------------------------------- */ 158*9712c20fSFrederick Mayle 159*9712c20fSFrederick Mayle #endif // COMMON_CONVERT_UTF_H_ 160