1 /* 2 * Summary: interface for the encoding conversion functions 3 * Description: interface for the encoding conversion functions needed for 4 * XML basic encoding and iconv() support. 5 * 6 * Related specs are 7 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies 8 * [ISO-10646] UTF-8 and UTF-16 in Annexes 9 * [ISO-8859-1] ISO Latin-1 characters codes. 10 * [UNICODE] The Unicode Consortium, "The Unicode Standard -- 11 * Worldwide Character Encoding -- Version 1.0", Addison- 12 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is 13 * described in Unicode Technical Report #4. 14 * [US-ASCII] Coded Character Set--7-bit American Standard Code for 15 * Information Interchange, ANSI X3.4-1986. 16 * 17 * Copy: See Copyright for the status of this software. 18 * 19 * Author: Daniel Veillard 20 */ 21 22 #ifndef __XML_CHAR_ENCODING_H__ 23 #define __XML_CHAR_ENCODING_H__ 24 25 #include <libxml/xmlversion.h> 26 27 #ifdef LIBXML_ICONV_ENABLED 28 #include <iconv.h> 29 #endif 30 31 #ifdef __cplusplus 32 extern "C" { 33 #endif 34 35 typedef enum { 36 XML_ENC_ERR_SUCCESS = 0, 37 XML_ENC_ERR_SPACE = -1, 38 XML_ENC_ERR_INPUT = -2, 39 XML_ENC_ERR_PARTIAL = -3, 40 XML_ENC_ERR_INTERNAL = -4, 41 XML_ENC_ERR_MEMORY = -5 42 } xmlCharEncError; 43 44 /* 45 * xmlCharEncoding: 46 * 47 * Predefined values for some standard encodings. 48 * Libxml does not do beforehand translation on UTF8 and ISOLatinX. 49 * It also supports ASCII, ISO-8859-1, and UTF16 (LE and BE) by default. 50 * 51 * Anything else would have to be translated to UTF8 before being 52 * given to the parser itself. The BOM for UTF16 and the encoding 53 * declaration are looked at and a converter is looked for at that 54 * point. If not found the parser stops here as asked by the XML REC. A 55 * converter can be registered by the user using xmlRegisterCharEncodingHandler 56 * but the current form doesn't allow stateful transcoding (a serious 57 * problem agreed !). If iconv has been found it will be used 58 * automatically and allow stateful transcoding, the simplest is then 59 * to be sure to enable iconv and to provide iconv libs for the encoding 60 * support needed. 61 * 62 * Note that the generic "UTF-16" is not a predefined value. Instead, only 63 * the specific UTF-16LE and UTF-16BE are present. 64 */ 65 typedef enum { 66 XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */ 67 XML_CHAR_ENCODING_NONE= 0, /* No char encoding detected */ 68 XML_CHAR_ENCODING_UTF8= 1, /* UTF-8 */ 69 XML_CHAR_ENCODING_UTF16LE= 2, /* UTF-16 little endian */ 70 XML_CHAR_ENCODING_UTF16BE= 3, /* UTF-16 big endian */ 71 XML_CHAR_ENCODING_UCS4LE= 4, /* UCS-4 little endian */ 72 XML_CHAR_ENCODING_UCS4BE= 5, /* UCS-4 big endian */ 73 XML_CHAR_ENCODING_EBCDIC= 6, /* EBCDIC uh! */ 74 XML_CHAR_ENCODING_UCS4_2143=7, /* UCS-4 unusual ordering */ 75 XML_CHAR_ENCODING_UCS4_3412=8, /* UCS-4 unusual ordering */ 76 XML_CHAR_ENCODING_UCS2= 9, /* UCS-2 */ 77 XML_CHAR_ENCODING_8859_1= 10,/* ISO-8859-1 ISO Latin 1 */ 78 XML_CHAR_ENCODING_8859_2= 11,/* ISO-8859-2 ISO Latin 2 */ 79 XML_CHAR_ENCODING_8859_3= 12,/* ISO-8859-3 */ 80 XML_CHAR_ENCODING_8859_4= 13,/* ISO-8859-4 */ 81 XML_CHAR_ENCODING_8859_5= 14,/* ISO-8859-5 */ 82 XML_CHAR_ENCODING_8859_6= 15,/* ISO-8859-6 */ 83 XML_CHAR_ENCODING_8859_7= 16,/* ISO-8859-7 */ 84 XML_CHAR_ENCODING_8859_8= 17,/* ISO-8859-8 */ 85 XML_CHAR_ENCODING_8859_9= 18,/* ISO-8859-9 */ 86 XML_CHAR_ENCODING_2022_JP= 19,/* ISO-2022-JP */ 87 XML_CHAR_ENCODING_SHIFT_JIS=20,/* Shift_JIS */ 88 XML_CHAR_ENCODING_EUC_JP= 21,/* EUC-JP */ 89 XML_CHAR_ENCODING_ASCII= 22 /* pure ASCII */ 90 } xmlCharEncoding; 91 92 /** 93 * xmlCharEncodingInputFunc: 94 * @out: a pointer to an array of bytes to store the UTF-8 result 95 * @outlen: the length of @out 96 * @in: a pointer to an array of chars in the original encoding 97 * @inlen: the length of @in 98 * 99 * Take a block of chars in the original encoding and try to convert 100 * it to an UTF-8 block of chars out. 101 * 102 * Returns the number of bytes written, -1 if lack of space, or -2 103 * if the transcoding failed. 104 * The value of @inlen after return is the number of octets consumed 105 * if the return value is positive, else unpredictiable. 106 * The value of @outlen after return is the number of octets consumed. 107 */ 108 typedef int (* xmlCharEncodingInputFunc)(unsigned char *out, int *outlen, 109 const unsigned char *in, int *inlen); 110 111 112 /** 113 * xmlCharEncodingOutputFunc: 114 * @out: a pointer to an array of bytes to store the result 115 * @outlen: the length of @out 116 * @in: a pointer to an array of UTF-8 chars 117 * @inlen: the length of @in 118 * 119 * Take a block of UTF-8 chars in and try to convert it to another 120 * encoding. 121 * Note: a first call designed to produce heading info is called with 122 * in = NULL. If stateful this should also initialize the encoder state. 123 * 124 * Returns the number of bytes written, -1 if lack of space, or -2 125 * if the transcoding failed. 126 * The value of @inlen after return is the number of octets consumed 127 * if the return value is positive, else unpredictiable. 128 * The value of @outlen after return is the number of octets produced. 129 */ 130 typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, 131 const unsigned char *in, int *inlen); 132 133 134 /* 135 * Block defining the handlers for non UTF-8 encodings. 136 * If iconv is supported, there are two extra fields. 137 */ 138 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; 139 typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; 140 struct _xmlCharEncodingHandler { 141 char *name; 142 xmlCharEncodingInputFunc input; 143 xmlCharEncodingOutputFunc output; 144 #ifdef LIBXML_ICONV_ENABLED 145 iconv_t iconv_in; 146 iconv_t iconv_out; 147 #endif /* LIBXML_ICONV_ENABLED */ 148 #ifdef LIBXML_ICU_ENABLED 149 struct _uconv_t *uconv_in; 150 struct _uconv_t *uconv_out; 151 #endif /* LIBXML_ICU_ENABLED */ 152 }; 153 154 /* 155 * Interfaces for encoding handlers. 156 */ 157 XML_DEPRECATED 158 XMLPUBFUN void 159 xmlInitCharEncodingHandlers (void); 160 XML_DEPRECATED 161 XMLPUBFUN void 162 xmlCleanupCharEncodingHandlers (void); 163 XMLPUBFUN void 164 xmlRegisterCharEncodingHandler (xmlCharEncodingHandlerPtr handler); 165 XMLPUBFUN int 166 xmlLookupCharEncodingHandler (xmlCharEncoding enc, 167 xmlCharEncodingHandlerPtr *out); 168 XMLPUBFUN int 169 xmlOpenCharEncodingHandler (const char *name, 170 int output, 171 xmlCharEncodingHandlerPtr *out); 172 XMLPUBFUN xmlCharEncodingHandlerPtr 173 xmlGetCharEncodingHandler (xmlCharEncoding enc); 174 XMLPUBFUN xmlCharEncodingHandlerPtr 175 xmlFindCharEncodingHandler (const char *name); 176 XMLPUBFUN xmlCharEncodingHandlerPtr 177 xmlNewCharEncodingHandler (const char *name, 178 xmlCharEncodingInputFunc input, 179 xmlCharEncodingOutputFunc output); 180 181 /* 182 * Interfaces for encoding names and aliases. 183 */ 184 XMLPUBFUN int 185 xmlAddEncodingAlias (const char *name, 186 const char *alias); 187 XMLPUBFUN int 188 xmlDelEncodingAlias (const char *alias); 189 XMLPUBFUN const char * 190 xmlGetEncodingAlias (const char *alias); 191 XMLPUBFUN void 192 xmlCleanupEncodingAliases (void); 193 XMLPUBFUN xmlCharEncoding 194 xmlParseCharEncoding (const char *name); 195 XMLPUBFUN const char * 196 xmlGetCharEncodingName (xmlCharEncoding enc); 197 198 /* 199 * Interfaces directly used by the parsers. 200 */ 201 XMLPUBFUN xmlCharEncoding 202 xmlDetectCharEncoding (const unsigned char *in, 203 int len); 204 205 struct _xmlBuffer; 206 XMLPUBFUN int 207 xmlCharEncOutFunc (xmlCharEncodingHandler *handler, 208 struct _xmlBuffer *out, 209 struct _xmlBuffer *in); 210 211 XMLPUBFUN int 212 xmlCharEncInFunc (xmlCharEncodingHandler *handler, 213 struct _xmlBuffer *out, 214 struct _xmlBuffer *in); 215 XML_DEPRECATED 216 XMLPUBFUN int 217 xmlCharEncFirstLine (xmlCharEncodingHandler *handler, 218 struct _xmlBuffer *out, 219 struct _xmlBuffer *in); 220 XMLPUBFUN int 221 xmlCharEncCloseFunc (xmlCharEncodingHandler *handler); 222 223 /* 224 * Export a few useful functions 225 */ 226 #ifdef LIBXML_OUTPUT_ENABLED 227 XMLPUBFUN int 228 UTF8Toisolat1 (unsigned char *out, 229 int *outlen, 230 const unsigned char *in, 231 int *inlen); 232 #endif /* LIBXML_OUTPUT_ENABLED */ 233 XMLPUBFUN int 234 isolat1ToUTF8 (unsigned char *out, 235 int *outlen, 236 const unsigned char *in, 237 int *inlen); 238 #ifdef __cplusplus 239 } 240 #endif 241 242 #endif /* __XML_CHAR_ENCODING_H__ */ 243