xref: /aosp_15_r20/external/cronet/third_party/icu/source/common/ucnv2022.cpp (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2000-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   file name:  ucnv2022.cpp
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2000feb03
14 *   created by: Markus W. Scherer
15 *
16 *   Change history:
17 *
18 *   06/29/2000  helena  Major rewrite of the callback APIs.
19 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
20 *                       Changed implementation of toUnicode
21 *                       function
22 *   08/21/2000  Ram     Added support for ISO-2022-KR
23 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
24 *                       ucnvebdc.c
25 *   09/20/2000  Ram     Added support for ISO-2022-CN
26 *                       Added implementations for getNextUChar()
27 *                       for specific 2022 country variants.
28 *   10/31/2000  Ram     Implemented offsets logic functions
29 */
30 
31 #include "unicode/utypes.h"
32 
33 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
34 
35 #include "unicode/ucnv.h"
36 #include "unicode/uset.h"
37 #include "unicode/ucnv_err.h"
38 #include "unicode/ucnv_cb.h"
39 #include "unicode/utf16.h"
40 #include "ucnv_imp.h"
41 #include "ucnv_bld.h"
42 #include "ucnv_cnv.h"
43 #include "ucnvmbcs.h"
44 #include "cstring.h"
45 #include "cmemory.h"
46 #include "uassert.h"
47 
48 #ifdef U_ENABLE_GENERIC_ISO_2022
49 /*
50  * I am disabling the generic ISO-2022 converter after proposing to do so on
51  * the icu mailing list two days ago.
52  *
53  * Reasons:
54  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55  *    its designation sequences, single shifts with return to the previous state,
56  *    switch-with-no-return to UTF-16BE or similar, etc.
57  *    This is unlike the language-specific variants like ISO-2022-JP which
58  *    require a much smaller repertoire of ISO-2022 features.
59  *    These variants continue to be supported.
60  * 2. I believe that no one is really using the generic ISO-2022 converter
61  *    but rather always one of the language-specific variants.
62  *    Note that ICU's generic ISO-2022 converter has always output one escape
63  *    sequence followed by UTF-8 for the whole stream.
64  * 3. Switching between subcharsets is extremely slow, because each time
65  *    the previous converter is closed and a new one opened,
66  *    without any kind of caching, least-recently-used list, etc.
67  * 4. The code is currently buggy, and given the above it does not seem
68  *    reasonable to spend the time on maintenance.
69  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70  *    This means, for example, that when ISO-8859-7 is designated, the following
71  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72  *    The ICU ISO-2022 converter does not handle this - and has no information
73  *    about which subconverter would have to be shifted vs. which is designed
74  *    for 7-bit ISO-2022.
75  *
76  * Markus Scherer 2003-dec-03
77  */
78 #endif
79 
80 #if !UCONFIG_ONLY_HTML_CONVERSION
81 static const char SHIFT_IN_STR[]  = "\x0F";
82 // static const char SHIFT_OUT_STR[] = "\x0E";
83 #endif
84 
85 #define CR      0x0D
86 #define LF      0x0A
87 #define H_TAB   0x09
88 #define V_TAB   0x0B
89 #define SPACE   0x20
90 
91 enum {
92     HWKANA_START=0xff61,
93     HWKANA_END=0xff9f
94 };
95 
96 /*
97  * 94-character sets with native byte values A1..FE are encoded in ISO 2022
98  * as bytes 21..7E. (Subtract 0x80.)
99  * 96-character sets with native byte values A0..FF are encoded in ISO 2022
100  * as bytes 20..7F. (Subtract 0x80.)
101  * Do not encode C1 control codes with native bytes 80..9F
102  * as bytes 00..1F (C0 control codes).
103  */
104 enum {
105     GR94_START=0xa1,
106     GR94_END=0xfe,
107     GR96_START=0xa0,
108     GR96_END=0xff
109 };
110 
111 /*
112  * ISO 2022 control codes must not be converted from Unicode
113  * because they would mess up the byte stream.
114  * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
115  * corresponding to SO, SI, and ESC.
116  */
117 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
118 
119 /* for ISO-2022-JP and -CN implementations */
120 typedef enum  {
121         /* shared values */
122         INVALID_STATE=-1,
123         ASCII = 0,
124 
125         SS2_STATE=0x10,
126         SS3_STATE,
127 
128         /* JP */
129         ISO8859_1 = 1 ,
130         ISO8859_7 = 2 ,
131         JISX201  = 3,
132         JISX208 = 4,
133         JISX212 = 5,
134         GB2312  =6,
135         KSC5601 =7,
136         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
137 
138         /* CN */
139         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
140         GB2312_1=1,
141         ISO_IR_165=2,
142         CNS_11643=3,
143 
144         /*
145          * these are used in StateEnum and ISO2022State variables,
146          * but CNS_11643 must be used to index into myConverterArray[]
147          */
148         CNS_11643_0=0x20,
149         CNS_11643_1,
150         CNS_11643_2,
151         CNS_11643_3,
152         CNS_11643_4,
153         CNS_11643_5,
154         CNS_11643_6,
155         CNS_11643_7
156 } StateEnum;
157 
158 /* is the StateEnum charset value for a DBCS charset? */
159 #if UCONFIG_ONLY_HTML_CONVERSION
160 #define IS_JP_DBCS(cs) (JISX208==(cs))
161 #else
162 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
163 #endif
164 
165 #define CSM(cs) ((uint16_t)1<<(cs))
166 
167 /*
168  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
169  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
170  *
171  * Note: The converter uses some leniency:
172  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
173  *   all versions, not just JIS7 and JIS8.
174  * - ICU does not distinguish between different versions of JIS X 0208.
175  */
176 #if UCONFIG_ONLY_HTML_CONVERSION
177 enum { MAX_JA_VERSION=0 };
178 #else
179 enum { MAX_JA_VERSION=4 };
180 #endif
181 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
182     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
183 #if !UCONFIG_ONLY_HTML_CONVERSION
184     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
185     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
186     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
187     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
188 #endif
189 };
190 
191 typedef enum {
192         ASCII1=0,
193         LATIN1,
194         SBCS,
195         DBCS,
196         MBCS,
197         HWKANA
198 }Cnv2022Type;
199 
200 typedef struct ISO2022State {
201     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
202     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
203     int8_t prevG;       /* g before single shift (SS2 or SS3) */
204 } ISO2022State;
205 
206 #define UCNV_OPTIONS_VERSION_MASK 0xf
207 #define UCNV_2022_MAX_CONVERTERS 10
208 
209 typedef struct{
210     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
211     UConverter *currentConverter;
212     Cnv2022Type currentType;
213     ISO2022State toU2022State, fromU2022State;
214     uint32_t key;
215     uint32_t version;
216 #ifdef U_ENABLE_GENERIC_ISO_2022
217     UBool isFirstBuffer;
218 #endif
219     UBool isEmptySegment;
220     char name[30];
221     char locale[3];
222 }UConverterDataISO2022;
223 
224 /* Protos */
225 /* ISO-2022 ----------------------------------------------------------------- */
226 
227 /*Forward declaration */
228 U_CFUNC void U_CALLCONV
229 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
230                       UErrorCode * err);
231 U_CFUNC void U_CALLCONV
232 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
233                                     UErrorCode * err);
234 
235 #define ESC_2022 0x1B /*ESC*/
236 
237 typedef enum
238 {
239         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
240         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
241         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
242         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
243 } UCNV_TableStates_2022;
244 
245 /*
246 * The way these state transition arrays work is:
247 * ex : ESC$B is the sequence for JISX208
248 *      a) First Iteration: char is ESC
249 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
250 *             int x = normalize_esq_chars_2022[27] which is equal to 1
251 *         ii) Search for this value in escSeqStateTable_Key_2022[]
252 *             value of x is stored at escSeqStateTable_Key_2022[0]
253 *        iii) Save this index as offset
254 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256 *     b) Switch on this state and continue to next char
257 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
258 *             which is normalize_esq_chars_2022[36] == 4
259 *         ii) x is currently 1(from above)
260 *               x<<=5 -- x is now 32
261 *               x+=normalize_esq_chars_2022[36]
262 *               now x is 36
263 *        iii) Search for this value in escSeqStateTable_Key_2022[]
264 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
265 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
266 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
267 *     c) Switch on this state and continue to next char
268 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
269 *        ii) x is currently 36 (from above)
270 *            x<<=5 -- x is now 1152
271 *            x+=normalize_esq_chars_2022[66]
272 *            now x is 1161
273 *       iii) Search for this value in escSeqStateTable_Key_2022[]
274 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
275 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
276 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
277 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
278 */
279 
280 
281 /*Below are the 3 arrays depicting a state transition table*/
282 static const int8_t normalize_esq_chars_2022[256] = {
283 /*       0      1       2       3       4      5       6        7       8       9           */
284 
285          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
286         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
287         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
288         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
289         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
290         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
291         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
292         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
293         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
294         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
295         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
296         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
297         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
298         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
299         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
300         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
301         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
302         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
303         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
304         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
305         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
306         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
307         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
308         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
309         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
310         ,0     ,0      ,0      ,0      ,0      ,0
311 };
312 
313 #ifdef U_ENABLE_GENERIC_ISO_2022
314 /*
315  * When the generic ISO-2022 converter is completely removed, not just disabled
316  * per #ifdef, then the following state table and the associated tables that are
317  * dimensioned with MAX_STATES_2022 should be trimmed.
318  *
319  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
320  * the associated escape sequences starting with ESC ( B should be removed.
321  * This includes the ones with key values 1097 and all of the ones above 1000000.
322  *
323  * For the latter, the tables can simply be truncated.
324  * For the former, since the tables must be kept parallel, it is probably best
325  * to simply duplicate an adjacent table cell, parallel in all tables.
326  *
327  * It may make sense to restructure the tables, especially by using small search
328  * tables for the variants instead of indexing them parallel to the table here.
329  */
330 #endif
331 
332 #define MAX_STATES_2022 74
333 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
334 /*   0           1           2           3           4           5           6           7           8           9           */
335 
336      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
337     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
338     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
339     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
340     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
341     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
342     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
343     ,35947631   ,35947635   ,35947636   ,35947638
344 };
345 
346 #ifdef U_ENABLE_GENERIC_ISO_2022
347 
348 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
349  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
350 
351      nullptr                   ,nullptr                   ,nullptr                   ,nullptr               ,nullptr               ,nullptr                   ,nullptr                   ,nullptr                   ,"latin1"               ,"latin1"
352     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
353     ,"latin1"               ,nullptr                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,nullptr                   ,nullptr                   ,nullptr                   ,nullptr                   ,"UTF8"
354     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,nullptr               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
355     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
356     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
357     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,nullptr               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
358     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
359 };
360 
361 #endif
362 
363 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
364 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
365      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
366     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
367     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
368     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
369     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
370     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
371     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
372     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
373 };
374 
375 /* Type def for refactoring changeState_2022 code*/
376 typedef enum{
377 #ifdef U_ENABLE_GENERIC_ISO_2022
378     ISO_2022=0,
379 #endif
380     ISO_2022_JP=1,
381 #if !UCONFIG_ONLY_HTML_CONVERSION
382     ISO_2022_KR=2,
383     ISO_2022_CN=3
384 #endif
385 } Variant2022;
386 
387 /*********** ISO 2022 Converter Protos ***********/
388 static void U_CALLCONV
389 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
390 
391 static void U_CALLCONV
392  _ISO2022Close(UConverter *converter);
393 
394 static void U_CALLCONV
395 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
396 
397 U_CDECL_BEGIN
398 static const char * U_CALLCONV
399 _ISO2022getName(const UConverter* cnv);
400 U_CDECL_END
401 
402 static void  U_CALLCONV
403 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
404 
405 U_CDECL_BEGIN
406 static UConverter * U_CALLCONV
407 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
408 
409 U_CDECL_END
410 
411 #ifdef U_ENABLE_GENERIC_ISO_2022
412 static void U_CALLCONV
413 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
414 #endif
415 
416 namespace {
417 
418 /*const UConverterSharedData _ISO2022Data;*/
419 extern const UConverterSharedData _ISO2022JPData;
420 
421 #if !UCONFIG_ONLY_HTML_CONVERSION
422 extern const UConverterSharedData _ISO2022KRData;
423 extern const UConverterSharedData _ISO2022CNData;
424 #endif
425 
426 }  // namespace
427 
428 /*************** Converter implementations ******************/
429 
430 /* The purpose of this function is to get around gcc compiler warnings. */
431 static inline void
fromUWriteUInt8(UConverter * cnv,const char * bytes,int32_t length,uint8_t ** target,const char * targetLimit,int32_t ** offsets,int32_t sourceIndex,UErrorCode * pErrorCode)432 fromUWriteUInt8(UConverter *cnv,
433                  const char *bytes, int32_t length,
434                  uint8_t **target, const char *targetLimit,
435                  int32_t **offsets,
436                  int32_t sourceIndex,
437                  UErrorCode *pErrorCode)
438 {
439     char *targetChars = (char *)*target;
440     ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
441                          offsets, sourceIndex, pErrorCode);
442     *target = (uint8_t*)targetChars;
443 
444 }
445 
446 static inline void
setInitialStateToUnicodeKR(UConverter *,UConverterDataISO2022 * myConverterData)447 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
448     if(myConverterData->version == 1) {
449         UConverter *cnv = myConverterData->currentConverter;
450 
451         cnv->toUnicodeStatus=0;     /* offset */
452         cnv->mode=0;                /* state */
453         cnv->toULength=0;           /* byteIndex */
454     }
455 }
456 
457 static inline void
setInitialStateFromUnicodeKR(UConverter * converter,UConverterDataISO2022 * myConverterData)458 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
459    /* in ISO-2022-KR the designator sequence appears only once
460     * in a file so we append it only once
461     */
462     if( converter->charErrorBufferLength==0){
463 
464         converter->charErrorBufferLength = 4;
465         converter->charErrorBuffer[0] = 0x1b;
466         converter->charErrorBuffer[1] = 0x24;
467         converter->charErrorBuffer[2] = 0x29;
468         converter->charErrorBuffer[3] = 0x43;
469     }
470     if(myConverterData->version == 1) {
471         UConverter *cnv = myConverterData->currentConverter;
472 
473         cnv->fromUChar32=0;
474         cnv->fromUnicodeStatus=1;   /* prevLength */
475     }
476 }
477 
478 static void U_CALLCONV
_ISO2022Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * errorCode)479 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
480 
481     char myLocale[7]={' ',' ',' ',' ',' ',' ', '\0'};
482 
483     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
484     if(cnv->extraInfo != nullptr) {
485         UConverterNamePieces stackPieces;
486         UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
487         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
488         uint32_t version;
489 
490         stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
491 
492         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
493         myConverterData->currentType = ASCII1;
494         cnv->fromUnicodeStatus =false;
495         if(pArgs->locale){
496             uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)-1);
497         }
498         version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
499         myConverterData->version = version;
500         if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
501             (myLocale[2]=='_' || myLocale[2]=='\0'))
502         {
503             /* open the required converters and cache them */
504             if(version>MAX_JA_VERSION) {
505                 // ICU 55 fails to open a converter for an unsupported version.
506                 // Previously, it fell back to version 0, but that would yield
507                 // unexpected behavior.
508                 *errorCode = U_MISSING_RESOURCE_ERROR;
509                 return;
510             }
511             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
512                 myConverterData->myConverterArray[ISO8859_7] =
513                     ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
514             }
515             myConverterData->myConverterArray[JISX208] =
516                 ucnv_loadSharedData("EUC-JP", &stackPieces, &stackArgs, errorCode);
517             if(jpCharsetMasks[version]&CSM(JISX212)) {
518                 myConverterData->myConverterArray[JISX212] =
519                     ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
520             }
521             if(jpCharsetMasks[version]&CSM(GB2312)) {
522                 myConverterData->myConverterArray[GB2312] =
523                     ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
524             }
525             if(jpCharsetMasks[version]&CSM(KSC5601)) {
526                 myConverterData->myConverterArray[KSC5601] =
527                     ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
528             }
529 
530             /* set the function pointers to appropriate functions */
531             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
532             uprv_strcpy(myConverterData->locale,"ja");
533 
534             (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
535             size_t len = uprv_strlen(myConverterData->name);
536             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
537             myConverterData->name[len+1]='\0';
538         }
539 #if !UCONFIG_ONLY_HTML_CONVERSION
540         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
541             (myLocale[2]=='_' || myLocale[2]=='\0'))
542         {
543             if(version>1) {
544                 // ICU 55 fails to open a converter for an unsupported version.
545                 // Previously, it fell back to version 0, but that would yield
546                 // unexpected behavior.
547                 *errorCode = U_MISSING_RESOURCE_ERROR;
548                 return;
549             }
550             const char *cnvName;
551             if(version==1) {
552                 cnvName="icu-internal-25546";
553             } else {
554                 cnvName="ibm-949";
555                 myConverterData->version=version=0;
556             }
557             if(pArgs->onlyTestIsLoadable) {
558                 ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
559                 uprv_free(cnv->extraInfo);
560                 cnv->extraInfo=nullptr;
561                 return;
562             } else {
563                 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
564                 if (U_FAILURE(*errorCode)) {
565                     _ISO2022Close(cnv);
566                     return;
567                 }
568 
569                 if(version==1) {
570                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
571                     uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
572                     cnv->subCharLen = myConverterData->currentConverter->subCharLen;
573                 }else{
574                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
575                 }
576 
577                 /* initialize the state variables */
578                 setInitialStateToUnicodeKR(cnv, myConverterData);
579                 setInitialStateFromUnicodeKR(cnv, myConverterData);
580 
581                 /* set the function pointers to appropriate functions */
582                 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
583                 uprv_strcpy(myConverterData->locale,"ko");
584             }
585         }
586         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
587             (myLocale[2]=='_' || myLocale[2]=='\0'))
588         {
589             if(version>2) {
590                 // ICU 55 fails to open a converter for an unsupported version.
591                 // Previously, it fell back to version 0, but that would yield
592                 // unexpected behavior.
593                 *errorCode = U_MISSING_RESOURCE_ERROR;
594                 return;
595             }
596 
597             /* open the required converters and cache them */
598             myConverterData->myConverterArray[GB2312_1] =
599                 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
600             if(version==1) {
601                 myConverterData->myConverterArray[ISO_IR_165] =
602                     ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
603             }
604             myConverterData->myConverterArray[CNS_11643] =
605                 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
606 
607 
608             /* set the function pointers to appropriate functions */
609             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
610             uprv_strcpy(myConverterData->locale,"cn");
611 
612             if (version==0){
613                 myConverterData->version = 0;
614                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
615             }else if (version==1){
616                 myConverterData->version = 1;
617                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
618             }else {
619                 myConverterData->version = 2;
620                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
621             }
622         }
623 #endif  // !UCONFIG_ONLY_HTML_CONVERSION
624         else{
625 #ifdef U_ENABLE_GENERIC_ISO_2022
626             myConverterData->isFirstBuffer = true;
627 
628             /* append the UTF-8 escape sequence */
629             cnv->charErrorBufferLength = 3;
630             cnv->charErrorBuffer[0] = 0x1b;
631             cnv->charErrorBuffer[1] = 0x25;
632             cnv->charErrorBuffer[2] = 0x42;
633 
634             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
635             /* initialize the state variables */
636             uprv_strcpy(myConverterData->name,"ISO_2022");
637 #else
638             *errorCode = U_MISSING_RESOURCE_ERROR;
639             // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
640             // data loading error code.
641             return;
642 #endif
643         }
644 
645         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
646 
647         if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
648             _ISO2022Close(cnv);
649         }
650     } else {
651         *errorCode = U_MEMORY_ALLOCATION_ERROR;
652     }
653 }
654 
655 
656 static void U_CALLCONV
_ISO2022Close(UConverter * converter)657 _ISO2022Close(UConverter *converter) {
658     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
659     UConverterSharedData **array = myData->myConverterArray;
660     int32_t i;
661 
662     if (converter->extraInfo != nullptr) {
663         /*close the array of converter pointers and free the memory*/
664         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
665             if(array[i]!=nullptr) {
666                 ucnv_unloadSharedDataIfReady(array[i]);
667             }
668         }
669 
670         ucnv_close(myData->currentConverter);
671 
672         if(!converter->isExtraLocal){
673             uprv_free (converter->extraInfo);
674             converter->extraInfo = nullptr;
675         }
676     }
677 }
678 
679 static void U_CALLCONV
_ISO2022Reset(UConverter * converter,UConverterResetChoice choice)680 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
681     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
682     if(choice<=UCNV_RESET_TO_UNICODE) {
683         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
684         myConverterData->key = 0;
685         myConverterData->isEmptySegment = false;
686     }
687     if(choice!=UCNV_RESET_TO_UNICODE) {
688         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
689     }
690 #ifdef U_ENABLE_GENERIC_ISO_2022
691     if(myConverterData->locale[0] == 0){
692         if(choice<=UCNV_RESET_TO_UNICODE) {
693             myConverterData->isFirstBuffer = true;
694             myConverterData->key = 0;
695             if (converter->mode == UCNV_SO){
696                 ucnv_close (myConverterData->currentConverter);
697                 myConverterData->currentConverter=nullptr;
698             }
699             converter->mode = UCNV_SI;
700         }
701         if(choice!=UCNV_RESET_TO_UNICODE) {
702             /* re-append UTF-8 escape sequence */
703             converter->charErrorBufferLength = 3;
704             converter->charErrorBuffer[0] = 0x1b;
705             converter->charErrorBuffer[1] = 0x28;
706             converter->charErrorBuffer[2] = 0x42;
707         }
708     }
709     else
710 #endif
711     {
712         /* reset the state variables */
713         if(myConverterData->locale[0] == 'k'){
714             if(choice<=UCNV_RESET_TO_UNICODE) {
715                 setInitialStateToUnicodeKR(converter, myConverterData);
716             }
717             if(choice!=UCNV_RESET_TO_UNICODE) {
718                 setInitialStateFromUnicodeKR(converter, myConverterData);
719             }
720         }
721     }
722 }
723 
724 U_CDECL_BEGIN
725 
726 static const char * U_CALLCONV
_ISO2022getName(const UConverter * cnv)727 _ISO2022getName(const UConverter* cnv){
728     if(cnv->extraInfo){
729         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
730         return myData->name;
731     }
732     return nullptr;
733 }
734 
735 U_CDECL_END
736 
737 
738 /*************** to unicode *******************/
739 /****************************************************************************
740  * Recognized escape sequences are
741  * <ESC>(B  ASCII
742  * <ESC>.A  ISO-8859-1
743  * <ESC>.F  ISO-8859-7
744  * <ESC>(J  JISX-201
745  * <ESC>(I  JISX-201
746  * <ESC>$B  JISX-208
747  * <ESC>$@  JISX-208
748  * <ESC>$(D JISX-212
749  * <ESC>$A  GB2312
750  * <ESC>$(C KSC5601
751  */
752 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
753 /*      0                1               2               3               4               5               6               7               8               9    */
754     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
755     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
756     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
757     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
758     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
759     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
760     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
761     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
762 };
763 
764 #if !UCONFIG_ONLY_HTML_CONVERSION
765 /*************** to unicode *******************/
766 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
767 /*      0                1               2               3               4               5               6               7               8               9    */
768      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
769     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
770     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
771     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
772     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
773     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
774     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
775     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
776 };
777 #endif
778 
779 
780 static UCNV_TableStates_2022
getKey_2022(char c,int32_t * key,int32_t * offset)781 getKey_2022(char c,int32_t* key,int32_t* offset){
782     int32_t togo;
783     int32_t low = 0;
784     int32_t hi = MAX_STATES_2022;
785     int32_t oldmid=0;
786 
787     togo = normalize_esq_chars_2022[(uint8_t)c];
788     if(togo == 0) {
789         /* not a valid character anywhere in an escape sequence */
790         *key = 0;
791         *offset = 0;
792         return INVALID_2022;
793     }
794     togo = (*key << 5) + togo;
795 
796     while (hi != low)  /*binary search*/{
797 
798         int32_t mid = (hi+low) >> 1; /*Finds median*/
799 
800         if (mid == oldmid)
801             break;
802 
803         if (escSeqStateTable_Key_2022[mid] > togo){
804             hi = mid;
805         }
806         else if (escSeqStateTable_Key_2022[mid] < togo){
807             low = mid;
808         }
809         else /*we found it*/{
810             *key = togo;
811             *offset = mid;
812             return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
813         }
814         oldmid = mid;
815 
816     }
817 
818     *key = 0;
819     *offset = 0;
820     return INVALID_2022;
821 }
822 
823 /*runs through a state machine to determine the escape sequence - codepage correspondence
824  */
825 static void
changeState_2022(UConverter * _this,const char ** source,const char * sourceLimit,Variant2022 var,UErrorCode * err)826 changeState_2022(UConverter* _this,
827                 const char** source,
828                 const char* sourceLimit,
829                 Variant2022 var,
830                 UErrorCode* err){
831     UCNV_TableStates_2022 value;
832     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
833     uint32_t key = myData2022->key;
834     int32_t offset = 0;
835     int8_t initialToULength = _this->toULength;
836     char c;
837 
838     value = VALID_NON_TERMINAL_2022;
839     while (*source < sourceLimit) {
840         c = *(*source)++;
841         _this->toUBytes[_this->toULength++]=(uint8_t)c;
842         value = getKey_2022(c,(int32_t *) &key, &offset);
843 
844         switch (value){
845 
846         case VALID_NON_TERMINAL_2022 :
847             /* continue with the loop */
848             break;
849 
850         case VALID_TERMINAL_2022:
851             key = 0;
852             goto DONE;
853 
854         case INVALID_2022:
855             goto DONE;
856 
857         case VALID_MAYBE_TERMINAL_2022:
858 #ifdef U_ENABLE_GENERIC_ISO_2022
859             /* ESC ( B is ambiguous only for ISO_2022 itself */
860             if(var == ISO_2022) {
861                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
862                 _this->toULength = 0;
863 
864                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
865 
866                 /* continue with the loop */
867                 value = VALID_NON_TERMINAL_2022;
868                 break;
869             } else
870 #endif
871             {
872                 /* not ISO_2022 itself, finish here */
873                 value = VALID_TERMINAL_2022;
874                 key = 0;
875                 goto DONE;
876             }
877         }
878     }
879 
880 DONE:
881     myData2022->key = key;
882 
883     if (value == VALID_NON_TERMINAL_2022) {
884         /* indicate that the escape sequence is incomplete: key!=0 */
885         return;
886     } else if (value == INVALID_2022 ) {
887         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
888     } else /* value == VALID_TERMINAL_2022 */ {
889         switch(var){
890 #ifdef U_ENABLE_GENERIC_ISO_2022
891         case ISO_2022:
892         {
893             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
894             if(chosenConverterName == nullptr) {
895                 /* SS2 or SS3 */
896                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
897                 _this->toUCallbackReason = UCNV_UNASSIGNED;
898                 return;
899             }
900 
901             _this->mode = UCNV_SI;
902             ucnv_close(myData2022->currentConverter);
903             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
904             if(U_SUCCESS(*err)) {
905                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
906                 _this->mode = UCNV_SO;
907             }
908             break;
909         }
910 #endif
911         case ISO_2022_JP:
912             {
913                 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
914                 switch(tempState) {
915                 case INVALID_STATE:
916                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
917                     break;
918                 case SS2_STATE:
919                     if(myData2022->toU2022State.cs[2]!=0) {
920                         if(myData2022->toU2022State.g<2) {
921                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
922                         }
923                         myData2022->toU2022State.g=2;
924                     } else {
925                         /* illegal to have SS2 before a matching designator */
926                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
927                     }
928                     break;
929                 /* case SS3_STATE: not used in ISO-2022-JP-x */
930                 case ISO8859_1:
931                 case ISO8859_7:
932                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
933                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
934                     } else {
935                         /* G2 charset for SS2 */
936                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
937                     }
938                     break;
939                 default:
940                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
941                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
942                     } else {
943                         /* G0 charset */
944                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
945                     }
946                     break;
947                 }
948             }
949             break;
950 #if !UCONFIG_ONLY_HTML_CONVERSION
951         case ISO_2022_CN:
952             {
953                 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
954                 switch(tempState) {
955                 case INVALID_STATE:
956                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
957                     break;
958                 case SS2_STATE:
959                     if(myData2022->toU2022State.cs[2]!=0) {
960                         if(myData2022->toU2022State.g<2) {
961                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
962                         }
963                         myData2022->toU2022State.g=2;
964                     } else {
965                         /* illegal to have SS2 before a matching designator */
966                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967                     }
968                     break;
969                 case SS3_STATE:
970                     if(myData2022->toU2022State.cs[3]!=0) {
971                         if(myData2022->toU2022State.g<2) {
972                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
973                         }
974                         myData2022->toU2022State.g=3;
975                     } else {
976                         /* illegal to have SS3 before a matching designator */
977                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
978                     }
979                     break;
980                 case ISO_IR_165:
981                     if(myData2022->version==0) {
982                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
983                         break;
984                     }
985                     U_FALLTHROUGH;
986                 case GB2312_1:
987                     U_FALLTHROUGH;
988                 case CNS_11643_1:
989                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
990                     break;
991                 case CNS_11643_2:
992                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
993                     break;
994                 default:
995                     /* other CNS 11643 planes */
996                     if(myData2022->version==0) {
997                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
998                     } else {
999                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
1000                     }
1001                     break;
1002                 }
1003             }
1004             break;
1005         case ISO_2022_KR:
1006             if(offset==0x30){
1007                 /* nothing to be done, just accept this one escape sequence */
1008             } else {
1009                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1010             }
1011             break;
1012 #endif  // !UCONFIG_ONLY_HTML_CONVERSION
1013 
1014         default:
1015             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1016             break;
1017         }
1018     }
1019     if(U_SUCCESS(*err)) {
1020         _this->toULength = 0;
1021     } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1022         if(_this->toULength>1) {
1023             /*
1024              * Ticket 5691: consistent illegal sequences:
1025              * - We include at least the first byte (ESC) in the illegal sequence.
1026              * - If any of the non-initial bytes could be the start of a character,
1027              *   we stop the illegal sequence before the first one of those.
1028              *   In escape sequences, all following bytes are "printable", that is,
1029              *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1030              *   they are valid single/lead bytes.
1031              *   For simplicity, we always only report the initial ESC byte as the
1032              *   illegal sequence and back out all other bytes we looked at.
1033              */
1034             /* Back out some bytes. */
1035             int8_t backOutDistance=_this->toULength-1;
1036             int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1037             if(backOutDistance<=bytesFromThisBuffer) {
1038                 /* same as initialToULength<=1 */
1039                 *source-=backOutDistance;
1040             } else {
1041                 /* Back out bytes from the previous buffer: Need to replay them. */
1042                 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1043                 /* same as -(initialToULength-1) */
1044                 /* preToULength is negative! */
1045                 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1046                 *source-=bytesFromThisBuffer;
1047             }
1048             _this->toULength=1;
1049         }
1050     } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1051         _this->toUCallbackReason = UCNV_UNASSIGNED;
1052     }
1053 }
1054 
1055 #if !UCONFIG_ONLY_HTML_CONVERSION
1056 /*Checks the characters of the buffer against valid 2022 escape sequences
1057 *if the match we return a pointer to the initial start of the sequence otherwise
1058 *we return sourceLimit
1059 */
1060 /*for 2022 looks ahead in the stream
1061  *to determine the longest possible convertible
1062  *data stream
1063  */
1064 static inline const char*
getEndOfBuffer_2022(const char ** source,const char * sourceLimit,UBool)1065 getEndOfBuffer_2022(const char** source,
1066                    const char* sourceLimit,
1067                    UBool /*flush*/){
1068 
1069     const char* mySource = *source;
1070 
1071 #ifdef U_ENABLE_GENERIC_ISO_2022
1072     if (*source >= sourceLimit)
1073         return sourceLimit;
1074 
1075     do{
1076 
1077         if (*mySource == ESC_2022){
1078             int8_t i;
1079             int32_t key = 0;
1080             int32_t offset;
1081             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1082 
1083             /* Kludge: I could not
1084             * figure out the reason for validating an escape sequence
1085             * twice - once here and once in changeState_2022().
1086             * is it possible to have an ESC character in a ISO2022
1087             * byte stream which is valid in a code page? Is it legal?
1088             */
1089             for (i=0;
1090             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1091             i++) {
1092                 value =  getKey_2022(*(mySource+i), &key, &offset);
1093             }
1094             if (value > 0 || *mySource==ESC_2022)
1095                 return mySource;
1096 
1097             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1098                 return sourceLimit;
1099         }
1100     }while (++mySource < sourceLimit);
1101 
1102     return sourceLimit;
1103 #else
1104     while(mySource < sourceLimit && *mySource != ESC_2022) {
1105         ++mySource;
1106     }
1107     return mySource;
1108 #endif
1109 }
1110 #endif
1111 
1112 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1113  * any future change in _MBCSFromUChar32() function should be reflected here.
1114  * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1115  */
1116 static inline int32_t
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData * sharedData,UChar32 c,uint32_t * value,UBool useFallback,int outputType)1117 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1118                                          UChar32 c,
1119                                          uint32_t* value,
1120                                          UBool useFallback,
1121                                          int outputType)
1122 {
1123     const int32_t *cx;
1124     const uint16_t *table;
1125     uint32_t stage2Entry;
1126     uint32_t myValue;
1127     int32_t length;
1128     const uint8_t *p;
1129     /*
1130      * TODO(markus): Use and require new, faster MBCS conversion table structures.
1131      * Use internal version of ucnv_open() that verifies that the new structures are available,
1132      * else U_INTERNAL_PROGRAM_ERROR.
1133      */
1134     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1135     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1136         table=sharedData->mbcs.fromUnicodeTable;
1137         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1138         /* get the bytes and the length for the output */
1139         if(outputType==MBCS_OUTPUT_2){
1140             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1141             if(myValue<=0xff) {
1142                 length=1;
1143             } else {
1144                 length=2;
1145             }
1146         } else /* outputType==MBCS_OUTPUT_3 */ {
1147             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1148             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1149             if(myValue<=0xff) {
1150                 length=1;
1151             } else if(myValue<=0xffff) {
1152                 length=2;
1153             } else {
1154                 length=3;
1155             }
1156         }
1157         /* is this code point assigned, or do we use fallbacks? */
1158         if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1159             /* assigned */
1160             *value=myValue;
1161             return length;
1162         } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1163             /*
1164              * We allow a 0 byte output if the "assigned" bit is set for this entry.
1165              * There is no way with this data structure for fallback output
1166              * to be a zero byte.
1167              */
1168             *value=myValue;
1169             return -length;
1170         }
1171     }
1172 
1173     cx=sharedData->mbcs.extIndexes;
1174     if(cx!=nullptr) {
1175         return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1176     }
1177 
1178     /* unassigned */
1179     return 0;
1180 }
1181 
1182 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1183  * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1184  * @param retval pointer to output byte
1185  * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1186  */
1187 static inline int32_t
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData * sharedData,UChar32 c,uint32_t * retval,UBool useFallback)1188 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1189                                        UChar32 c,
1190                                        uint32_t* retval,
1191                                        UBool useFallback)
1192 {
1193     const uint16_t *table;
1194     int32_t value;
1195     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1196     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1197         return 0;
1198     }
1199     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1200     table=sharedData->mbcs.fromUnicodeTable;
1201     /* get the byte for the output */
1202     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1203     /* is this code point assigned, or do we use fallbacks? */
1204     *retval=(uint32_t)(value&0xff);
1205     if(value>=0xf00) {
1206         return 1;  /* roundtrip */
1207     } else if(useFallback ? value>=0x800 : value>=0xc00) {
1208         return -1;  /* fallback taken */
1209     } else {
1210         return 0;  /* no mapping */
1211     }
1212 }
1213 
1214 /*
1215  * Check that the result is a 2-byte value with each byte in the range A1..FE
1216  * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1217  * to move it to the ISO 2022 range 21..7E.
1218  * Return 0 if out of range.
1219  */
1220 static inline uint32_t
_2022FromGR94DBCS(uint32_t value)1221 _2022FromGR94DBCS(uint32_t value) {
1222     if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1223         (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1224     ) {
1225         return value - 0x8080;  /* shift down to 21..7e byte range */
1226     } else {
1227         return 0;  /* not valid for ISO 2022 */
1228     }
1229 }
1230 
1231 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1232 /*
1233  * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1234  * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1235  * unchanged.
1236  */
1237 static inline uint32_t
1238 _2022ToGR94DBCS(uint32_t value) {
1239     uint32_t returnValue = value + 0x8080;
1240     if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1241         (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1242         return returnValue;
1243     } else {
1244         return value;
1245     }
1246 }
1247 #endif
1248 
1249 #ifdef U_ENABLE_GENERIC_ISO_2022
1250 
1251 /**********************************************************************************
1252 *  ISO-2022 Converter
1253 *
1254 *
1255 */
1256 
1257 static void U_CALLCONV
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)1258 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1259                                                            UErrorCode* err){
1260     const char* mySourceLimit, *realSourceLimit;
1261     const char* sourceStart;
1262     const char16_t* myTargetStart;
1263     UConverter* saveThis;
1264     UConverterDataISO2022* myData;
1265     int8_t length;
1266 
1267     saveThis = args->converter;
1268     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1269 
1270     realSourceLimit = args->sourceLimit;
1271     while (args->source < realSourceLimit) {
1272         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1273             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1274             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1275 
1276             if(args->source < mySourceLimit) {
1277                 if(myData->currentConverter==nullptr) {
1278                     myData->currentConverter = ucnv_open("ASCII",err);
1279                     if(U_FAILURE(*err)){
1280                         return;
1281                     }
1282 
1283                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1284                     saveThis->mode = UCNV_SO;
1285                 }
1286 
1287                 /* convert to before the ESC or until the end of the buffer */
1288                 myData->isFirstBuffer=false;
1289                 sourceStart = args->source;
1290                 myTargetStart = args->target;
1291                 args->converter = myData->currentConverter;
1292                 ucnv_toUnicode(args->converter,
1293                     &args->target,
1294                     args->targetLimit,
1295                     &args->source,
1296                     mySourceLimit,
1297                     args->offsets,
1298                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
1299                     err);
1300                 args->converter = saveThis;
1301 
1302                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1303                     /* move the overflow buffer */
1304                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1305                     myData->currentConverter->UCharErrorBufferLength = 0;
1306                     if(length > 0) {
1307                         uprv_memcpy(saveThis->UCharErrorBuffer,
1308                                     myData->currentConverter->UCharErrorBuffer,
1309                                     length*U_SIZEOF_UCHAR);
1310                     }
1311                     return;
1312                 }
1313 
1314                 /*
1315                  * At least one of:
1316                  * -Error while converting
1317                  * -Done with entire buffer
1318                  * -Need to write offsets or update the current offset
1319                  *  (leave that up to the code in ucnv.c)
1320                  *
1321                  * or else we just stopped at an ESC byte and continue with changeState_2022()
1322                  */
1323                 if (U_FAILURE(*err) ||
1324                     (args->source == realSourceLimit) ||
1325                     (args->offsets != nullptr && (args->target != myTargetStart || args->source != sourceStart) ||
1326                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1327                 ) {
1328                     /* copy partial or error input for truncated detection and error handling */
1329                     if(U_FAILURE(*err)) {
1330                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1331                         if(length > 0) {
1332                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1333                         }
1334                     } else {
1335                         length = saveThis->toULength = myData->currentConverter->toULength;
1336                         if(length > 0) {
1337                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1338                             if(args->source < mySourceLimit) {
1339                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1340                             }
1341                         }
1342                     }
1343                     return;
1344                 }
1345             }
1346         }
1347 
1348         sourceStart = args->source;
1349         changeState_2022(args->converter,
1350                &(args->source),
1351                realSourceLimit,
1352                ISO_2022,
1353                err);
1354         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != nullptr)) {
1355             /* let the ucnv.c code update its current offset */
1356             return;
1357         }
1358     }
1359 }
1360 
1361 #endif
1362 
1363 /*
1364  * To Unicode Callback helper function
1365  */
1366 static void
toUnicodeCallback(UConverter * cnv,const uint32_t sourceChar,const uint32_t targetUniChar,UErrorCode * err)1367 toUnicodeCallback(UConverter *cnv,
1368                   const uint32_t sourceChar, const uint32_t targetUniChar,
1369                   UErrorCode* err){
1370     if(sourceChar>0xff){
1371         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1372         cnv->toUBytes[1] = (uint8_t)sourceChar;
1373         cnv->toULength = 2;
1374     }
1375     else{
1376         cnv->toUBytes[0] =(char) sourceChar;
1377         cnv->toULength = 1;
1378     }
1379 
1380     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1381         *err = U_INVALID_CHAR_FOUND;
1382     }
1383     else{
1384         *err = U_ILLEGAL_CHAR_FOUND;
1385     }
1386 }
1387 
1388 /**************************************ISO-2022-JP*************************************************/
1389 
1390 /************************************** IMPORTANT **************************************************
1391 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1392 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1393 * The converter iterates over each Unicode codepoint
1394 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1395 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1396 * would do as far as possible.
1397 *
1398 * If the implementation of these macros or structure of sharedData struct change in the future, make
1399 * sure that ISO-2022 is also changed.
1400 ***************************************************************************************************
1401 */
1402 
1403 /***************************************************************************************************
1404 * Rules for ISO-2022-jp encoding
1405 * (i)   Escape sequences must be fully contained within a line they should not
1406 *       span new lines or CRs
1407 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
1408 *       JIS-Roman character escape sequence should follow before the line terminates
1409 * (iii) If the first character on the line is represented by two bytes then a two
1410 *       byte character escape sequence should precede it
1411 * (iv)  If no escape sequence is encountered then the characters are ASCII
1412 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1413 *       and invoked with SS2 (ESC N).
1414 * (vi)  If there is any G0 designation in text, there must be a switch to
1415 *       ASCII or to JIS X 0201-Roman before a space character (but not
1416 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1417 *       characters such as tab or CRLF.
1418 * (vi)  Supported encodings:
1419 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1420 *
1421 *  source : RFC-1554
1422 *
1423 *          JISX201, JISX208,JISX212 : new .cnv data files created
1424 *          KSC5601 : alias to ibm-949 mapping table
1425 *          GB2312 : alias to ibm-1386 mapping table
1426 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1427 *          ISO-8859-7 : alias to ibm-9409 mapping table
1428 */
1429 
1430 /* preference order of JP charsets */
1431 static const StateEnum jpCharsetPref[]={
1432     ASCII,
1433     JISX201,
1434     ISO8859_1,
1435     JISX208,
1436     ISO8859_7,
1437     JISX212,
1438     GB2312,
1439     KSC5601,
1440     HWKANA_7BIT
1441 };
1442 
1443 /*
1444  * The escape sequences must be in order of the enum constants like JISX201  = 3,
1445  * not in order of jpCharsetPref[]!
1446  */
1447 static const char escSeqChars[][6] ={
1448     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1449     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1450     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1451     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1452     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1453     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1454     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1455     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1456     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1457 
1458 };
1459 static  const int8_t escSeqCharsLen[] ={
1460     3, /* length of <ESC>(B  ASCII       */
1461     3, /* length of <ESC>.A  ISO-8859-1  */
1462     3, /* length of <ESC>.F  ISO-8859-7  */
1463     3, /* length of <ESC>(J  JISX-201    */
1464     3, /* length of <ESC>$B  JISX-208    */
1465     4, /* length of <ESC>$(D JISX-212    */
1466     3, /* length of <ESC>$A  GB2312      */
1467     4, /* length of <ESC>$(C KSC5601     */
1468     3  /* length of <ESC>(I  HWKANA_7BIT */
1469 };
1470 
1471 /*
1472 * The iteration over various code pages works this way:
1473 * i)   Get the currentState from myConverterData->currentState
1474 * ii)  Check if the character is mapped to a valid character in the currentState
1475 *      Yes ->  a) set the initIterState to currentState
1476 *       b) remain in this state until an invalid character is found
1477 *      No  ->  a) go to the next code page and find the character
1478 * iii) Before changing the state increment the current state check if the current state
1479 *      is equal to the intitIteration state
1480 *      Yes ->  A character that cannot be represented in any of the supported encodings
1481 *       break and return a U_INVALID_CHARACTER error
1482 *      No  ->  Continue and find the character in next code page
1483 *
1484 *
1485 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1486 */
1487 
1488 /* Map 00..7F to Unicode according to JIS X 0201. */
1489 static inline uint32_t
jisx201ToU(uint32_t value)1490 jisx201ToU(uint32_t value) {
1491     if(value < 0x5c) {
1492         return value;
1493     } else if(value == 0x5c) {
1494         return 0xa5;
1495     } else if(value == 0x7e) {
1496         return 0x203e;
1497     } else /* value <= 0x7f */ {
1498         return value;
1499     }
1500 }
1501 
1502 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1503 static inline uint32_t
jisx201FromU(uint32_t value)1504 jisx201FromU(uint32_t value) {
1505     if(value<=0x7f) {
1506         if(value!=0x5c && value!=0x7e) {
1507             return value;
1508         }
1509     } else if(value==0xa5) {
1510         return 0x5c;
1511     } else if(value==0x203e) {
1512         return 0x7e;
1513     }
1514     return 0xfffe;
1515 }
1516 
1517 /*
1518  * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1519  * Katakana.
1520  * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1521  * because Shift-JIS roundtrips half-width Katakana to single bytes.
1522  * These were the only fallbacks in ICU's jisx-208.ucm file.
1523  */
1524 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1525     0x2123,  /* U+FF61 */
1526     0x2156,
1527     0x2157,
1528     0x2122,
1529     0x2126,
1530     0x2572,
1531     0x2521,
1532     0x2523,
1533     0x2525,
1534     0x2527,
1535     0x2529,
1536     0x2563,
1537     0x2565,
1538     0x2567,
1539     0x2543,
1540     0x213C,  /* U+FF70 */
1541     0x2522,
1542     0x2524,
1543     0x2526,
1544     0x2528,
1545     0x252A,
1546     0x252B,
1547     0x252D,
1548     0x252F,
1549     0x2531,
1550     0x2533,
1551     0x2535,
1552     0x2537,
1553     0x2539,
1554     0x253B,
1555     0x253D,
1556     0x253F,  /* U+FF80 */
1557     0x2541,
1558     0x2544,
1559     0x2546,
1560     0x2548,
1561     0x254A,
1562     0x254B,
1563     0x254C,
1564     0x254D,
1565     0x254E,
1566     0x254F,
1567     0x2552,
1568     0x2555,
1569     0x2558,
1570     0x255B,
1571     0x255E,
1572     0x255F,  /* U+FF90 */
1573     0x2560,
1574     0x2561,
1575     0x2562,
1576     0x2564,
1577     0x2566,
1578     0x2568,
1579     0x2569,
1580     0x256A,
1581     0x256B,
1582     0x256C,
1583     0x256D,
1584     0x256F,
1585     0x2573,
1586     0x212B,
1587     0x212C   /* U+FF9F */
1588 };
1589 
1590 static void U_CALLCONV
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)1591 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1592     UConverter *cnv = args->converter;
1593     UConverterDataISO2022 *converterData;
1594     ISO2022State *pFromU2022State;
1595     uint8_t *target = (uint8_t *) args->target;
1596     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1597     const char16_t* source = args->source;
1598     const char16_t* sourceLimit = args->sourceLimit;
1599     int32_t* offsets = args->offsets;
1600     UChar32 sourceChar;
1601     char buffer[8];
1602     int32_t len, outLen;
1603     int8_t choices[10];
1604     int32_t choiceCount;
1605     uint32_t targetValue = 0;
1606     UBool useFallback;
1607 
1608     int32_t i;
1609     int8_t cs, g;
1610 
1611     /* set up the state */
1612     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1613     pFromU2022State   = &converterData->fromU2022State;
1614 
1615     choiceCount = 0;
1616 
1617     /* check if the last codepoint of previous buffer was a lead surrogate*/
1618     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1619         goto getTrail;
1620     }
1621 
1622     while(source < sourceLimit) {
1623         if(target < targetLimit) {
1624 
1625             sourceChar  = *(source++);
1626             /*check if the char is a First surrogate*/
1627             if(U16_IS_SURROGATE(sourceChar)) {
1628                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1629 getTrail:
1630                     /*look ahead to find the trail surrogate*/
1631                     if(source < sourceLimit) {
1632                         /* test the following code unit */
1633                         char16_t trail=(char16_t) *source;
1634                         if(U16_IS_TRAIL(trail)) {
1635                             source++;
1636                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1637                             cnv->fromUChar32=0x00;
1638                             /* convert this supplementary code point */
1639                             /* exit this condition tree */
1640                         } else {
1641                             /* this is an unmatched lead code unit (1st surrogate) */
1642                             /* callback(illegal) */
1643                             *err=U_ILLEGAL_CHAR_FOUND;
1644                             cnv->fromUChar32=sourceChar;
1645                             break;
1646                         }
1647                     } else {
1648                         /* no more input */
1649                         cnv->fromUChar32=sourceChar;
1650                         break;
1651                     }
1652                 } else {
1653                     /* this is an unmatched trail code unit (2nd surrogate) */
1654                     /* callback(illegal) */
1655                     *err=U_ILLEGAL_CHAR_FOUND;
1656                     cnv->fromUChar32=sourceChar;
1657                     break;
1658                 }
1659             }
1660 
1661             /* do not convert SO/SI/ESC */
1662             if(IS_2022_CONTROL(sourceChar)) {
1663                 /* callback(illegal) */
1664                 *err=U_ILLEGAL_CHAR_FOUND;
1665                 cnv->fromUChar32=sourceChar;
1666                 break;
1667             }
1668 
1669             /* do the conversion */
1670 
1671             if(choiceCount == 0) {
1672                 uint16_t csm;
1673 
1674                 /*
1675                  * The csm variable keeps track of which charsets are allowed
1676                  * and not used yet while building the choices[].
1677                  */
1678                 csm = jpCharsetMasks[converterData->version];
1679                 choiceCount = 0;
1680 
1681                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1682                 if(converterData->version == 3 || converterData->version == 4) {
1683                     choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1684                 }
1685                 /* Do not try single-byte half-width Katakana for other versions. */
1686                 csm &= ~CSM(HWKANA_7BIT);
1687 
1688                 /* try the current G0 charset */
1689                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1690                 csm &= ~CSM(cs);
1691 
1692                 /* try the current G2 charset */
1693                 if((cs = pFromU2022State->cs[2]) != 0) {
1694                     choices[choiceCount++] = cs;
1695                     csm &= ~CSM(cs);
1696                 }
1697 
1698                 /* try all the other possible charsets */
1699                 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1700                     cs = (int8_t)jpCharsetPref[i];
1701                     if(CSM(cs) & csm) {
1702                         choices[choiceCount++] = cs;
1703                         csm &= ~CSM(cs);
1704                     }
1705                 }
1706             }
1707 
1708             cs = g = 0;
1709             /*
1710              * len==0: no mapping found yet
1711              * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1712              * len>0: found a roundtrip result, done
1713              */
1714             len = 0;
1715             /*
1716              * We will turn off useFallback after finding a fallback,
1717              * but we still get fallbacks from PUA code points as usual.
1718              * Therefore, we will also need to check that we don't overwrite
1719              * an early fallback with a later one.
1720              */
1721             useFallback = cnv->useFallback;
1722 
1723             for(i = 0; i < choiceCount && len <= 0; ++i) {
1724                 uint32_t value;
1725                 int32_t len2;
1726                 int8_t cs0 = choices[i];
1727                 switch(cs0) {
1728                 case ASCII:
1729                     if(sourceChar <= 0x7f) {
1730                         targetValue = (uint32_t)sourceChar;
1731                         len = 1;
1732                         cs = cs0;
1733                         g = 0;
1734                     }
1735                     break;
1736                 case ISO8859_1:
1737                     if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1738                         targetValue = (uint32_t)sourceChar - 0x80;
1739                         len = 1;
1740                         cs = cs0;
1741                         g = 2;
1742                     }
1743                     break;
1744                 case HWKANA_7BIT:
1745                     if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1746                         if(converterData->version==3) {
1747                             /* JIS7: use G1 (SO) */
1748                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1749                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1750                             len = 1;
1751                             pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1752                             g = 1;
1753                         } else if(converterData->version==4) {
1754                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1755                             /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1756                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1757                             len = 1;
1758 
1759                             cs = pFromU2022State->cs[0];
1760                             if(IS_JP_DBCS(cs)) {
1761                                 /* switch from a DBCS charset to JISX201 */
1762                                 cs = (int8_t)JISX201;
1763                             }
1764                             /* else stay in the current G0 charset */
1765                             g = 0;
1766                         }
1767                         /* else do not use HWKANA_7BIT with other versions */
1768                     }
1769                     break;
1770                 case JISX201:
1771                     /* G0 SBCS */
1772                     value = jisx201FromU(sourceChar);
1773                     if(value <= 0x7f) {
1774                         targetValue = value;
1775                         len = 1;
1776                         cs = cs0;
1777                         g = 0;
1778                         useFallback = false;
1779                     }
1780                     break;
1781                 case JISX208:
1782                     /* G0 DBCS from Shift-JIS table */
1783                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1784                                 converterData->myConverterArray[cs0],
1785                                 sourceChar, &value,
1786                                 useFallback, MBCS_OUTPUT_2);
1787                     // Only accept DBCS char (abs(len2) == 2).
1788                     // With EUC-JP table for JIS X 208, half-width Kana
1789                     // represented with DBCS starting with 0x8E has to be
1790                     // filtered out so that they can be converted with
1791                     // hwkana_fb table.
1792                     if((len2 == 2 && ((value & 0xFF00) != 0x8E00)) || (len2 == -2 && len == 0)) {
1793                         value &= 0x7F7F;
1794                         if(value != 0) {
1795                             targetValue = value;
1796                             len = len2;
1797                             cs = cs0;
1798                             g = 0;
1799                             useFallback = false;
1800                         }
1801                     } else if(len == 0 && useFallback &&
1802                               (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1803                         targetValue = hwkana_fb[sourceChar - HWKANA_START];
1804                         len = -2;
1805                         cs = cs0;
1806                         g = 0;
1807                         useFallback = false;
1808                     }
1809                     break;
1810                 case ISO8859_7:
1811                     /* G0 SBCS forced to 7-bit output */
1812                     len2 = MBCS_SINGLE_FROM_UCHAR32(
1813                                 converterData->myConverterArray[cs0],
1814                                 sourceChar, &value,
1815                                 useFallback);
1816                     if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1817                         targetValue = value - 0x80;
1818                         len = len2;
1819                         cs = cs0;
1820                         g = 2;
1821                         useFallback = false;
1822                     }
1823                     break;
1824                 default:
1825                     /* G0 DBCS */
1826                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1827                                 converterData->myConverterArray[cs0],
1828                                 sourceChar, &value,
1829                                 useFallback, MBCS_OUTPUT_2);
1830                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1831                         if(cs0 == KSC5601) {
1832                             /*
1833                              * Check for valid bytes for the encoding scheme.
1834                              * This is necessary because the sub-converter (windows-949)
1835                              * has a broader encoding scheme than is valid for 2022.
1836                              */
1837                             value = _2022FromGR94DBCS(value);
1838                             if(value == 0) {
1839                                 break;
1840                             }
1841                         }
1842                         targetValue = value;
1843                         len = len2;
1844                         cs = cs0;
1845                         g = 0;
1846                         useFallback = false;
1847                     }
1848                     break;
1849                 }
1850             }
1851 
1852             if(len != 0) {
1853                 if(len < 0) {
1854                     len = -len;  /* fallback */
1855                 }
1856                 outLen = 0; /* count output bytes */
1857 
1858                 /* write SI if necessary (only for JIS7) */
1859                 if(pFromU2022State->g == 1 && g == 0) {
1860                     buffer[outLen++] = UCNV_SI;
1861                     pFromU2022State->g = 0;
1862                 }
1863 
1864                 /* write the designation sequence if necessary */
1865                 if(cs != pFromU2022State->cs[g]) {
1866                     int32_t escLen = escSeqCharsLen[cs];
1867                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1868                     outLen += escLen;
1869                     pFromU2022State->cs[g] = cs;
1870 
1871                     /* invalidate the choices[] */
1872                     choiceCount = 0;
1873                 }
1874 
1875                 /* write the shift sequence if necessary */
1876                 if(g != pFromU2022State->g) {
1877                     switch(g) {
1878                     /* case 0 handled before writing escapes */
1879                     case 1:
1880                         buffer[outLen++] = UCNV_SO;
1881                         pFromU2022State->g = 1;
1882                         break;
1883                     default: /* case 2 */
1884                         buffer[outLen++] = 0x1b;
1885                         buffer[outLen++] = 0x4e;
1886                         break;
1887                     /* no case 3: no SS3 in ISO-2022-JP-x */
1888                     }
1889                 }
1890 
1891                 /* write the output bytes */
1892                 if(len == 1) {
1893                     buffer[outLen++] = (char)targetValue;
1894                 } else /* len == 2 */ {
1895                     buffer[outLen++] = (char)(targetValue >> 8);
1896                     buffer[outLen++] = (char)targetValue;
1897                 }
1898             } else {
1899                 /*
1900                  * if we cannot find the character after checking all codepages
1901                  * then this is an error
1902                  */
1903                 *err = U_INVALID_CHAR_FOUND;
1904                 cnv->fromUChar32=sourceChar;
1905                 break;
1906             }
1907 
1908             if(sourceChar == CR || sourceChar == LF) {
1909                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1910                 pFromU2022State->cs[2] = 0;
1911                 choiceCount = 0;
1912             }
1913 
1914             /* output outLen>0 bytes in buffer[] */
1915             if(outLen == 1) {
1916                 *target++ = buffer[0];
1917                 if(offsets) {
1918                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1919                 }
1920             } else if(outLen == 2 && (target + 2) <= targetLimit) {
1921                 *target++ = buffer[0];
1922                 *target++ = buffer[1];
1923                 if(offsets) {
1924                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1925                     *offsets++ = sourceIndex;
1926                     *offsets++ = sourceIndex;
1927                 }
1928             } else {
1929                 fromUWriteUInt8(
1930                     cnv,
1931                     buffer, outLen,
1932                     &target, (const char *)targetLimit,
1933                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1934                     err);
1935                 if(U_FAILURE(*err)) {
1936                     break;
1937                 }
1938             }
1939         } /* end if(myTargetIndex<myTargetLength) */
1940         else{
1941             *err =U_BUFFER_OVERFLOW_ERROR;
1942             break;
1943         }
1944 
1945     }/* end while(mySourceIndex<mySourceLength) */
1946 
1947     /*
1948      * the end of the input stream and detection of truncated input
1949      * are handled by the framework, but for ISO-2022-JP conversion
1950      * we need to be in ASCII mode at the very end
1951      *
1952      * conditions:
1953      *   successful
1954      *   in SO mode or not in ASCII mode
1955      *   end of input and no truncated input
1956      */
1957     if( U_SUCCESS(*err) &&
1958         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1959         args->flush && source>=sourceLimit && cnv->fromUChar32==0
1960     ) {
1961         int32_t sourceIndex;
1962 
1963         outLen = 0;
1964 
1965         if(pFromU2022State->g != 0) {
1966             buffer[outLen++] = UCNV_SI;
1967             pFromU2022State->g = 0;
1968         }
1969 
1970         if(pFromU2022State->cs[0] != ASCII) {
1971             int32_t escLen = escSeqCharsLen[ASCII];
1972             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1973             outLen += escLen;
1974             pFromU2022State->cs[0] = (int8_t)ASCII;
1975         }
1976 
1977         /* get the source index of the last input character */
1978         /*
1979          * TODO this would be simpler and more reliable if we used a pair
1980          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1981          * so that we could simply use the prevSourceIndex here;
1982          * this code gives an incorrect result for the rare case of an unmatched
1983          * trail surrogate that is alone in the last buffer of the text stream
1984          */
1985         sourceIndex=(int32_t)(source-args->source);
1986         if(sourceIndex>0) {
1987             --sourceIndex;
1988             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1989                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1990             ) {
1991                 --sourceIndex;
1992             }
1993         } else {
1994             sourceIndex=-1;
1995         }
1996 
1997         fromUWriteUInt8(
1998             cnv,
1999             buffer, outLen,
2000             &target, (const char *)targetLimit,
2001             &offsets, sourceIndex,
2002             err);
2003     }
2004 
2005     /*save the state and return */
2006     args->source = source;
2007     args->target = (char*)target;
2008 }
2009 
2010 /*************** to unicode *******************/
2011 
2012 static void U_CALLCONV
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2013 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2014                                                UErrorCode* err){
2015     char tempBuf[2];
2016     const char *mySource = (char *) args->source;
2017     char16_t *myTarget = args->target;
2018     const char *mySourceLimit = args->sourceLimit;
2019     uint32_t targetUniChar = 0x0000;
2020     uint32_t mySourceChar = 0x0000;
2021     uint32_t tmpSourceChar = 0x0000;
2022     UConverterDataISO2022* myData;
2023     ISO2022State *pToU2022State;
2024     StateEnum cs;
2025 
2026     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2027     pToU2022State = &myData->toU2022State;
2028 
2029     if(myData->key != 0) {
2030         /* continue with a partial escape sequence */
2031         goto escape;
2032     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2033         /* continue with a partial double-byte character */
2034         mySourceChar = args->converter->toUBytes[0];
2035         args->converter->toULength = 0;
2036         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2037         targetUniChar = missingCharMarker;
2038         goto getTrailByte;
2039     }
2040 
2041     while(mySource < mySourceLimit){
2042 
2043         targetUniChar =missingCharMarker;
2044 
2045         if(myTarget < args->targetLimit){
2046 
2047             mySourceChar= (unsigned char) *mySource++;
2048 
2049             switch(mySourceChar) {
2050             case UCNV_SI:
2051                 if(myData->version==3) {
2052                     pToU2022State->g=0;
2053                     continue;
2054                 } else {
2055                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2056                     myData->isEmptySegment = false;	/* reset this, we have a different error */
2057                     break;
2058                 }
2059 
2060             case UCNV_SO:
2061                 if(myData->version==3) {
2062                     /* JIS7: switch to G1 half-width Katakana */
2063                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2064                     pToU2022State->g=1;
2065                     continue;
2066                 } else {
2067                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2068                     myData->isEmptySegment = false;	/* reset this, we have a different error */
2069                     break;
2070                 }
2071 
2072             case ESC_2022:
2073                 mySource--;
2074 escape:
2075                 {
2076                     const char * mySourceBefore = mySource;
2077                     int8_t toULengthBefore = args->converter->toULength;
2078 
2079                     changeState_2022(args->converter,&(mySource),
2080                         mySourceLimit, ISO_2022_JP,err);
2081 
2082                     /* If in ISO-2022-JP only and we successfully completed an escape sequence, but previous segment was empty, create an error */
2083                     if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2084                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2085                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
2086                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2087                     }
2088                 }
2089 
2090                 /* invalid or illegal escape sequence */
2091                 if(U_FAILURE(*err)){
2092                     args->target = myTarget;
2093                     args->source = mySource;
2094                     myData->isEmptySegment = false;	/* Reset to avoid future spurious errors */
2095                     return;
2096                 }
2097                 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2098                 if(myData->key==0) {
2099                     myData->isEmptySegment = true;
2100                 }
2101                 continue;
2102 
2103             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2104 
2105             case CR:
2106             case LF:
2107                 /* automatically reset to single-byte mode */
2108                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2109                     pToU2022State->cs[0] = (int8_t)ASCII;
2110                 }
2111                 pToU2022State->cs[2] = 0;
2112                 pToU2022State->g = 0;
2113                 U_FALLTHROUGH;
2114             default:
2115                 /* convert one or two bytes */
2116                 myData->isEmptySegment = false;
2117                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2118                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2119                     !IS_JP_DBCS(cs)
2120                 ) {
2121                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2122                     targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2123 
2124                     /* return from a single-shift state to the previous one */
2125                     if(pToU2022State->g >= 2) {
2126                         pToU2022State->g=pToU2022State->prevG;
2127                     }
2128                 } else switch(cs) {
2129                 case ASCII:
2130                     if(mySourceChar <= 0x7f) {
2131                         targetUniChar = mySourceChar;
2132                     }
2133                     break;
2134                 case ISO8859_1:
2135                     if(mySourceChar <= 0x7f) {
2136                         targetUniChar = mySourceChar + 0x80;
2137                     }
2138                     /* return from a single-shift state to the previous one */
2139                     pToU2022State->g=pToU2022State->prevG;
2140                     break;
2141                 case ISO8859_7:
2142                     if(mySourceChar <= 0x7f) {
2143                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
2144                         targetUniChar =
2145                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2146                                 myData->myConverterArray[cs],
2147                                 mySourceChar + 0x80);
2148                     }
2149                     /* return from a single-shift state to the previous one */
2150                     pToU2022State->g=pToU2022State->prevG;
2151                     break;
2152                 case JISX201:
2153                     if(mySourceChar <= 0x7f) {
2154                         targetUniChar = jisx201ToU(mySourceChar);
2155                     }
2156                     break;
2157                 case HWKANA_7BIT:
2158                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2159                         /* 7-bit halfwidth Katakana */
2160                         targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2161                     }
2162                     break;
2163                 default:
2164                     /* G0 DBCS */
2165                     if(mySource < mySourceLimit) {
2166                         int leadIsOk, trailIsOk;
2167                         uint8_t trailByte;
2168 getTrailByte:
2169                         trailByte = (uint8_t)*mySource;
2170                         /*
2171                          * Ticket 5691: consistent illegal sequences:
2172                          * - We include at least the first byte in the illegal sequence.
2173                          * - If any of the non-initial bytes could be the start of a character,
2174                          *   we stop the illegal sequence before the first one of those.
2175                          *
2176                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2177                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2178                          * Otherwise we convert or report the pair of bytes.
2179                          */
2180                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2181                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2182                         if (leadIsOk && trailIsOk) {
2183                             ++mySource;
2184                             tmpSourceChar = (mySourceChar << 8) | trailByte;
2185                             /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2186                             mySourceChar = tmpSourceChar;
2187                             if (cs == JISX208 || cs == KSC5601) {
2188                                 tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2189                             }
2190                             tempBuf[0] = (char)(tmpSourceChar >> 8);
2191                             tempBuf[1] = (char)(tmpSourceChar);
2192                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, false);
2193                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2194                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2195                             ++mySource;
2196                             /* add another bit so that the code below writes 2 bytes in case of error */
2197                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2198                         }
2199                     } else {
2200                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2201                         args->converter->toULength = 1;
2202                         goto endloop;
2203                     }
2204                 }  /* End of inner switch */
2205                 break;
2206             }  /* End of outer switch */
2207             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2208                 if(args->offsets){
2209                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2210                 }
2211                 *(myTarget++)=(char16_t)targetUniChar;
2212             }
2213             else if(targetUniChar > missingCharMarker){
2214                 /* disassemble the surrogate pair and write to output*/
2215                 targetUniChar-=0x0010000;
2216                 *myTarget = (char16_t)(0xd800+(char16_t)(targetUniChar>>10));
2217                 if(args->offsets){
2218                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2219                 }
2220                 ++myTarget;
2221                 if(myTarget< args->targetLimit){
2222                     *myTarget = (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
2223                     if(args->offsets){
2224                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2225                     }
2226                     ++myTarget;
2227                 }else{
2228                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2229                                     (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
2230                 }
2231 
2232             }
2233             else{
2234                 /* Call the callback function*/
2235                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2236                 break;
2237             }
2238         }
2239         else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2240             *err =U_BUFFER_OVERFLOW_ERROR;
2241             break;
2242         }
2243     }
2244 endloop:
2245     args->target = myTarget;
2246     args->source = mySource;
2247 }
2248 
2249 
2250 #if !UCONFIG_ONLY_HTML_CONVERSION
2251 /***************************************************************
2252 *   Rules for ISO-2022-KR encoding
2253 *   i) The KSC5601 designator sequence should appear only once in a file,
2254 *      at the beginning of a line before any KSC5601 characters. This usually
2255 *      means that it appears by itself on the first line of the file
2256 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
2257 *      and SI to shift into single byte mode
2258 */
2259 static void U_CALLCONV
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs * args,UErrorCode * err)2260 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2261 
2262     UConverter* saveConv = args->converter;
2263     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2264     args->converter=myConverterData->currentConverter;
2265 
2266     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2267     ucnv_MBCSFromUnicodeWithOffsets(args,err);
2268     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2269 
2270     if(*err == U_BUFFER_OVERFLOW_ERROR) {
2271         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2272             uprv_memcpy(
2273                 saveConv->charErrorBuffer,
2274                 myConverterData->currentConverter->charErrorBuffer,
2275                 myConverterData->currentConverter->charErrorBufferLength);
2276         }
2277         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2278         myConverterData->currentConverter->charErrorBufferLength = 0;
2279     }
2280     args->converter=saveConv;
2281 }
2282 
2283 static void U_CALLCONV
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2284 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2285 
2286     const char16_t *source = args->source;
2287     const char16_t *sourceLimit = args->sourceLimit;
2288     unsigned char *target = (unsigned char *) args->target;
2289     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2290     int32_t* offsets = args->offsets;
2291     uint32_t targetByteUnit = 0x0000;
2292     UChar32 sourceChar = 0x0000;
2293     UBool isTargetByteDBCS;
2294     UBool oldIsTargetByteDBCS;
2295     UConverterDataISO2022 *converterData;
2296     UConverterSharedData* sharedData;
2297     UBool useFallback;
2298     int32_t length =0;
2299 
2300     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2301     /* if the version is 1 then the user is requesting
2302      * conversion with ibm-25546 pass the arguments to
2303      * MBCS converter and return
2304      */
2305     if(converterData->version==1){
2306         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2307         return;
2308     }
2309 
2310     /* initialize data */
2311     sharedData = converterData->currentConverter->sharedData;
2312     useFallback = args->converter->useFallback;
2313     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2314     oldIsTargetByteDBCS = isTargetByteDBCS;
2315 
2316     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2317     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2318         goto getTrail;
2319     }
2320     while(source < sourceLimit){
2321 
2322         targetByteUnit = missingCharMarker;
2323 
2324         if(target < (unsigned char*) args->targetLimit){
2325             sourceChar = *source++;
2326 
2327             /* do not convert SO/SI/ESC */
2328             if(IS_2022_CONTROL(sourceChar)) {
2329                 /* callback(illegal) */
2330                 *err=U_ILLEGAL_CHAR_FOUND;
2331                 args->converter->fromUChar32=sourceChar;
2332                 break;
2333             }
2334 
2335             length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2336             if(length < 0) {
2337                 length = -length;  /* fallback */
2338             }
2339             /* only DBCS or SBCS characters are expected*/
2340             /* DB characters with high bit set to 1 are expected */
2341             if( length > 2 || length==0 ||
2342                 (length == 1 && targetByteUnit > 0x7f) ||
2343                 (length == 2 &&
2344                     ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2345                     (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2346             ) {
2347                 targetByteUnit=missingCharMarker;
2348             }
2349             if (targetByteUnit != missingCharMarker){
2350 
2351                 oldIsTargetByteDBCS = isTargetByteDBCS;
2352                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2353                   /* append the shift sequence */
2354                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2355 
2356                     if (isTargetByteDBCS)
2357                         *target++ = UCNV_SO;
2358                     else
2359                         *target++ = UCNV_SI;
2360                     if(offsets)
2361                         *(offsets++) = (int32_t)(source - args->source-1);
2362                 }
2363                 /* write the targetUniChar  to target */
2364                 if(targetByteUnit <= 0x00FF){
2365                     if( target < targetLimit){
2366                         *(target++) = (unsigned char) targetByteUnit;
2367                         if(offsets){
2368                             *(offsets++) = (int32_t)(source - args->source-1);
2369                         }
2370 
2371                     }else{
2372                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2373                         *err = U_BUFFER_OVERFLOW_ERROR;
2374                     }
2375                 }else{
2376                     if(target < targetLimit){
2377                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2378                         if(offsets){
2379                             *(offsets++) = (int32_t)(source - args->source-1);
2380                         }
2381                         if(target < targetLimit){
2382                             *(target++) =(unsigned char) (targetByteUnit -0x80);
2383                             if(offsets){
2384                                 *(offsets++) = (int32_t)(source - args->source-1);
2385                             }
2386                         }else{
2387                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2388                             *err = U_BUFFER_OVERFLOW_ERROR;
2389                         }
2390                     }else{
2391                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2392                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2393                         *err = U_BUFFER_OVERFLOW_ERROR;
2394                     }
2395                 }
2396 
2397             }
2398             else{
2399                 /* oops.. the code point is unassingned
2400                  * set the error and reason
2401                  */
2402 
2403                 /*check if the char is a First surrogate*/
2404                 if(U16_IS_SURROGATE(sourceChar)) {
2405                     if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2406 getTrail:
2407                         /*look ahead to find the trail surrogate*/
2408                         if(source <  sourceLimit) {
2409                             /* test the following code unit */
2410                             char16_t trail=(char16_t) *source;
2411                             if(U16_IS_TRAIL(trail)) {
2412                                 source++;
2413                                 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2414                                 *err = U_INVALID_CHAR_FOUND;
2415                                 /* convert this surrogate code point */
2416                                 /* exit this condition tree */
2417                             } else {
2418                                 /* this is an unmatched lead code unit (1st surrogate) */
2419                                 /* callback(illegal) */
2420                                 *err=U_ILLEGAL_CHAR_FOUND;
2421                             }
2422                         } else {
2423                             /* no more input */
2424                             *err = U_ZERO_ERROR;
2425                         }
2426                     } else {
2427                         /* this is an unmatched trail code unit (2nd surrogate) */
2428                         /* callback(illegal) */
2429                         *err=U_ILLEGAL_CHAR_FOUND;
2430                     }
2431                 } else {
2432                     /* callback(unassigned) for a BMP code point */
2433                     *err = U_INVALID_CHAR_FOUND;
2434                 }
2435 
2436                 args->converter->fromUChar32=sourceChar;
2437                 break;
2438             }
2439         } /* end if(myTargetIndex<myTargetLength) */
2440         else{
2441             *err =U_BUFFER_OVERFLOW_ERROR;
2442             break;
2443         }
2444 
2445     }/* end while(mySourceIndex<mySourceLength) */
2446 
2447     /*
2448      * the end of the input stream and detection of truncated input
2449      * are handled by the framework, but for ISO-2022-KR conversion
2450      * we need to be in ASCII mode at the very end
2451      *
2452      * conditions:
2453      *   successful
2454      *   not in ASCII mode
2455      *   end of input and no truncated input
2456      */
2457     if( U_SUCCESS(*err) &&
2458         isTargetByteDBCS &&
2459         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2460     ) {
2461         int32_t sourceIndex;
2462 
2463         /* we are switching to ASCII */
2464         isTargetByteDBCS=false;
2465 
2466         /* get the source index of the last input character */
2467         /*
2468          * TODO this would be simpler and more reliable if we used a pair
2469          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2470          * so that we could simply use the prevSourceIndex here;
2471          * this code gives an incorrect result for the rare case of an unmatched
2472          * trail surrogate that is alone in the last buffer of the text stream
2473          */
2474         sourceIndex=(int32_t)(source-args->source);
2475         if(sourceIndex>0) {
2476             --sourceIndex;
2477             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2478                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2479             ) {
2480                 --sourceIndex;
2481             }
2482         } else {
2483             sourceIndex=-1;
2484         }
2485 
2486         fromUWriteUInt8(
2487             args->converter,
2488             SHIFT_IN_STR, 1,
2489             &target, (const char *)targetLimit,
2490             &offsets, sourceIndex,
2491             err);
2492     }
2493 
2494     /*save the state and return */
2495     args->source = source;
2496     args->target = (char*)target;
2497     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2498 }
2499 
2500 /************************ To Unicode ***************************************/
2501 
2502 static void U_CALLCONV
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs * args,UErrorCode * err)2503 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2504                                                             UErrorCode* err){
2505     char const* sourceStart;
2506     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2507 
2508     UConverterToUnicodeArgs subArgs;
2509     int32_t minArgsSize;
2510 
2511     /* set up the subconverter arguments */
2512     if(args->size<sizeof(UConverterToUnicodeArgs)) {
2513         minArgsSize = args->size;
2514     } else {
2515         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2516     }
2517 
2518     uprv_memcpy(&subArgs, args, minArgsSize);
2519     subArgs.size = (uint16_t)minArgsSize;
2520     subArgs.converter = myData->currentConverter;
2521 
2522     /* remember the original start of the input for offsets */
2523     sourceStart = args->source;
2524 
2525     if(myData->key != 0) {
2526         /* continue with a partial escape sequence */
2527         goto escape;
2528     }
2529 
2530     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2531         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2532         subArgs.source = args->source;
2533         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2534         if(subArgs.source != subArgs.sourceLimit) {
2535             /*
2536              * get the current partial byte sequence
2537              *
2538              * it needs to be moved between the public and the subconverter
2539              * so that the conversion framework, which only sees the public
2540              * converter, can handle truncated and illegal input etc.
2541              */
2542             if(args->converter->toULength > 0) {
2543                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2544             }
2545             subArgs.converter->toULength = args->converter->toULength;
2546 
2547             /*
2548              * Convert up to the end of the input, or to before the next escape character.
2549              * Does not handle conversion extensions because the preToU[] state etc.
2550              * is not copied.
2551              */
2552             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2553 
2554             if(args->offsets != nullptr && sourceStart != args->source) {
2555                 /* update offsets to base them on the actual start of the input */
2556                 int32_t *offsets = args->offsets;
2557                 char16_t *target = args->target;
2558                 int32_t delta = (int32_t)(args->source - sourceStart);
2559                 while(target < subArgs.target) {
2560                     if(*offsets >= 0) {
2561                         *offsets += delta;
2562                     }
2563                     ++offsets;
2564                     ++target;
2565                 }
2566             }
2567             args->source = subArgs.source;
2568             args->target = subArgs.target;
2569             args->offsets = subArgs.offsets;
2570 
2571             /* copy input/error/overflow buffers */
2572             if(subArgs.converter->toULength > 0) {
2573                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2574             }
2575             args->converter->toULength = subArgs.converter->toULength;
2576 
2577             if(*err == U_BUFFER_OVERFLOW_ERROR) {
2578                 if(subArgs.converter->UCharErrorBufferLength > 0) {
2579                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2580                                 subArgs.converter->UCharErrorBufferLength);
2581                 }
2582                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2583                 subArgs.converter->UCharErrorBufferLength = 0;
2584             }
2585         }
2586 
2587         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2588             return;
2589         }
2590 
2591 escape:
2592         changeState_2022(args->converter,
2593                &(args->source),
2594                args->sourceLimit,
2595                ISO_2022_KR,
2596                err);
2597     }
2598 }
2599 
2600 static void U_CALLCONV
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)2601 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2602                                                             UErrorCode* err){
2603     char tempBuf[2];
2604     const char *mySource = ( char *) args->source;
2605     char16_t *myTarget = args->target;
2606     const char *mySourceLimit = args->sourceLimit;
2607     UChar32 targetUniChar = 0x0000;
2608     char16_t mySourceChar = 0x0000;
2609     UConverterDataISO2022* myData;
2610     UConverterSharedData* sharedData ;
2611     UBool useFallback;
2612 
2613     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2614     if(myData->version==1){
2615         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2616         return;
2617     }
2618 
2619     /* initialize state */
2620     sharedData = myData->currentConverter->sharedData;
2621     useFallback = args->converter->useFallback;
2622 
2623     if(myData->key != 0) {
2624         /* continue with a partial escape sequence */
2625         goto escape;
2626     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2627         /* continue with a partial double-byte character */
2628         mySourceChar = args->converter->toUBytes[0];
2629         args->converter->toULength = 0;
2630         goto getTrailByte;
2631     }
2632 
2633     while(mySource< mySourceLimit){
2634 
2635         if(myTarget < args->targetLimit){
2636 
2637             mySourceChar= (unsigned char) *mySource++;
2638 
2639             if(mySourceChar==UCNV_SI){
2640                 myData->toU2022State.g = 0;
2641                 if (myData->isEmptySegment) {
2642                     myData->isEmptySegment = false;	/* we are handling it, reset to avoid future spurious errors */
2643                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2644                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
2645                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2646                     args->converter->toULength = 1;
2647                     args->target = myTarget;
2648                     args->source = mySource;
2649                     return;
2650                 }
2651                 /*consume the source */
2652                 continue;
2653             }else if(mySourceChar==UCNV_SO){
2654                 myData->toU2022State.g = 1;
2655                 myData->isEmptySegment = true;	/* Begin a new segment, empty so far */
2656                 /*consume the source */
2657                 continue;
2658             }else if(mySourceChar==ESC_2022){
2659                 mySource--;
2660 escape:
2661                 myData->isEmptySegment = false;	/* Any invalid ESC sequences will be detected separately, so just reset this */
2662                 changeState_2022(args->converter,&(mySource),
2663                                 mySourceLimit, ISO_2022_KR, err);
2664                 if(U_FAILURE(*err)){
2665                     args->target = myTarget;
2666                     args->source = mySource;
2667                     return;
2668                 }
2669                 continue;
2670             }
2671 
2672             myData->isEmptySegment = false;	/* Any invalid char errors will be detected separately, so just reset this */
2673             if(myData->toU2022State.g == 1) {
2674                 if(mySource < mySourceLimit) {
2675                     int leadIsOk, trailIsOk;
2676                     uint8_t trailByte;
2677 getTrailByte:
2678                     targetUniChar = missingCharMarker;
2679                     trailByte = (uint8_t)*mySource;
2680                     /*
2681                      * Ticket 5691: consistent illegal sequences:
2682                      * - We include at least the first byte in the illegal sequence.
2683                      * - If any of the non-initial bytes could be the start of a character,
2684                      *   we stop the illegal sequence before the first one of those.
2685                      *
2686                      * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2687                      * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2688                      * Otherwise we convert or report the pair of bytes.
2689                      */
2690                     leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2691                     trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2692                     if (leadIsOk && trailIsOk) {
2693                         ++mySource;
2694                         tempBuf[0] = (char)(mySourceChar + 0x80);
2695                         tempBuf[1] = (char)(trailByte + 0x80);
2696                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2697                         mySourceChar = (mySourceChar << 8) | trailByte;
2698                     } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2699                         /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2700                         ++mySource;
2701                         /* add another bit so that the code below writes 2 bytes in case of error */
2702                         mySourceChar = static_cast<char16_t>(0x10000 | (mySourceChar << 8) | trailByte);
2703                     }
2704                 } else {
2705                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2706                     args->converter->toULength = 1;
2707                     break;
2708                 }
2709             }
2710             else if(mySourceChar <= 0x7f) {
2711                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2712             } else {
2713                 targetUniChar = 0xffff;
2714             }
2715             if(targetUniChar < 0xfffe){
2716                 if(args->offsets) {
2717                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2718                 }
2719                 *(myTarget++)=(char16_t)targetUniChar;
2720             }
2721             else {
2722                 /* Call the callback function*/
2723                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2724                 break;
2725             }
2726         }
2727         else{
2728             *err =U_BUFFER_OVERFLOW_ERROR;
2729             break;
2730         }
2731     }
2732     args->target = myTarget;
2733     args->source = mySource;
2734 }
2735 
2736 /*************************** END ISO2022-KR *********************************/
2737 
2738 /*************************** ISO-2022-CN *********************************
2739 *
2740 * Rules for ISO-2022-CN Encoding:
2741 * i)   The designator sequence must appear once on a line before any instance
2742 *      of character set it designates.
2743 * ii)  If two lines contain characters from the same character set, both lines
2744 *      must include the designator sequence.
2745 * iii) Once the designator sequence is known, a shifting sequence has to be found
2746 *      to invoke the  shifting
2747 * iv)  All lines start in ASCII and end in ASCII.
2748 * v)   Four shifting sequences are employed for this purpose:
2749 *
2750 *      Sequcence   ASCII Eq    Charsets
2751 *      ----------  -------    ---------
2752 *      SI           <SI>        US-ASCII
2753 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2754 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
2755 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2756 *
2757 * vi)
2758 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
2759 *      SS2designator : ESC "$" "*" finalchar_for_SS2
2760 *      SS3designator : ESC "$" "+" finalchar_for_SS3
2761 *
2762 *      ESC $ ) A       Indicates the bytes following SO are Chinese
2763 *       characters as defined in GB 2312-80, until
2764 *       another SOdesignation appears
2765 *
2766 *
2767 *      ESC $ ) E       Indicates the bytes following SO are as defined
2768 *       in ISO-IR-165 (for details, see section 2.1),
2769 *       until another SOdesignation appears
2770 *
2771 *      ESC $ ) G       Indicates the bytes following SO are as defined
2772 *       in CNS 11643-plane-1, until another
2773 *       SOdesignation appears
2774 *
2775 *      ESC $ * H       Indicates the two bytes immediately following
2776 *       SS2 is a Chinese character as defined in CNS
2777 *       11643-plane-2, until another SS2designation
2778 *       appears
2779 *       (Meaning <ESC>N must precede every 2 byte
2780 *        sequence.)
2781 *
2782 *      ESC $ + I       Indicates the immediate two bytes following SS3
2783 *       is a Chinese character as defined in CNS
2784 *       11643-plane-3, until another SS3designation
2785 *       appears
2786 *       (Meaning <ESC>O must precede every 2 byte
2787 *        sequence.)
2788 *
2789 *      ESC $ + J       Indicates the immediate two bytes following SS3
2790 *       is a Chinese character as defined in CNS
2791 *       11643-plane-4, until another SS3designation
2792 *       appears
2793 *       (In English: <ESC>O must precede every 2 byte
2794 *        sequence.)
2795 *
2796 *      ESC $ + K       Indicates the immediate two bytes following SS3
2797 *       is a Chinese character as defined in CNS
2798 *       11643-plane-5, until another SS3designation
2799 *       appears
2800 *
2801 *      ESC $ + L       Indicates the immediate two bytes following SS3
2802 *       is a Chinese character as defined in CNS
2803 *       11643-plane-6, until another SS3designation
2804 *       appears
2805 *
2806 *      ESC $ + M       Indicates the immediate two bytes following SS3
2807 *       is a Chinese character as defined in CNS
2808 *       11643-plane-7, until another SS3designation
2809 *       appears
2810 *
2811 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2812 *       has its own designation information before any Chinese characters
2813 *       appear
2814 *
2815 */
2816 
2817 /* The following are defined this way to make the strings truly readonly */
2818 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2819 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2820 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2821 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2822 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2823 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2824 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2825 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2826 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2827 
2828 /********************** ISO2022-CN Data **************************/
2829 static const char* const escSeqCharsCN[10] ={
2830         SHIFT_IN_STR,                   /* 0 ASCII */
2831         GB_2312_80_STR,                 /* 1 GB2312_1 */
2832         ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2833         CNS_11643_1992_Plane_1_STR,
2834         CNS_11643_1992_Plane_2_STR,
2835         CNS_11643_1992_Plane_3_STR,
2836         CNS_11643_1992_Plane_4_STR,
2837         CNS_11643_1992_Plane_5_STR,
2838         CNS_11643_1992_Plane_6_STR,
2839         CNS_11643_1992_Plane_7_STR
2840 };
2841 
2842 static void U_CALLCONV
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,UErrorCode * err)2843 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2844     UConverter *cnv = args->converter;
2845     UConverterDataISO2022 *converterData;
2846     ISO2022State *pFromU2022State;
2847     uint8_t *target = (uint8_t *) args->target;
2848     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2849     const char16_t* source = args->source;
2850     const char16_t* sourceLimit = args->sourceLimit;
2851     int32_t* offsets = args->offsets;
2852     UChar32 sourceChar;
2853     char buffer[8];
2854     int32_t len;
2855     int8_t choices[3];
2856     int32_t choiceCount;
2857     uint32_t targetValue = 0;
2858     UBool useFallback;
2859 
2860     /* set up the state */
2861     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2862     pFromU2022State   = &converterData->fromU2022State;
2863 
2864     choiceCount = 0;
2865 
2866     /* check if the last codepoint of previous buffer was a lead surrogate*/
2867     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2868         goto getTrail;
2869     }
2870 
2871     while( source < sourceLimit){
2872         if(target < targetLimit){
2873 
2874             sourceChar  = *(source++);
2875             /*check if the char is a First surrogate*/
2876              if(U16_IS_SURROGATE(sourceChar)) {
2877                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2878 getTrail:
2879                     /*look ahead to find the trail surrogate*/
2880                     if(source < sourceLimit) {
2881                         /* test the following code unit */
2882                         char16_t trail=(char16_t) *source;
2883                         if(U16_IS_TRAIL(trail)) {
2884                             source++;
2885                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2886                             cnv->fromUChar32=0x00;
2887                             /* convert this supplementary code point */
2888                             /* exit this condition tree */
2889                         } else {
2890                             /* this is an unmatched lead code unit (1st surrogate) */
2891                             /* callback(illegal) */
2892                             *err=U_ILLEGAL_CHAR_FOUND;
2893                             cnv->fromUChar32=sourceChar;
2894                             break;
2895                         }
2896                     } else {
2897                         /* no more input */
2898                         cnv->fromUChar32=sourceChar;
2899                         break;
2900                     }
2901                 } else {
2902                     /* this is an unmatched trail code unit (2nd surrogate) */
2903                     /* callback(illegal) */
2904                     *err=U_ILLEGAL_CHAR_FOUND;
2905                     cnv->fromUChar32=sourceChar;
2906                     break;
2907                 }
2908             }
2909 
2910             /* do the conversion */
2911             if(sourceChar <= 0x007f ){
2912                 /* do not convert SO/SI/ESC */
2913                 if(IS_2022_CONTROL(sourceChar)) {
2914                     /* callback(illegal) */
2915                     *err=U_ILLEGAL_CHAR_FOUND;
2916                     cnv->fromUChar32=sourceChar;
2917                     break;
2918                 }
2919 
2920                 /* US-ASCII */
2921                 if(pFromU2022State->g == 0) {
2922                     buffer[0] = (char)sourceChar;
2923                     len = 1;
2924                 } else {
2925                     buffer[0] = UCNV_SI;
2926                     buffer[1] = (char)sourceChar;
2927                     len = 2;
2928                     pFromU2022State->g = 0;
2929                     choiceCount = 0;
2930                 }
2931                 if(sourceChar == CR || sourceChar == LF) {
2932                     /* reset the state at the end of a line */
2933                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2934                     choiceCount = 0;
2935                 }
2936             }
2937             else{
2938                 /* convert U+0080..U+10ffff */
2939                 int32_t i;
2940                 int8_t cs, g;
2941 
2942                 if(choiceCount == 0) {
2943                     /* try the current SO/G1 converter first */
2944                     choices[0] = pFromU2022State->cs[1];
2945 
2946                     /* default to GB2312_1 if none is designated yet */
2947                     if(choices[0] == 0) {
2948                         choices[0] = GB2312_1;
2949                     }
2950 
2951                     if(converterData->version == 0) {
2952                         /* ISO-2022-CN */
2953 
2954                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2955                         if(choices[0] == GB2312_1) {
2956                             choices[1] = (int8_t)CNS_11643_1;
2957                         } else {
2958                             choices[1] = (int8_t)GB2312_1;
2959                         }
2960 
2961                         choiceCount = 2;
2962                     } else if (converterData->version == 1) {
2963                         /* ISO-2022-CN-EXT */
2964 
2965                         /* try one of the other converters */
2966                         switch(choices[0]) {
2967                         case GB2312_1:
2968                             choices[1] = (int8_t)CNS_11643_1;
2969                             choices[2] = (int8_t)ISO_IR_165;
2970                             break;
2971                         case ISO_IR_165:
2972                             choices[1] = (int8_t)GB2312_1;
2973                             choices[2] = (int8_t)CNS_11643_1;
2974                             break;
2975                         default: /* CNS_11643_x */
2976                             choices[1] = (int8_t)GB2312_1;
2977                             choices[2] = (int8_t)ISO_IR_165;
2978                             break;
2979                         }
2980 
2981                         choiceCount = 3;
2982                     } else {
2983                         choices[0] = (int8_t)CNS_11643_1;
2984                         choices[1] = (int8_t)GB2312_1;
2985                     }
2986                 }
2987 
2988                 cs = g = 0;
2989                 /*
2990                  * len==0: no mapping found yet
2991                  * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
2992                  * len>0: found a roundtrip result, done
2993                  */
2994                 len = 0;
2995                 /*
2996                  * We will turn off useFallback after finding a fallback,
2997                  * but we still get fallbacks from PUA code points as usual.
2998                  * Therefore, we will also need to check that we don't overwrite
2999                  * an early fallback with a later one.
3000                  */
3001                 useFallback = cnv->useFallback;
3002 
3003                 for(i = 0; i < choiceCount && len <= 0; ++i) {
3004                     int8_t cs0 = choices[i];
3005                     if(cs0 > 0) {
3006                         uint32_t value;
3007                         int32_t len2;
3008                         if(cs0 >= CNS_11643_0) {
3009                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3010                                         converterData->myConverterArray[CNS_11643],
3011                                         sourceChar,
3012                                         &value,
3013                                         useFallback,
3014                                         MBCS_OUTPUT_3);
3015                             if(len2 == 3 || (len2 == -3 && len == 0)) {
3016                                 targetValue = value;
3017                                 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3018                                 if(len2 >= 0) {
3019                                     len = 2;
3020                                 } else {
3021                                     len = -2;
3022                                     useFallback = false;
3023                                 }
3024                                 if(cs == CNS_11643_1) {
3025                                     g = 1;
3026                                 } else if(cs == CNS_11643_2) {
3027                                     g = 2;
3028                                 } else /* plane 3..7 */ if(converterData->version == 1) {
3029                                     g = 3;
3030                                 } else {
3031                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3032                                     len = 0;
3033                                 }
3034                             }
3035                         } else {
3036                             /* GB2312_1 or ISO-IR-165 */
3037                             U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3038                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3039                                         converterData->myConverterArray[cs0],
3040                                         sourceChar,
3041                                         &value,
3042                                         useFallback,
3043                                         MBCS_OUTPUT_2);
3044                             if(len2 == 2 || (len2 == -2 && len == 0)) {
3045                                 targetValue = value;
3046                                 len = len2;
3047                                 cs = cs0;
3048                                 g = 1;
3049                                 useFallback = false;
3050                             }
3051                         }
3052                     }
3053                 }
3054 
3055                 if(len != 0) {
3056                     len = 0; /* count output bytes; it must have been abs(len) == 2 */
3057 
3058                     /* write the designation sequence if necessary */
3059                     if(cs != pFromU2022State->cs[g]) {
3060                         if(cs < CNS_11643) {
3061                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3062                         } else {
3063                             U_ASSERT(cs >= CNS_11643_1);
3064                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3065                         }
3066                         len = 4;
3067                         pFromU2022State->cs[g] = cs;
3068                         if(g == 1) {
3069                             /* changing the SO/G1 charset invalidates the choices[] */
3070                             choiceCount = 0;
3071                         }
3072                     }
3073 
3074                     /* write the shift sequence if necessary */
3075                     if(g != pFromU2022State->g) {
3076                         switch(g) {
3077                         case 1:
3078                             buffer[len++] = UCNV_SO;
3079 
3080                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3081                             pFromU2022State->g = 1;
3082                             break;
3083                         case 2:
3084                             buffer[len++] = 0x1b;
3085                             buffer[len++] = 0x4e;
3086                             break;
3087                         default: /* case 3 */
3088                             buffer[len++] = 0x1b;
3089                             buffer[len++] = 0x4f;
3090                             break;
3091                         }
3092                     }
3093 
3094                     /* write the two output bytes */
3095                     buffer[len++] = (char)(targetValue >> 8);
3096                     buffer[len++] = (char)targetValue;
3097                 } else {
3098                     /* if we cannot find the character after checking all codepages
3099                      * then this is an error
3100                      */
3101                     *err = U_INVALID_CHAR_FOUND;
3102                     cnv->fromUChar32=sourceChar;
3103                     break;
3104                 }
3105             }
3106 
3107             /* output len>0 bytes in buffer[] */
3108             if(len == 1) {
3109                 *target++ = buffer[0];
3110                 if(offsets) {
3111                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3112                 }
3113             } else if(len == 2 && (target + 2) <= targetLimit) {
3114                 *target++ = buffer[0];
3115                 *target++ = buffer[1];
3116                 if(offsets) {
3117                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3118                     *offsets++ = sourceIndex;
3119                     *offsets++ = sourceIndex;
3120                 }
3121             } else {
3122                 fromUWriteUInt8(
3123                     cnv,
3124                     buffer, len,
3125                     &target, (const char *)targetLimit,
3126                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3127                     err);
3128                 if(U_FAILURE(*err)) {
3129                     break;
3130                 }
3131             }
3132         } /* end if(myTargetIndex<myTargetLength) */
3133         else{
3134             *err =U_BUFFER_OVERFLOW_ERROR;
3135             break;
3136         }
3137 
3138     }/* end while(mySourceIndex<mySourceLength) */
3139 
3140     /*
3141      * the end of the input stream and detection of truncated input
3142      * are handled by the framework, but for ISO-2022-CN conversion
3143      * we need to be in ASCII mode at the very end
3144      *
3145      * conditions:
3146      *   successful
3147      *   not in ASCII mode
3148      *   end of input and no truncated input
3149      */
3150     if( U_SUCCESS(*err) &&
3151         pFromU2022State->g!=0 &&
3152         args->flush && source>=sourceLimit && cnv->fromUChar32==0
3153     ) {
3154         int32_t sourceIndex;
3155 
3156         /* we are switching to ASCII */
3157         pFromU2022State->g=0;
3158 
3159         /* get the source index of the last input character */
3160         /*
3161          * TODO this would be simpler and more reliable if we used a pair
3162          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3163          * so that we could simply use the prevSourceIndex here;
3164          * this code gives an incorrect result for the rare case of an unmatched
3165          * trail surrogate that is alone in the last buffer of the text stream
3166          */
3167         sourceIndex=(int32_t)(source-args->source);
3168         if(sourceIndex>0) {
3169             --sourceIndex;
3170             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3171                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3172             ) {
3173                 --sourceIndex;
3174             }
3175         } else {
3176             sourceIndex=-1;
3177         }
3178 
3179         fromUWriteUInt8(
3180             cnv,
3181             SHIFT_IN_STR, 1,
3182             &target, (const char *)targetLimit,
3183             &offsets, sourceIndex,
3184             err);
3185     }
3186 
3187     /*save the state and return */
3188     args->source = source;
3189     args->target = (char*)target;
3190 }
3191 
3192 
3193 static void U_CALLCONV
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs * args,UErrorCode * err)3194 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3195                                                UErrorCode* err){
3196     char tempBuf[3];
3197     const char *mySource = (char *) args->source;
3198     char16_t *myTarget = args->target;
3199     const char *mySourceLimit = args->sourceLimit;
3200     uint32_t targetUniChar = 0x0000;
3201     uint32_t mySourceChar = 0x0000;
3202     UConverterDataISO2022* myData;
3203     ISO2022State *pToU2022State;
3204 
3205     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3206     pToU2022State = &myData->toU2022State;
3207 
3208     if(myData->key != 0) {
3209         /* continue with a partial escape sequence */
3210         goto escape;
3211     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3212         /* continue with a partial double-byte character */
3213         mySourceChar = args->converter->toUBytes[0];
3214         args->converter->toULength = 0;
3215         targetUniChar = missingCharMarker;
3216         goto getTrailByte;
3217     }
3218 
3219     while(mySource < mySourceLimit){
3220 
3221         targetUniChar =missingCharMarker;
3222 
3223         if(myTarget < args->targetLimit){
3224 
3225             mySourceChar= (unsigned char) *mySource++;
3226 
3227             switch(mySourceChar){
3228             case UCNV_SI:
3229                 pToU2022State->g=0;
3230                 if (myData->isEmptySegment) {
3231                     myData->isEmptySegment = false;	/* we are handling it, reset to avoid future spurious errors */
3232                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3233                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
3234                     args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
3235                     args->converter->toULength = 1;
3236                     args->target = myTarget;
3237                     args->source = mySource;
3238                     return;
3239                 }
3240                 continue;
3241 
3242             case UCNV_SO:
3243                 if(pToU2022State->cs[1] != 0) {
3244                     pToU2022State->g=1;
3245                     myData->isEmptySegment = true;	/* Begin a new segment, empty so far */
3246                     continue;
3247                 } else {
3248                     /* illegal to have SO before a matching designator */
3249                     myData->isEmptySegment = false;	/* Handling a different error, reset this to avoid future spurious errs */
3250                     break;
3251                 }
3252 
3253             case ESC_2022:
3254                 mySource--;
3255 escape:
3256                 {
3257                     const char * mySourceBefore = mySource;
3258                     int8_t toULengthBefore = args->converter->toULength;
3259 
3260                     changeState_2022(args->converter,&(mySource),
3261                         mySourceLimit, ISO_2022_CN,err);
3262 
3263                     /* After SO there must be at least one character before a designator (designator error handled separately) */
3264                     if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3265                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3266                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
3267                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3268                     }
3269                 }
3270 
3271                 /* invalid or illegal escape sequence */
3272                 if(U_FAILURE(*err)){
3273                     args->target = myTarget;
3274                     args->source = mySource;
3275                     myData->isEmptySegment = false;	/* Reset to avoid future spurious errors */
3276                     return;
3277                 }
3278                 continue;
3279 
3280             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3281 
3282             case CR:
3283             case LF:
3284                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3285                 U_FALLTHROUGH;
3286             default:
3287                 /* convert one or two bytes */
3288                 myData->isEmptySegment = false;
3289                 if(pToU2022State->g != 0) {
3290                     if(mySource < mySourceLimit) {
3291                         UConverterSharedData *cnv;
3292                         StateEnum tempState;
3293                         int32_t tempBufLen;
3294                         int leadIsOk, trailIsOk;
3295                         uint8_t trailByte;
3296 getTrailByte:
3297                         trailByte = (uint8_t)*mySource;
3298                         /*
3299                          * Ticket 5691: consistent illegal sequences:
3300                          * - We include at least the first byte in the illegal sequence.
3301                          * - If any of the non-initial bytes could be the start of a character,
3302                          *   we stop the illegal sequence before the first one of those.
3303                          *
3304                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3305                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3306                          * Otherwise we convert or report the pair of bytes.
3307                          */
3308                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3309                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3310                         if (leadIsOk && trailIsOk) {
3311                             ++mySource;
3312                             tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3313                             if(tempState >= CNS_11643_0) {
3314                                 cnv = myData->myConverterArray[CNS_11643];
3315                                 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3316                                 tempBuf[1] = (char) (mySourceChar);
3317                                 tempBuf[2] = (char) trailByte;
3318                                 tempBufLen = 3;
3319 
3320                             }else{
3321                                 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3322                                 cnv = myData->myConverterArray[tempState];
3323                                 tempBuf[0] = (char) (mySourceChar);
3324                                 tempBuf[1] = (char) trailByte;
3325                                 tempBufLen = 2;
3326                             }
3327                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, false);
3328                             mySourceChar = (mySourceChar << 8) | trailByte;
3329                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3330                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3331                             ++mySource;
3332                             /* add another bit so that the code below writes 2 bytes in case of error */
3333                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3334                         }
3335                         if(pToU2022State->g>=2) {
3336                             /* return from a single-shift state to the previous one */
3337                             pToU2022State->g=pToU2022State->prevG;
3338                         }
3339                     } else {
3340                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3341                         args->converter->toULength = 1;
3342                         goto endloop;
3343                     }
3344                 }
3345                 else{
3346                     if(mySourceChar <= 0x7f) {
3347                         targetUniChar = (char16_t) mySourceChar;
3348                     }
3349                 }
3350                 break;
3351             }
3352             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3353                 if(args->offsets){
3354                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3355                 }
3356                 *(myTarget++)=(char16_t)targetUniChar;
3357             }
3358             else if(targetUniChar > missingCharMarker){
3359                 /* disassemble the surrogate pair and write to output*/
3360                 targetUniChar-=0x0010000;
3361                 *myTarget = (char16_t)(0xd800+(char16_t)(targetUniChar>>10));
3362                 if(args->offsets){
3363                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3364                 }
3365                 ++myTarget;
3366                 if(myTarget< args->targetLimit){
3367                     *myTarget = (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
3368                     if(args->offsets){
3369                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3370                     }
3371                     ++myTarget;
3372                 }else{
3373                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3374                                     (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
3375                 }
3376 
3377             }
3378             else{
3379                 /* Call the callback function*/
3380                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3381                 break;
3382             }
3383         }
3384         else{
3385             *err =U_BUFFER_OVERFLOW_ERROR;
3386             break;
3387         }
3388     }
3389 endloop:
3390     args->target = myTarget;
3391     args->source = mySource;
3392 }
3393 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3394 
3395 static void U_CALLCONV
_ISO_2022_WriteSub(UConverterFromUnicodeArgs * args,int32_t offsetIndex,UErrorCode * err)3396 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3397     UConverter *cnv = args->converter;
3398     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3399     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3400     char *p, *subchar;
3401     char buffer[8];
3402     int32_t length;
3403 
3404     subchar=(char *)cnv->subChars;
3405     length=cnv->subCharLen; /* assume length==1 for most variants */
3406 
3407     p = buffer;
3408     switch(myConverterData->locale[0]){
3409     case 'j':
3410         {
3411             int8_t cs;
3412 
3413             if(pFromU2022State->g == 1) {
3414                 /* JIS7: switch from G1 to G0 */
3415                 pFromU2022State->g = 0;
3416                 *p++ = UCNV_SI;
3417             }
3418 
3419             cs = pFromU2022State->cs[0];
3420             if(cs != ASCII && cs != JISX201) {
3421                 /* not in ASCII or JIS X 0201: switch to ASCII */
3422                 pFromU2022State->cs[0] = (int8_t)ASCII;
3423                 *p++ = '\x1b';
3424                 *p++ = '\x28';
3425                 *p++ = '\x42';
3426             }
3427 
3428             *p++ = subchar[0];
3429             break;
3430         }
3431     case 'c':
3432         if(pFromU2022State->g != 0) {
3433             /* not in ASCII mode: switch to ASCII */
3434             pFromU2022State->g = 0;
3435             *p++ = UCNV_SI;
3436         }
3437         *p++ = subchar[0];
3438         break;
3439     case 'k':
3440         if(myConverterData->version == 0) {
3441             if(length == 1) {
3442                 if(args->converter->fromUnicodeStatus) {
3443                     /* in DBCS mode: switch to SBCS */
3444                     args->converter->fromUnicodeStatus = 0;
3445                     *p++ = UCNV_SI;
3446                 }
3447                 *p++ = subchar[0];
3448             } else /* length == 2*/ {
3449                 if(!args->converter->fromUnicodeStatus) {
3450                     /* in SBCS mode: switch to DBCS */
3451                     args->converter->fromUnicodeStatus = 1;
3452                     *p++ = UCNV_SO;
3453                 }
3454                 *p++ = subchar[0];
3455                 *p++ = subchar[1];
3456             }
3457             break;
3458         } else {
3459             /* save the subconverter's substitution string */
3460             uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3461             int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3462 
3463             /* set our substitution string into the subconverter */
3464             myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3465             myConverterData->currentConverter->subCharLen = (int8_t)length;
3466 
3467             /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3468             args->converter = myConverterData->currentConverter;
3469             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3470             ucnv_cbFromUWriteSub(args, 0, err);
3471             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3472             args->converter = cnv;
3473 
3474             /* restore the subconverter's substitution string */
3475             myConverterData->currentConverter->subChars = currentSubChars;
3476             myConverterData->currentConverter->subCharLen = currentSubCharLen;
3477 
3478             if(*err == U_BUFFER_OVERFLOW_ERROR) {
3479                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3480                     uprv_memcpy(
3481                         cnv->charErrorBuffer,
3482                         myConverterData->currentConverter->charErrorBuffer,
3483                         myConverterData->currentConverter->charErrorBufferLength);
3484                 }
3485                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3486                 myConverterData->currentConverter->charErrorBufferLength = 0;
3487             }
3488             return;
3489         }
3490     default:
3491         /* not expected */
3492         break;
3493     }
3494     ucnv_cbFromUWriteBytes(args,
3495                            buffer, (int32_t)(p - buffer),
3496                            offsetIndex, err);
3497 }
3498 
3499 /*
3500  * Structure for cloning an ISO 2022 converter into a single memory block.
3501  */
3502 struct cloneStruct
3503 {
3504     UConverter cnv;
3505     UConverter currentConverter;
3506     UConverterDataISO2022 mydata;
3507 };
3508 
3509 
3510 U_CDECL_BEGIN
3511 
3512 static UConverter * U_CALLCONV
_ISO_2022_SafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)3513 _ISO_2022_SafeClone(
3514             const UConverter *cnv,
3515             void *stackBuffer,
3516             int32_t *pBufferSize,
3517             UErrorCode *status)
3518 {
3519     struct cloneStruct * localClone;
3520     UConverterDataISO2022 *cnvData;
3521     int32_t i, size;
3522 
3523     if (U_FAILURE(*status)){
3524         return nullptr;
3525     }
3526 
3527     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3528         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3529         return nullptr;
3530     }
3531 
3532     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3533     localClone = (struct cloneStruct *)stackBuffer;
3534 
3535     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3536 
3537     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3538     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3539     localClone->cnv.isExtraLocal = true;
3540 
3541     /* share the subconverters */
3542 
3543     if(cnvData->currentConverter != nullptr) {
3544         size = (int32_t)sizeof(UConverter);
3545         localClone->mydata.currentConverter =
3546             ucnv_safeClone(cnvData->currentConverter,
3547                             &localClone->currentConverter,
3548                             &size, status);
3549         if(U_FAILURE(*status)) {
3550             return nullptr;
3551         }
3552     }
3553 
3554     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3555         if(cnvData->myConverterArray[i] != nullptr) {
3556             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3557         }
3558     }
3559 
3560     return &localClone->cnv;
3561 }
3562 
3563 U_CDECL_END
3564 
3565 static void U_CALLCONV
_ISO_2022_GetUnicodeSet(const UConverter * cnv,const USetAdder * sa,UConverterUnicodeSet which,UErrorCode * pErrorCode)3566 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3567                     const USetAdder *sa,
3568                     UConverterUnicodeSet which,
3569                     UErrorCode *pErrorCode)
3570 {
3571     int32_t i;
3572     UConverterDataISO2022* cnvData;
3573 
3574     if (U_FAILURE(*pErrorCode)) {
3575         return;
3576     }
3577 #ifdef U_ENABLE_GENERIC_ISO_2022
3578     if (cnv->sharedData == &_ISO2022Data) {
3579         /* We use UTF-8 in this case */
3580         sa->addRange(sa->set, 0, 0xd7FF);
3581         sa->addRange(sa->set, 0xE000, 0x10FFFF);
3582         return;
3583     }
3584 #endif
3585 
3586     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3587 
3588     /* open a set and initialize it with code points that are algorithmically round-tripped */
3589     switch(cnvData->locale[0]){
3590     case 'j':
3591         /* include JIS X 0201 which is hardcoded */
3592         sa->add(sa->set, 0xa5);
3593         sa->add(sa->set, 0x203e);
3594         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3595             /* include Latin-1 for some variants of JP */
3596             sa->addRange(sa->set, 0, 0xff);
3597         } else {
3598             /* include ASCII for JP */
3599             sa->addRange(sa->set, 0, 0x7f);
3600         }
3601         if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3602             /*
3603              * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3604              * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3605              * use half-width Katakana.
3606              * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3607              * half-width Katakana via the ESC ( I sequence.
3608              * However, we only emit (fromUnicode) half-width Katakana according to the
3609              * definition of each variant.
3610              *
3611              * When including fallbacks,
3612              * we need to include half-width Katakana Unicode code points for all JP variants because
3613              * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3614              */
3615             /* include half-width Katakana for JP */
3616             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3617         }
3618         break;
3619 #if !UCONFIG_ONLY_HTML_CONVERSION
3620     case 'c':
3621     case 'z':
3622         /* include ASCII for CN */
3623         sa->addRange(sa->set, 0, 0x7f);
3624         break;
3625     case 'k':
3626         /* there is only one converter for KR, and it is not in the myConverterArray[] */
3627         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3628                 cnvData->currentConverter, sa, which, pErrorCode);
3629         /* the loop over myConverterArray[] will simply not find another converter */
3630         break;
3631 #endif
3632     default:
3633         break;
3634     }
3635 
3636 #if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3637             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3638                 cnvData->version==0 && i==CNS_11643
3639             ) {
3640                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3641                 ucnv_MBCSGetUnicodeSetForBytes(
3642                         cnvData->myConverterArray[i],
3643                         sa, UCNV_ROUNDTRIP_SET,
3644                         0, 0x81, 0x82,
3645                         pErrorCode);
3646             }
3647 #endif
3648 
3649     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3650         UConverterSetFilter filter;
3651         if(cnvData->myConverterArray[i]!=nullptr) {
3652             if(cnvData->locale[0]=='j' && i==JISX208) {
3653                 /*
3654                  * Only add code points that map to Shift-JIS codes
3655                  * corresponding to JIS X 0208.
3656                  */
3657                 filter=UCNV_SET_FILTER_SJIS;
3658 #if !UCONFIG_ONLY_HTML_CONVERSION
3659             } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3660                        cnvData->version==0 && i==CNS_11643) {
3661                 /*
3662                  * Version-specific for CN:
3663                  * CN version 0 does not map CNS planes 3..7 although
3664                  * they are all available in the CNS conversion table;
3665                  * CN version 1 (-EXT) does map them all.
3666                  * The two versions create different Unicode sets.
3667                  */
3668                 filter=UCNV_SET_FILTER_2022_CN;
3669             } else if(i==KSC5601) {
3670                 /*
3671                  * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3672                  * are broader than GR94.
3673                  */
3674                 filter=UCNV_SET_FILTER_GR94DBCS;
3675 #endif
3676             } else {
3677                 filter=UCNV_SET_FILTER_NONE;
3678             }
3679             ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3680         }
3681     }
3682 
3683     /*
3684      * ISO 2022 converters must not convert SO/SI/ESC despite what
3685      * sub-converters do by themselves.
3686      * Remove these characters from the set.
3687      */
3688     sa->remove(sa->set, 0x0e);
3689     sa->remove(sa->set, 0x0f);
3690     sa->remove(sa->set, 0x1b);
3691 
3692     /* ISO 2022 converters do not convert C1 controls either */
3693     sa->removeRange(sa->set, 0x80, 0x9f);
3694 }
3695 
3696 static const UConverterImpl _ISO2022Impl={
3697     UCNV_ISO_2022,
3698 
3699     nullptr,
3700     nullptr,
3701 
3702     _ISO2022Open,
3703     _ISO2022Close,
3704     _ISO2022Reset,
3705 
3706 #ifdef U_ENABLE_GENERIC_ISO_2022
3707     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3708     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3709     ucnv_fromUnicode_UTF8,
3710     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3711 #else
3712     nullptr,
3713     nullptr,
3714     nullptr,
3715     nullptr,
3716 #endif
3717     nullptr,
3718 
3719     nullptr,
3720     _ISO2022getName,
3721     _ISO_2022_WriteSub,
3722     _ISO_2022_SafeClone,
3723     _ISO_2022_GetUnicodeSet,
3724 
3725     nullptr,
3726     nullptr
3727 };
3728 static const UConverterStaticData _ISO2022StaticData={
3729     sizeof(UConverterStaticData),
3730     "ISO_2022",
3731     2022,
3732     UCNV_IBM,
3733     UCNV_ISO_2022,
3734     1,
3735     3, /* max 3 bytes per char16_t from UTF-8 (4 bytes from surrogate _pair_) */
3736     { 0x1a, 0, 0, 0 },
3737     1,
3738     false,
3739     false,
3740     0,
3741     0,
3742     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3743 };
3744 const UConverterSharedData _ISO2022Data=
3745         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3746 
3747 /*************JP****************/
3748 static const UConverterImpl _ISO2022JPImpl={
3749     UCNV_ISO_2022,
3750 
3751     nullptr,
3752     nullptr,
3753 
3754     _ISO2022Open,
3755     _ISO2022Close,
3756     _ISO2022Reset,
3757 
3758     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3759     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3760     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3761     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3762     nullptr,
3763 
3764     nullptr,
3765     _ISO2022getName,
3766     _ISO_2022_WriteSub,
3767     _ISO_2022_SafeClone,
3768     _ISO_2022_GetUnicodeSet,
3769 
3770     nullptr,
3771     nullptr
3772 };
3773 static const UConverterStaticData _ISO2022JPStaticData={
3774     sizeof(UConverterStaticData),
3775     "ISO_2022_JP",
3776     0,
3777     UCNV_IBM,
3778     UCNV_ISO_2022,
3779     1,
3780     6, /* max 6 bytes per char16_t: 4-byte escape sequence + DBCS */
3781     { 0x1a, 0, 0, 0 },
3782     1,
3783     false,
3784     false,
3785     0,
3786     0,
3787     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3788 };
3789 
3790 namespace {
3791 
3792 const UConverterSharedData _ISO2022JPData=
3793         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3794 
3795 }  // namespace
3796 
3797 #if !UCONFIG_ONLY_HTML_CONVERSION
3798 /************* KR ***************/
3799 static const UConverterImpl _ISO2022KRImpl={
3800     UCNV_ISO_2022,
3801 
3802     nullptr,
3803     nullptr,
3804 
3805     _ISO2022Open,
3806     _ISO2022Close,
3807     _ISO2022Reset,
3808 
3809     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3810     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3811     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3812     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3813     nullptr,
3814 
3815     nullptr,
3816     _ISO2022getName,
3817     _ISO_2022_WriteSub,
3818     _ISO_2022_SafeClone,
3819     _ISO_2022_GetUnicodeSet,
3820 
3821     nullptr,
3822     nullptr
3823 };
3824 static const UConverterStaticData _ISO2022KRStaticData={
3825     sizeof(UConverterStaticData),
3826     "ISO_2022_KR",
3827     0,
3828     UCNV_IBM,
3829     UCNV_ISO_2022,
3830     1,
3831     8, /* max 8 bytes per char16_t */
3832     { 0x1a, 0, 0, 0 },
3833     1,
3834     false,
3835     false,
3836     0,
3837     0,
3838     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3839 };
3840 
3841 namespace {
3842 
3843 const UConverterSharedData _ISO2022KRData=
3844         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3845 
3846 }  // namespace
3847 
3848 /*************** CN ***************/
3849 static const UConverterImpl _ISO2022CNImpl={
3850 
3851     UCNV_ISO_2022,
3852 
3853     nullptr,
3854     nullptr,
3855 
3856     _ISO2022Open,
3857     _ISO2022Close,
3858     _ISO2022Reset,
3859 
3860     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3861     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3862     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3863     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3864     nullptr,
3865 
3866     nullptr,
3867     _ISO2022getName,
3868     _ISO_2022_WriteSub,
3869     _ISO_2022_SafeClone,
3870     _ISO_2022_GetUnicodeSet,
3871 
3872     nullptr,
3873     nullptr
3874 };
3875 static const UConverterStaticData _ISO2022CNStaticData={
3876     sizeof(UConverterStaticData),
3877     "ISO_2022_CN",
3878     0,
3879     UCNV_IBM,
3880     UCNV_ISO_2022,
3881     1,
3882     8, /* max 8 bytes per char16_t: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3883     { 0x1a, 0, 0, 0 },
3884     1,
3885     false,
3886     false,
3887     0,
3888     0,
3889     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3890 };
3891 
3892 namespace {
3893 
3894 const UConverterSharedData _ISO2022CNData=
3895         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3896 
3897 }  // namespace
3898 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3899 
3900 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
3901