xref: /aosp_15_r20/external/lzma/CPP/Common/UTFConvert.h (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1*f6dc9357SAndroid Build Coastguard Worker // Common/UTFConvert.h
2*f6dc9357SAndroid Build Coastguard Worker 
3*f6dc9357SAndroid Build Coastguard Worker #ifndef ZIP7_INC_COMMON_UTF_CONVERT_H
4*f6dc9357SAndroid Build Coastguard Worker #define ZIP7_INC_COMMON_UTF_CONVERT_H
5*f6dc9357SAndroid Build Coastguard Worker 
6*f6dc9357SAndroid Build Coastguard Worker #include "MyBuffer.h"
7*f6dc9357SAndroid Build Coastguard Worker #include "MyString.h"
8*f6dc9357SAndroid Build Coastguard Worker 
9*f6dc9357SAndroid Build Coastguard Worker struct CUtf8Check
10*f6dc9357SAndroid Build Coastguard Worker {
11*f6dc9357SAndroid Build Coastguard Worker   // Byte MaxByte;     // in original src stream
12*f6dc9357SAndroid Build Coastguard Worker   bool NonUtf;
13*f6dc9357SAndroid Build Coastguard Worker   bool ZeroChar;
14*f6dc9357SAndroid Build Coastguard Worker   bool SingleSurrogate;
15*f6dc9357SAndroid Build Coastguard Worker   bool Escape;
16*f6dc9357SAndroid Build Coastguard Worker   bool Truncated;
17*f6dc9357SAndroid Build Coastguard Worker   UInt32 MaxHighPoint;  // only for points >= 0x80
18*f6dc9357SAndroid Build Coastguard Worker 
CUtf8CheckCUtf8Check19*f6dc9357SAndroid Build Coastguard Worker   CUtf8Check() { Clear(); }
20*f6dc9357SAndroid Build Coastguard Worker 
ClearCUtf8Check21*f6dc9357SAndroid Build Coastguard Worker   void Clear()
22*f6dc9357SAndroid Build Coastguard Worker   {
23*f6dc9357SAndroid Build Coastguard Worker     // MaxByte = 0;
24*f6dc9357SAndroid Build Coastguard Worker     NonUtf = false;
25*f6dc9357SAndroid Build Coastguard Worker     ZeroChar = false;
26*f6dc9357SAndroid Build Coastguard Worker     SingleSurrogate = false;
27*f6dc9357SAndroid Build Coastguard Worker     Escape = false;
28*f6dc9357SAndroid Build Coastguard Worker     Truncated = false;
29*f6dc9357SAndroid Build Coastguard Worker     MaxHighPoint = 0;
30*f6dc9357SAndroid Build Coastguard Worker   }
31*f6dc9357SAndroid Build Coastguard Worker 
UpdateCUtf8Check32*f6dc9357SAndroid Build Coastguard Worker   void Update(const CUtf8Check &c)
33*f6dc9357SAndroid Build Coastguard Worker   {
34*f6dc9357SAndroid Build Coastguard Worker     if (c.NonUtf) NonUtf = true;
35*f6dc9357SAndroid Build Coastguard Worker     if (c.ZeroChar) ZeroChar = true;
36*f6dc9357SAndroid Build Coastguard Worker     if (c.SingleSurrogate) SingleSurrogate = true;
37*f6dc9357SAndroid Build Coastguard Worker     if (c.Escape) Escape = true;
38*f6dc9357SAndroid Build Coastguard Worker     if (c.Truncated) Truncated = true;
39*f6dc9357SAndroid Build Coastguard Worker     if (MaxHighPoint < c.MaxHighPoint) MaxHighPoint = c.MaxHighPoint;
40*f6dc9357SAndroid Build Coastguard Worker   }
41*f6dc9357SAndroid Build Coastguard Worker 
PrintStatusCUtf8Check42*f6dc9357SAndroid Build Coastguard Worker   void PrintStatus(AString &s) const
43*f6dc9357SAndroid Build Coastguard Worker   {
44*f6dc9357SAndroid Build Coastguard Worker     s.Empty();
45*f6dc9357SAndroid Build Coastguard Worker 
46*f6dc9357SAndroid Build Coastguard Worker     // s.Add_OptSpaced("MaxByte=");
47*f6dc9357SAndroid Build Coastguard Worker     // s.Add_UInt32(MaxByte);
48*f6dc9357SAndroid Build Coastguard Worker 
49*f6dc9357SAndroid Build Coastguard Worker     if (NonUtf)          s.Add_OptSpaced("non-UTF8");
50*f6dc9357SAndroid Build Coastguard Worker     if (ZeroChar)        s.Add_OptSpaced("ZeroChar");
51*f6dc9357SAndroid Build Coastguard Worker     if (SingleSurrogate) s.Add_OptSpaced("SingleSurrogate");
52*f6dc9357SAndroid Build Coastguard Worker     if (Escape)          s.Add_OptSpaced("Escape");
53*f6dc9357SAndroid Build Coastguard Worker     if (Truncated)       s.Add_OptSpaced("Truncated");
54*f6dc9357SAndroid Build Coastguard Worker 
55*f6dc9357SAndroid Build Coastguard Worker     if (MaxHighPoint != 0)
56*f6dc9357SAndroid Build Coastguard Worker     {
57*f6dc9357SAndroid Build Coastguard Worker       s.Add_OptSpaced("MaxUnicode=");
58*f6dc9357SAndroid Build Coastguard Worker       s.Add_UInt32(MaxHighPoint);
59*f6dc9357SAndroid Build Coastguard Worker     }
60*f6dc9357SAndroid Build Coastguard Worker   }
61*f6dc9357SAndroid Build Coastguard Worker 
62*f6dc9357SAndroid Build Coastguard Worker 
63*f6dc9357SAndroid Build Coastguard Worker   bool IsOK(bool allowReduced = false) const
64*f6dc9357SAndroid Build Coastguard Worker   {
65*f6dc9357SAndroid Build Coastguard Worker     if (NonUtf || SingleSurrogate || ZeroChar)
66*f6dc9357SAndroid Build Coastguard Worker       return false;
67*f6dc9357SAndroid Build Coastguard Worker     if (MaxHighPoint >= 0x110000)
68*f6dc9357SAndroid Build Coastguard Worker       return false;
69*f6dc9357SAndroid Build Coastguard Worker     if (Truncated && !allowReduced)
70*f6dc9357SAndroid Build Coastguard Worker       return false;
71*f6dc9357SAndroid Build Coastguard Worker     return true;
72*f6dc9357SAndroid Build Coastguard Worker   }
73*f6dc9357SAndroid Build Coastguard Worker 
74*f6dc9357SAndroid Build Coastguard Worker   // it checks full buffer as specified in (size) and it doesn't stop on zero char
75*f6dc9357SAndroid Build Coastguard Worker   void Check_Buf(const char *src, size_t size) throw();
76*f6dc9357SAndroid Build Coastguard Worker 
Check_AStringCUtf8Check77*f6dc9357SAndroid Build Coastguard Worker   void Check_AString(const AString &s) throw()
78*f6dc9357SAndroid Build Coastguard Worker   {
79*f6dc9357SAndroid Build Coastguard Worker     Check_Buf(s.Ptr(), s.Len());
80*f6dc9357SAndroid Build Coastguard Worker   }
81*f6dc9357SAndroid Build Coastguard Worker };
82*f6dc9357SAndroid Build Coastguard Worker 
83*f6dc9357SAndroid Build Coastguard Worker /*
84*f6dc9357SAndroid Build Coastguard Worker if (allowReduced == false) - all UTF-8 character sequences must be finished.
85*f6dc9357SAndroid Build Coastguard Worker if (allowReduced == true)  - it allows truncated last character-Utf8-sequence
86*f6dc9357SAndroid Build Coastguard Worker */
87*f6dc9357SAndroid Build Coastguard Worker 
88*f6dc9357SAndroid Build Coastguard Worker bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw();
89*f6dc9357SAndroid Build Coastguard Worker bool CheckUTF8_AString(const AString &s) throw();
90*f6dc9357SAndroid Build Coastguard Worker 
91*f6dc9357SAndroid Build Coastguard Worker #define Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR    (1 << 0)
92*f6dc9357SAndroid Build Coastguard Worker #define Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE         (1 << 1)
93*f6dc9357SAndroid Build Coastguard Worker #define Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT (1 << 2)
94*f6dc9357SAndroid Build Coastguard Worker 
95*f6dc9357SAndroid Build Coastguard Worker /*
96*f6dc9357SAndroid Build Coastguard Worker Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
97*f6dc9357SAndroid Build Coastguard Worker 
98*f6dc9357SAndroid Build Coastguard Worker    if (flag is NOT set)
99*f6dc9357SAndroid Build Coastguard Worker    {
100*f6dc9357SAndroid Build Coastguard Worker      it processes SINGLE-SURROGATE-8 as valid Unicode point.
101*f6dc9357SAndroid Build Coastguard Worker      it converts  SINGLE-SURROGATE-8 to SINGLE-SURROGATE-16
102*f6dc9357SAndroid Build Coastguard Worker      Note: some sequencies of two SINGLE-SURROGATE-8 points
103*f6dc9357SAndroid Build Coastguard Worker            will generate correct SURROGATE-16-PAIR, and
104*f6dc9357SAndroid Build Coastguard Worker            that SURROGATE-16-PAIR later will be converted to correct
105*f6dc9357SAndroid Build Coastguard Worker            UTF8-SURROGATE-21 point. So we don't restore original
106*f6dc9357SAndroid Build Coastguard Worker            STR-8 sequence in that case.
107*f6dc9357SAndroid Build Coastguard Worker    }
108*f6dc9357SAndroid Build Coastguard Worker 
109*f6dc9357SAndroid Build Coastguard Worker    if (flag is set)
110*f6dc9357SAndroid Build Coastguard Worker    {
111*f6dc9357SAndroid Build Coastguard Worker      if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is defined)
112*f6dc9357SAndroid Build Coastguard Worker         it generates ESCAPE for SINGLE-SURROGATE-8,
113*f6dc9357SAndroid Build Coastguard Worker      if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is not defined)
114*f6dc9357SAndroid Build Coastguard Worker         it generates U+fffd for SINGLE-SURROGATE-8,
115*f6dc9357SAndroid Build Coastguard Worker    }
116*f6dc9357SAndroid Build Coastguard Worker 
117*f6dc9357SAndroid Build Coastguard Worker 
118*f6dc9357SAndroid Build Coastguard Worker Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
119*f6dc9357SAndroid Build Coastguard Worker 
120*f6dc9357SAndroid Build Coastguard Worker    if (flag is NOT set)
121*f6dc9357SAndroid Build Coastguard Worker      it generates (U+fffd) code for non-UTF-8 (invalid) characters
122*f6dc9357SAndroid Build Coastguard Worker 
123*f6dc9357SAndroid Build Coastguard Worker    if (flag is set)
124*f6dc9357SAndroid Build Coastguard Worker    {
125*f6dc9357SAndroid Build Coastguard Worker      It generates (ESCAPE) codes for NON-UTF-8 (invalid) characters.
126*f6dc9357SAndroid Build Coastguard Worker      And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes.
127*f6dc9357SAndroid Build Coastguard Worker    }
128*f6dc9357SAndroid Build Coastguard Worker 
129*f6dc9357SAndroid Build Coastguard Worker Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
130*f6dc9357SAndroid Build Coastguard Worker 
131*f6dc9357SAndroid Build Coastguard Worker    if (flag is NOT set)
132*f6dc9357SAndroid Build Coastguard Worker    {
133*f6dc9357SAndroid Build Coastguard Worker      it process ESCAPE-8 points as another Unicode points.
134*f6dc9357SAndroid Build Coastguard Worker      In Linux: ESCAPE-16 will mean two different ESCAPE-8 seqences,
135*f6dc9357SAndroid Build Coastguard Worker        so we need HIGH-ESCAPE-PLANE-21 to restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW
136*f6dc9357SAndroid Build Coastguard Worker    }
137*f6dc9357SAndroid Build Coastguard Worker 
138*f6dc9357SAndroid Build Coastguard Worker    if (flag is set)
139*f6dc9357SAndroid Build Coastguard Worker    {
140*f6dc9357SAndroid Build Coastguard Worker      it generates ESCAPE-16-21 for ESCAPE-8 points
141*f6dc9357SAndroid Build Coastguard Worker      so we can restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW without HIGH-ESCAPE-PLANE-21.
142*f6dc9357SAndroid Build Coastguard Worker    }
143*f6dc9357SAndroid Build Coastguard Worker 
144*f6dc9357SAndroid Build Coastguard Worker 
145*f6dc9357SAndroid Build Coastguard Worker Main USE CASES with UTF-8 <-> UTF-16 conversions:
146*f6dc9357SAndroid Build Coastguard Worker 
147*f6dc9357SAndroid Build Coastguard Worker  WIN32:   UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW
148*f6dc9357SAndroid Build Coastguard Worker    {
149*f6dc9357SAndroid Build Coastguard Worker             set Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
150*f6dc9357SAndroid Build Coastguard Worker      Do NOT set Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
151*f6dc9357SAndroid Build Coastguard Worker      Do NOT set Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
152*f6dc9357SAndroid Build Coastguard Worker 
153*f6dc9357SAndroid Build Coastguard Worker      So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8.
154*f6dc9357SAndroid Build Coastguard Worker    }
155*f6dc9357SAndroid Build Coastguard Worker 
156*f6dc9357SAndroid Build Coastguard Worker  Linux:   UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
157*f6dc9357SAndroid Build Coastguard Worker    {
158*f6dc9357SAndroid Build Coastguard Worker      we want restore original UTF-8-RAW sequence later from that ESCAPE-16.
159*f6dc9357SAndroid Build Coastguard Worker      Set the flags:
160*f6dc9357SAndroid Build Coastguard Worker        Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
161*f6dc9357SAndroid Build Coastguard Worker        Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
162*f6dc9357SAndroid Build Coastguard Worker        Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
163*f6dc9357SAndroid Build Coastguard Worker    }
164*f6dc9357SAndroid Build Coastguard Worker 
165*f6dc9357SAndroid Build Coastguard Worker  MacOS:   UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
166*f6dc9357SAndroid Build Coastguard Worker    {
167*f6dc9357SAndroid Build Coastguard Worker      we want to restore correct UTF-8 without any BMP processing:
168*f6dc9357SAndroid Build Coastguard Worker      Set the flags:
169*f6dc9357SAndroid Build Coastguard Worker        Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
170*f6dc9357SAndroid Build Coastguard Worker        Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
171*f6dc9357SAndroid Build Coastguard Worker    }
172*f6dc9357SAndroid Build Coastguard Worker 
173*f6dc9357SAndroid Build Coastguard Worker */
174*f6dc9357SAndroid Build Coastguard Worker 
175*f6dc9357SAndroid Build Coastguard Worker // zero char is not allowed in (src) buf
176*f6dc9357SAndroid Build Coastguard Worker bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags = 0);
177*f6dc9357SAndroid Build Coastguard Worker 
178*f6dc9357SAndroid Build Coastguard Worker bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0);
179*f6dc9357SAndroid Build Coastguard Worker bool ConvertUTF8ToUnicode(const AString &src, UString &dest);
180*f6dc9357SAndroid Build Coastguard Worker 
181*f6dc9357SAndroid Build Coastguard Worker #define Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR    (1 << 8)
182*f6dc9357SAndroid Build Coastguard Worker #define Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE (1 << 9)
183*f6dc9357SAndroid Build Coastguard Worker // #define Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE  (1 << 10)
184*f6dc9357SAndroid Build Coastguard Worker 
185*f6dc9357SAndroid Build Coastguard Worker /*
186*f6dc9357SAndroid Build Coastguard Worker Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR
187*f6dc9357SAndroid Build Coastguard Worker 
188*f6dc9357SAndroid Build Coastguard Worker   if (flag is NOT set)
189*f6dc9357SAndroid Build Coastguard Worker   {
190*f6dc9357SAndroid Build Coastguard Worker      we extract SINGLE-SURROGATE as normal UTF-8
191*f6dc9357SAndroid Build Coastguard Worker 
192*f6dc9357SAndroid Build Coastguard Worker      In Windows : for UTF-16-RAW <-> UTF-8 (archive) <-> UTF-16-RAW in .
193*f6dc9357SAndroid Build Coastguard Worker 
194*f6dc9357SAndroid Build Coastguard Worker      In Linux :
195*f6dc9357SAndroid Build Coastguard Worker        use-case-1: UTF-8 -> UTF-16 -> UTF-8  doesn't generate UTF-16 SINGLE-SURROGATE,
196*f6dc9357SAndroid Build Coastguard Worker                    if (Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) is used.
197*f6dc9357SAndroid Build Coastguard Worker        use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux)
198*f6dc9357SAndroid Build Coastguard Worker                    will generate SINGLE-SURROGATE-UTF-8 here.
199*f6dc9357SAndroid Build Coastguard Worker   }
200*f6dc9357SAndroid Build Coastguard Worker 
201*f6dc9357SAndroid Build Coastguard Worker   if (flag is set)
202*f6dc9357SAndroid Build Coastguard Worker   {
203*f6dc9357SAndroid Build Coastguard Worker      we generate UTF_REPLACEMENT_CHAR (0xfffd) for SINGLE_SURROGATE
204*f6dc9357SAndroid Build Coastguard Worker      it can be used for compatibility mode with WIN32 UTF function
205*f6dc9357SAndroid Build Coastguard Worker      or if we want UTF-8 stream without any errors
206*f6dc9357SAndroid Build Coastguard Worker   }
207*f6dc9357SAndroid Build Coastguard Worker 
208*f6dc9357SAndroid Build Coastguard Worker 
209*f6dc9357SAndroid Build Coastguard Worker Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE
210*f6dc9357SAndroid Build Coastguard Worker 
211*f6dc9357SAndroid Build Coastguard Worker   if (flag is NOT set) it doesn't extract  raw 8-bit symbol from Escape-Plane-16
212*f6dc9357SAndroid Build Coastguard Worker   if (flag is set)     it         extracts raw 8-bit symbol from Escape-Plane-16
213*f6dc9357SAndroid Build Coastguard Worker 
214*f6dc9357SAndroid Build Coastguard Worker   in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive):
215*f6dc9357SAndroid Build Coastguard Worker   if (we       use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane.
216*f6dc9357SAndroid Build Coastguard Worker   if (we don't use High-Escape-Plane), we must use Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE.
217*f6dc9357SAndroid Build Coastguard Worker 
218*f6dc9357SAndroid Build Coastguard Worker 
219*f6dc9357SAndroid Build Coastguard Worker Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE
220*f6dc9357SAndroid Build Coastguard Worker   // that flag affects the code only if (wchar_t is 32-bit)
221*f6dc9357SAndroid Build Coastguard Worker   // that mode with high-escape can be disabled now in UTFConvert.cpp
222*f6dc9357SAndroid Build Coastguard Worker   if (flag is NOT set)
223*f6dc9357SAndroid Build Coastguard Worker      it doesn't extract raw 8-bit symbol from High-Escape-Plane
224*f6dc9357SAndroid Build Coastguard Worker   if (flag is set)
225*f6dc9357SAndroid Build Coastguard Worker      it        extracts raw 8-bit symbol from High-Escape-Plane
226*f6dc9357SAndroid Build Coastguard Worker 
227*f6dc9357SAndroid Build Coastguard Worker Main use cases:
228*f6dc9357SAndroid Build Coastguard Worker 
229*f6dc9357SAndroid Build Coastguard Worker WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW
230*f6dc9357SAndroid Build Coastguard Worker    {
231*f6dc9357SAndroid Build Coastguard Worker      Do NOT set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE.
232*f6dc9357SAndroid Build Coastguard Worker      Do NOT set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR.
233*f6dc9357SAndroid Build Coastguard Worker      So we restore original UTF-16-RAW.
234*f6dc9357SAndroid Build Coastguard Worker    }
235*f6dc9357SAndroid Build Coastguard Worker 
236*f6dc9357SAndroid Build Coastguard Worker Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes
237*f6dc9357SAndroid Build Coastguard Worker      set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive
238*f6dc9357SAndroid Build Coastguard Worker      set Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE for intermediate UTF-16.
239*f6dc9357SAndroid Build Coastguard Worker      Note: high esacape mode can be ignored now in UTFConvert.cpp
240*f6dc9357SAndroid Build Coastguard Worker 
241*f6dc9357SAndroid Build Coastguard Worker macOS:
242*f6dc9357SAndroid Build Coastguard Worker      the system doesn't support incorrect UTF-8 in file names.
243*f6dc9357SAndroid Build Coastguard Worker      set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR
244*f6dc9357SAndroid Build Coastguard Worker */
245*f6dc9357SAndroid Build Coastguard Worker 
246*f6dc9357SAndroid Build Coastguard Worker extern unsigned g_Unicode_To_UTF8_Flags;
247*f6dc9357SAndroid Build Coastguard Worker 
248*f6dc9357SAndroid Build Coastguard Worker void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags = 0);
249*f6dc9357SAndroid Build Coastguard Worker void ConvertUnicodeToUTF8(const UString &src, AString &dest);
250*f6dc9357SAndroid Build Coastguard Worker 
251*f6dc9357SAndroid Build Coastguard Worker void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest);
252*f6dc9357SAndroid Build Coastguard Worker 
253*f6dc9357SAndroid Build Coastguard Worker /*
254*f6dc9357SAndroid Build Coastguard Worker #ifndef _WIN32
255*f6dc9357SAndroid Build Coastguard Worker void Convert_UTF16_To_UTF32(const UString &src, UString &dest);
256*f6dc9357SAndroid Build Coastguard Worker void Convert_UTF32_To_UTF16(const UString &src, UString &dest);
257*f6dc9357SAndroid Build Coastguard Worker bool UTF32_IsThere_BigPoint(const UString &src);
258*f6dc9357SAndroid Build Coastguard Worker bool Unicode_IsThere_BmpEscape(const UString &src);
259*f6dc9357SAndroid Build Coastguard Worker #endif
260*f6dc9357SAndroid Build Coastguard Worker 
261*f6dc9357SAndroid Build Coastguard Worker bool Unicode_IsThere_Utf16SurrogateError(const UString &src);
262*f6dc9357SAndroid Build Coastguard Worker */
263*f6dc9357SAndroid Build Coastguard Worker 
264*f6dc9357SAndroid Build Coastguard Worker #ifdef Z7_WCHART_IS_16BIT
265*f6dc9357SAndroid Build Coastguard Worker #define Convert_UnicodeEsc16_To_UnicodeEscHigh(s)
266*f6dc9357SAndroid Build Coastguard Worker #else
267*f6dc9357SAndroid Build Coastguard Worker void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s);
268*f6dc9357SAndroid Build Coastguard Worker #endif
269*f6dc9357SAndroid Build Coastguard Worker 
270*f6dc9357SAndroid Build Coastguard Worker /*
271*f6dc9357SAndroid Build Coastguard Worker // #include "../../C/CpuArch.h"
272*f6dc9357SAndroid Build Coastguard Worker 
273*f6dc9357SAndroid Build Coastguard Worker // ---------- Utf16 Little endian functions ----------
274*f6dc9357SAndroid Build Coastguard Worker 
275*f6dc9357SAndroid Build Coastguard Worker // We store 16-bit surrogates even in 32-bit WCHARs in Linux.
276*f6dc9357SAndroid Build Coastguard Worker // So now we don't use the following code:
277*f6dc9357SAndroid Build Coastguard Worker 
278*f6dc9357SAndroid Build Coastguard Worker #if WCHAR_MAX > 0xffff
279*f6dc9357SAndroid Build Coastguard Worker 
280*f6dc9357SAndroid Build Coastguard Worker // void *p     : pointer to src bytes stream
281*f6dc9357SAndroid Build Coastguard Worker // size_t len  : num Utf16 characters : it can include or not include NULL character
282*f6dc9357SAndroid Build Coastguard Worker 
283*f6dc9357SAndroid Build Coastguard Worker inline size_t Utf16LE__Get_Num_WCHARs(const void *p, size_t len)
284*f6dc9357SAndroid Build Coastguard Worker {
285*f6dc9357SAndroid Build Coastguard Worker   #if WCHAR_MAX > 0xffff
286*f6dc9357SAndroid Build Coastguard Worker   size_t num_wchars = 0;
287*f6dc9357SAndroid Build Coastguard Worker   for (size_t i = 0; i < len; i++)
288*f6dc9357SAndroid Build Coastguard Worker   {
289*f6dc9357SAndroid Build Coastguard Worker     wchar_t c = GetUi16(p);
290*f6dc9357SAndroid Build Coastguard Worker     p = (const void *)((const Byte *)p + 2);
291*f6dc9357SAndroid Build Coastguard Worker     if (c >= 0xd800 && c < 0xdc00 && i + 1 != len)
292*f6dc9357SAndroid Build Coastguard Worker     {
293*f6dc9357SAndroid Build Coastguard Worker       wchar_t c2 = GetUi16(p);
294*f6dc9357SAndroid Build Coastguard Worker       if (c2 >= 0xdc00 && c2 < 0xe000)
295*f6dc9357SAndroid Build Coastguard Worker       {
296*f6dc9357SAndroid Build Coastguard Worker         c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
297*f6dc9357SAndroid Build Coastguard Worker         p = (const void *)((const Byte *)p + 2);
298*f6dc9357SAndroid Build Coastguard Worker         i++;
299*f6dc9357SAndroid Build Coastguard Worker       }
300*f6dc9357SAndroid Build Coastguard Worker     }
301*f6dc9357SAndroid Build Coastguard Worker     num_wchars++;
302*f6dc9357SAndroid Build Coastguard Worker   }
303*f6dc9357SAndroid Build Coastguard Worker   return num_wchars;
304*f6dc9357SAndroid Build Coastguard Worker   #else
305*f6dc9357SAndroid Build Coastguard Worker   UNUSED_VAR(p)
306*f6dc9357SAndroid Build Coastguard Worker   return len;
307*f6dc9357SAndroid Build Coastguard Worker   #endif
308*f6dc9357SAndroid Build Coastguard Worker }
309*f6dc9357SAndroid Build Coastguard Worker 
310*f6dc9357SAndroid Build Coastguard Worker // #include <stdio.h>
311*f6dc9357SAndroid Build Coastguard Worker 
312*f6dc9357SAndroid Build Coastguard Worker inline wchar_t *Utf16LE__To_WCHARs_Sep(const void *p, size_t len, wchar_t *dest)
313*f6dc9357SAndroid Build Coastguard Worker {
314*f6dc9357SAndroid Build Coastguard Worker   for (size_t i = 0; i < len; i++)
315*f6dc9357SAndroid Build Coastguard Worker   {
316*f6dc9357SAndroid Build Coastguard Worker     wchar_t c = GetUi16(p);
317*f6dc9357SAndroid Build Coastguard Worker     p = (const void *)((const Byte *)p + 2);
318*f6dc9357SAndroid Build Coastguard Worker 
319*f6dc9357SAndroid Build Coastguard Worker     #if WCHAR_PATH_SEPARATOR != L'/'
320*f6dc9357SAndroid Build Coastguard Worker     if (c == L'/')
321*f6dc9357SAndroid Build Coastguard Worker       c = WCHAR_PATH_SEPARATOR;
322*f6dc9357SAndroid Build Coastguard Worker     #endif
323*f6dc9357SAndroid Build Coastguard Worker 
324*f6dc9357SAndroid Build Coastguard Worker     #if WCHAR_MAX > 0xffff
325*f6dc9357SAndroid Build Coastguard Worker 
326*f6dc9357SAndroid Build Coastguard Worker     if (c >= 0xd800 && c < 0xdc00 && i + 1 != len)
327*f6dc9357SAndroid Build Coastguard Worker     {
328*f6dc9357SAndroid Build Coastguard Worker       wchar_t c2 = GetUi16(p);
329*f6dc9357SAndroid Build Coastguard Worker       if (c2 >= 0xdc00 && c2 < 0xe000)
330*f6dc9357SAndroid Build Coastguard Worker       {
331*f6dc9357SAndroid Build Coastguard Worker         // printf("\nSurragate : %4x %4x -> ", (int)c, (int)c2);
332*f6dc9357SAndroid Build Coastguard Worker         c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
333*f6dc9357SAndroid Build Coastguard Worker         p = (const void *)((const Byte *)p + 2);
334*f6dc9357SAndroid Build Coastguard Worker         i++;
335*f6dc9357SAndroid Build Coastguard Worker         // printf("%4x\n", (int)c);
336*f6dc9357SAndroid Build Coastguard Worker       }
337*f6dc9357SAndroid Build Coastguard Worker     }
338*f6dc9357SAndroid Build Coastguard Worker 
339*f6dc9357SAndroid Build Coastguard Worker     #endif
340*f6dc9357SAndroid Build Coastguard Worker 
341*f6dc9357SAndroid Build Coastguard Worker     *dest++ = c;
342*f6dc9357SAndroid Build Coastguard Worker   }
343*f6dc9357SAndroid Build Coastguard Worker   return dest;
344*f6dc9357SAndroid Build Coastguard Worker }
345*f6dc9357SAndroid Build Coastguard Worker 
346*f6dc9357SAndroid Build Coastguard Worker 
347*f6dc9357SAndroid Build Coastguard Worker inline size_t Get_Num_Utf16_chars_from_wchar_string(const wchar_t *p)
348*f6dc9357SAndroid Build Coastguard Worker {
349*f6dc9357SAndroid Build Coastguard Worker   size_t num = 0;
350*f6dc9357SAndroid Build Coastguard Worker   for (;;)
351*f6dc9357SAndroid Build Coastguard Worker   {
352*f6dc9357SAndroid Build Coastguard Worker     wchar_t c = *p++;
353*f6dc9357SAndroid Build Coastguard Worker     if (c == 0)
354*f6dc9357SAndroid Build Coastguard Worker       return num;
355*f6dc9357SAndroid Build Coastguard Worker     num += ((c >= 0x10000 && c < 0x110000) ? 2 : 1);
356*f6dc9357SAndroid Build Coastguard Worker   }
357*f6dc9357SAndroid Build Coastguard Worker   return num;
358*f6dc9357SAndroid Build Coastguard Worker }
359*f6dc9357SAndroid Build Coastguard Worker 
360*f6dc9357SAndroid Build Coastguard Worker inline Byte *wchars_to_Utf16LE(const wchar_t *p, Byte *dest)
361*f6dc9357SAndroid Build Coastguard Worker {
362*f6dc9357SAndroid Build Coastguard Worker   for (;;)
363*f6dc9357SAndroid Build Coastguard Worker   {
364*f6dc9357SAndroid Build Coastguard Worker     wchar_t c = *p++;
365*f6dc9357SAndroid Build Coastguard Worker     if (c == 0)
366*f6dc9357SAndroid Build Coastguard Worker       return dest;
367*f6dc9357SAndroid Build Coastguard Worker     if (c >= 0x10000 && c < 0x110000)
368*f6dc9357SAndroid Build Coastguard Worker     {
369*f6dc9357SAndroid Build Coastguard Worker       SetUi16(dest    , (UInt16)(0xd800 + ((c >> 10) & 0x3FF)));
370*f6dc9357SAndroid Build Coastguard Worker       SetUi16(dest + 2, (UInt16)(0xdc00 + ( c        & 0x3FF)));
371*f6dc9357SAndroid Build Coastguard Worker       dest += 4;
372*f6dc9357SAndroid Build Coastguard Worker     }
373*f6dc9357SAndroid Build Coastguard Worker     else
374*f6dc9357SAndroid Build Coastguard Worker     {
375*f6dc9357SAndroid Build Coastguard Worker       SetUi16(dest, c);
376*f6dc9357SAndroid Build Coastguard Worker       dest += 2;
377*f6dc9357SAndroid Build Coastguard Worker     }
378*f6dc9357SAndroid Build Coastguard Worker   }
379*f6dc9357SAndroid Build Coastguard Worker }
380*f6dc9357SAndroid Build Coastguard Worker 
381*f6dc9357SAndroid Build Coastguard Worker #endif
382*f6dc9357SAndroid Build Coastguard Worker */
383*f6dc9357SAndroid Build Coastguard Worker 
384*f6dc9357SAndroid Build Coastguard Worker #endif
385