1 // Common/StringConvert.cpp
2
3 #include "StdAfx.h"
4
5 #include "StringConvert.h"
6
7 #ifndef _WIN32
8 // #include <stdio.h>
9 #include <stdlib.h>
10 #endif
11
12 #if !defined(_WIN32) || defined(ENV_HAVE_LOCALE)
13 #include "UTFConvert.h"
14 #endif
15
16 #ifdef ENV_HAVE_LOCALE
17 #include <locale.h>
18 #endif
19
20 static const char k_DefultChar = '_';
21
22 #ifdef _WIN32
23
24 /*
25 MultiByteToWideChar(CodePage, DWORD dwFlags,
26 LPCSTR lpMultiByteStr, int cbMultiByte,
27 LPWSTR lpWideCharStr, int cchWideChar)
28
29 if (cbMultiByte == 0)
30 return: 0. ERR: ERROR_INVALID_PARAMETER
31
32 if (cchWideChar == 0)
33 return: the required buffer size in characters.
34
35 if (supplied buffer size was not large enough)
36 return: 0. ERR: ERROR_INSUFFICIENT_BUFFER
37 The number of filled characters in lpWideCharStr can be smaller than cchWideChar (if last character is complex)
38
39 If there are illegal characters:
40 if MB_ERR_INVALID_CHARS is set in dwFlags:
41 - the function stops conversion on illegal character.
42 - Return: 0. ERR: ERROR_NO_UNICODE_TRANSLATION.
43
44 if MB_ERR_INVALID_CHARS is NOT set in dwFlags:
45 before Vista: illegal character is dropped (skipped). WinXP-64: GetLastError() returns 0.
46 in Vista+: illegal character is not dropped (MSDN). Undocumented: illegal
47 character is converted to U+FFFD, which is REPLACEMENT CHARACTER.
48 */
49
50
MultiByteToUnicodeString2(UString & dest,const AString & src,UINT codePage)51 void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
52 {
53 dest.Empty();
54 if (src.IsEmpty())
55 return;
56 {
57 /*
58 wchar_t *d = dest.GetBuf(src.Len());
59 const char *s = (const char *)src;
60 unsigned i;
61
62 for (i = 0;;)
63 {
64 Byte c = (Byte)s[i];
65 if (c >= 0x80 || c == 0)
66 break;
67 d[i++] = (wchar_t)c;
68 }
69
70 if (i != src.Len())
71 {
72 unsigned len = MultiByteToWideChar(codePage, 0, s + i,
73 src.Len() - i, d + i,
74 src.Len() + 1 - i);
75 if (len == 0)
76 throw 282228;
77 i += len;
78 }
79
80 d[i] = 0;
81 dest.ReleaseBuf_SetLen(i);
82 */
83 unsigned len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), NULL, 0);
84 if (len == 0)
85 {
86 if (GetLastError() != 0)
87 throw 282228;
88 }
89 else
90 {
91 len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), dest.GetBuf(len), (int)len);
92 if (len == 0)
93 throw 282228;
94 dest.ReleaseBuf_SetEnd(len);
95 }
96 }
97 }
98
99 /*
100 int WideCharToMultiByte(
101 UINT CodePage, DWORD dwFlags,
102 LPCWSTR lpWideCharStr, int cchWideChar,
103 LPSTR lpMultiByteStr, int cbMultiByte,
104 LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar);
105
106 if (lpDefaultChar == NULL),
107 - it uses system default value.
108
109 if (CodePage == CP_UTF7 || CodePage == CP_UTF8)
110 if (lpDefaultChar != NULL || lpUsedDefaultChar != NULL)
111 return: 0. ERR: ERROR_INVALID_PARAMETER.
112
113 The function operates most efficiently, if (lpDefaultChar == NULL && lpUsedDefaultChar == NULL)
114
115 */
116
UnicodeStringToMultiByte2(AString & dest,const UString & src,UINT codePage,char defaultChar,bool & defaultCharWasUsed)117 static void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
118 {
119 dest.Empty();
120 defaultCharWasUsed = false;
121 if (src.IsEmpty())
122 return;
123 {
124 /*
125 unsigned numRequiredBytes = src.Len() * 2;
126 char *d = dest.GetBuf(numRequiredBytes);
127 const wchar_t *s = (const wchar_t *)src;
128 unsigned i;
129
130 for (i = 0;;)
131 {
132 wchar_t c = s[i];
133 if (c >= 0x80 || c == 0)
134 break;
135 d[i++] = (char)c;
136 }
137
138 if (i != src.Len())
139 {
140 BOOL defUsed = FALSE;
141 defaultChar = defaultChar;
142
143 bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7);
144 unsigned len = WideCharToMultiByte(codePage, 0, s + i, src.Len() - i,
145 d + i, numRequiredBytes + 1 - i,
146 (isUtf ? NULL : &defaultChar),
147 (isUtf ? NULL : &defUsed));
148 defaultCharWasUsed = (defUsed != FALSE);
149 if (len == 0)
150 throw 282229;
151 i += len;
152 }
153
154 d[i] = 0;
155 dest.ReleaseBuf_SetLen(i);
156 */
157
158 /*
159 if (codePage != CP_UTF7)
160 {
161 const wchar_t *s = (const wchar_t *)src;
162 unsigned i;
163 for (i = 0;; i++)
164 {
165 wchar_t c = s[i];
166 if (c >= 0x80 || c == 0)
167 break;
168 }
169
170 if (s[i] == 0)
171 {
172 char *d = dest.GetBuf(src.Len());
173 for (i = 0;;)
174 {
175 wchar_t c = s[i];
176 if (c == 0)
177 break;
178 d[i++] = (char)c;
179 }
180 d[i] = 0;
181 dest.ReleaseBuf_SetLen(i);
182 return;
183 }
184 }
185 */
186
187 unsigned len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), NULL, 0, NULL, NULL);
188 if (len == 0)
189 {
190 if (GetLastError() != 0)
191 throw 282228;
192 }
193 else
194 {
195 BOOL defUsed = FALSE;
196 bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7);
197 // defaultChar = defaultChar;
198 len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(),
199 dest.GetBuf(len), (int)len,
200 (isUtf ? NULL : &defaultChar),
201 (isUtf ? NULL : &defUsed)
202 );
203 if (!isUtf)
204 defaultCharWasUsed = (defUsed != FALSE);
205 if (len == 0)
206 throw 282228;
207 dest.ReleaseBuf_SetEnd(len);
208 }
209 }
210 }
211
212 /*
213 #ifndef UNDER_CE
214 AString SystemStringToOemString(const CSysString &src)
215 {
216 AString dest;
217 const unsigned len = src.Len() * 2;
218 CharToOem(src, dest.GetBuf(len));
219 dest.ReleaseBuf_CalcLen(len);
220 return dest;
221 }
222 #endif
223 */
224
225 #else // _WIN32
226
227 // #include <stdio.h>
228 /*
229 if (wchar_t is 32-bit (#if WCHAR_MAX > 0xffff),
230 and utf-8 string contains big unicode character > 0xffff),
231 then we still use 16-bit surrogate pair in UString.
232 It simplifies another code where utf-16 encoding is used.
233 So we use surrogate-conversion code only in is file.
234 */
235
236 /*
237 mbstowcs() returns error if there is error in utf-8 stream,
238 mbstowcs() returns error if there is single surrogates point (d800-dfff) in utf-8 stream
239 */
240
241 /*
242 static void MultiByteToUnicodeString2_Native(UString &dest, const AString &src)
243 {
244 dest.Empty();
245 if (src.IsEmpty())
246 return;
247
248 const size_t limit = ((size_t)src.Len() + 1) * 2;
249 wchar_t *d = dest.GetBuf((unsigned)limit);
250 const size_t len = mbstowcs(d, src, limit);
251 if (len != (size_t)-1)
252 {
253 dest.ReleaseBuf_SetEnd((unsigned)len);
254 return;
255 }
256 dest.ReleaseBuf_SetEnd(0);
257 }
258 */
259
260 bool g_ForceToUTF8 = true; // false;
261
MultiByteToUnicodeString2(UString & dest,const AString & src,UINT codePage)262 void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
263 {
264 dest.Empty();
265 if (src.IsEmpty())
266 return;
267
268 if (codePage == CP_UTF8 || g_ForceToUTF8)
269 {
270 #if 1
271 ConvertUTF8ToUnicode(src, dest);
272 return;
273 #endif
274 }
275
276 const size_t limit = ((size_t)src.Len() + 1) * 2;
277 wchar_t *d = dest.GetBuf((unsigned)limit);
278 const size_t len = mbstowcs(d, src, limit);
279 if (len != (size_t)-1)
280 {
281 dest.ReleaseBuf_SetEnd((unsigned)len);
282
283 #if WCHAR_MAX > 0xffff
284 d = dest.GetBuf();
285 for (size_t i = 0;; i++)
286 {
287 wchar_t c = d[i];
288 // printf("\ni=%2d c = %4x\n", (unsigned)i, (unsigned)c);
289 if (c == 0)
290 break;
291 if (c >= 0x10000 && c < 0x110000)
292 {
293 UString tempString = d + i;
294 const wchar_t *t = tempString.Ptr();
295
296 for (;;)
297 {
298 wchar_t w = *t++;
299 // printf("\nchar=%x\n", w);
300 if (w == 0)
301 break;
302 if (i == limit)
303 break; // unexpected error
304 if (w >= 0x10000 && w < 0x110000)
305 {
306 #if 1
307 if (i + 1 == limit)
308 break; // unexpected error
309 w -= 0x10000;
310 d[i++] = (unsigned)0xd800 + (((unsigned)w >> 10) & 0x3ff);
311 w = 0xdc00 + (w & 0x3ff);
312 #else
313 // w = '_'; // for debug
314 #endif
315 }
316 d[i++] = w;
317 }
318 dest.ReleaseBuf_SetEnd((unsigned)i);
319 break;
320 }
321 }
322
323 #endif
324
325 /*
326 printf("\nMultiByteToUnicodeString2 (%d) %s\n", (int)src.Len(), src.Ptr());
327 printf("char: ");
328 for (unsigned i = 0; i < src.Len(); i++)
329 printf (" %02x", (int)(Byte)src[i]);
330 printf("\n");
331 printf("\n-> (%d) %ls\n", (int)dest.Len(), dest.Ptr());
332 printf("wchar_t: ");
333 for (unsigned i = 0; i < dest.Len(); i++)
334 {
335 printf (" %02x", (int)dest[i]);
336 }
337 printf("\n");
338 */
339
340 return;
341 }
342
343 /* if there is mbstowcs() error, we have two ways:
344
345 1) change 0x80+ characters to some character: '_'
346 in that case we lose data, but we have correct UString()
347 and that scheme can show errors to user in early stages,
348 when file converted back to mbs() cannot be found
349
350 2) transfer bad characters in some UTF-16 range.
351 it can be non-original Unicode character.
352 but later we still can restore original character.
353 */
354
355
356 // printf("\nmbstowcs ERROR !!!!!! s=%s\n", src.Ptr());
357 {
358 unsigned i;
359 const char *s = (const char *)src;
360 for (i = 0;;)
361 {
362 Byte c = (Byte)s[i];
363 if (c == 0)
364 break;
365 // we can use ascii compatibilty character '_'
366 // if (c > 0x7F) c = '_'; // we replace "bad: character
367 d[i++] = (wchar_t)c;
368 }
369 d[i] = 0;
370 dest.ReleaseBuf_SetLen(i);
371 }
372 }
373
UnicodeStringToMultiByte2_Native(AString & dest,const UString & src)374 static void UnicodeStringToMultiByte2_Native(AString &dest, const UString &src)
375 {
376 dest.Empty();
377 if (src.IsEmpty())
378 return;
379
380 const size_t limit = ((size_t)src.Len() + 1) * 6;
381 char *d = dest.GetBuf((unsigned)limit);
382
383 const size_t len = wcstombs(d, src, limit);
384
385 if (len != (size_t)-1)
386 {
387 dest.ReleaseBuf_SetEnd((unsigned)len);
388 return;
389 }
390 dest.ReleaseBuf_SetEnd(0);
391 }
392
393
UnicodeStringToMultiByte2(AString & dest,const UString & src2,UINT codePage,char defaultChar,bool & defaultCharWasUsed)394 static void UnicodeStringToMultiByte2(AString &dest, const UString &src2, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
395 {
396 // if (codePage == 1234567) // for debug purposes
397 if (codePage == CP_UTF8 || g_ForceToUTF8)
398 {
399 #if 1
400 defaultCharWasUsed = false;
401 ConvertUnicodeToUTF8(src2, dest);
402 return;
403 #endif
404 }
405
406 UString src = src2;
407 #if WCHAR_MAX > 0xffff
408 {
409 src.Empty();
410 for (unsigned i = 0; i < src2.Len();)
411 {
412 wchar_t c = src2[i++];
413 if (c >= 0xd800 && c < 0xdc00 && i != src2.Len())
414 {
415 const wchar_t c2 = src2[i];
416 if (c2 >= 0xdc00 && c2 < 0xe000)
417 {
418 #if 1
419 // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2);
420 c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
421 // printf("%4x\n", (int)c);
422 i++;
423 #else
424 // c = '_'; // for debug
425 #endif
426 }
427 }
428 src += c;
429 }
430 }
431 #endif
432
433 dest.Empty();
434 defaultCharWasUsed = false;
435 if (src.IsEmpty())
436 return;
437
438 const size_t len = wcstombs(NULL, src, 0);
439
440 if (len != (size_t)-1)
441 {
442 const unsigned limit = ((unsigned)len);
443 if (limit == len)
444 {
445 char *d = dest.GetBuf(limit);
446
447 /*
448 {
449 printf("\nwcstombs; len = %d %ls \n", (int)src.Len(), src.Ptr());
450 for (unsigned i = 0; i < src.Len(); i++)
451 printf (" %02x", (int)src[i]);
452 printf("\n");
453 printf("\ndest Limit = %d \n", limit);
454 }
455 */
456
457 const size_t len2 = wcstombs(d, src, len + 1);
458
459 if (len2 != (size_t)-1 && len2 <= limit)
460 {
461 /*
462 printf("\nOK : destLen = %d : %s\n", (int)len, dest.Ptr());
463 for (unsigned i = 0; i < len2; i++)
464 printf(" %02x", (int)(Byte)dest[i]);
465 printf("\n");
466 */
467 dest.ReleaseBuf_SetEnd((unsigned)len2);
468 return;
469 }
470 }
471 }
472
473 {
474 const wchar_t *s = (const wchar_t *)src;
475 char *d = dest.GetBuf(src.Len());
476
477 unsigned i;
478 for (i = 0;;)
479 {
480 wchar_t c = s[i];
481 if (c == 0)
482 break;
483 if (c >=
484 0x100
485 // 0x80
486 )
487 {
488 c = defaultChar;
489 defaultCharWasUsed = true;
490 }
491
492 d[i++] = (char)c;
493 }
494 d[i] = 0;
495 dest.ReleaseBuf_SetLen(i);
496 /*
497 printf("\nUnicodeStringToMultiByte2; len = %d \n", (int)src.Len());
498 printf("ERROR: %s\n", dest.Ptr());
499 */
500 }
501 }
502
503 #endif // _WIN32
504
505
MultiByteToUnicodeString(const AString & src,UINT codePage)506 UString MultiByteToUnicodeString(const AString &src, UINT codePage)
507 {
508 UString dest;
509 MultiByteToUnicodeString2(dest, src, codePage);
510 return dest;
511 }
512
MultiByteToUnicodeString(const char * src,UINT codePage)513 UString MultiByteToUnicodeString(const char *src, UINT codePage)
514 {
515 return MultiByteToUnicodeString(AString(src), codePage);
516 }
517
518
UnicodeStringToMultiByte2(AString & dest,const UString & src,UINT codePage)519 void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage)
520 {
521 bool defaultCharWasUsed;
522 UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
523 }
524
UnicodeStringToMultiByte(const UString & src,UINT codePage,char defaultChar,bool & defaultCharWasUsed)525 AString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
526 {
527 AString dest;
528 UnicodeStringToMultiByte2(dest, src, codePage, defaultChar, defaultCharWasUsed);
529 return dest;
530 }
531
UnicodeStringToMultiByte(const UString & src,UINT codePage)532 AString UnicodeStringToMultiByte(const UString &src, UINT codePage)
533 {
534 AString dest;
535 bool defaultCharWasUsed;
536 UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
537 return dest;
538 }
539
540
541
542
543 #if !defined(_WIN32) || defined(ENV_HAVE_LOCALE)
544
545 #ifdef _WIN32
546 #define U_to_A(a, b, c) UnicodeStringToMultiByte2
547 // #define A_to_U(a, b, c) MultiByteToUnicodeString2
548 #else
549 // void MultiByteToUnicodeString2_Native(UString &dest, const AString &src);
550 #define U_to_A(a, b, c) UnicodeStringToMultiByte2_Native(a, b)
551 // #define A_to_U(a, b, c) MultiByteToUnicodeString2_Native(a, b)
552 #endif
553
IsNativeUTF8()554 bool IsNativeUTF8()
555 {
556 UString u;
557 AString a, a2;
558 // for (unsigned c = 0x80; c < (UInt32)0x10000; c += (c >> 9) + 1)
559 for (unsigned c = 0x80; c < (UInt32)0xD000; c += (c >> 2) + 1)
560 {
561 u.Empty();
562 u += (wchar_t)c;
563 /*
564 if (Unicode_Is_There_Utf16SurrogateError(u))
565 continue;
566 #ifndef _WIN32
567 if (Unicode_Is_There_BmpEscape(u))
568 continue;
569 #endif
570 */
571 ConvertUnicodeToUTF8(u, a);
572 U_to_A(a2, u, CP_OEMCP);
573 if (a != a2)
574 return false;
575 }
576 return true;
577 }
578
579 #endif
580
581
582 #ifdef ENV_HAVE_LOCALE
583
GetLocale(void)584 const char *GetLocale(void)
585 {
586 #ifdef ENV_HAVE_LOCALE
587 // printf("\n\nsetlocale(LC_CTYPE, NULL) : return : ");
588 const char *s = setlocale(LC_CTYPE, NULL);
589 if (!s)
590 {
591 // printf("[NULL]\n");
592 s = "C";
593 }
594 else
595 {
596 // ubuntu returns "C" after program start
597 // printf("\"%s\"\n", s);
598 }
599 return s;
600 #elif defined(LOCALE_IS_UTF8)
601 return "utf8";
602 #else
603 return "C";
604 #endif
605 }
606
607 #ifdef _WIN32
Set_ForceToUTF8(bool)608 static void Set_ForceToUTF8(bool) {}
609 #else
Set_ForceToUTF8(bool val)610 static void Set_ForceToUTF8(bool val) { g_ForceToUTF8 = val; }
611 #endif
612
Is_Default_Basic_Locale(const char * locale)613 static bool Is_Default_Basic_Locale(const char *locale)
614 {
615 const AString a (locale);
616 if (a.IsEqualTo_Ascii_NoCase("")
617 || a.IsEqualTo_Ascii_NoCase("C")
618 || a.IsEqualTo_Ascii_NoCase("POSIX"))
619 return true;
620 return false;
621 }
622
Is_Default_Basic_Locale()623 static bool Is_Default_Basic_Locale()
624 {
625 return Is_Default_Basic_Locale(GetLocale());
626 }
627
628
MY_SetLocale()629 void MY_SetLocale()
630 {
631 #ifdef ENV_HAVE_LOCALE
632 /*
633 {
634 const char *s = GetLocale();
635 printf("\nGetLocale() : returned : \"%s\"\n", s);
636 }
637 */
638
639 unsigned start = 0;
640 // unsigned lim = 0;
641 unsigned lim = 3;
642
643 /*
644 #define MY_SET_LOCALE_FLAGS__FROM_ENV 1
645 #define MY_SET_LOCALE_FLAGS__TRY_UTF8 2
646
647 unsigned flags =
648 MY_SET_LOCALE_FLAGS__FROM_ENV |
649 MY_SET_LOCALE_FLAGS__TRY_UTF8
650
651 if (flags != 0)
652 {
653 if (flags & MY_SET_LOCALE_FLAGS__FROM_ENV)
654 lim = (flags & MY_SET_LOCALE_FLAGS__TRY_UTF8) ? 3 : 1;
655 else
656 {
657 start = 1;
658 lim = 2;
659 }
660 }
661 */
662
663 for (unsigned i = start; i < lim; i++)
664 {
665 /*
666 man7: "If locale is an empty string, "", each part of the locale that
667 should be modified is set according to the environment variables.
668 for glibc: glibc, first from the user's environment variables:
669 1) the environment variable LC_ALL,
670 2) environment variable with the same name as the category (see the
671 3) the environment variable LANG
672 The locale "C" or "POSIX" is a portable locale; it exists on all conforming systems.
673
674 for WIN32 : MSDN :
675 Sets the locale to the default, which is the user-default
676 ANSI code page obtained from the operating system.
677 The locale name is set to the value returned by GetUserDefaultLocaleName.
678 The code page is set to the value returned by GetACP
679 */
680 const char *newLocale = "";
681
682 #ifdef __APPLE__
683
684 /* look also CFLocale
685 there is no C.UTF-8 in macos
686 macos has UTF-8 locale only with some language like en_US.UTF-8
687 what is best way to set UTF-8 locale in macos? */
688 if (i == 1)
689 newLocale = "en_US.UTF-8";
690
691 /* file open with non-utf8 sequencies return
692 #define EILSEQ 92 // "Illegal byte sequence"
693 */
694 #else
695 // newLocale = "C";
696 if (i == 1)
697 {
698 newLocale = "C.UTF-8"; // main UTF-8 locale in ubuntu
699 // newLocale = ".utf8"; // supported in new Windows 10 build 17134 (April 2018 Update), the Universal C Runtime
700 // newLocale = "en_US.utf8"; // supported by ubuntu ?
701 // newLocale = "en_US.UTF-8";
702 /* setlocale() in ubuntu allows locales with minor chracter changes in strings
703 "en_US.UTF-8" / "en_US.utf8" */
704 }
705
706 #endif
707
708 // printf("\nsetlocale(LC_ALL, \"%s\") : returned: ", newLocale);
709
710 // const char *s =
711 setlocale(LC_ALL, newLocale);
712
713 /*
714 if (!s)
715 printf("NULL: can't set locale");
716 else
717 printf("\"%s\"\n", s);
718 */
719
720 // request curent locale of program
721 const char *locale = GetLocale();
722 if (locale)
723 {
724 AString a (locale);
725 a.MakeLower_Ascii();
726 // if (a.Find("utf") >= 0)
727 {
728 if (IsNativeUTF8())
729 {
730 Set_ForceToUTF8(true);
731 return;
732 }
733 }
734 if (!Is_Default_Basic_Locale(locale))
735 {
736 // if there is some non-default and non-utf locale, we want to use it
737 break; // comment it for debug
738 }
739 }
740 }
741
742 if (IsNativeUTF8())
743 {
744 Set_ForceToUTF8(true);
745 return;
746 }
747
748 if (Is_Default_Basic_Locale())
749 {
750 Set_ForceToUTF8(true);
751 return;
752 }
753
754 Set_ForceToUTF8(false);
755
756 #elif defined(LOCALE_IS_UTF8)
757 // assume LC_CTYPE="utf8"
758 #else
759 // assume LC_CTYPE="C"
760 #endif
761 }
762 #endif
763