1 #include <stdio.h>
2
3 /*
4 * UTF-8 to UTF-16
5 * Table from https://woboq.com/blog/utf-8-processing-using-simd.html
6 *
7 * +-------------------------------------+-------------------+
8 * | UTF-8 | UTF-16LE (HI LO) |
9 * +-------------------------------------+-------------------+
10 * | 0aaaaaaa | 00000000 0aaaaaaa |
11 * +-------------------------------------+-------------------+
12 * | 110bbbbb 10aaaaaa | 00000bbb bbaaaaaa |
13 * +-------------------------------------+-------------------+
14 * | 1110cccc 10bbbbbb 10aaaaaa | ccccbbbb bbaaaaaa |
15 * +-------------------------------------+-------------------+
16 * | 11110ddd 10ddcccc 10bbbbbb 10aaaaaa | 110110uu uuccccbb |
17 * + uuuu = ddddd - 1 | 110111bb bbaaaaaa |
18 * +-------------------------------------+-------------------+
19 */
20
21 /*
22 * Parameters:
23 * - buf8, len8: input utf-8 string
24 * - buf16: buffer to store decoded utf-16 string
25 * - *len16: on entry - utf-16 buffer length in bytes
26 * on exit - length in bytes of valid decoded utf-16 string
27 * Returns:
28 * - 0: success
29 * - >0: error position of input utf-8 string
30 * - -1: utf-16 buffer overflow
31 * LE/BE depends on host
32 */
utf8_to16_naive(const unsigned char * buf8,size_t len8,unsigned short * buf16,size_t * len16)33 int utf8_to16_naive(const unsigned char *buf8, size_t len8,
34 unsigned short *buf16, size_t *len16)
35 {
36 int err_pos = 1;
37 size_t len16_left = *len16;
38
39 *len16 = 0;
40
41 while (len8) {
42 unsigned char b0, b1, b2, b3;
43 unsigned int u;
44
45 /* Output buffer full */
46 if (len16_left < 2)
47 return -1;
48
49 /* 1st byte */
50 b0 = buf8[0];
51
52 if ((b0 & 0x80) == 0) {
53 /* 0aaaaaaa -> 00000000 0aaaaaaa */
54 *buf16++ = b0;
55 ++buf8;
56 --len8;
57 ++err_pos;
58 *len16 += 2;
59 len16_left -= 2;
60 continue;
61 }
62
63 /* Character length */
64 size_t clen = b0 & 0xF0;
65 clen >>= 4; /* 10xx, 110x, 1110, 1111 */
66 clen -= 12; /* -4~-1, 0/1, 2, 3 */
67 clen += !clen; /* -4~-1, 1, 2, 3 */
68
69 /* String too short or invalid 1st byte (10xxxxxx) */
70 if (len8 <= clen)
71 return err_pos;
72
73 /* Trailing bytes must be within 0x80 ~ 0xBF */
74 b1 = buf8[1];
75 if ((signed char)b1 >= (signed char)0xC0)
76 return err_pos;
77 b1 &= 0x3F;
78
79 ++clen;
80 if (clen == 2) {
81 u = b0 & 0x1F;
82 u <<= 6;
83 u |= b1;
84 if (u <= 0x7F)
85 return err_pos;
86 *buf16++ = u;
87 } else {
88 b2 = buf8[2];
89 if ((signed char)b2 >= (signed char)0xC0)
90 return err_pos;
91 b2 &= 0x3F;
92 if (clen == 3) {
93 u = b0 & 0x0F;
94 u <<= 6;
95 u |= b1;
96 u <<= 6;
97 u |= b2;
98 if (u <= 0x7FF || (u >= 0xD800 && u <= 0xDFFF))
99 return err_pos;
100 *buf16++ = u;
101 } else {
102 /* clen == 4 */
103 if (len16_left < 4)
104 return -1; /* Output buffer full */
105 b3 = buf8[3];
106 if ((signed char)b3 >= (signed char)0xC0)
107 return err_pos;
108 u = b0 & 0x07;
109 u <<= 6;
110 u |= b1;
111 u <<= 6;
112 u |= b2;
113 u <<= 6;
114 u |= (b3 & 0x3F);
115 if (u <= 0xFFFF || u > 0x10FFFF)
116 return err_pos;
117 u -= 0x10000;
118 *buf16++ = (((u >> 10) & 0x3FF) | 0xD800);
119 *buf16++ = ((u & 0x3FF) | 0xDC00);
120 *len16 += 2;
121 len16_left -= 2;
122 }
123 }
124
125 buf8 += clen;
126 len8 -= clen;
127 err_pos += clen;
128 *len16 += 2;
129 len16_left -= 2;
130 }
131
132 return 0;
133 }
134