1 #include <stdio.h>
2 
3 /*
4  * UTF-8 to UTF-16
5  * Table from https://woboq.com/blog/utf-8-processing-using-simd.html
6  *
7  * +-------------------------------------+-------------------+
8  * | UTF-8                               | UTF-16LE (HI LO)  |
9  * +-------------------------------------+-------------------+
10  * | 0aaaaaaa                            | 00000000 0aaaaaaa |
11  * +-------------------------------------+-------------------+
12  * | 110bbbbb 10aaaaaa                   | 00000bbb bbaaaaaa |
13  * +-------------------------------------+-------------------+
14  * | 1110cccc 10bbbbbb 10aaaaaa          | ccccbbbb bbaaaaaa |
15  * +-------------------------------------+-------------------+
16  * | 11110ddd 10ddcccc 10bbbbbb 10aaaaaa | 110110uu uuccccbb |
17  * + uuuu = ddddd - 1                    | 110111bb bbaaaaaa |
18  * +-------------------------------------+-------------------+
19  */
20 
21 /*
22  * Parameters:
23  * - buf8, len8: input utf-8 string
24  * - buf16: buffer to store decoded utf-16 string
25  * - *len16: on entry - utf-16 buffer length in bytes
26  *           on exit  - length in bytes of valid decoded utf-16 string
27  * Returns:
28  *  -  0: success
29  *  - >0: error position of input utf-8 string
30  *  - -1: utf-16 buffer overflow
31  * LE/BE depends on host
32  */
utf8_to16_naive(const unsigned char * buf8,size_t len8,unsigned short * buf16,size_t * len16)33 int utf8_to16_naive(const unsigned char *buf8, size_t len8,
34         unsigned short *buf16, size_t *len16)
35 {
36     int err_pos = 1;
37     size_t len16_left = *len16;
38 
39     *len16 = 0;
40 
41     while (len8) {
42         unsigned char b0, b1, b2, b3;
43         unsigned int u;
44 
45         /* Output buffer full */
46         if (len16_left < 2)
47             return -1;
48 
49         /* 1st byte */
50         b0 = buf8[0];
51 
52         if ((b0 & 0x80) == 0) {
53             /* 0aaaaaaa -> 00000000 0aaaaaaa */
54             *buf16++ = b0;
55             ++buf8;
56             --len8;
57             ++err_pos;
58             *len16 += 2;
59             len16_left -= 2;
60             continue;
61         }
62 
63         /* Character length */
64         size_t clen = b0 & 0xF0;
65         clen >>= 4;     /* 10xx,  110x, 1110, 1111 */
66         clen -= 12;     /* -4~-1, 0/1,  2,    3 */
67         clen += !clen;  /* -4~-1, 1,    2,    3 */
68 
69         /* String too short or invalid 1st byte (10xxxxxx) */
70         if (len8 <= clen)
71             return err_pos;
72 
73         /* Trailing bytes must be within 0x80 ~ 0xBF */
74         b1 = buf8[1];
75         if ((signed char)b1 >= (signed char)0xC0)
76             return err_pos;
77         b1 &= 0x3F;
78 
79         ++clen;
80         if (clen == 2) {
81             u = b0 & 0x1F;
82             u <<= 6;
83             u |= b1;
84             if (u <= 0x7F)
85                 return err_pos;
86             *buf16++ = u;
87         } else {
88             b2 = buf8[2];
89             if ((signed char)b2 >= (signed char)0xC0)
90                 return err_pos;
91             b2 &= 0x3F;
92             if (clen == 3) {
93                 u = b0 & 0x0F;
94                 u <<= 6;
95                 u |= b1;
96                 u <<= 6;
97                 u |= b2;
98                 if (u <= 0x7FF || (u >= 0xD800 && u <= 0xDFFF))
99                     return err_pos;
100                 *buf16++ = u;
101             } else {
102                 /* clen == 4 */
103                 if (len16_left < 4)
104                     return -1;  /* Output buffer full */
105                 b3 = buf8[3];
106                 if ((signed char)b3 >= (signed char)0xC0)
107                     return err_pos;
108                 u = b0 & 0x07;
109                 u <<= 6;
110                 u |= b1;
111                 u <<= 6;
112                 u |= b2;
113                 u <<= 6;
114                 u |= (b3 & 0x3F);
115                 if (u <= 0xFFFF || u > 0x10FFFF)
116                     return err_pos;
117                 u -= 0x10000;
118                 *buf16++ = (((u >> 10) & 0x3FF) | 0xD800);
119                 *buf16++ = ((u & 0x3FF) | 0xDC00);
120                 *len16 += 2;
121                 len16_left -= 2;
122             }
123         }
124 
125         buf8 += clen;
126         len8 -= clen;
127         err_pos += clen;
128         *len16 += 2;
129         len16_left -= 2;
130     }
131 
132     return 0;
133 }
134