1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <inttypes.h>
5 #include <sys/types.h>
6 #include <sys/stat.h>
7 #include <sys/time.h>
8 #include <fcntl.h>
9 #include <unistd.h>
10 
11 int utf8_to16_iconv(const unsigned char *buf8, size_t len8,
12         unsigned short *buf16, size_t *len16);
13 int utf8_to16_naive(const unsigned char *buf8, size_t len8,
14         unsigned short *buf16, size_t *len16);
15 
16 static struct ftab {
17     const char *name;
18     int (*func)(const unsigned char *buf8, size_t len8,
19             unsigned short *buf16, size_t *len16);
20 } ftab[] = {
21     {
22         .name = "iconv",
23         .func = utf8_to16_iconv,
24     }, {
25         .name = "naive",
26         .func = utf8_to16_naive,
27     },
28 };
29 
load_test_buf(int len)30 static unsigned char *load_test_buf(int len)
31 {
32     const char utf8[] = "\xF0\x90\xBF\x80";
33     const int utf8_len = sizeof(utf8)/sizeof(utf8[0]) - 1;
34 
35     unsigned char *data = malloc(len);
36     unsigned char *p = data;
37 
38     while (len >= utf8_len) {
39         memcpy(p, utf8, utf8_len);
40         p += utf8_len;
41         len -= utf8_len;
42     }
43 
44     while (len--)
45         *p++ = 0x7F;
46 
47     return data;
48 }
49 
load_test_file(int * len)50 static unsigned char *load_test_file(int *len)
51 {
52     unsigned char *data;
53     int fd;
54     struct stat stat;
55 
56     fd = open("../UTF-8-demo.txt", O_RDONLY);
57     if (fd == -1) {
58         printf("Failed to open ../UTF-8-demo.txt!\n");
59         exit(1);
60     }
61     if (fstat(fd, &stat) == -1) {
62         printf("Failed to get file size!\n");
63         exit(1);
64     }
65 
66     *len = stat.st_size;
67     data = malloc(*len);
68     if (read(fd, data, *len) != *len) {
69         printf("Failed to read file!\n");
70         exit(1);
71     }
72 
73     close(fd);
74 
75     return data;
76 }
77 
print_test(const unsigned char * data,int len)78 static void print_test(const unsigned char *data, int len)
79 {
80     printf(" [len=%d] \"", len);
81     while (len--)
82         printf("\\x%02X", *data++);
83 
84     printf("\"\n");
85 }
86 
87 struct test {
88     const unsigned char *data;
89     int len;
90 };
91 
prepare_test_buf(unsigned char * buf,const struct test * pos,int pos_len,int pos_idx)92 static void prepare_test_buf(unsigned char *buf, const struct test *pos,
93                              int pos_len, int pos_idx)
94 {
95     /* Round concatenate correct tokens to 1024 bytes */
96     int buf_idx = 0;
97     while (buf_idx < 1024) {
98         int buf_len = 1024 - buf_idx;
99 
100         if (buf_len >= pos[pos_idx].len) {
101             memcpy(buf+buf_idx, pos[pos_idx].data, pos[pos_idx].len);
102             buf_idx += pos[pos_idx].len;
103         } else {
104             memset(buf+buf_idx, 0, buf_len);
105             buf_idx += buf_len;
106         }
107 
108         if (++pos_idx == pos_len)
109             pos_idx = 0;
110     }
111 }
112 
113 /* Return 0 on success, -1 on error */
test_manual(const struct ftab * ftab,unsigned short * buf16,unsigned short * _buf16)114 static int test_manual(const struct ftab *ftab, unsigned short *buf16,
115         unsigned short *_buf16)
116 {
117 #define LEN16   4096
118 
119 #pragma GCC diagnostic push
120 #pragma GCC diagnostic ignored "-Wpointer-sign"
121     /* positive tests */
122     static const struct test pos[] = {
123         {"", 0},
124         {"\x00", 1},
125         {"\x66", 1},
126         {"\x7F", 1},
127         {"\x00\x7F", 2},
128         {"\x7F\x00", 2},
129         {"\xC2\x80", 2},
130         {"\xDF\xBF", 2},
131         {"\xE0\xA0\x80", 3},
132         {"\xE0\xA0\xBF", 3},
133         {"\xED\x9F\x80", 3},
134         {"\xEF\x80\xBF", 3},
135         {"\xF0\x90\xBF\x80", 4},
136         {"\xF2\x81\xBE\x99", 4},
137         {"\xF4\x8F\x88\xAA", 4},
138     };
139 
140     /* negative tests */
141     static const struct test neg[] = {
142         {"\x80", 1},
143         {"\xBF", 1},
144         {"\xC0\x80", 2},
145         {"\xC1\x00", 2},
146         {"\xC2\x7F", 2},
147         {"\xDF\xC0", 2},
148         {"\xE0\x9F\x80", 3},
149         {"\xE0\xC2\x80", 3},
150         {"\xED\xA0\x80", 3},
151         {"\xED\x7F\x80", 3},
152         {"\xEF\x80\x00", 3},
153         {"\xF0\x8F\x80\x80", 4},
154         {"\xF0\xEE\x80\x80", 4},
155         {"\xF2\x90\x91\x7F", 4},
156         {"\xF4\x90\x88\xAA", 4},
157         {"\xF4\x00\xBF\xBF", 4},
158         {"\x00\x00\x00\x00\x00\xC2\x80\x00\x00\x00\xE1\x80\x80\x00\x00\xC2" \
159          "\xC2\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
160          32},
161         {"\x00\x00\x00\x00\x00\xC2\xC2\x80\x00\x00\xE1\x80\x80\x00\x00\x00",
162          16},
163         {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
164          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80",
165          32},
166         {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
167          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1",
168          32},
169         {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
170          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80" \
171          "\x80", 33},
172         {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
173          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80" \
174          "\xC2\x80", 34},
175         {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
176          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF0" \
177          "\x80\x80\x80", 35},
178     };
179 #pragma GCC diagnostic push
180 
181     size_t len16 = LEN16, _len16 = LEN16;
182     int ret, _ret;
183 
184     /* Test single token */
185     for (int i = 0; i < sizeof(pos)/sizeof(pos[0]); ++i) {
186         ret = ftab->func(pos[i].data, pos[i].len, buf16, &len16);
187         _ret = utf8_to16_iconv(pos[i].data, pos[i].len, _buf16, &_len16);
188         if (ret != _ret || len16 != _len16 || memcmp(buf16, _buf16, len16)) {
189             printf("FAILED positive test(%d:%d, %lu:%lu): ",
190                     ret, _ret, len16, _len16);
191             print_test(pos[i].data, pos[i].len);
192             return -1;
193         }
194         len16 = _len16 = LEN16;
195     }
196     for (int i = 0; i < sizeof(neg)/sizeof(neg[0]); ++i) {
197         ret = ftab->func(neg[i].data, neg[i].len, buf16, &len16);
198         _ret = utf8_to16_iconv(neg[i].data, neg[i].len, _buf16, &_len16);
199         if (ret != _ret || len16 != _len16 || memcmp(buf16, _buf16, len16)) {
200             printf("FAILED negitive test(%d:%d, %lu:%lu): ",
201                     ret, _ret, len16, _len16);
202             print_test(neg[i].data, neg[i].len);
203             return -1;
204         }
205         len16 = _len16 = LEN16;
206     }
207 
208     /* Test shifted buffer to cover 1k length */
209     /* buffer size must be greater than 1024 + 16 + max(test string length) */
210     const int max_size = 1024*2;
211     uint64_t buf64[max_size/8 + 2];
212     /* Offset 8 bytes by 1 byte */
213     unsigned char *buf = ((unsigned char *)buf64) + 1;
214     int buf_len;
215 
216     for (int i = 0; i < sizeof(pos)/sizeof(pos[0]); ++i) {
217         /* Positive test: shift 16 bytes, validate each shift */
218         prepare_test_buf(buf, pos, sizeof(pos)/sizeof(pos[0]), i);
219         buf_len = 1024;
220         for (int j = 0; j < 16; ++j) {
221             ret = ftab->func(buf, buf_len, buf16, &len16);
222             _ret = utf8_to16_iconv(buf, buf_len, _buf16, &_len16);
223             if (ret != _ret || len16 != _len16 || \
224                     memcmp(buf16, _buf16, len16)) {
225                 printf("FAILED positive test(%d:%d, %lu:%lu): ",
226                         ret, _ret, len16, _len16);
227                 print_test(buf, buf_len);
228                 return -1;
229             }
230             len16 = _len16 = LEN16;
231             for (int k = buf_len; k >= 1; --k)
232                 buf[k] = buf[k-1];
233             buf[0] = '\x55';
234             ++buf_len;
235         }
236 
237         /* Negative test: trunk last non ascii */
238         while (buf_len >= 1 && buf[buf_len-1] <= 0x7F)
239             --buf_len;
240         if (buf_len) {
241             ret = ftab->func(buf, buf_len-1, buf16, &len16);
242             _ret = utf8_to16_iconv(buf, buf_len-1, _buf16, &_len16);
243             if (ret != _ret || len16 != _len16 || \
244                     memcmp(buf16, _buf16, len16)) {
245                 printf("FAILED negative test(%d:%d, %lu:%lu): ",
246                         ret, _ret, len16, _len16);
247                 print_test(buf, buf_len-1);
248                 return -1;
249             }
250             len16 = _len16 = LEN16;
251         }
252     }
253 
254     /* Negative test */
255     for (int i = 0; i < sizeof(neg)/sizeof(neg[0]); ++i) {
256         /* Append one error token, shift 16 bytes, validate each shift */
257         int pos_idx = i % (sizeof(pos)/sizeof(pos[0]));
258         prepare_test_buf(buf, pos, sizeof(pos)/sizeof(pos[0]), pos_idx);
259         memcpy(buf+1024, neg[i].data, neg[i].len);
260         buf_len = 1024 + neg[i].len;
261         for (int j = 0; j < 16; ++j) {
262             ret = ftab->func(buf, buf_len, buf16, &len16);
263             _ret = utf8_to16_iconv(buf, buf_len, _buf16, &_len16);
264             if (ret != _ret || len16 != _len16 || \
265                     memcmp(buf16, _buf16, len16)) {
266                 printf("FAILED negative test(%d:%d, %lu:%lu): ",
267                         ret, _ret, len16, _len16);
268                 print_test(buf, buf_len);
269                 return -1;
270             }
271             len16 = _len16 = LEN16;
272             for (int k = buf_len; k >= 1; --k)
273                 buf[k] = buf[k-1];
274             buf[0] = '\x66';
275             ++buf_len;
276         }
277     }
278 
279     return 0;
280 }
281 
test(const unsigned char * buf8,size_t len8,unsigned short * buf16,size_t len16,const struct ftab * ftab)282 static void test(const unsigned char *buf8, size_t len8,
283         unsigned short *buf16, size_t len16, const struct ftab *ftab)
284 {
285     /* Use iconv as the reference answer */
286     if (strcmp(ftab->name, "iconv") == 0)
287         return;
288 
289     printf("%s\n", ftab->name);
290 
291     /* Test file or buffer */
292     size_t _len16 = len16;
293     unsigned short *_buf16 = (unsigned short *)malloc(_len16);
294     if (utf8_to16_iconv(buf8, len8, _buf16, &_len16)) {
295         printf("Invalid test file or buffer!\n");
296         exit(1);
297     }
298     printf("standard test: ");
299     if (ftab->func(buf8, len8, buf16, &len16) || len16 != _len16 || \
300             memcmp(buf16, _buf16, len16) != 0)
301         printf("FAIL\n");
302     else
303         printf("pass\n");
304     free(_buf16);
305 
306     /* Manual cases */
307     unsigned short *mbuf8 = (unsigned short *)malloc(LEN16);
308     unsigned short *mbuf16 = (unsigned short *)malloc(LEN16);
309     printf("manual test: %s\n",
310             test_manual(ftab, mbuf8, mbuf16) ? "FAIL" : "pass");
311     free(mbuf8);
312     free(mbuf16);
313     printf("\n");
314 }
315 
bench(const unsigned char * buf8,size_t len8,unsigned short * buf16,size_t len16,const struct ftab * ftab)316 static void bench(const unsigned char *buf8, size_t len8,
317         unsigned short *buf16, size_t len16, const struct ftab *ftab)
318 {
319     const int loops = 1024*1024*1024/len8;
320     int ret = 0;
321     double time, size;
322     struct timeval tv1, tv2;
323 
324     fprintf(stderr, "bench %s... ", ftab->name);
325     gettimeofday(&tv1, 0);
326     for (int i = 0; i < loops; ++i)
327         ret |= ftab->func(buf8, len8, buf16, &len16);
328     gettimeofday(&tv2, 0);
329     printf("%s\n", ret?"FAIL":"pass");
330 
331     time = tv2.tv_usec - tv1.tv_usec;
332     time = time / 1000000 + tv2.tv_sec - tv1.tv_sec;
333     size = ((double)len8 * loops) / (1024*1024);
334     printf("time: %.4f s\n", time);
335     printf("data: %.0f MB\n", size);
336     printf("BW: %.2f MB/s\n", size / time);
337     printf("\n");
338 }
339 
usage(const char * bin)340 static void usage(const char *bin)
341 {
342     printf("Usage:\n");
343     printf("%s test  [alg]     ==> test all or one algorithm\n", bin);
344     printf("%s bench [alg]     ==> benchmark all or one algorithm\n", bin);
345     printf("%s bench size NUM  ==> benchmark with specific buffer size\n", bin);
346     printf("alg = ");
347     for (int i = 0; i < sizeof(ftab)/sizeof(ftab[0]); ++i)
348         printf("%s ", ftab[i].name);
349     printf("\nNUM = buffer size in bytes, 1 ~ 67108864(64M)\n");
350 }
351 
main(int argc,char * argv[])352 int main(int argc, char *argv[])
353 {
354     int len8 = 0, len16;
355     unsigned char *buf8;
356     unsigned short *buf16;
357     const char *alg = NULL;
358     void (*tb)(const unsigned char *buf8, size_t len8,
359            unsigned short *buf16, size_t len16, const struct ftab *ftab);
360 
361     tb = NULL;
362     if (argc >= 2) {
363         if (strcmp(argv[1], "test") == 0)
364             tb = test;
365         else if (strcmp(argv[1], "bench") == 0)
366             tb = bench;
367         if (argc >= 3) {
368             alg = argv[2];
369             if (strcmp(alg, "size") == 0) {
370                 if (argc < 4) {
371                     tb = NULL;
372                 } else {
373                     alg = NULL;
374                     len8 = atoi(argv[3]);
375                     if (len8 <= 0 || len8 > 67108864) {
376                         printf("Buffer size error!\n\n");
377                         tb = NULL;
378                     }
379                 }
380             }
381         }
382     }
383 
384     if (tb == NULL) {
385         usage(argv[0]);
386         return 1;
387     }
388 
389     /* Load UTF8 test buffer */
390     if (len8)
391         buf8 = load_test_buf(len8);
392     else
393         buf8 = load_test_file(&len8);
394 
395     /* Prepare UTF16 buffer large enough */
396     len16 = len8 * 2;
397     buf16 = (unsigned short *)malloc(len16);
398 
399     if (tb == bench)
400         printf("============== Bench UTF8 (%d bytes) ==============\n", len8);
401     for (int i = 0; i < sizeof(ftab)/sizeof(ftab[0]); ++i) {
402         if (alg && strcmp(alg, ftab[i].name) != 0)
403             continue;
404         tb((const unsigned char *)buf8, len8, buf16, len16, &ftab[i]);
405     }
406 
407 #if 0
408     if (tb == bench) {
409         printf("==================== Bench ASCII ====================\n");
410         /* Change test buffer to ascii */
411         for (int i = 0; i < len; i++)
412             data[i] &= 0x7F;
413 
414         for (int i = 0; i < sizeof(ftab)/sizeof(ftab[0]); ++i) {
415             if (alg && strcmp(alg, ftab[i].name) != 0)
416                 continue;
417             tb((const unsigned char *)data, len, &ftab[i]);
418             printf("\n");
419         }
420     }
421 #endif
422 
423     return 0;
424 }
425