1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <inttypes.h>
5 #include <sys/types.h>
6 #include <sys/stat.h>
7 #include <sys/time.h>
8 #include <fcntl.h>
9 #include <unistd.h>
10
11 int utf8_to16_iconv(const unsigned char *buf8, size_t len8,
12 unsigned short *buf16, size_t *len16);
13 int utf8_to16_naive(const unsigned char *buf8, size_t len8,
14 unsigned short *buf16, size_t *len16);
15
16 static struct ftab {
17 const char *name;
18 int (*func)(const unsigned char *buf8, size_t len8,
19 unsigned short *buf16, size_t *len16);
20 } ftab[] = {
21 {
22 .name = "iconv",
23 .func = utf8_to16_iconv,
24 }, {
25 .name = "naive",
26 .func = utf8_to16_naive,
27 },
28 };
29
load_test_buf(int len)30 static unsigned char *load_test_buf(int len)
31 {
32 const char utf8[] = "\xF0\x90\xBF\x80";
33 const int utf8_len = sizeof(utf8)/sizeof(utf8[0]) - 1;
34
35 unsigned char *data = malloc(len);
36 unsigned char *p = data;
37
38 while (len >= utf8_len) {
39 memcpy(p, utf8, utf8_len);
40 p += utf8_len;
41 len -= utf8_len;
42 }
43
44 while (len--)
45 *p++ = 0x7F;
46
47 return data;
48 }
49
load_test_file(int * len)50 static unsigned char *load_test_file(int *len)
51 {
52 unsigned char *data;
53 int fd;
54 struct stat stat;
55
56 fd = open("../UTF-8-demo.txt", O_RDONLY);
57 if (fd == -1) {
58 printf("Failed to open ../UTF-8-demo.txt!\n");
59 exit(1);
60 }
61 if (fstat(fd, &stat) == -1) {
62 printf("Failed to get file size!\n");
63 exit(1);
64 }
65
66 *len = stat.st_size;
67 data = malloc(*len);
68 if (read(fd, data, *len) != *len) {
69 printf("Failed to read file!\n");
70 exit(1);
71 }
72
73 close(fd);
74
75 return data;
76 }
77
print_test(const unsigned char * data,int len)78 static void print_test(const unsigned char *data, int len)
79 {
80 printf(" [len=%d] \"", len);
81 while (len--)
82 printf("\\x%02X", *data++);
83
84 printf("\"\n");
85 }
86
87 struct test {
88 const unsigned char *data;
89 int len;
90 };
91
prepare_test_buf(unsigned char * buf,const struct test * pos,int pos_len,int pos_idx)92 static void prepare_test_buf(unsigned char *buf, const struct test *pos,
93 int pos_len, int pos_idx)
94 {
95 /* Round concatenate correct tokens to 1024 bytes */
96 int buf_idx = 0;
97 while (buf_idx < 1024) {
98 int buf_len = 1024 - buf_idx;
99
100 if (buf_len >= pos[pos_idx].len) {
101 memcpy(buf+buf_idx, pos[pos_idx].data, pos[pos_idx].len);
102 buf_idx += pos[pos_idx].len;
103 } else {
104 memset(buf+buf_idx, 0, buf_len);
105 buf_idx += buf_len;
106 }
107
108 if (++pos_idx == pos_len)
109 pos_idx = 0;
110 }
111 }
112
113 /* Return 0 on success, -1 on error */
test_manual(const struct ftab * ftab,unsigned short * buf16,unsigned short * _buf16)114 static int test_manual(const struct ftab *ftab, unsigned short *buf16,
115 unsigned short *_buf16)
116 {
117 #define LEN16 4096
118
119 #pragma GCC diagnostic push
120 #pragma GCC diagnostic ignored "-Wpointer-sign"
121 /* positive tests */
122 static const struct test pos[] = {
123 {"", 0},
124 {"\x00", 1},
125 {"\x66", 1},
126 {"\x7F", 1},
127 {"\x00\x7F", 2},
128 {"\x7F\x00", 2},
129 {"\xC2\x80", 2},
130 {"\xDF\xBF", 2},
131 {"\xE0\xA0\x80", 3},
132 {"\xE0\xA0\xBF", 3},
133 {"\xED\x9F\x80", 3},
134 {"\xEF\x80\xBF", 3},
135 {"\xF0\x90\xBF\x80", 4},
136 {"\xF2\x81\xBE\x99", 4},
137 {"\xF4\x8F\x88\xAA", 4},
138 };
139
140 /* negative tests */
141 static const struct test neg[] = {
142 {"\x80", 1},
143 {"\xBF", 1},
144 {"\xC0\x80", 2},
145 {"\xC1\x00", 2},
146 {"\xC2\x7F", 2},
147 {"\xDF\xC0", 2},
148 {"\xE0\x9F\x80", 3},
149 {"\xE0\xC2\x80", 3},
150 {"\xED\xA0\x80", 3},
151 {"\xED\x7F\x80", 3},
152 {"\xEF\x80\x00", 3},
153 {"\xF0\x8F\x80\x80", 4},
154 {"\xF0\xEE\x80\x80", 4},
155 {"\xF2\x90\x91\x7F", 4},
156 {"\xF4\x90\x88\xAA", 4},
157 {"\xF4\x00\xBF\xBF", 4},
158 {"\x00\x00\x00\x00\x00\xC2\x80\x00\x00\x00\xE1\x80\x80\x00\x00\xC2" \
159 "\xC2\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
160 32},
161 {"\x00\x00\x00\x00\x00\xC2\xC2\x80\x00\x00\xE1\x80\x80\x00\x00\x00",
162 16},
163 {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
164 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80",
165 32},
166 {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
167 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1",
168 32},
169 {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
170 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80" \
171 "\x80", 33},
172 {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
173 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80" \
174 "\xC2\x80", 34},
175 {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
176 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF0" \
177 "\x80\x80\x80", 35},
178 };
179 #pragma GCC diagnostic push
180
181 size_t len16 = LEN16, _len16 = LEN16;
182 int ret, _ret;
183
184 /* Test single token */
185 for (int i = 0; i < sizeof(pos)/sizeof(pos[0]); ++i) {
186 ret = ftab->func(pos[i].data, pos[i].len, buf16, &len16);
187 _ret = utf8_to16_iconv(pos[i].data, pos[i].len, _buf16, &_len16);
188 if (ret != _ret || len16 != _len16 || memcmp(buf16, _buf16, len16)) {
189 printf("FAILED positive test(%d:%d, %lu:%lu): ",
190 ret, _ret, len16, _len16);
191 print_test(pos[i].data, pos[i].len);
192 return -1;
193 }
194 len16 = _len16 = LEN16;
195 }
196 for (int i = 0; i < sizeof(neg)/sizeof(neg[0]); ++i) {
197 ret = ftab->func(neg[i].data, neg[i].len, buf16, &len16);
198 _ret = utf8_to16_iconv(neg[i].data, neg[i].len, _buf16, &_len16);
199 if (ret != _ret || len16 != _len16 || memcmp(buf16, _buf16, len16)) {
200 printf("FAILED negitive test(%d:%d, %lu:%lu): ",
201 ret, _ret, len16, _len16);
202 print_test(neg[i].data, neg[i].len);
203 return -1;
204 }
205 len16 = _len16 = LEN16;
206 }
207
208 /* Test shifted buffer to cover 1k length */
209 /* buffer size must be greater than 1024 + 16 + max(test string length) */
210 const int max_size = 1024*2;
211 uint64_t buf64[max_size/8 + 2];
212 /* Offset 8 bytes by 1 byte */
213 unsigned char *buf = ((unsigned char *)buf64) + 1;
214 int buf_len;
215
216 for (int i = 0; i < sizeof(pos)/sizeof(pos[0]); ++i) {
217 /* Positive test: shift 16 bytes, validate each shift */
218 prepare_test_buf(buf, pos, sizeof(pos)/sizeof(pos[0]), i);
219 buf_len = 1024;
220 for (int j = 0; j < 16; ++j) {
221 ret = ftab->func(buf, buf_len, buf16, &len16);
222 _ret = utf8_to16_iconv(buf, buf_len, _buf16, &_len16);
223 if (ret != _ret || len16 != _len16 || \
224 memcmp(buf16, _buf16, len16)) {
225 printf("FAILED positive test(%d:%d, %lu:%lu): ",
226 ret, _ret, len16, _len16);
227 print_test(buf, buf_len);
228 return -1;
229 }
230 len16 = _len16 = LEN16;
231 for (int k = buf_len; k >= 1; --k)
232 buf[k] = buf[k-1];
233 buf[0] = '\x55';
234 ++buf_len;
235 }
236
237 /* Negative test: trunk last non ascii */
238 while (buf_len >= 1 && buf[buf_len-1] <= 0x7F)
239 --buf_len;
240 if (buf_len) {
241 ret = ftab->func(buf, buf_len-1, buf16, &len16);
242 _ret = utf8_to16_iconv(buf, buf_len-1, _buf16, &_len16);
243 if (ret != _ret || len16 != _len16 || \
244 memcmp(buf16, _buf16, len16)) {
245 printf("FAILED negative test(%d:%d, %lu:%lu): ",
246 ret, _ret, len16, _len16);
247 print_test(buf, buf_len-1);
248 return -1;
249 }
250 len16 = _len16 = LEN16;
251 }
252 }
253
254 /* Negative test */
255 for (int i = 0; i < sizeof(neg)/sizeof(neg[0]); ++i) {
256 /* Append one error token, shift 16 bytes, validate each shift */
257 int pos_idx = i % (sizeof(pos)/sizeof(pos[0]));
258 prepare_test_buf(buf, pos, sizeof(pos)/sizeof(pos[0]), pos_idx);
259 memcpy(buf+1024, neg[i].data, neg[i].len);
260 buf_len = 1024 + neg[i].len;
261 for (int j = 0; j < 16; ++j) {
262 ret = ftab->func(buf, buf_len, buf16, &len16);
263 _ret = utf8_to16_iconv(buf, buf_len, _buf16, &_len16);
264 if (ret != _ret || len16 != _len16 || \
265 memcmp(buf16, _buf16, len16)) {
266 printf("FAILED negative test(%d:%d, %lu:%lu): ",
267 ret, _ret, len16, _len16);
268 print_test(buf, buf_len);
269 return -1;
270 }
271 len16 = _len16 = LEN16;
272 for (int k = buf_len; k >= 1; --k)
273 buf[k] = buf[k-1];
274 buf[0] = '\x66';
275 ++buf_len;
276 }
277 }
278
279 return 0;
280 }
281
test(const unsigned char * buf8,size_t len8,unsigned short * buf16,size_t len16,const struct ftab * ftab)282 static void test(const unsigned char *buf8, size_t len8,
283 unsigned short *buf16, size_t len16, const struct ftab *ftab)
284 {
285 /* Use iconv as the reference answer */
286 if (strcmp(ftab->name, "iconv") == 0)
287 return;
288
289 printf("%s\n", ftab->name);
290
291 /* Test file or buffer */
292 size_t _len16 = len16;
293 unsigned short *_buf16 = (unsigned short *)malloc(_len16);
294 if (utf8_to16_iconv(buf8, len8, _buf16, &_len16)) {
295 printf("Invalid test file or buffer!\n");
296 exit(1);
297 }
298 printf("standard test: ");
299 if (ftab->func(buf8, len8, buf16, &len16) || len16 != _len16 || \
300 memcmp(buf16, _buf16, len16) != 0)
301 printf("FAIL\n");
302 else
303 printf("pass\n");
304 free(_buf16);
305
306 /* Manual cases */
307 unsigned short *mbuf8 = (unsigned short *)malloc(LEN16);
308 unsigned short *mbuf16 = (unsigned short *)malloc(LEN16);
309 printf("manual test: %s\n",
310 test_manual(ftab, mbuf8, mbuf16) ? "FAIL" : "pass");
311 free(mbuf8);
312 free(mbuf16);
313 printf("\n");
314 }
315
bench(const unsigned char * buf8,size_t len8,unsigned short * buf16,size_t len16,const struct ftab * ftab)316 static void bench(const unsigned char *buf8, size_t len8,
317 unsigned short *buf16, size_t len16, const struct ftab *ftab)
318 {
319 const int loops = 1024*1024*1024/len8;
320 int ret = 0;
321 double time, size;
322 struct timeval tv1, tv2;
323
324 fprintf(stderr, "bench %s... ", ftab->name);
325 gettimeofday(&tv1, 0);
326 for (int i = 0; i < loops; ++i)
327 ret |= ftab->func(buf8, len8, buf16, &len16);
328 gettimeofday(&tv2, 0);
329 printf("%s\n", ret?"FAIL":"pass");
330
331 time = tv2.tv_usec - tv1.tv_usec;
332 time = time / 1000000 + tv2.tv_sec - tv1.tv_sec;
333 size = ((double)len8 * loops) / (1024*1024);
334 printf("time: %.4f s\n", time);
335 printf("data: %.0f MB\n", size);
336 printf("BW: %.2f MB/s\n", size / time);
337 printf("\n");
338 }
339
usage(const char * bin)340 static void usage(const char *bin)
341 {
342 printf("Usage:\n");
343 printf("%s test [alg] ==> test all or one algorithm\n", bin);
344 printf("%s bench [alg] ==> benchmark all or one algorithm\n", bin);
345 printf("%s bench size NUM ==> benchmark with specific buffer size\n", bin);
346 printf("alg = ");
347 for (int i = 0; i < sizeof(ftab)/sizeof(ftab[0]); ++i)
348 printf("%s ", ftab[i].name);
349 printf("\nNUM = buffer size in bytes, 1 ~ 67108864(64M)\n");
350 }
351
main(int argc,char * argv[])352 int main(int argc, char *argv[])
353 {
354 int len8 = 0, len16;
355 unsigned char *buf8;
356 unsigned short *buf16;
357 const char *alg = NULL;
358 void (*tb)(const unsigned char *buf8, size_t len8,
359 unsigned short *buf16, size_t len16, const struct ftab *ftab);
360
361 tb = NULL;
362 if (argc >= 2) {
363 if (strcmp(argv[1], "test") == 0)
364 tb = test;
365 else if (strcmp(argv[1], "bench") == 0)
366 tb = bench;
367 if (argc >= 3) {
368 alg = argv[2];
369 if (strcmp(alg, "size") == 0) {
370 if (argc < 4) {
371 tb = NULL;
372 } else {
373 alg = NULL;
374 len8 = atoi(argv[3]);
375 if (len8 <= 0 || len8 > 67108864) {
376 printf("Buffer size error!\n\n");
377 tb = NULL;
378 }
379 }
380 }
381 }
382 }
383
384 if (tb == NULL) {
385 usage(argv[0]);
386 return 1;
387 }
388
389 /* Load UTF8 test buffer */
390 if (len8)
391 buf8 = load_test_buf(len8);
392 else
393 buf8 = load_test_file(&len8);
394
395 /* Prepare UTF16 buffer large enough */
396 len16 = len8 * 2;
397 buf16 = (unsigned short *)malloc(len16);
398
399 if (tb == bench)
400 printf("============== Bench UTF8 (%d bytes) ==============\n", len8);
401 for (int i = 0; i < sizeof(ftab)/sizeof(ftab[0]); ++i) {
402 if (alg && strcmp(alg, ftab[i].name) != 0)
403 continue;
404 tb((const unsigned char *)buf8, len8, buf16, len16, &ftab[i]);
405 }
406
407 #if 0
408 if (tb == bench) {
409 printf("==================== Bench ASCII ====================\n");
410 /* Change test buffer to ascii */
411 for (int i = 0; i < len; i++)
412 data[i] &= 0x7F;
413
414 for (int i = 0; i < sizeof(ftab)/sizeof(ftab[0]); ++i) {
415 if (alg && strcmp(alg, ftab[i].name) != 0)
416 continue;
417 tb((const unsigned char *)data, len, &ftab[i]);
418 printf("\n");
419 }
420 }
421 #endif
422
423 return 0;
424 }
425