1 /***************************************************
2 * A program for testing the Unicode property table *
3 ***************************************************/
4
5 /* Copyright (c) University of Cambridge 2008-2023 */
6
7 /* Compile thus:
8
9 gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 \
10 -fvisibility=hidden -o ucptest ucptest.c \
11 ../src/pcre2_ord2utf.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
12
13 Add -lreadline or -ledit if PCRE2 was configured with readline or libedit
14 support in pcre2test.
15 */
16
17 /* This is a hacked-up program for testing the Unicode properties tables of
18 PCRE2. It can also be used for finding characters with certain properties. I
19 wrote it to help with debugging, and have added things that I found useful, in
20 a rather haphazard way. The code has never been seriously tidied or checked for
21 robustness, but it shouldn't now give compiler warnings.
22
23 There is only one option: "-s". If given, it applies only to the "findprop"
24 command. It causes the UTF-8 sequence of bytes that encode the character to be
25 output between angle brackets at the end of the line. On a UTF-8 terminal, this
26 will show the appropriate graphic for the code point.
27
28 If the command has arguments, they are concatenated into a buffer, separated by
29 spaces. If the first argument starts "U+" or consists entirely of hexadecimal
30 digits, "findprop" is inserted at the start. The buffer is then processed as a
31 single line file, after which the program exits. If there are no arguments, the
32 program reads commands line by line on stdin and writes output to stdout. The
33 return code is always zero.
34
35 There are three commands:
36
37 The command "findprop" must be followed by a space-separated list of Unicode
38 code points as hex numbers, either without any prefix or starting with "U+", or
39 as individual UTF-8 characters preceded by '+'. For example:
40
41 findprop U+1234 5Abc +?
42
43 The output is one long line per character, listing Unicode properties that have
44 values, followed by its other case or cases if one or more exist, followed by
45 its Script Extension list if there is one. This list is in square brackets. A
46 second list in square brackets gives all the Boolean properties of the
47 character. The properties that come first are:
48
49 Bidi class e.g. NSM (most common is L)
50 General type e.g. Letter
51 Specific type e.g. Upper case letter
52 Script e.g. Medefaidrin
53 Grapheme break type e.g. Extend (most common is Other)
54
55 Script names and Boolean property names are all in lower case, with underscores
56 and hyphens removed, because that's how they are stored for "loose" matching.
57
58 The command "find" must be followed by a list of property types and their
59 values. The values are case-sensitive, except for bidi class. This finds
60 characters that have those properties. If multiple properties are listed, they
61 must all be matched. Currently supported:
62
63 script <name> The character must have this script property. Only one
64 such script may be given.
65 scriptx <name> This script must be in the character's Script Extension
66 property list. If this is used many times, all the given
67 scripts must be present.
68 type <abbrev> The character's specific type (e.g. Lu or Nd) must match.
69 gbreak <name> The grapheme break property must match.
70 bidi <class> The character's bidi class must match.
71 bool <name> The character's Boolean property list must contain this
72 property.
73
74 If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
75 Script Extensions and Boolean properties, there may be a mixture of positive
76 and negative requirements. All must be satisfied.
77
78 Sequences of two or more characters are shown as ranges, for example
79 U+0041..U+004A. No more than 100 lines are output. If there are more
80 characters, the list ends with ...
81
82 The command "list" must be followed by one of property names script, bool,
83 type, gbreak or bidi. The defined values for that property are listed. */
84
85
86 #ifdef HAVE_CONFIG_H
87 #include "../src/config.h"
88 #endif
89
90 #ifndef SUPPORT_UNICODE
91 #error "Unicode support not enabled"
92 #endif
93
94 #include <ctype.h>
95 #include <stdio.h>
96 #include <stdlib.h>
97 #include <string.h>
98 #include "../src/pcre2_internal.h"
99 #include "../src/pcre2_ucp.h"
100
101 #ifdef HAVE_UNISTD_H
102 #include <unistd.h>
103 #endif
104
105 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
106 #if defined(SUPPORT_LIBREADLINE)
107 #include <readline/readline.h>
108 #include <readline/history.h>
109 #else
110 #if defined(HAVE_EDITLINE_READLINE_H)
111 #include <editline/readline.h>
112 #else
113 #include <readline/readline.h>
114 #ifdef RL_VERSION_MAJOR
115 #include <readline/history.h>
116 #endif
117 #endif
118 #endif
119 #endif
120
121
122 /* -------------------------------------------------------------------*/
123
124 #define CS (char *)
125 #define CCS (const char *)
126 #define CSS (char **)
127 #define US (unsigned char *)
128 #define CUS (const unsigned char *)
129
130 /* -------------------------------------------------------------------*/
131
132 static BOOL show_character = FALSE;
133
134 static const unsigned char *type_names[] = {
135 US"Cc", US"Control",
136 US"Cf", US"Format",
137 US"Cn", US"Unassigned",
138 US"Co", US"Private use",
139 US"Cs", US"Surrogate",
140 US"Ll", US"Lower case letter",
141 US"Lm", US"Modifier letter",
142 US"Lo", US"Other letter",
143 US"Lt", US"Title case letter",
144 US"Lu", US"Upper case letter",
145 US"Mc", US"Spacing mark",
146 US"Me", US"Enclosing mark",
147 US"Mn", US"Non-spacing mark",
148 US"Nd", US"Decimal number",
149 US"Nl", US"Letter number",
150 US"No", US"Other number",
151 US"Pc", US"Connector punctuation",
152 US"Pd", US"Dash punctuation",
153 US"Pe", US"Close punctuation",
154 US"Pf", US"Final punctuation",
155 US"Pi", US"Initial punctuation",
156 US"Po", US"Other punctuation",
157 US"Ps", US"Open punctuation",
158 US"Sc", US"Currency symbol",
159 US"Sk", US"Modifier symbol",
160 US"Sm", US"Mathematical symbol",
161 US"So", US"Other symbol",
162 US"Zl", US"Line separator",
163 US"Zp", US"Paragraph separator",
164 US"Zs", US"Space separator"
165 };
166
167 static const unsigned char *gb_names[] = {
168 US"CR", US"carriage return",
169 US"LF", US"linefeed",
170 US"Control", US"",
171 US"Extend", US"",
172 US"Prepend", US"",
173 US"SpacingMark", US"",
174 US"L", US"Hangul syllable type L",
175 US"V", US"Hangul syllable type V",
176 US"T", US"Hangul syllable type T",
177 US"LV", US"Hangul syllable type LV",
178 US"LVT", US"Hangul syllable type LVT",
179 US"Regional_Indicator", US"",
180 US"Other", US"",
181 US"ZWJ", US"zero width joiner",
182 US"Extended_Pictographic", US""
183 };
184
185 static const unsigned char *bd_names[] = {
186 US"AL", US"ArabicLetter",
187 US"AN", US"ArabicNumber",
188 US"B", US"ParagraphSeparator",
189 US"BN", US"BoundaryNeutral",
190 US"CS", US"CommonSeparator",
191 US"EN", US"EuropeanNumber",
192 US"ES", US"EuropeanSeparator",
193 US"ET", US"EuropeanTerminator",
194 US"FSI", US"FirstStrongIsolate",
195 US"L", US"LeftToRight",
196 US"LRE", US"LeftToRightEmbedding",
197 US"LRI", US"LeftToRightIsolate",
198 US"LRO", US"LeftToRightOverride",
199 US"NSM", US"NonspacingMark",
200 US"ON", US"OtherNeutral",
201 US"PDF", US"PopDirectionalFormat",
202 US"PDI", US"PopDirectionalIsolate",
203 US"R", US"RightToLeft",
204 US"RLE", US"RightToLeftEmbedding",
205 US"RLI", US"RightToLeftIsolate",
206 US"RLO", US"RightToLeftOverride",
207 US"S", US"SegmentSeparator",
208 US"WS", US"WhiteSpace"
209 };
210
211
212 /*************************************************
213 * Test for interaction *
214 *************************************************/
215
216 static BOOL
is_stdin_tty(void)217 is_stdin_tty(void)
218 {
219 #if defined WIN32
220 return _isatty(_fileno(stdin));
221 #else
222 return isatty(fileno(stdin));
223 #endif
224 }
225
226
227 /*************************************************
228 * Get name from ucp ident *
229 *************************************************/
230
231 /* The utt table contains both full names and abbreviations. So search for both
232 and use the longer if two are found, unless the first one is only 3 characters
233 and we are looking for a script (some scripts have 3-character names). If this
234 were not just a test program it might be worth making some kind of reverse
235 index. */
236
237 static const char *
get_propname(int prop,int type)238 get_propname(int prop, int type)
239 {
240 size_t i, j, len;
241 size_t foundlist[2];
242 const char *yield;
243 int typex = (type == PT_SC)? PT_SCX : type;
244
245 j = 0;
246 for (i = 0; i < PRIV(utt_size); i++)
247 {
248 const ucp_type_table *u = PRIV(utt) + i;
249 if ((u->type == type || u->type == typex) && u->value == prop)
250 {
251 foundlist[j++] = i;
252 if (j >= 2) break;
253 }
254 }
255
256 if (j == 0) return "??";
257
258 yield = NULL;
259 len = 0;
260
261 for (i = 0; i < j; i++)
262 {
263 const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
264 size_t sl = strlen(s);
265
266 if (sl > len)
267 {
268 yield = s;
269 if (sl == 3 && type == PT_SC) break;
270 len = sl;
271 }
272 }
273
274 return yield;
275 }
276
277
278 /*************************************************
279 * Print Unicode property info for a char *
280 *************************************************/
281
282 static void
print_prop(unsigned int c,BOOL is_just_one)283 print_prop(unsigned int c, BOOL is_just_one)
284 {
285 unsigned int type = UCD_CATEGORY(c);
286 int fulltype = UCD_CHARTYPE(c);
287 int script = UCD_SCRIPT(c);
288 int scriptx = UCD_SCRIPTX(c);
289 int gbprop = UCD_GRAPHBREAK(c);
290 int bidi = UCD_BIDICLASS(c);
291 unsigned int othercase = UCD_OTHERCASE(c);
292 int caseset = UCD_CASESET(c);
293 int bprops = UCD_BPROPS(c);
294
295 const unsigned char *fulltypename = US"??";
296 const unsigned char *typename = US"??";
297 const unsigned char *graphbreak = US"??";
298 const unsigned char *bidiclass = US"??";
299 const unsigned char *scriptname = CUS get_propname(script, PT_SC);
300
301 switch (type)
302 {
303 case ucp_C: typename = US"Control"; break;
304 case ucp_L: typename = US"Letter"; break;
305 case ucp_M: typename = US"Mark"; break;
306 case ucp_N: typename = US"Number"; break;
307 case ucp_P: typename = US"Punctuation"; break;
308 case ucp_S: typename = US"Symbol"; break;
309 case ucp_Z: typename = US"Separator"; break;
310 }
311
312 switch (fulltype)
313 {
314 case ucp_Cc: fulltypename = US"Control"; break;
315 case ucp_Cf: fulltypename = US"Format"; break;
316 case ucp_Cn: fulltypename = US"Unassigned"; break;
317 case ucp_Co: fulltypename = US"Private use"; break;
318 case ucp_Cs: fulltypename = US"Surrogate"; break;
319 case ucp_Ll: fulltypename = US"Lower case letter"; break;
320 case ucp_Lm: fulltypename = US"Modifier letter"; break;
321 case ucp_Lo: fulltypename = US"Other letter"; break;
322 case ucp_Lt: fulltypename = US"Title case letter"; break;
323 case ucp_Lu: fulltypename = US"Upper case letter"; break;
324 case ucp_Mc: fulltypename = US"Spacing mark"; break;
325 case ucp_Me: fulltypename = US"Enclosing mark"; break;
326 case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
327 case ucp_Nd: fulltypename = US"Decimal number"; break;
328 case ucp_Nl: fulltypename = US"Letter number"; break;
329 case ucp_No: fulltypename = US"Other number"; break;
330 case ucp_Pc: fulltypename = US"Connector punctuation"; break;
331 case ucp_Pd: fulltypename = US"Dash punctuation"; break;
332 case ucp_Pe: fulltypename = US"Close punctuation"; break;
333 case ucp_Pf: fulltypename = US"Final punctuation"; break;
334 case ucp_Pi: fulltypename = US"Initial punctuation"; break;
335 case ucp_Po: fulltypename = US"Other punctuation"; break;
336 case ucp_Ps: fulltypename = US"Open punctuation"; break;
337 case ucp_Sc: fulltypename = US"Currency symbol"; break;
338 case ucp_Sk: fulltypename = US"Modifier symbol"; break;
339 case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
340 case ucp_So: fulltypename = US"Other symbol"; break;
341 case ucp_Zl: fulltypename = US"Line separator"; break;
342 case ucp_Zp: fulltypename = US"Paragraph separator"; break;
343 case ucp_Zs: fulltypename = US"Space separator"; break;
344 }
345
346 switch(gbprop)
347 {
348 case ucp_gbCR: graphbreak = US"CR"; break;
349 case ucp_gbLF: graphbreak = US"LF"; break;
350 case ucp_gbControl: graphbreak = US"Control"; break;
351 case ucp_gbExtend: graphbreak = US"Extend"; break;
352 case ucp_gbPrepend: graphbreak = US"Prepend"; break;
353 case ucp_gbSpacingMark: graphbreak = US"SpacingMark"; break;
354 case ucp_gbL: graphbreak = US"Hangul syllable type L"; break;
355 case ucp_gbV: graphbreak = US"Hangul syllable type V"; break;
356 case ucp_gbT: graphbreak = US"Hangul syllable type T"; break;
357 case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break;
358 case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break;
359 case ucp_gbRegional_Indicator:
360 graphbreak = US"Regional Indicator"; break;
361 case ucp_gbOther: graphbreak = US"Other"; break;
362 case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break;
363 case ucp_gbExtended_Pictographic:
364 graphbreak = US"Extended Pictographic"; break;
365 default: graphbreak = US"Unknown"; break;
366 }
367
368 switch(bidi)
369 {
370 case ucp_bidiAL: bidiclass = US"AL "; break;
371 case ucp_bidiFSI: bidiclass = US"FSI"; break;
372 case ucp_bidiL: bidiclass = US"L "; break;
373 case ucp_bidiLRE: bidiclass = US"LRE"; break;
374 case ucp_bidiLRI: bidiclass = US"LRI"; break;
375 case ucp_bidiLRO: bidiclass = US"LRO"; break;
376 case ucp_bidiPDF: bidiclass = US"PDF"; break;
377 case ucp_bidiPDI: bidiclass = US"PDI"; break;
378 case ucp_bidiR: bidiclass = US"R "; break;
379 case ucp_bidiRLE: bidiclass = US"RLE"; break;
380 case ucp_bidiRLI: bidiclass = US"RLI"; break;
381 case ucp_bidiRLO: bidiclass = US"RLO"; break;
382 case ucp_bidiAN: bidiclass = US"AN "; break;
383 case ucp_bidiB: bidiclass = US"B "; break;
384 case ucp_bidiBN: bidiclass = US"BN "; break;
385 case ucp_bidiCS: bidiclass = US"CS "; break;
386 case ucp_bidiEN: bidiclass = US"EN "; break;
387 case ucp_bidiES: bidiclass = US"ES "; break;
388 case ucp_bidiET: bidiclass = US"ET "; break;
389 case ucp_bidiNSM: bidiclass = US"NSM"; break;
390 case ucp_bidiON: bidiclass = US"ON "; break;
391 case ucp_bidiS: bidiclass = US"S "; break;
392 case ucp_bidiWS: bidiclass = US"WS "; break;
393 default: bidiclass = US"???"; break;
394 }
395
396 printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,
397 scriptname, graphbreak);
398
399 if (is_just_one && (othercase != c || caseset != 0))
400 {
401 if (othercase != c) printf(", U+%04X", othercase);
402 if (caseset != 0)
403 {
404 const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
405 while (*(++p) < NOTACHAR)
406 {
407 unsigned int d = *p;
408 if (d != othercase && d != c) printf(", U+%04X", d);
409 }
410 }
411 }
412
413 if (scriptx != 0)
414 {
415 const char *sep = "";
416 const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
417 printf(", [");
418 for (int i = 0; i < ucp_Unknown; i++)
419 if (MAPBIT(p, i) != 0)
420 {
421 printf("%s%s", sep, get_propname(i, PT_SC));
422 sep = ", ";
423 }
424 printf("]");
425 }
426
427 if (bprops != 0)
428 {
429 const char *sep = "";
430 const uint32_t *p = PRIV(ucd_boolprop_sets) + bprops;
431 printf(", [");
432 for (int i = 0; i < ucp_Bprop_Count; i++)
433 if (MAPBIT(p, i) != 0)
434 {
435 printf("%s%s", sep, get_propname(i, PT_BOOL));
436 sep = ", ";
437 }
438 printf("]");
439 }
440
441 if (show_character && is_just_one)
442 {
443 unsigned char buffer[8];
444 int len = (int)PRIV(ord2utf_8)(c, buffer);
445 printf(", >%.*s<", len, buffer);
446 }
447
448 printf("\n");
449 }
450
451
452
453 /*************************************************
454 * Find character(s) with given property/ies *
455 *************************************************/
456
457 static void
find_chars(unsigned char * s)458 find_chars(unsigned char *s)
459 {
460 unsigned char name[128];
461 unsigned char value[128];
462 unsigned char *t;
463 unsigned int count= 0;
464 int scriptx_list[128];
465 unsigned int scriptx_count = 0;
466 int bprop_list[128];
467 unsigned int bprop_count = 0;
468 uint32_t i, c;
469 int script = -1;
470 int type = -1;
471 int gbreak = -1;
472 int bidiclass = -1;
473 BOOL script_not = FALSE;
474 BOOL type_not = FALSE;
475 BOOL gbreak_not = FALSE;
476 BOOL bidiclass_not = FALSE;
477 BOOL hadrange = FALSE;
478 const ucd_record *ucd, *next_ucd;
479 const char *pad = " ";
480
481 while (*s != 0)
482 {
483 unsigned int offset = 0;
484
485 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
486 *t = 0;
487 while (isspace(*s)) s++;
488
489 for (t = value; *s != 0 && !isspace(*s); s++)
490 {
491 if (*s != '_' && *s != '-') *t++ = *s;
492 }
493 *t = 0;
494 while (isspace(*s)) s++;
495
496 if (strcmp(CS name, "script") == 0 ||
497 strcmp(CS name, "scriptx") == 0)
498 {
499 BOOL x = (name[6] == 'x');
500 BOOL scriptx_not = FALSE;
501 for (t = value; *t != 0; t++) *t = tolower(*t);
502
503 if (value[0] == '!')
504 {
505 if (x) scriptx_not = TRUE; else script_not = TRUE;
506 offset = 1;
507 }
508
509 for (i = 0; i < PRIV(utt_size); i++)
510 {
511 const ucp_type_table *u = PRIV(utt) + i;
512 if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset),
513 PRIV(utt_names) + u->name_offset) == 0)
514 {
515 c = u->value;
516 if (x && !scriptx_not && u->type == PT_SC)
517 {
518 if (script < 0)
519 {
520 x = FALSE;
521 script = -1;
522 script_not = scriptx_not;
523 }
524 else if (!script_not)
525 {
526 printf("No characters found\n");
527 return;
528 }
529 }
530 if (x)
531 {
532 scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
533 }
534 else
535 {
536 if (script < 0) script = c; else
537 {
538 printf("** Only 1 script value allowed\n");
539 return;
540 }
541 }
542 break;
543 }
544 }
545
546 if (i >= PRIV(utt_size))
547 {
548 printf("** Unrecognized script name \"%s\"\n", value);
549 return;
550 }
551 }
552
553 else if (strcmp(CS name, "bool") == 0)
554 {
555 int not = 1;
556 if (value[0] == '!')
557 {
558 not = -1;
559 offset = 1;
560 }
561
562 for (i = 0; i < PRIV(utt_size); i++)
563 {
564 const ucp_type_table *u = PRIV(utt) + i;
565 if (u->type == PT_BOOL && strcmp(CS(value + offset),
566 PRIV(utt_names) + u->name_offset) == 0)
567 {
568 bprop_list[bprop_count++] = u->value * not;
569 break;
570 }
571 }
572
573 if (i >= PRIV(utt_size))
574 {
575 printf("** Unrecognized property name \"%s\"\n", value);
576 return;
577 }
578 }
579
580 else if (strcmp(CS name, "type") == 0)
581 {
582 if (type >= 0)
583 {
584 printf("** Only 1 type value allowed\n");
585 return;
586 }
587 else
588 {
589 if (value[0] == '!')
590 {
591 type_not = TRUE;
592 offset = 1;
593 }
594
595 for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
596 {
597 if (strcmp(CS (value + offset), CCS type_names[i]) == 0)
598 {
599 type = i/2;
600 break;
601 }
602 }
603 if (i >= sizeof(type_names)/sizeof(char *))
604 {
605 printf("** Unrecognized type name \"%s\"\n", value);
606 return;
607 }
608 }
609 }
610
611 else if (strcmp(CS name, "gbreak") == 0)
612 {
613 if (gbreak >= 0)
614 {
615 printf("** Only 1 grapheme break value allowed\n");
616 return;
617 }
618 else
619 {
620 if (value[0] == '!')
621 {
622 gbreak_not = TRUE;
623 offset = 1;
624 }
625
626 for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
627 {
628 if (strcmp(CS (value + offset), CCS gb_names[i]) == 0)
629 {
630 gbreak = i/2;
631 break;
632 }
633 }
634 if (i >= sizeof(gb_names)/sizeof(char *))
635 {
636 printf("** Unrecognized gbreak name \"%s\"\n", value);
637 return;
638 }
639 }
640 }
641
642 else if (strcmp(CS name, "bidi") == 0 ||
643 strcmp(CS name, "bidiclass") == 0 ||
644 strcmp(CS name, "bidi_class") == 0 )
645 {
646 if (bidiclass >= 0)
647 {
648 printf("** Only 1 bidi class value allowed\n");
649 return;
650 }
651 else
652 {
653 if (value[0] == '!')
654 {
655 bidiclass_not = TRUE;
656 offset = 1;
657 }
658 for (i = 0; i < sizeof(bd_names)/sizeof(char *); i++)
659 {
660 if (strcasecmp(CS (value + offset), CCS bd_names[i]) == 0)
661 {
662 bidiclass = i/2;
663 break;
664 }
665 }
666 if (i >= sizeof(bd_names)/sizeof(char *))
667 {
668 printf("** Unrecognized bidi class name \"%s\"\n", value);
669 return;
670 }
671 }
672 }
673
674 else
675 {
676 printf("** Unrecognized property name \"%s\"\n", name);
677 return;
678 }
679 }
680
681 if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 &&
682 gbreak < 0 && bidiclass < 0)
683 {
684 printf("** No properties specified\n");
685 return;
686 }
687
688 for (c = 0; c <= 0x10ffff; c++)
689 {
690 if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
691
692 if (scriptx_count > 0)
693 {
694 const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c);
695 unsigned int found = 0;
696
697 for (i = 0; i < scriptx_count; i++)
698 {
699 int x = scriptx_list[i]/32;
700 int y = scriptx_list[i]%32;
701
702 /* Positive requirment */
703 if (scriptx_list[i] >= 0)
704 {
705 if (scriptx_list[i] == UCD_SCRIPT(c) ||
706 ((scriptx_list[i] < ucp_Unknown) &&
707 (bits_scriptx[x] & (1u<<y)) != 0)) found++;
708 }
709 /* Negative requirement */
710 else
711 {
712 if ((-(scriptx_list[i]) < ucp_Unknown) &&
713 (bits_scriptx[x] & (1u<<y)) == 0) found++;
714 }
715 }
716
717 if (found != scriptx_count) continue;
718 }
719
720 if (bprop_count > 0)
721 {
722 const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) + UCD_BPROPS(c);
723 unsigned int found = 0;
724
725 for (i = 0; i < bprop_count; i++)
726 {
727 int x = bprop_list[i]/32;
728 int y = bprop_list[i]%32;
729
730 /* Positive requirement */
731 if (bprop_list[i] >= 0)
732 {
733 if ((bits_bprop[x] & (1u<<y)) != 0) found++;
734 }
735 /* Negative requirement */
736 else
737 {
738 if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++;
739 }
740 }
741
742 if (found != bprop_count) continue;
743 }
744
745 if (type >= 0)
746 {
747 if (type_not)
748 {
749 if (type == UCD_CHARTYPE(c)) continue;
750 }
751 else
752 {
753 if (type != UCD_CHARTYPE(c)) continue;
754 }
755 }
756
757 if (gbreak >= 0)
758 {
759 if (gbreak_not)
760 {
761 if (gbreak == UCD_GRAPHBREAK(c)) continue;
762 }
763 else
764 {
765 if (gbreak != UCD_GRAPHBREAK(c)) continue;
766 }
767 }
768
769 if (bidiclass >= 0)
770 {
771 if (bidiclass_not)
772 {
773 if (bidiclass == UCD_BIDICLASS(c)) continue;
774 }
775 else
776 {
777 if (bidiclass != UCD_BIDICLASS(c)) continue;
778 }
779 }
780
781 /* All conditions are met. Look for runs. */
782
783 ucd = GET_UCD(c);
784
785 for (i = c + 1; i < 0x10ffff; i++)
786 {
787 next_ucd = GET_UCD(i);
788 if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
789 }
790
791 if (--i > c)
792 {
793 printf("U+%04X..", c);
794 c = i;
795 hadrange = TRUE;
796 }
797 else if (hadrange) printf("%s", pad);
798
799 print_prop(c, FALSE);
800 if (c >= 0x100000) pad = " ";
801 else if (c >= 0x10000) pad = " ";
802 count++;
803 if (count >= 100)
804 {
805 printf("...\n");
806 break;
807 }
808 }
809
810 if (count == 0) printf("No characters found\n");
811 }
812
813
814 /*************************************************
815 * Process command line *
816 *************************************************/
817
818 static void
process_command_line(unsigned char * buffer)819 process_command_line(unsigned char *buffer)
820 {
821 unsigned char *s, *t;
822 unsigned char name[24];
823
824 s = buffer;
825 while (isspace(*s)) s++;
826 if (*s == 0) return;
827
828 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
829 *t = 0;
830 while (isspace(*s)) s++;
831
832 if (strcmp(CS name, "findprop") == 0)
833 {
834 while (*s != 0)
835 {
836 unsigned int c;
837 unsigned char *endptr;
838 t = s;
839
840 if (*t == '+')
841 {
842 c = *(++t);
843 if (c > 0x7fu)
844 {
845 GETCHARINC(c, t);
846 endptr = t;
847 }
848 else endptr = t+1;
849 }
850 else
851 {
852 if (memcmp(t, "U+", 2) == 0) t += 2;
853 c = (uint32_t)strtoul(CS t, CSS(&endptr), 16);
854 }
855
856 if (*endptr != 0 && !isspace(*endptr))
857 {
858 while (*endptr != 0 && !isspace(*endptr)) endptr++;
859 printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
860 }
861 else
862 {
863 if (c > 0x10ffff)
864 printf("** U+%x is too big for a Unicode code point\n", c);
865 else
866 print_prop(c, TRUE);
867 }
868 s = endptr;
869 while (isspace(*s)) s++;
870 }
871 }
872
873 else if (strcmp(CS name, "find") == 0)
874 {
875 find_chars(s);
876 }
877
878 else if (strcmp(CS name, "list") == 0)
879 {
880 while (*s != 0)
881 {
882 size_t i;
883 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
884 *t = 0;
885 while (isspace(*s)) s++;
886
887 if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
888 {
889 for (i = 0; i < PRIV(utt_size); i++)
890 if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC)
891 printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
892 }
893
894 else if (strcmp(CS name, "bool") == 0)
895 {
896 for (i = 0; i < PRIV(utt_size); i++)
897 if (PRIV(utt)[i].type == PT_BOOL)
898 printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
899 }
900
901 else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
902 {
903 for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
904 printf("%s %s\n", type_names[i], type_names[i+1]);
905 }
906
907 else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
908 {
909 for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
910 {
911 if (gb_names[i+1][0] != 0)
912 printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
913 else
914 printf("%s\n", gb_names[i]);
915 }
916 }
917
918 else if (strcmp(CS name, "bidi") == 0 ||
919 strcmp(CS name, "bidiclasses") == 0)
920 {
921 for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
922 printf("%3s %s\n", bd_names[i], bd_names[i+1]);
923 }
924
925 else
926 {
927 printf("** Unknown property \"%s\"\n", name);
928 break;
929 }
930 }
931 }
932
933 else printf("** Unknown test command \"%s\"\n", name);
934 }
935
936
937
938 /*************************************************
939 * Main program *
940 *************************************************/
941
942 int
main(int argc,char ** argv)943 main(int argc, char **argv)
944 {
945 BOOL interactive;
946 int first_arg = 1;
947 unsigned char buffer[1024];
948
949 if (argc > 1 && strcmp(argv[1], "-s") == 0)
950 {
951 show_character = TRUE;
952 first_arg++;
953 }
954
955 if (argc > first_arg)
956 {
957 int i;
958 BOOL datafirst = TRUE;
959 char *arg = argv[first_arg];
960 unsigned char *s = buffer;
961
962 if (*arg != '+' && memcmp(arg, "U+", 2) != 0 && !isdigit(*arg))
963 {
964 while (*arg != 0)
965 {
966 if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
967 }
968 }
969
970 if (datafirst)
971 {
972 strcpy(CS s, "findprop ");
973 s += 9;
974 }
975
976 for (i = first_arg; i < argc; i++)
977 {
978 s += sprintf(CS s, "%s ", argv[i]);
979 }
980
981 process_command_line(buffer);
982 return 0;
983 }
984
985 interactive = is_stdin_tty();
986
987 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
988 if (interactive) using_history();
989 #endif
990
991 for(;;)
992 {
993 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
994 if (interactive)
995 {
996 size_t len;
997 unsigned char *s = US readline("> ");
998 if (s == NULL) break;
999 len = strlen(CS s);
1000 if (len > 0) add_history(CS s);
1001 memcpy(buffer, s, len);
1002 buffer[len] = '\n';
1003 buffer[len+1] = 0;
1004 free(s);
1005 }
1006 else
1007 #endif
1008
1009 {
1010 if (interactive) printf("> ");
1011 if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
1012 if (!interactive) printf("%s", buffer);
1013 }
1014
1015 process_command_line(buffer);
1016 }
1017
1018 if (interactive) printf("\n");
1019
1020 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1021 if (interactive) clear_history();
1022 #endif
1023
1024 return 0;
1025 }
1026
1027 /* End */
1028