xref: /aosp_15_r20/external/pcre/maint/ucptest.c (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1 /***************************************************
2 * A program for testing the Unicode property table *
3 ***************************************************/
4 
5 /* Copyright (c) University of Cambridge 2008-2023 */
6 
7 /* Compile thus:
8 
9    gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 \
10      -fvisibility=hidden -o ucptest ucptest.c \
11      ../src/pcre2_ord2utf.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
12 
13    Add -lreadline or -ledit if PCRE2 was configured with readline or libedit
14    support in pcre2test.
15 */
16 
17 /* This is a hacked-up program for testing the Unicode properties tables of
18 PCRE2. It can also be used for finding characters with certain properties. I
19 wrote it to help with debugging, and have added things that I found useful, in
20 a rather haphazard way. The code has never been seriously tidied or checked for
21 robustness, but it shouldn't now give compiler warnings.
22 
23 There is only one option: "-s". If given, it applies only to the "findprop"
24 command. It causes the UTF-8 sequence of bytes that encode the character to be
25 output between angle brackets at the end of the line. On a UTF-8 terminal, this
26 will show the appropriate graphic for the code point.
27 
28 If the command has arguments, they are concatenated into a buffer, separated by
29 spaces. If the first argument starts "U+" or consists entirely of hexadecimal
30 digits, "findprop" is inserted at the start. The buffer is then processed as a
31 single line file, after which the program exits. If there are no arguments, the
32 program reads commands line by line on stdin and writes output to stdout. The
33 return code is always zero.
34 
35 There are three commands:
36 
37 The command "findprop" must be followed by a space-separated list of Unicode
38 code points as hex numbers, either without any prefix or starting with "U+", or
39 as individual UTF-8 characters preceded by '+'. For example:
40 
41   findprop U+1234 5Abc +?
42 
43 The output is one long line per character, listing Unicode properties that have
44 values, followed by its other case or cases if one or more exist, followed by
45 its Script Extension list if there is one. This list is in square brackets. A
46 second list in square brackets gives all the Boolean properties of the
47 character. The properties that come first are:
48 
49   Bidi class          e.g. NSM (most common is L)
50   General type        e.g. Letter
51   Specific type       e.g. Upper case letter
52   Script              e.g. Medefaidrin
53   Grapheme break type e.g. Extend (most common is Other)
54 
55 Script names and Boolean property names are all in lower case, with underscores
56 and hyphens removed, because that's how they are stored for "loose" matching.
57 
58 The command "find" must be followed by a list of property types and their
59 values. The values are case-sensitive, except for bidi class. This finds
60 characters that have those properties. If multiple properties are listed, they
61 must all be matched. Currently supported:
62 
63   script <name>    The character must have this script property. Only one
64                      such script may be given.
65   scriptx <name>   This script must be in the character's Script Extension
66                      property list. If this is used many times, all the given
67                      scripts must be present.
68   type <abbrev>    The character's specific type (e.g. Lu or Nd) must match.
69   gbreak <name>    The grapheme break property must match.
70   bidi <class>     The character's bidi class must match.
71   bool <name>      The character's Boolean property list must contain this
72                      property.
73 
74 If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
75 Script Extensions and Boolean properties, there may be a mixture of positive
76 and negative requirements. All must be satisfied.
77 
78 Sequences of two or more characters are shown as ranges, for example
79 U+0041..U+004A. No more than 100 lines are output. If there are more
80 characters, the list ends with ...
81 
82 The command "list" must be followed by one of property names script, bool,
83 type, gbreak or bidi. The defined values for that property are listed. */
84 
85 
86 #ifdef HAVE_CONFIG_H
87 #include "../src/config.h"
88 #endif
89 
90 #ifndef SUPPORT_UNICODE
91 #error "Unicode support not enabled"
92 #endif
93 
94 #include <ctype.h>
95 #include <stdio.h>
96 #include <stdlib.h>
97 #include <string.h>
98 #include "../src/pcre2_internal.h"
99 #include "../src/pcre2_ucp.h"
100 
101 #ifdef HAVE_UNISTD_H
102 #include <unistd.h>
103 #endif
104 
105 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
106 #if defined(SUPPORT_LIBREADLINE)
107 #include <readline/readline.h>
108 #include <readline/history.h>
109 #else
110 #if defined(HAVE_EDITLINE_READLINE_H)
111 #include <editline/readline.h>
112 #else
113 #include <readline/readline.h>
114 #ifdef RL_VERSION_MAJOR
115 #include <readline/history.h>
116 #endif
117 #endif
118 #endif
119 #endif
120 
121 
122 /* -------------------------------------------------------------------*/
123 
124 #define CS   (char *)
125 #define CCS  (const char *)
126 #define CSS  (char **)
127 #define US   (unsigned char *)
128 #define CUS  (const unsigned char *)
129 
130 /* -------------------------------------------------------------------*/
131 
132 static BOOL show_character = FALSE;
133 
134 static const unsigned char *type_names[] = {
135   US"Cc", US"Control",
136   US"Cf", US"Format",
137   US"Cn", US"Unassigned",
138   US"Co", US"Private use",
139   US"Cs", US"Surrogate",
140   US"Ll", US"Lower case letter",
141   US"Lm", US"Modifier letter",
142   US"Lo", US"Other letter",
143   US"Lt", US"Title case letter",
144   US"Lu", US"Upper case letter",
145   US"Mc", US"Spacing mark",
146   US"Me", US"Enclosing mark",
147   US"Mn", US"Non-spacing mark",
148   US"Nd", US"Decimal number",
149   US"Nl", US"Letter number",
150   US"No", US"Other number",
151   US"Pc", US"Connector punctuation",
152   US"Pd", US"Dash punctuation",
153   US"Pe", US"Close punctuation",
154   US"Pf", US"Final punctuation",
155   US"Pi", US"Initial punctuation",
156   US"Po", US"Other punctuation",
157   US"Ps", US"Open punctuation",
158   US"Sc", US"Currency symbol",
159   US"Sk", US"Modifier symbol",
160   US"Sm", US"Mathematical symbol",
161   US"So", US"Other symbol",
162   US"Zl", US"Line separator",
163   US"Zp", US"Paragraph separator",
164   US"Zs", US"Space separator"
165 };
166 
167 static const unsigned char *gb_names[] = {
168   US"CR",                    US"carriage return",
169   US"LF",                    US"linefeed",
170   US"Control",               US"",
171   US"Extend",                US"",
172   US"Prepend",               US"",
173   US"SpacingMark",           US"",
174   US"L",                     US"Hangul syllable type L",
175   US"V",                     US"Hangul syllable type V",
176   US"T",                     US"Hangul syllable type T",
177   US"LV",                    US"Hangul syllable type LV",
178   US"LVT",                   US"Hangul syllable type LVT",
179   US"Regional_Indicator",    US"",
180   US"Other",                 US"",
181   US"ZWJ",                   US"zero width joiner",
182   US"Extended_Pictographic", US""
183 };
184 
185 static const unsigned char *bd_names[] = {
186   US"AL",   US"ArabicLetter",
187   US"AN",   US"ArabicNumber",
188   US"B",    US"ParagraphSeparator",
189   US"BN",   US"BoundaryNeutral",
190   US"CS",   US"CommonSeparator",
191   US"EN",   US"EuropeanNumber",
192   US"ES",   US"EuropeanSeparator",
193   US"ET",   US"EuropeanTerminator",
194   US"FSI",  US"FirstStrongIsolate",
195   US"L",    US"LeftToRight",
196   US"LRE",  US"LeftToRightEmbedding",
197   US"LRI",  US"LeftToRightIsolate",
198   US"LRO",  US"LeftToRightOverride",
199   US"NSM",  US"NonspacingMark",
200   US"ON",   US"OtherNeutral",
201   US"PDF",  US"PopDirectionalFormat",
202   US"PDI",  US"PopDirectionalIsolate",
203   US"R",    US"RightToLeft",
204   US"RLE",  US"RightToLeftEmbedding",
205   US"RLI",  US"RightToLeftIsolate",
206   US"RLO",  US"RightToLeftOverride",
207   US"S",    US"SegmentSeparator",
208   US"WS",   US"WhiteSpace"
209 };
210 
211 
212 /*************************************************
213 *             Test for interaction               *
214 *************************************************/
215 
216 static BOOL
is_stdin_tty(void)217 is_stdin_tty(void)
218 {
219 #if defined WIN32
220 return _isatty(_fileno(stdin));
221 #else
222 return isatty(fileno(stdin));
223 #endif
224 }
225 
226 
227 /*************************************************
228 *            Get  name from ucp ident            *
229 *************************************************/
230 
231 /* The utt table contains both full names and abbreviations. So search for both
232 and use the longer if two are found, unless the first one is only 3 characters
233 and we are looking for a script (some scripts have 3-character names). If this
234 were not just a test program it might be worth making some kind of reverse
235 index. */
236 
237 static const char *
get_propname(int prop,int type)238 get_propname(int prop, int type)
239 {
240 size_t i, j, len;
241 size_t foundlist[2];
242 const char *yield;
243 int typex = (type == PT_SC)? PT_SCX : type;
244 
245 j = 0;
246 for (i = 0; i < PRIV(utt_size); i++)
247   {
248   const ucp_type_table *u = PRIV(utt) + i;
249   if ((u->type == type || u->type == typex) && u->value == prop)
250     {
251     foundlist[j++] = i;
252     if (j >= 2) break;
253     }
254   }
255 
256 if (j == 0) return "??";
257 
258 yield = NULL;
259 len = 0;
260 
261 for (i = 0; i < j; i++)
262   {
263   const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
264   size_t sl = strlen(s);
265 
266   if (sl > len)
267     {
268     yield = s;
269     if (sl == 3 && type == PT_SC) break;
270     len = sl;
271     }
272   }
273 
274 return yield;
275 }
276 
277 
278 /*************************************************
279 *      Print Unicode property info for a char    *
280 *************************************************/
281 
282 static void
print_prop(unsigned int c,BOOL is_just_one)283 print_prop(unsigned int c, BOOL is_just_one)
284 {
285 unsigned int type = UCD_CATEGORY(c);
286 int fulltype = UCD_CHARTYPE(c);
287 int script = UCD_SCRIPT(c);
288 int scriptx = UCD_SCRIPTX(c);
289 int gbprop = UCD_GRAPHBREAK(c);
290 int bidi = UCD_BIDICLASS(c);
291 unsigned int othercase = UCD_OTHERCASE(c);
292 int caseset = UCD_CASESET(c);
293 int bprops = UCD_BPROPS(c);
294 
295 const unsigned char *fulltypename = US"??";
296 const unsigned char *typename = US"??";
297 const unsigned char *graphbreak = US"??";
298 const unsigned char *bidiclass = US"??";
299 const unsigned char *scriptname = CUS get_propname(script, PT_SC);
300 
301 switch (type)
302   {
303   case ucp_C: typename = US"Control"; break;
304   case ucp_L: typename = US"Letter"; break;
305   case ucp_M: typename = US"Mark"; break;
306   case ucp_N: typename = US"Number"; break;
307   case ucp_P: typename = US"Punctuation"; break;
308   case ucp_S: typename = US"Symbol"; break;
309   case ucp_Z: typename = US"Separator"; break;
310   }
311 
312 switch (fulltype)
313   {
314   case ucp_Cc: fulltypename = US"Control"; break;
315   case ucp_Cf: fulltypename = US"Format"; break;
316   case ucp_Cn: fulltypename = US"Unassigned"; break;
317   case ucp_Co: fulltypename = US"Private use"; break;
318   case ucp_Cs: fulltypename = US"Surrogate"; break;
319   case ucp_Ll: fulltypename = US"Lower case letter"; break;
320   case ucp_Lm: fulltypename = US"Modifier letter"; break;
321   case ucp_Lo: fulltypename = US"Other letter"; break;
322   case ucp_Lt: fulltypename = US"Title case letter"; break;
323   case ucp_Lu: fulltypename = US"Upper case letter"; break;
324   case ucp_Mc: fulltypename = US"Spacing mark"; break;
325   case ucp_Me: fulltypename = US"Enclosing mark"; break;
326   case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
327   case ucp_Nd: fulltypename = US"Decimal number"; break;
328   case ucp_Nl: fulltypename = US"Letter number"; break;
329   case ucp_No: fulltypename = US"Other number"; break;
330   case ucp_Pc: fulltypename = US"Connector punctuation"; break;
331   case ucp_Pd: fulltypename = US"Dash punctuation"; break;
332   case ucp_Pe: fulltypename = US"Close punctuation"; break;
333   case ucp_Pf: fulltypename = US"Final punctuation"; break;
334   case ucp_Pi: fulltypename = US"Initial punctuation"; break;
335   case ucp_Po: fulltypename = US"Other punctuation"; break;
336   case ucp_Ps: fulltypename = US"Open punctuation"; break;
337   case ucp_Sc: fulltypename = US"Currency symbol"; break;
338   case ucp_Sk: fulltypename = US"Modifier symbol"; break;
339   case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
340   case ucp_So: fulltypename = US"Other symbol"; break;
341   case ucp_Zl: fulltypename = US"Line separator"; break;
342   case ucp_Zp: fulltypename = US"Paragraph separator"; break;
343   case ucp_Zs: fulltypename = US"Space separator"; break;
344   }
345 
346 switch(gbprop)
347   {
348   case ucp_gbCR:           graphbreak = US"CR"; break;
349   case ucp_gbLF:           graphbreak = US"LF"; break;
350   case ucp_gbControl:      graphbreak = US"Control"; break;
351   case ucp_gbExtend:       graphbreak = US"Extend"; break;
352   case ucp_gbPrepend:      graphbreak = US"Prepend"; break;
353   case ucp_gbSpacingMark:  graphbreak = US"SpacingMark"; break;
354   case ucp_gbL:            graphbreak = US"Hangul syllable type L"; break;
355   case ucp_gbV:            graphbreak = US"Hangul syllable type V"; break;
356   case ucp_gbT:            graphbreak = US"Hangul syllable type T"; break;
357   case ucp_gbLV:           graphbreak = US"Hangul syllable type LV"; break;
358   case ucp_gbLVT:          graphbreak = US"Hangul syllable type LVT"; break;
359   case ucp_gbRegional_Indicator:
360                            graphbreak = US"Regional Indicator"; break;
361   case ucp_gbOther:        graphbreak = US"Other"; break;
362   case ucp_gbZWJ:          graphbreak = US"Zero Width Joiner"; break;
363   case ucp_gbExtended_Pictographic:
364                            graphbreak = US"Extended Pictographic"; break;
365   default:                 graphbreak = US"Unknown"; break;
366   }
367 
368 switch(bidi)
369   {
370   case ucp_bidiAL:   bidiclass = US"AL "; break;
371   case ucp_bidiFSI:  bidiclass = US"FSI"; break;
372   case ucp_bidiL:    bidiclass = US"L  "; break;
373   case ucp_bidiLRE:  bidiclass = US"LRE"; break;
374   case ucp_bidiLRI:  bidiclass = US"LRI"; break;
375   case ucp_bidiLRO:  bidiclass = US"LRO"; break;
376   case ucp_bidiPDF:  bidiclass = US"PDF"; break;
377   case ucp_bidiPDI:  bidiclass = US"PDI"; break;
378   case ucp_bidiR:    bidiclass = US"R  "; break;
379   case ucp_bidiRLE:  bidiclass = US"RLE"; break;
380   case ucp_bidiRLI:  bidiclass = US"RLI"; break;
381   case ucp_bidiRLO:  bidiclass = US"RLO"; break;
382   case ucp_bidiAN:   bidiclass = US"AN "; break;
383   case ucp_bidiB:    bidiclass = US"B  "; break;
384   case ucp_bidiBN:   bidiclass = US"BN "; break;
385   case ucp_bidiCS:   bidiclass = US"CS "; break;
386   case ucp_bidiEN:   bidiclass = US"EN "; break;
387   case ucp_bidiES:   bidiclass = US"ES "; break;
388   case ucp_bidiET:   bidiclass = US"ET "; break;
389   case ucp_bidiNSM:  bidiclass = US"NSM"; break;
390   case ucp_bidiON:   bidiclass = US"ON "; break;
391   case ucp_bidiS:    bidiclass = US"S  "; break;
392   case ucp_bidiWS:   bidiclass = US"WS "; break;
393   default:           bidiclass = US"???"; break;
394   }
395 
396 printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,
397   scriptname, graphbreak);
398 
399 if (is_just_one && (othercase != c || caseset != 0))
400   {
401   if (othercase != c) printf(", U+%04X", othercase);
402   if (caseset != 0)
403     {
404     const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
405     while (*(++p) < NOTACHAR)
406       {
407       unsigned int d = *p;
408       if (d != othercase && d != c) printf(", U+%04X", d);
409       }
410     }
411   }
412 
413 if (scriptx != 0)
414   {
415   const char *sep = "";
416   const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
417   printf(", [");
418   for (int i = 0; i < ucp_Unknown; i++)
419   if (MAPBIT(p, i) != 0)
420     {
421     printf("%s%s", sep, get_propname(i, PT_SC));
422     sep = ", ";
423     }
424   printf("]");
425   }
426 
427 if (bprops != 0)
428   {
429   const char *sep = "";
430   const uint32_t *p = PRIV(ucd_boolprop_sets) + bprops;
431   printf(", [");
432   for (int i = 0; i < ucp_Bprop_Count; i++)
433   if (MAPBIT(p, i) != 0)
434     {
435     printf("%s%s", sep, get_propname(i, PT_BOOL));
436     sep = ", ";
437     }
438   printf("]");
439   }
440 
441 if (show_character && is_just_one)
442   {
443   unsigned char buffer[8];
444   int len = (int)PRIV(ord2utf_8)(c, buffer);
445   printf(", >%.*s<", len, buffer);
446   }
447 
448 printf("\n");
449 }
450 
451 
452 
453 /*************************************************
454 *   Find character(s) with given property/ies    *
455 *************************************************/
456 
457 static void
find_chars(unsigned char * s)458 find_chars(unsigned char *s)
459 {
460 unsigned char name[128];
461 unsigned char value[128];
462 unsigned char *t;
463 unsigned int count= 0;
464 int scriptx_list[128];
465 unsigned int scriptx_count = 0;
466 int bprop_list[128];
467 unsigned int bprop_count = 0;
468 uint32_t i, c;
469 int script = -1;
470 int type = -1;
471 int gbreak = -1;
472 int bidiclass = -1;
473 BOOL script_not = FALSE;
474 BOOL type_not = FALSE;
475 BOOL gbreak_not = FALSE;
476 BOOL bidiclass_not = FALSE;
477 BOOL hadrange = FALSE;
478 const ucd_record *ucd, *next_ucd;
479 const char *pad = "        ";
480 
481 while (*s != 0)
482   {
483   unsigned int offset = 0;
484 
485   for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
486   *t = 0;
487   while (isspace(*s)) s++;
488 
489   for (t = value; *s != 0 && !isspace(*s); s++)
490     {
491     if (*s != '_' && *s != '-') *t++ = *s;
492     }
493   *t = 0;
494   while (isspace(*s)) s++;
495 
496   if (strcmp(CS name, "script") == 0 ||
497       strcmp(CS name, "scriptx") == 0)
498     {
499     BOOL x = (name[6] == 'x');
500     BOOL scriptx_not = FALSE;
501     for (t = value; *t != 0; t++) *t = tolower(*t);
502 
503     if (value[0] == '!')
504       {
505       if (x) scriptx_not = TRUE; else script_not = TRUE;
506       offset = 1;
507       }
508 
509     for (i = 0; i < PRIV(utt_size); i++)
510       {
511       const ucp_type_table *u = PRIV(utt) + i;
512       if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset),
513             PRIV(utt_names) + u->name_offset) == 0)
514         {
515         c = u->value;
516         if (x && !scriptx_not && u->type == PT_SC)
517           {
518           if (script < 0)
519             {
520             x = FALSE;
521             script = -1;
522             script_not = scriptx_not;
523             }
524           else if (!script_not)
525             {
526             printf("No characters found\n");
527             return;
528             }
529           }
530         if (x)
531           {
532           scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
533           }
534         else
535           {
536           if (script < 0) script = c; else
537             {
538             printf("** Only 1 script value allowed\n");
539             return;
540             }
541           }
542         break;
543         }
544       }
545 
546     if (i >= PRIV(utt_size))
547       {
548       printf("** Unrecognized script name \"%s\"\n", value);
549       return;
550       }
551     }
552 
553   else if (strcmp(CS name, "bool") == 0)
554     {
555     int not = 1;
556     if (value[0] == '!')
557       {
558       not = -1;
559       offset = 1;
560       }
561 
562     for (i = 0; i < PRIV(utt_size); i++)
563       {
564       const ucp_type_table *u = PRIV(utt) + i;
565       if (u->type == PT_BOOL && strcmp(CS(value + offset),
566             PRIV(utt_names) + u->name_offset) == 0)
567         {
568         bprop_list[bprop_count++] = u->value * not;
569         break;
570         }
571       }
572 
573     if (i >= PRIV(utt_size))
574       {
575       printf("** Unrecognized property name \"%s\"\n", value);
576       return;
577       }
578     }
579 
580   else if (strcmp(CS name, "type") == 0)
581     {
582     if (type >= 0)
583       {
584       printf("** Only 1 type value allowed\n");
585       return;
586       }
587     else
588       {
589       if (value[0] == '!')
590         {
591         type_not = TRUE;
592         offset = 1;
593         }
594 
595       for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
596         {
597         if (strcmp(CS (value + offset), CCS type_names[i]) == 0)
598           {
599           type = i/2;
600           break;
601           }
602         }
603       if (i >= sizeof(type_names)/sizeof(char *))
604         {
605         printf("** Unrecognized type name \"%s\"\n", value);
606         return;
607         }
608       }
609     }
610 
611   else if (strcmp(CS name, "gbreak") == 0)
612     {
613     if (gbreak >= 0)
614       {
615       printf("** Only 1 grapheme break value allowed\n");
616       return;
617       }
618     else
619       {
620       if (value[0] == '!')
621         {
622         gbreak_not = TRUE;
623         offset = 1;
624         }
625 
626       for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
627         {
628         if (strcmp(CS (value + offset), CCS gb_names[i]) == 0)
629           {
630           gbreak = i/2;
631           break;
632           }
633         }
634       if (i >= sizeof(gb_names)/sizeof(char *))
635         {
636         printf("** Unrecognized gbreak name \"%s\"\n", value);
637         return;
638         }
639       }
640     }
641 
642   else if (strcmp(CS name, "bidi") == 0 ||
643            strcmp(CS name, "bidiclass") == 0 ||
644            strcmp(CS name, "bidi_class") == 0 )
645     {
646     if (bidiclass >= 0)
647       {
648       printf("** Only 1 bidi class value allowed\n");
649       return;
650       }
651     else
652       {
653       if (value[0] == '!')
654         {
655         bidiclass_not = TRUE;
656         offset = 1;
657         }
658       for (i = 0; i < sizeof(bd_names)/sizeof(char *); i++)
659         {
660         if (strcasecmp(CS (value + offset), CCS bd_names[i]) == 0)
661           {
662           bidiclass = i/2;
663           break;
664           }
665         }
666       if (i >= sizeof(bd_names)/sizeof(char *))
667         {
668         printf("** Unrecognized bidi class name \"%s\"\n", value);
669         return;
670         }
671       }
672     }
673 
674   else
675     {
676     printf("** Unrecognized property name \"%s\"\n", name);
677     return;
678     }
679   }
680 
681 if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 &&
682     gbreak < 0 && bidiclass < 0)
683   {
684   printf("** No properties specified\n");
685   return;
686   }
687 
688 for (c = 0; c <= 0x10ffff; c++)
689   {
690   if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
691 
692   if (scriptx_count > 0)
693     {
694     const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c);
695     unsigned int found = 0;
696 
697     for (i = 0; i < scriptx_count; i++)
698       {
699       int x = scriptx_list[i]/32;
700       int y = scriptx_list[i]%32;
701 
702       /* Positive requirment */
703       if (scriptx_list[i] >= 0)
704         {
705         if (scriptx_list[i] == UCD_SCRIPT(c) ||
706             ((scriptx_list[i] < ucp_Unknown) &&
707              (bits_scriptx[x] & (1u<<y)) != 0)) found++;
708         }
709       /* Negative requirement */
710       else
711         {
712         if ((-(scriptx_list[i]) < ucp_Unknown) &&
713             (bits_scriptx[x] & (1u<<y)) == 0) found++;
714         }
715       }
716 
717     if (found != scriptx_count) continue;
718     }
719 
720   if (bprop_count > 0)
721     {
722     const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) + UCD_BPROPS(c);
723     unsigned int found = 0;
724 
725     for (i = 0; i < bprop_count; i++)
726       {
727       int x = bprop_list[i]/32;
728       int y = bprop_list[i]%32;
729 
730       /* Positive requirement */
731       if (bprop_list[i] >= 0)
732         {
733         if ((bits_bprop[x] & (1u<<y)) != 0) found++;
734         }
735       /* Negative requirement */
736       else
737         {
738         if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++;
739         }
740       }
741 
742     if (found != bprop_count) continue;
743     }
744 
745   if (type >= 0)
746     {
747     if (type_not)
748       {
749       if (type == UCD_CHARTYPE(c)) continue;
750       }
751     else
752       {
753       if (type != UCD_CHARTYPE(c)) continue;
754       }
755     }
756 
757   if (gbreak >= 0)
758     {
759     if (gbreak_not)
760       {
761       if (gbreak == UCD_GRAPHBREAK(c)) continue;
762       }
763     else
764       {
765       if (gbreak != UCD_GRAPHBREAK(c)) continue;
766       }
767     }
768 
769   if (bidiclass >= 0)
770     {
771     if (bidiclass_not)
772       {
773       if (bidiclass == UCD_BIDICLASS(c)) continue;
774       }
775     else
776       {
777       if (bidiclass != UCD_BIDICLASS(c)) continue;
778       }
779     }
780 
781   /* All conditions are met. Look for runs. */
782 
783   ucd = GET_UCD(c);
784 
785   for (i = c + 1; i < 0x10ffff; i++)
786     {
787     next_ucd = GET_UCD(i);
788     if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
789     }
790 
791   if (--i > c)
792     {
793     printf("U+%04X..", c);
794     c = i;
795     hadrange = TRUE;
796     }
797   else if (hadrange) printf("%s", pad);
798 
799   print_prop(c, FALSE);
800   if (c >= 0x100000) pad = "        ";
801     else if (c >= 0x10000) pad = "       ";
802   count++;
803   if (count >= 100)
804     {
805     printf("...\n");
806     break;
807     }
808   }
809 
810 if (count == 0) printf("No characters found\n");
811 }
812 
813 
814 /*************************************************
815 *        Process command line                    *
816 *************************************************/
817 
818 static void
process_command_line(unsigned char * buffer)819 process_command_line(unsigned char *buffer)
820 {
821 unsigned char *s, *t;
822 unsigned char name[24];
823 
824 s = buffer;
825 while (isspace(*s)) s++;
826 if (*s == 0) return;
827 
828 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
829 *t = 0;
830 while (isspace(*s)) s++;
831 
832 if (strcmp(CS name, "findprop") == 0)
833   {
834   while (*s != 0)
835     {
836     unsigned int c;
837     unsigned char *endptr;
838     t = s;
839 
840     if (*t == '+')
841       {
842       c = *(++t);
843       if (c > 0x7fu)
844         {
845         GETCHARINC(c, t);
846         endptr = t;
847         }
848       else endptr = t+1;
849       }
850     else
851       {
852       if (memcmp(t, "U+", 2) == 0) t += 2;
853       c = (uint32_t)strtoul(CS t, CSS(&endptr), 16);
854       }
855 
856     if (*endptr != 0 && !isspace(*endptr))
857       {
858       while (*endptr != 0 && !isspace(*endptr)) endptr++;
859       printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
860       }
861     else
862       {
863       if (c > 0x10ffff)
864         printf("** U+%x is too big for a Unicode code point\n", c);
865       else
866         print_prop(c, TRUE);
867       }
868     s = endptr;
869     while (isspace(*s)) s++;
870     }
871   }
872 
873 else if (strcmp(CS name, "find") == 0)
874   {
875   find_chars(s);
876   }
877 
878 else if (strcmp(CS name, "list") == 0)
879   {
880   while (*s != 0)
881     {
882     size_t i;
883     for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
884     *t = 0;
885     while (isspace(*s)) s++;
886 
887     if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
888       {
889       for (i = 0; i < PRIV(utt_size); i++)
890         if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC)
891           printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
892       }
893 
894     else if (strcmp(CS name, "bool") == 0)
895       {
896       for (i = 0; i < PRIV(utt_size); i++)
897         if (PRIV(utt)[i].type == PT_BOOL)
898           printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
899       }
900 
901     else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
902       {
903       for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
904         printf("%s %s\n", type_names[i], type_names[i+1]);
905       }
906 
907     else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
908       {
909       for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
910         {
911         if (gb_names[i+1][0] != 0)
912           printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
913         else
914           printf("%s\n", gb_names[i]);
915         }
916       }
917 
918     else if (strcmp(CS name, "bidi") == 0 ||
919              strcmp(CS name, "bidiclasses") == 0)
920       {
921       for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
922         printf("%3s %s\n", bd_names[i], bd_names[i+1]);
923       }
924 
925     else
926       {
927       printf("** Unknown property \"%s\"\n", name);
928       break;
929       }
930     }
931   }
932 
933 else printf("** Unknown test command \"%s\"\n", name);
934 }
935 
936 
937 
938 /*************************************************
939 *               Main program                     *
940 *************************************************/
941 
942 int
main(int argc,char ** argv)943 main(int argc, char **argv)
944 {
945 BOOL interactive;
946 int first_arg = 1;
947 unsigned char buffer[1024];
948 
949 if (argc > 1 && strcmp(argv[1], "-s") == 0)
950   {
951   show_character = TRUE;
952   first_arg++;
953   }
954 
955 if (argc > first_arg)
956   {
957   int i;
958   BOOL datafirst = TRUE;
959   char *arg = argv[first_arg];
960   unsigned char *s = buffer;
961 
962   if (*arg != '+' && memcmp(arg, "U+", 2) != 0 && !isdigit(*arg))
963     {
964     while (*arg != 0)
965       {
966       if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
967       }
968     }
969 
970   if (datafirst)
971     {
972     strcpy(CS s, "findprop ");
973     s += 9;
974     }
975 
976   for (i = first_arg; i < argc; i++)
977     {
978     s += sprintf(CS s, "%s ", argv[i]);
979     }
980 
981   process_command_line(buffer);
982   return 0;
983   }
984 
985 interactive = is_stdin_tty();
986 
987 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
988 if (interactive) using_history();
989 #endif
990 
991 for(;;)
992   {
993 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
994   if (interactive)
995     {
996     size_t len;
997     unsigned char *s = US readline("> ");
998     if (s == NULL) break;
999     len = strlen(CS s);
1000     if (len > 0) add_history(CS s);
1001     memcpy(buffer, s, len);
1002     buffer[len] = '\n';
1003     buffer[len+1] = 0;
1004     free(s);
1005     }
1006   else
1007 #endif
1008 
1009     {
1010     if (interactive) printf("> ");
1011     if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
1012     if (!interactive) printf("%s", buffer);
1013     }
1014 
1015   process_command_line(buffer);
1016   }
1017 
1018 if (interactive) printf("\n");
1019 
1020 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1021 if (interactive) clear_history();
1022 #endif
1023 
1024 return 0;
1025 }
1026 
1027 /* End */
1028