xref: /aosp_15_r20/external/regex-re2/re2/unicode_groups.h (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi #ifndef RE2_UNICODE_GROUPS_H_
6*ccdc9c3eSSadaf Ebrahimi #define RE2_UNICODE_GROUPS_H_
7*ccdc9c3eSSadaf Ebrahimi 
8*ccdc9c3eSSadaf Ebrahimi // Unicode character groups.
9*ccdc9c3eSSadaf Ebrahimi 
10*ccdc9c3eSSadaf Ebrahimi // The codes get split into ranges of 16-bit codes
11*ccdc9c3eSSadaf Ebrahimi // and ranges of 32-bit codes.  It would be simpler
12*ccdc9c3eSSadaf Ebrahimi // to use only 32-bit ranges, but these tables are large
13*ccdc9c3eSSadaf Ebrahimi // enough to warrant extra care.
14*ccdc9c3eSSadaf Ebrahimi //
15*ccdc9c3eSSadaf Ebrahimi // Using just 32-bit ranges gives 27 kB of data.
16*ccdc9c3eSSadaf Ebrahimi // Adding 16-bit ranges gives 18 kB of data.
17*ccdc9c3eSSadaf Ebrahimi // Adding an extra table of 16-bit singletons would reduce
18*ccdc9c3eSSadaf Ebrahimi // to 16.5 kB of data but make the data harder to use;
19*ccdc9c3eSSadaf Ebrahimi // we don't bother.
20*ccdc9c3eSSadaf Ebrahimi 
21*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
22*ccdc9c3eSSadaf Ebrahimi 
23*ccdc9c3eSSadaf Ebrahimi #include "util/util.h"
24*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
25*ccdc9c3eSSadaf Ebrahimi 
26*ccdc9c3eSSadaf Ebrahimi namespace re2 {
27*ccdc9c3eSSadaf Ebrahimi 
28*ccdc9c3eSSadaf Ebrahimi struct URange16
29*ccdc9c3eSSadaf Ebrahimi {
30*ccdc9c3eSSadaf Ebrahimi   uint16_t lo;
31*ccdc9c3eSSadaf Ebrahimi   uint16_t hi;
32*ccdc9c3eSSadaf Ebrahimi };
33*ccdc9c3eSSadaf Ebrahimi 
34*ccdc9c3eSSadaf Ebrahimi struct URange32
35*ccdc9c3eSSadaf Ebrahimi {
36*ccdc9c3eSSadaf Ebrahimi   Rune lo;
37*ccdc9c3eSSadaf Ebrahimi   Rune hi;
38*ccdc9c3eSSadaf Ebrahimi };
39*ccdc9c3eSSadaf Ebrahimi 
40*ccdc9c3eSSadaf Ebrahimi struct UGroup
41*ccdc9c3eSSadaf Ebrahimi {
42*ccdc9c3eSSadaf Ebrahimi   const char *name;
43*ccdc9c3eSSadaf Ebrahimi   int sign;  // +1 for [abc], -1 for [^abc]
44*ccdc9c3eSSadaf Ebrahimi   const URange16 *r16;
45*ccdc9c3eSSadaf Ebrahimi   int nr16;
46*ccdc9c3eSSadaf Ebrahimi   const URange32 *r32;
47*ccdc9c3eSSadaf Ebrahimi   int nr32;
48*ccdc9c3eSSadaf Ebrahimi };
49*ccdc9c3eSSadaf Ebrahimi 
50*ccdc9c3eSSadaf Ebrahimi // Named by property or script name (e.g., "Nd", "N", "Han").
51*ccdc9c3eSSadaf Ebrahimi // Negated groups are not included.
52*ccdc9c3eSSadaf Ebrahimi extern const UGroup unicode_groups[];
53*ccdc9c3eSSadaf Ebrahimi extern const int num_unicode_groups;
54*ccdc9c3eSSadaf Ebrahimi 
55*ccdc9c3eSSadaf Ebrahimi // Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
56*ccdc9c3eSSadaf Ebrahimi // Negated groups are included.
57*ccdc9c3eSSadaf Ebrahimi extern const UGroup posix_groups[];
58*ccdc9c3eSSadaf Ebrahimi extern const int num_posix_groups;
59*ccdc9c3eSSadaf Ebrahimi 
60*ccdc9c3eSSadaf Ebrahimi // Named by Perl name (e.g., "\\d", "\\D").
61*ccdc9c3eSSadaf Ebrahimi // Negated groups are included.
62*ccdc9c3eSSadaf Ebrahimi extern const UGroup perl_groups[];
63*ccdc9c3eSSadaf Ebrahimi extern const int num_perl_groups;
64*ccdc9c3eSSadaf Ebrahimi 
65*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
66*ccdc9c3eSSadaf Ebrahimi 
67*ccdc9c3eSSadaf Ebrahimi #endif  // RE2_UNICODE_GROUPS_H_
68