1*ccdc9c3eSSadaf Ebrahimi // Copyright 2008 The RE2 Authors. All Rights Reserved. 2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style 3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file. 4*ccdc9c3eSSadaf Ebrahimi 5*ccdc9c3eSSadaf Ebrahimi #ifndef RE2_UNICODE_GROUPS_H_ 6*ccdc9c3eSSadaf Ebrahimi #define RE2_UNICODE_GROUPS_H_ 7*ccdc9c3eSSadaf Ebrahimi 8*ccdc9c3eSSadaf Ebrahimi // Unicode character groups. 9*ccdc9c3eSSadaf Ebrahimi 10*ccdc9c3eSSadaf Ebrahimi // The codes get split into ranges of 16-bit codes 11*ccdc9c3eSSadaf Ebrahimi // and ranges of 32-bit codes. It would be simpler 12*ccdc9c3eSSadaf Ebrahimi // to use only 32-bit ranges, but these tables are large 13*ccdc9c3eSSadaf Ebrahimi // enough to warrant extra care. 14*ccdc9c3eSSadaf Ebrahimi // 15*ccdc9c3eSSadaf Ebrahimi // Using just 32-bit ranges gives 27 kB of data. 16*ccdc9c3eSSadaf Ebrahimi // Adding 16-bit ranges gives 18 kB of data. 17*ccdc9c3eSSadaf Ebrahimi // Adding an extra table of 16-bit singletons would reduce 18*ccdc9c3eSSadaf Ebrahimi // to 16.5 kB of data but make the data harder to use; 19*ccdc9c3eSSadaf Ebrahimi // we don't bother. 20*ccdc9c3eSSadaf Ebrahimi 21*ccdc9c3eSSadaf Ebrahimi #include <stdint.h> 22*ccdc9c3eSSadaf Ebrahimi 23*ccdc9c3eSSadaf Ebrahimi #include "util/util.h" 24*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h" 25*ccdc9c3eSSadaf Ebrahimi 26*ccdc9c3eSSadaf Ebrahimi namespace re2 { 27*ccdc9c3eSSadaf Ebrahimi 28*ccdc9c3eSSadaf Ebrahimi struct URange16 29*ccdc9c3eSSadaf Ebrahimi { 30*ccdc9c3eSSadaf Ebrahimi uint16_t lo; 31*ccdc9c3eSSadaf Ebrahimi uint16_t hi; 32*ccdc9c3eSSadaf Ebrahimi }; 33*ccdc9c3eSSadaf Ebrahimi 34*ccdc9c3eSSadaf Ebrahimi struct URange32 35*ccdc9c3eSSadaf Ebrahimi { 36*ccdc9c3eSSadaf Ebrahimi Rune lo; 37*ccdc9c3eSSadaf Ebrahimi Rune hi; 38*ccdc9c3eSSadaf Ebrahimi }; 39*ccdc9c3eSSadaf Ebrahimi 40*ccdc9c3eSSadaf Ebrahimi struct UGroup 41*ccdc9c3eSSadaf Ebrahimi { 42*ccdc9c3eSSadaf Ebrahimi const char *name; 43*ccdc9c3eSSadaf Ebrahimi int sign; // +1 for [abc], -1 for [^abc] 44*ccdc9c3eSSadaf Ebrahimi const URange16 *r16; 45*ccdc9c3eSSadaf Ebrahimi int nr16; 46*ccdc9c3eSSadaf Ebrahimi const URange32 *r32; 47*ccdc9c3eSSadaf Ebrahimi int nr32; 48*ccdc9c3eSSadaf Ebrahimi }; 49*ccdc9c3eSSadaf Ebrahimi 50*ccdc9c3eSSadaf Ebrahimi // Named by property or script name (e.g., "Nd", "N", "Han"). 51*ccdc9c3eSSadaf Ebrahimi // Negated groups are not included. 52*ccdc9c3eSSadaf Ebrahimi extern const UGroup unicode_groups[]; 53*ccdc9c3eSSadaf Ebrahimi extern const int num_unicode_groups; 54*ccdc9c3eSSadaf Ebrahimi 55*ccdc9c3eSSadaf Ebrahimi // Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]"). 56*ccdc9c3eSSadaf Ebrahimi // Negated groups are included. 57*ccdc9c3eSSadaf Ebrahimi extern const UGroup posix_groups[]; 58*ccdc9c3eSSadaf Ebrahimi extern const int num_posix_groups; 59*ccdc9c3eSSadaf Ebrahimi 60*ccdc9c3eSSadaf Ebrahimi // Named by Perl name (e.g., "\\d", "\\D"). 61*ccdc9c3eSSadaf Ebrahimi // Negated groups are included. 62*ccdc9c3eSSadaf Ebrahimi extern const UGroup perl_groups[]; 63*ccdc9c3eSSadaf Ebrahimi extern const int num_perl_groups; 64*ccdc9c3eSSadaf Ebrahimi 65*ccdc9c3eSSadaf Ebrahimi } // namespace re2 66*ccdc9c3eSSadaf Ebrahimi 67*ccdc9c3eSSadaf Ebrahimi #endif // RE2_UNICODE_GROUPS_H_ 68