xref: /aosp_15_r20/external/regex-re2/re2/unicode_casefold.h (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style
3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file.
4*ccdc9c3eSSadaf Ebrahimi 
5*ccdc9c3eSSadaf Ebrahimi #ifndef RE2_UNICODE_CASEFOLD_H_
6*ccdc9c3eSSadaf Ebrahimi #define RE2_UNICODE_CASEFOLD_H_
7*ccdc9c3eSSadaf Ebrahimi 
8*ccdc9c3eSSadaf Ebrahimi // Unicode case folding tables.
9*ccdc9c3eSSadaf Ebrahimi 
10*ccdc9c3eSSadaf Ebrahimi // The Unicode case folding tables encode the mapping from one Unicode point
11*ccdc9c3eSSadaf Ebrahimi // to the next largest Unicode point with equivalent folding.  The largest
12*ccdc9c3eSSadaf Ebrahimi // point wraps back to the first.  For example, the tables map:
13*ccdc9c3eSSadaf Ebrahimi //
14*ccdc9c3eSSadaf Ebrahimi //     'A' -> 'a'
15*ccdc9c3eSSadaf Ebrahimi //     'a' -> 'A'
16*ccdc9c3eSSadaf Ebrahimi //
17*ccdc9c3eSSadaf Ebrahimi //     'K' -> 'k'
18*ccdc9c3eSSadaf Ebrahimi //     'k' -> 'K'  (Kelvin symbol)
19*ccdc9c3eSSadaf Ebrahimi //     'K' -> 'K'
20*ccdc9c3eSSadaf Ebrahimi //
21*ccdc9c3eSSadaf Ebrahimi // Like everything Unicode, these tables are big.  If we represent the table
22*ccdc9c3eSSadaf Ebrahimi // as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB.
23*ccdc9c3eSSadaf Ebrahimi // Most table entries look like the ones around them:
24*ccdc9c3eSSadaf Ebrahimi // 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
25*ccdc9c3eSSadaf Ebrahimi // Instead of listing all the pairs explicitly, we make a list of ranges
26*ccdc9c3eSSadaf Ebrahimi // and deltas, so that the table entries for 'A' through 'Z' can be represented
27*ccdc9c3eSSadaf Ebrahimi // as a single entry { 'A', 'Z', +32 }.
28*ccdc9c3eSSadaf Ebrahimi //
29*ccdc9c3eSSadaf Ebrahimi // In addition to blocks that map to each other (A-Z mapping to a-z)
30*ccdc9c3eSSadaf Ebrahimi // there are blocks of pairs that individually map to each other
31*ccdc9c3eSSadaf Ebrahimi // (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
32*ccdc9c3eSSadaf Ebrahimi // For those, the special delta value EvenOdd marks even/odd pairs
33*ccdc9c3eSSadaf Ebrahimi // (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
34*ccdc9c3eSSadaf Ebrahimi //
35*ccdc9c3eSSadaf Ebrahimi // In this form, the table has 274 entries, about 3kB.  If we were to split
36*ccdc9c3eSSadaf Ebrahimi // the table into one for 16-bit codes and an overflow table for larger ones,
37*ccdc9c3eSSadaf Ebrahimi // we could get it down to about 1.5kB, but that's not worth the complexity.
38*ccdc9c3eSSadaf Ebrahimi //
39*ccdc9c3eSSadaf Ebrahimi // The grouped form also allows for efficient fold range calculations
40*ccdc9c3eSSadaf Ebrahimi // rather than looping one character at a time.
41*ccdc9c3eSSadaf Ebrahimi 
42*ccdc9c3eSSadaf Ebrahimi #include <stdint.h>
43*ccdc9c3eSSadaf Ebrahimi 
44*ccdc9c3eSSadaf Ebrahimi #include "util/util.h"
45*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h"
46*ccdc9c3eSSadaf Ebrahimi 
47*ccdc9c3eSSadaf Ebrahimi namespace re2 {
48*ccdc9c3eSSadaf Ebrahimi 
49*ccdc9c3eSSadaf Ebrahimi enum {
50*ccdc9c3eSSadaf Ebrahimi   EvenOdd = 1,
51*ccdc9c3eSSadaf Ebrahimi   OddEven = -1,
52*ccdc9c3eSSadaf Ebrahimi   EvenOddSkip = 1<<30,
53*ccdc9c3eSSadaf Ebrahimi   OddEvenSkip,
54*ccdc9c3eSSadaf Ebrahimi };
55*ccdc9c3eSSadaf Ebrahimi 
56*ccdc9c3eSSadaf Ebrahimi struct CaseFold {
57*ccdc9c3eSSadaf Ebrahimi   Rune lo;
58*ccdc9c3eSSadaf Ebrahimi   Rune hi;
59*ccdc9c3eSSadaf Ebrahimi   int32_t delta;
60*ccdc9c3eSSadaf Ebrahimi };
61*ccdc9c3eSSadaf Ebrahimi 
62*ccdc9c3eSSadaf Ebrahimi extern const CaseFold unicode_casefold[];
63*ccdc9c3eSSadaf Ebrahimi extern const int num_unicode_casefold;
64*ccdc9c3eSSadaf Ebrahimi 
65*ccdc9c3eSSadaf Ebrahimi extern const CaseFold unicode_tolower[];
66*ccdc9c3eSSadaf Ebrahimi extern const int num_unicode_tolower;
67*ccdc9c3eSSadaf Ebrahimi 
68*ccdc9c3eSSadaf Ebrahimi // Returns the CaseFold* in the tables that contains rune.
69*ccdc9c3eSSadaf Ebrahimi // If rune is not in the tables, returns the first CaseFold* after rune.
70*ccdc9c3eSSadaf Ebrahimi // If rune is larger than any value in the tables, returns NULL.
71*ccdc9c3eSSadaf Ebrahimi extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune);
72*ccdc9c3eSSadaf Ebrahimi 
73*ccdc9c3eSSadaf Ebrahimi // Returns the result of applying the fold f to the rune r.
74*ccdc9c3eSSadaf Ebrahimi extern Rune ApplyFold(const CaseFold *f, Rune r);
75*ccdc9c3eSSadaf Ebrahimi 
76*ccdc9c3eSSadaf Ebrahimi }  // namespace re2
77*ccdc9c3eSSadaf Ebrahimi 
78*ccdc9c3eSSadaf Ebrahimi #endif  // RE2_UNICODE_CASEFOLD_H_
79