1*ccdc9c3eSSadaf Ebrahimi // Copyright 2008 The RE2 Authors. All Rights Reserved. 2*ccdc9c3eSSadaf Ebrahimi // Use of this source code is governed by a BSD-style 3*ccdc9c3eSSadaf Ebrahimi // license that can be found in the LICENSE file. 4*ccdc9c3eSSadaf Ebrahimi 5*ccdc9c3eSSadaf Ebrahimi #ifndef RE2_UNICODE_CASEFOLD_H_ 6*ccdc9c3eSSadaf Ebrahimi #define RE2_UNICODE_CASEFOLD_H_ 7*ccdc9c3eSSadaf Ebrahimi 8*ccdc9c3eSSadaf Ebrahimi // Unicode case folding tables. 9*ccdc9c3eSSadaf Ebrahimi 10*ccdc9c3eSSadaf Ebrahimi // The Unicode case folding tables encode the mapping from one Unicode point 11*ccdc9c3eSSadaf Ebrahimi // to the next largest Unicode point with equivalent folding. The largest 12*ccdc9c3eSSadaf Ebrahimi // point wraps back to the first. For example, the tables map: 13*ccdc9c3eSSadaf Ebrahimi // 14*ccdc9c3eSSadaf Ebrahimi // 'A' -> 'a' 15*ccdc9c3eSSadaf Ebrahimi // 'a' -> 'A' 16*ccdc9c3eSSadaf Ebrahimi // 17*ccdc9c3eSSadaf Ebrahimi // 'K' -> 'k' 18*ccdc9c3eSSadaf Ebrahimi // 'k' -> 'K' (Kelvin symbol) 19*ccdc9c3eSSadaf Ebrahimi // 'K' -> 'K' 20*ccdc9c3eSSadaf Ebrahimi // 21*ccdc9c3eSSadaf Ebrahimi // Like everything Unicode, these tables are big. If we represent the table 22*ccdc9c3eSSadaf Ebrahimi // as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB. 23*ccdc9c3eSSadaf Ebrahimi // Most table entries look like the ones around them: 24*ccdc9c3eSSadaf Ebrahimi // 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. 25*ccdc9c3eSSadaf Ebrahimi // Instead of listing all the pairs explicitly, we make a list of ranges 26*ccdc9c3eSSadaf Ebrahimi // and deltas, so that the table entries for 'A' through 'Z' can be represented 27*ccdc9c3eSSadaf Ebrahimi // as a single entry { 'A', 'Z', +32 }. 28*ccdc9c3eSSadaf Ebrahimi // 29*ccdc9c3eSSadaf Ebrahimi // In addition to blocks that map to each other (A-Z mapping to a-z) 30*ccdc9c3eSSadaf Ebrahimi // there are blocks of pairs that individually map to each other 31*ccdc9c3eSSadaf Ebrahimi // (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...). 32*ccdc9c3eSSadaf Ebrahimi // For those, the special delta value EvenOdd marks even/odd pairs 33*ccdc9c3eSSadaf Ebrahimi // (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs. 34*ccdc9c3eSSadaf Ebrahimi // 35*ccdc9c3eSSadaf Ebrahimi // In this form, the table has 274 entries, about 3kB. If we were to split 36*ccdc9c3eSSadaf Ebrahimi // the table into one for 16-bit codes and an overflow table for larger ones, 37*ccdc9c3eSSadaf Ebrahimi // we could get it down to about 1.5kB, but that's not worth the complexity. 38*ccdc9c3eSSadaf Ebrahimi // 39*ccdc9c3eSSadaf Ebrahimi // The grouped form also allows for efficient fold range calculations 40*ccdc9c3eSSadaf Ebrahimi // rather than looping one character at a time. 41*ccdc9c3eSSadaf Ebrahimi 42*ccdc9c3eSSadaf Ebrahimi #include <stdint.h> 43*ccdc9c3eSSadaf Ebrahimi 44*ccdc9c3eSSadaf Ebrahimi #include "util/util.h" 45*ccdc9c3eSSadaf Ebrahimi #include "util/utf.h" 46*ccdc9c3eSSadaf Ebrahimi 47*ccdc9c3eSSadaf Ebrahimi namespace re2 { 48*ccdc9c3eSSadaf Ebrahimi 49*ccdc9c3eSSadaf Ebrahimi enum { 50*ccdc9c3eSSadaf Ebrahimi EvenOdd = 1, 51*ccdc9c3eSSadaf Ebrahimi OddEven = -1, 52*ccdc9c3eSSadaf Ebrahimi EvenOddSkip = 1<<30, 53*ccdc9c3eSSadaf Ebrahimi OddEvenSkip, 54*ccdc9c3eSSadaf Ebrahimi }; 55*ccdc9c3eSSadaf Ebrahimi 56*ccdc9c3eSSadaf Ebrahimi struct CaseFold { 57*ccdc9c3eSSadaf Ebrahimi Rune lo; 58*ccdc9c3eSSadaf Ebrahimi Rune hi; 59*ccdc9c3eSSadaf Ebrahimi int32_t delta; 60*ccdc9c3eSSadaf Ebrahimi }; 61*ccdc9c3eSSadaf Ebrahimi 62*ccdc9c3eSSadaf Ebrahimi extern const CaseFold unicode_casefold[]; 63*ccdc9c3eSSadaf Ebrahimi extern const int num_unicode_casefold; 64*ccdc9c3eSSadaf Ebrahimi 65*ccdc9c3eSSadaf Ebrahimi extern const CaseFold unicode_tolower[]; 66*ccdc9c3eSSadaf Ebrahimi extern const int num_unicode_tolower; 67*ccdc9c3eSSadaf Ebrahimi 68*ccdc9c3eSSadaf Ebrahimi // Returns the CaseFold* in the tables that contains rune. 69*ccdc9c3eSSadaf Ebrahimi // If rune is not in the tables, returns the first CaseFold* after rune. 70*ccdc9c3eSSadaf Ebrahimi // If rune is larger than any value in the tables, returns NULL. 71*ccdc9c3eSSadaf Ebrahimi extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune); 72*ccdc9c3eSSadaf Ebrahimi 73*ccdc9c3eSSadaf Ebrahimi // Returns the result of applying the fold f to the rune r. 74*ccdc9c3eSSadaf Ebrahimi extern Rune ApplyFold(const CaseFold *f, Rune r); 75*ccdc9c3eSSadaf Ebrahimi 76*ccdc9c3eSSadaf Ebrahimi } // namespace re2 77*ccdc9c3eSSadaf Ebrahimi 78*ccdc9c3eSSadaf Ebrahimi #endif // RE2_UNICODE_CASEFOLD_H_ 79