1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/utf8/unilib-common.h"
18
19 #include <algorithm>
20
21 namespace libtextclassifier3 {
22 namespace {
23
24 #define ARRAYSIZE(a) sizeof(a) / sizeof(*a)
25
26 // Derived from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
27 // grep -E "Ps" UnicodeData.txt | \
28 // sed -rne "s/^([0-9A-Z]{4});.*(PAREN|BRACKET|BRAKCET|BRACE).*/0x\1, /p"
29 // IMPORTANT: entries with the same offsets in kOpeningBrackets and
30 // kClosingBrackets must be counterparts.
31 constexpr char32 kOpeningBrackets[] = {
32 0x0028, 0x005B, 0x007B, 0x0F3C, 0x2045, 0x207D, 0x208D, 0x2329, 0x2768,
33 0x276A, 0x276C, 0x2770, 0x2772, 0x2774, 0x27E6, 0x27E8, 0x27EA, 0x27EC,
34 0x27EE, 0x2983, 0x2985, 0x2987, 0x2989, 0x298B, 0x298D, 0x298F, 0x2991,
35 0x2993, 0x2995, 0x2997, 0x29FC, 0x2E22, 0x2E24, 0x2E26, 0x2E28, 0x3008,
36 0x300A, 0x300C, 0x300E, 0x3010, 0x3014, 0x3016, 0x3018, 0x301A, 0xFD3F,
37 0xFE17, 0xFE35, 0xFE37, 0xFE39, 0xFE3B, 0xFE3D, 0xFE3F, 0xFE41, 0xFE43,
38 0xFE47, 0xFE59, 0xFE5B, 0xFE5D, 0xFF08, 0xFF3B, 0xFF5B, 0xFF5F, 0xFF62};
39 constexpr int kNumOpeningBrackets = ARRAYSIZE(kOpeningBrackets);
40
41 // grep -E "Pe" UnicodeData.txt | \
42 // sed -rne "s/^([0-9A-Z]{4});.*(PAREN|BRACKET|BRAKCET|BRACE).*/0x\1, /p"
43 constexpr char32 kClosingBrackets[] = {
44 0x0029, 0x005D, 0x007D, 0x0F3D, 0x2046, 0x207E, 0x208E, 0x232A, 0x2769,
45 0x276B, 0x276D, 0x2771, 0x2773, 0x2775, 0x27E7, 0x27E9, 0x27EB, 0x27ED,
46 0x27EF, 0x2984, 0x2986, 0x2988, 0x298A, 0x298C, 0x298E, 0x2990, 0x2992,
47 0x2994, 0x2996, 0x2998, 0x29FD, 0x2E23, 0x2E25, 0x2E27, 0x2E29, 0x3009,
48 0x300B, 0x300D, 0x300F, 0x3011, 0x3015, 0x3017, 0x3019, 0x301B, 0xFD3E,
49 0xFE18, 0xFE36, 0xFE38, 0xFE3A, 0xFE3C, 0xFE3E, 0xFE40, 0xFE42, 0xFE44,
50 0xFE48, 0xFE5A, 0xFE5C, 0xFE5E, 0xFF09, 0xFF3D, 0xFF5D, 0xFF60, 0xFF63};
51 constexpr int kNumClosingBrackets = ARRAYSIZE(kClosingBrackets);
52
53 // grep -E "WS" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /"
54 constexpr char32 kWhitespaces[] = {
55 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0,
56 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006,
57 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F,
58 0x21C7, 0x21C8, 0x21C9, 0x21CA, 0x21F6, 0x2B31, 0x2B84, 0x2B85,
59 0x2B86, 0x2B87, 0x2B94, 0x3000, 0x4DCC, 0x10344, 0x10347, 0x1DA0A,
60 0x1DA0B, 0x1DA0C, 0x1DA0D, 0x1DA0E, 0x1DA0F, 0x1DA10, 0x1F4F0, 0x1F500,
61 0x1F501, 0x1F502, 0x1F503, 0x1F504, 0x1F5D8, 0x1F5DE};
62 constexpr int kNumWhitespaces = ARRAYSIZE(kWhitespaces);
63
64 // https://en.wikipedia.org/wiki/Bidirectional_text
65 constexpr char32 kBidirectional[] = {0x061C, 0x200E, 0x200F, 0x202A,
66 0x202B, 0x202C, 0x202D, 0x202E,
67 0x2066, 0x2067, 0x2068, 0x2069};
68 constexpr int kNumBidirectional = ARRAYSIZE(kBidirectional);
69
70 // grep -E "Nd" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /"
71 // As the name suggests, these ranges are always 10 codepoints long, so we just
72 // store the end of the range.
73 constexpr char32 kDecimalDigitRangesEnd[] = {
74 0x0039, 0x0669, 0x06f9, 0x07c9, 0x096f, 0x09ef, 0x0a6f, 0x0aef,
75 0x0b6f, 0x0bef, 0x0c6f, 0x0cef, 0x0d6f, 0x0def, 0x0e59, 0x0ed9,
76 0x0f29, 0x1049, 0x1099, 0x17e9, 0x1819, 0x194f, 0x19d9, 0x1a89,
77 0x1a99, 0x1b59, 0x1bb9, 0x1c49, 0x1c59, 0xa629, 0xa8d9, 0xa909,
78 0xa9d9, 0xa9f9, 0xaa59, 0xabf9, 0xff19, 0x104a9, 0x1106f, 0x110f9,
79 0x1113f, 0x111d9, 0x112f9, 0x11459, 0x114d9, 0x11659, 0x116c9, 0x11739,
80 0x118e9, 0x11c59, 0x11d59, 0x16a69, 0x16b59, 0x1d7ff};
81 constexpr int kNumDecimalDigitRangesEnd = ARRAYSIZE(kDecimalDigitRangesEnd);
82
83 // Visual source: https://en.wikipedia.org/wiki/Latin_script_in_Unicode
84 // Source https://unicode-search.net/unicode-namesearch.pl?term=letter
85 // clang-format off
86 // grep "LATIN " latters.txt | grep -v "TAG LATIN" | grep -v "SQUARED LATIN" | grep -v "CIRCLED LATIN" | grep -v "PARENTHESIZED LATIN" | cut -d' ' -f1 | cut -d'+' -f2 | sed -re "s/([0-9A-Z]+).*/0x\1, /" | tr -d "\n" NOLINT
87 // clang-format on
88 constexpr char32 kLatinLettersRangesStart[] = {0x0041, 0x0061, 0x00C0, 0x00D8,
89 0x00F8, 0x1D00, 0x2C60, 0xAB30,
90 0xFF21, 0xFF41};
91 constexpr int kNumLatinLettersRangesStart = ARRAYSIZE(kLatinLettersRangesStart);
92 constexpr char32 kLatinLettersRangesEnd[] = {0x005A, 0x007A, 0x00D6, 0x00F7,
93 0x02A8, 0x1EFF, 0xA7B7, 0xAB64,
94 0xFF3A, 0xFF5A};
95 constexpr int kNumLatinLettersRangesEnd = ARRAYSIZE(kLatinLettersRangesEnd);
96
97 // Source https://unicode-search.net/unicode-namesearch.pl?term=letter
98 constexpr char32 kArabicLettersRangesStart[] = {
99 0x0620, 0x0641, 0x066E, 0x06EE, 0x0750, 0x08A0, 0xFB50, 0xFDFA, 0xFE80};
100 constexpr int kNumArabicLettersRangesStart =
101 ARRAYSIZE(kArabicLettersRangesStart);
102 constexpr char32 kArabicLettersRangesEnd[] = {
103 0x063F, 0x064A, 0x06D5, 0x06FF, 0x077F, 0x08BD, 0xFBFF, 0xFDFB, 0xFEF4};
104 constexpr int kNumArabicLettersRangesEnd = ARRAYSIZE(kArabicLettersRangesEnd);
105
106 // Source https://unicode-search.net/unicode-namesearch.pl?term=letter
107 constexpr char32 kCyrillicLettersRangesStart[] = {0x0400, 0x1C80, 0x2DE0,
108 0xA640, 0xA674, 0xA680};
109 constexpr int kNumCyrillicLettersRangesStart =
110 ARRAYSIZE(kCyrillicLettersRangesStart);
111 constexpr char32 kCyrillicLettersRangesEnd[] = {0x052F, 0x1C88, 0x2DFF,
112 0xA66E, 0xA67B, 0xA69F};
113 constexpr int kNumCyrillicLettersRangesEnd =
114 ARRAYSIZE(kCyrillicLettersRangesEnd);
115
116 constexpr char32 kChineseLettersRangesStart[] = {
117 0x4E00, 0xF900, 0x2F800, 0xFE30, 0x3400,
118 0x20000, 0x2A700, 0x2B740, 0x2B820, 0x2CEB0};
119 constexpr int kNumChineseLettersRangesStart =
120 ARRAYSIZE(kChineseLettersRangesStart);
121 constexpr char32 kChineseLettersRangesEnd[] = {
122 0x9FFF, 0xFAFF, 0x2FA1F, 0xFE4F, 0x4DBF,
123 0x2A6DF, 0x2B73F, 0x2B81F, 0x2CEAF, 0x2EBEF};
124 constexpr int kNumChineseLettersRangesEnd = ARRAYSIZE(kChineseLettersRangesEnd);
125
126 // Source https://unicode-search.net/unicode-namesearch.pl?term=letter
127 // Hiragana and Katakana
128 constexpr char32 kJapaneseLettersRangesStart[] = {0x3041, 0x30A1, 0x31F0,
129 0xFF66};
130 constexpr int kNumJapaneseLettersRangesStart =
131 ARRAYSIZE(kJapaneseLettersRangesStart);
132 constexpr char32 kJapaneseLettersRangesEnd[] = {0x3096, 0x30FA, 0x31FF, 0xFF9D};
133 constexpr int kNumJapaneseLettersRangesEnd =
134 ARRAYSIZE(kJapaneseLettersRangesEnd);
135
136 // Source https://unicode-search.net/unicode-namesearch.pl?term=letter
137 // Hangul
138 constexpr char32 kKoreanLettersRangesStart[] = {0x3131, 0xFFA1};
139 constexpr int kNumKoreanLettersRangesStart =
140 ARRAYSIZE(kKoreanLettersRangesStart);
141 constexpr char32 kKoreanLettersRangesEnd[] = {0x318E, 0xFFDC};
142 constexpr int kNumKoreanLettersRangesEnd = ARRAYSIZE(kKoreanLettersRangesEnd);
143
144 // Source https://unicode-search.net/unicode-namesearch.pl?term=letter
145 constexpr char32 kThaiLettersRangesStart[] = {0x0E01};
146 constexpr int kNumThaiLettersRangesStart = ARRAYSIZE(kThaiLettersRangesStart);
147 constexpr char32 kThaiLettersRangesEnd[] = {0x0E2E};
148 constexpr int kNumThaiLettersRangesEnd = ARRAYSIZE(kThaiLettersRangesEnd);
149
150 // grep -E ";P.;" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /"
151 constexpr char32 kPunctuationRangesStart[] = {
152 0x0021, 0x0025, 0x002c, 0x003a, 0x003f, 0x005b, 0x005f, 0x007b,
153 0x007d, 0x00a1, 0x00a7, 0x00ab, 0x00b6, 0x00bb, 0x00bf, 0x037e,
154 0x0387, 0x055a, 0x0589, 0x05be, 0x05c0, 0x05c3, 0x05c6, 0x05f3,
155 0x0609, 0x060c, 0x061b, 0x061e, 0x066a, 0x06d4, 0x0700, 0x07f7,
156 0x0830, 0x085e, 0x0964, 0x0970, 0x09fd, 0x0a76, 0x0af0, 0x0c77,
157 0x0c84, 0x0df4, 0x0e4f, 0x0e5a, 0x0f04, 0x0f14, 0x0f3a, 0x0f85,
158 0x0fd0, 0x0fd9, 0x104a, 0x10fb, 0x1360, 0x1400, 0x166e, 0x169b,
159 0x16eb, 0x1735, 0x17d4, 0x17d8, 0x1800, 0x1944, 0x1a1e, 0x1aa0,
160 0x1aa8, 0x1b5a, 0x1bfc, 0x1c3b, 0x1c7e, 0x1cc0, 0x1cd3, 0x2010,
161 0x2030, 0x2045, 0x2053, 0x207d, 0x208d, 0x2308, 0x2329, 0x2768,
162 0x27c5, 0x27e6, 0x2983, 0x29d8, 0x29fc, 0x2cf9, 0x2cfe, 0x2d70,
163 0x2e00, 0x2e30, 0x3001, 0x3008, 0x3014, 0x3030, 0x303d, 0x30a0,
164 0x30fb, 0xa4fe, 0xa60d, 0xa673, 0xa67e, 0xa6f2, 0xa874, 0xa8ce,
165 0xa8f8, 0xa8fc, 0xa92e, 0xa95f, 0xa9c1, 0xa9de, 0xaa5c, 0xaade,
166 0xaaf0, 0xabeb, 0xfd3e, 0xfe10, 0xfe30, 0xfe54, 0xfe63, 0xfe68,
167 0xfe6a, 0xff01, 0xff05, 0xff0c, 0xff1a, 0xff1f, 0xff3b, 0xff3f,
168 0xff5b, 0xff5d, 0xff5f, 0x10100, 0x1039f, 0x103d0, 0x1056f, 0x10857,
169 0x1091f, 0x1093f, 0x10a50, 0x10a7f, 0x10af0, 0x10b39, 0x10b99, 0x10f55,
170 0x11047, 0x110bb, 0x110be, 0x11140, 0x11174, 0x111c5, 0x111cd, 0x111db,
171 0x111dd, 0x11238, 0x112a9, 0x1144b, 0x1145b, 0x1145d, 0x114c6, 0x115c1,
172 0x11641, 0x11660, 0x1173c, 0x1183b, 0x119e2, 0x11a3f, 0x11a9a, 0x11a9e,
173 0x11c41, 0x11c70, 0x11ef7, 0x11fff, 0x12470, 0x16a6e, 0x16af5, 0x16b37,
174 0x16b44, 0x16e97, 0x16fe2, 0x1bc9f, 0x1da87, 0x1e95e};
175 constexpr int kNumPunctuationRangesStart = ARRAYSIZE(kPunctuationRangesStart);
176 constexpr char32 kPunctuationRangesEnd[] = {
177 0x0023, 0x002a, 0x002f, 0x003b, 0x0040, 0x005d, 0x005f, 0x007b,
178 0x007d, 0x00a1, 0x00a7, 0x00ab, 0x00b7, 0x00bb, 0x00bf, 0x037e,
179 0x0387, 0x055f, 0x058a, 0x05be, 0x05c0, 0x05c3, 0x05c6, 0x05f4,
180 0x060a, 0x060d, 0x061b, 0x061f, 0x066d, 0x06d4, 0x070d, 0x07f9,
181 0x083e, 0x085e, 0x0965, 0x0970, 0x09fd, 0x0a76, 0x0af0, 0x0c77,
182 0x0c84, 0x0df4, 0x0e4f, 0x0e5b, 0x0f12, 0x0f14, 0x0f3d, 0x0f85,
183 0x0fd4, 0x0fda, 0x104f, 0x10fb, 0x1368, 0x1400, 0x166e, 0x169c,
184 0x16ed, 0x1736, 0x17d6, 0x17da, 0x180a, 0x1945, 0x1a1f, 0x1aa6,
185 0x1aad, 0x1b60, 0x1bff, 0x1c3f, 0x1c7f, 0x1cc7, 0x1cd3, 0x2027,
186 0x2043, 0x2051, 0x205e, 0x207e, 0x208e, 0x230b, 0x232a, 0x2775,
187 0x27c6, 0x27ef, 0x2998, 0x29db, 0x29fd, 0x2cfc, 0x2cff, 0x2d70,
188 0x2e2e, 0x2e4f, 0x3003, 0x3011, 0x301f, 0x3030, 0x303d, 0x30a0,
189 0x30fb, 0xa4ff, 0xa60f, 0xa673, 0xa67e, 0xa6f7, 0xa877, 0xa8cf,
190 0xa8fa, 0xa8fc, 0xa92f, 0xa95f, 0xa9cd, 0xa9df, 0xaa5f, 0xaadf,
191 0xaaf1, 0xabeb, 0xfd3f, 0xfe19, 0xfe52, 0xfe61, 0xfe63, 0xfe68,
192 0xfe6b, 0xff03, 0xff0a, 0xff0f, 0xff1b, 0xff20, 0xff3d, 0xff3f,
193 0xff5b, 0xff5d, 0xff65, 0x10102, 0x1039f, 0x103d0, 0x1056f, 0x10857,
194 0x1091f, 0x1093f, 0x10a58, 0x10a7f, 0x10af6, 0x10b3f, 0x10b9c, 0x10f59,
195 0x1104d, 0x110bc, 0x110c1, 0x11143, 0x11175, 0x111c8, 0x111cd, 0x111db,
196 0x111df, 0x1123d, 0x112a9, 0x1144f, 0x1145b, 0x1145d, 0x114c6, 0x115d7,
197 0x11643, 0x1166c, 0x1173e, 0x1183b, 0x119e2, 0x11a46, 0x11a9c, 0x11aa2,
198 0x11c45, 0x11c71, 0x11ef8, 0x11fff, 0x12474, 0x16a6f, 0x16af5, 0x16b3b,
199 0x16b44, 0x16e9a, 0x16fe2, 0x1bc9f, 0x1da8b, 0x1e95f};
200 constexpr int kNumPunctuationRangesEnd = ARRAYSIZE(kPunctuationRangesEnd);
201
202 // grep -E "Lu" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /"
203 // There are three common ways in which upper/lower case codepoint ranges
204 // were introduced: one offs, dense ranges, and ranges that alternate between
205 // lower and upper case. For the sake of keeping out binary size down, we
206 // treat each independently.
207 constexpr char32 kUpperSingles[] = {
208 0x01b8, 0x01bc, 0x01c4, 0x01c7, 0x01ca, 0x01f1, 0x0376, 0x037f,
209 0x03cf, 0x03f4, 0x03fa, 0x10c7, 0x10cd, 0x2102, 0x2107, 0x2115,
210 0x2145, 0x2183, 0x2c72, 0x2c75, 0x2cf2, 0xa7b6};
211 constexpr int kNumUpperSingles = ARRAYSIZE(kUpperSingles);
212 constexpr char32 kUpperRanges1Start[] = {
213 0x0041, 0x00c0, 0x00d8, 0x0181, 0x018a, 0x018e, 0x0193, 0x0196,
214 0x019c, 0x019f, 0x01b2, 0x01f7, 0x023a, 0x023d, 0x0244, 0x0389,
215 0x0392, 0x03a3, 0x03d2, 0x03fd, 0x0531, 0x10a0, 0x13a0, 0x1f08,
216 0x1f18, 0x1f28, 0x1f38, 0x1f48, 0x1f68, 0x1fb8, 0x1fc8, 0x1fd8,
217 0x1fe8, 0x1ff8, 0x210b, 0x2110, 0x2119, 0x212b, 0x2130, 0x213e,
218 0x2c00, 0x2c63, 0x2c6e, 0x2c7e, 0xa7ab, 0xa7b0};
219 constexpr int kNumUpperRanges1Start = ARRAYSIZE(kUpperRanges1Start);
220 constexpr char32 kUpperRanges1End[] = {
221 0x005a, 0x00d6, 0x00de, 0x0182, 0x018b, 0x0191, 0x0194, 0x0198,
222 0x019d, 0x01a0, 0x01b3, 0x01f8, 0x023b, 0x023e, 0x0246, 0x038a,
223 0x03a1, 0x03ab, 0x03d4, 0x042f, 0x0556, 0x10c5, 0x13f5, 0x1f0f,
224 0x1f1d, 0x1f2f, 0x1f3f, 0x1f4d, 0x1f6f, 0x1fbb, 0x1fcb, 0x1fdb,
225 0x1fec, 0x1ffb, 0x210d, 0x2112, 0x211d, 0x212d, 0x2133, 0x213f,
226 0x2c2e, 0x2c64, 0x2c70, 0x2c80, 0xa7ae, 0xa7b4};
227 constexpr int kNumUpperRanges1End = ARRAYSIZE(kUpperRanges1End);
228 constexpr char32 kUpperRanges2Start[] = {
229 0x0100, 0x0139, 0x014a, 0x0179, 0x0184, 0x0187, 0x01a2, 0x01a7, 0x01ac,
230 0x01af, 0x01b5, 0x01cd, 0x01de, 0x01f4, 0x01fa, 0x0241, 0x0248, 0x0370,
231 0x0386, 0x038c, 0x038f, 0x03d8, 0x03f7, 0x0460, 0x048a, 0x04c1, 0x04d0,
232 0x1e00, 0x1e9e, 0x1f59, 0x2124, 0x2c60, 0x2c67, 0x2c82, 0x2ceb, 0xa640,
233 0xa680, 0xa722, 0xa732, 0xa779, 0xa77e, 0xa78b, 0xa790, 0xa796};
234 constexpr int kNumUpperRanges2Start = ARRAYSIZE(kUpperRanges2Start);
235 constexpr char32 kUpperRanges2End[] = {
236 0x0136, 0x0147, 0x0178, 0x017d, 0x0186, 0x0189, 0x01a6, 0x01a9, 0x01ae,
237 0x01b1, 0x01b7, 0x01db, 0x01ee, 0x01f6, 0x0232, 0x0243, 0x024e, 0x0372,
238 0x0388, 0x038e, 0x0391, 0x03ee, 0x03f9, 0x0480, 0x04c0, 0x04cd, 0x052e,
239 0x1e94, 0x1efe, 0x1f5f, 0x212a, 0x2c62, 0x2c6d, 0x2ce2, 0x2ced, 0xa66c,
240 0xa69a, 0xa72e, 0xa76e, 0xa77d, 0xa786, 0xa78d, 0xa792, 0xa7aa};
241 constexpr int kNumUpperRanges2End = ARRAYSIZE(kUpperRanges2End);
242
243 // grep -E "Ll" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /"
244 constexpr char32 kLowerSingles[] = {
245 0x00b5, 0x0188, 0x0192, 0x0195, 0x019e, 0x01b0, 0x01c6, 0x01c9,
246 0x01f0, 0x023c, 0x0242, 0x0377, 0x0390, 0x03f5, 0x03f8, 0x1fbe,
247 0x210a, 0x2113, 0x212f, 0x2134, 0x2139, 0x214e, 0x2184, 0x2c61,
248 0x2ce4, 0x2cf3, 0x2d27, 0x2d2d, 0xa7af, 0xa7c3, 0xa7fa, 0x1d7cb};
249 constexpr int kNumLowerSingles = ARRAYSIZE(kLowerSingles);
250 constexpr char32 kLowerRanges1Start[] = {
251 0x0061, 0x00df, 0x00f8, 0x017f, 0x018c, 0x0199, 0x01b9, 0x01bd,
252 0x0234, 0x023f, 0x0250, 0x0295, 0x037b, 0x03ac, 0x03d0, 0x03d5,
253 0x03f0, 0x03fb, 0x0430, 0x0560, 0x10d0, 0x10fd, 0x13f8, 0x1c80,
254 0x1d00, 0x1d6b, 0x1d79, 0x1e96, 0x1f00, 0x1f10, 0x1f20, 0x1f30,
255 0x1f40, 0x1f50, 0x1f60, 0x1f70, 0x1f80, 0x1f90, 0x1fa0, 0x1fb0,
256 0x1fb6, 0x1fc2, 0x1fc6, 0x1fd0, 0x1fd6, 0x1fe0, 0x1ff2, 0x1ff6,
257 0x210e, 0x213c, 0x2146, 0x2c30, 0x2c65, 0x2c77, 0x2d00, 0xa730,
258 0xa772, 0xa794, 0xab30, 0xab60, 0xab70, 0xfb00, 0xfb13, 0xff41,
259 0x10428, 0x104d8, 0x10cc0, 0x118c0, 0x16e60, 0x1d41a, 0x1d44e, 0x1d456,
260 0x1d482, 0x1d4b6, 0x1d4be, 0x1d4c5, 0x1d4ea, 0x1d51e, 0x1d552, 0x1d586,
261 0x1d5ba, 0x1d5ee, 0x1d622, 0x1d656, 0x1d68a, 0x1d6c2, 0x1d6dc, 0x1d6fc,
262 0x1d716, 0x1d736, 0x1d750, 0x1d770, 0x1d78a, 0x1d7aa, 0x1d7c4, 0x1e922};
263 constexpr int kNumLowerRanges1Start = ARRAYSIZE(kLowerRanges1Start);
264 constexpr char32 kLowerRanges1End[] = {
265 0x007a, 0x00f6, 0x00ff, 0x0180, 0x018d, 0x019b, 0x01ba, 0x01bf,
266 0x0239, 0x0240, 0x0293, 0x02af, 0x037d, 0x03ce, 0x03d1, 0x03d7,
267 0x03f3, 0x03fc, 0x045f, 0x0588, 0x10fa, 0x10ff, 0x13fd, 0x1c88,
268 0x1d2b, 0x1d77, 0x1d9a, 0x1e9d, 0x1f07, 0x1f15, 0x1f27, 0x1f37,
269 0x1f45, 0x1f57, 0x1f67, 0x1f7d, 0x1f87, 0x1f97, 0x1fa7, 0x1fb4,
270 0x1fb7, 0x1fc4, 0x1fc7, 0x1fd3, 0x1fd7, 0x1fe7, 0x1ff4, 0x1ff7,
271 0x210f, 0x213d, 0x2149, 0x2c5e, 0x2c66, 0x2c7b, 0x2d25, 0xa731,
272 0xa778, 0xa795, 0xab5a, 0xab67, 0xabbf, 0xfb06, 0xfb17, 0xff5a,
273 0x1044f, 0x104fb, 0x10cf2, 0x118df, 0x16e7f, 0x1d433, 0x1d454, 0x1d467,
274 0x1d49b, 0x1d4b9, 0x1d4c3, 0x1d4cf, 0x1d503, 0x1d537, 0x1d56b, 0x1d59f,
275 0x1d5d3, 0x1d607, 0x1d63b, 0x1d66f, 0x1d6a5, 0x1d6da, 0x1d6e1, 0x1d714,
276 0x1d71b, 0x1d74e, 0x1d755, 0x1d788, 0x1d78f, 0x1d7c2, 0x1d7c9, 0x1e943};
277 constexpr int kNumLowerRanges1End = ARRAYSIZE(kLowerRanges1End);
278 constexpr char32 kLowerRanges2Start[] = {
279 0x0101, 0x0138, 0x0149, 0x017a, 0x0183, 0x01a1, 0x01a8, 0x01ab,
280 0x01b4, 0x01cc, 0x01dd, 0x01f3, 0x01f9, 0x0247, 0x0371, 0x03d9,
281 0x0461, 0x048b, 0x04c2, 0x04cf, 0x1e01, 0x1e9f, 0x2c68, 0x2c71,
282 0x2c74, 0x2c81, 0x2cec, 0xa641, 0xa681, 0xa723, 0xa733, 0xa77a,
283 0xa77f, 0xa78c, 0xa791, 0xa797, 0xa7b5, 0x1d4bb};
284 constexpr int kNumLowerRanges2Start = ARRAYSIZE(kLowerRanges2Start);
285 constexpr char32 kLowerRanges2End[] = {
286 0x0137, 0x0148, 0x0177, 0x017e, 0x0185, 0x01a5, 0x01aa, 0x01ad,
287 0x01b6, 0x01dc, 0x01ef, 0x01f5, 0x0233, 0x024f, 0x0373, 0x03ef,
288 0x0481, 0x04bf, 0x04ce, 0x052f, 0x1e95, 0x1eff, 0x2c6c, 0x2c73,
289 0x2c76, 0x2ce3, 0x2cee, 0xa66d, 0xa69b, 0xa72f, 0xa771, 0xa77c,
290 0xa787, 0xa78e, 0xa793, 0xa7a9, 0xa7bf, 0x1d4bd};
291 constexpr int kNumLowerRanges2End = ARRAYSIZE(kLowerRanges2End);
292
293 // grep -E "Lu" UnicodeData.txt | \
294 // sed -rne "s/^([0-9A-Z]+);.*;([0-9A-Z]+);$/(0x\1, 0x\2), /p"
295 // We have two strategies for mapping from upper to lower case. We have single
296 // character lookups that do not follow a pattern, and ranges for which there
297 // is a constant codepoint shift.
298 // Note that these ranges ignore anything that's not an upper case character,
299 // so when applied to a non-uppercase character the result is incorrect.
300 constexpr int kToLowerSingles[] = {
301 0x0130, 0x0178, 0x0181, 0x0186, 0x018b, 0x018e, 0x018f, 0x0190, 0x0191,
302 0x0194, 0x0196, 0x0197, 0x0198, 0x019c, 0x019d, 0x019f, 0x01a6, 0x01a9,
303 0x01ae, 0x01b7, 0x01f6, 0x01f7, 0x0220, 0x023a, 0x023d, 0x023e, 0x0243,
304 0x0244, 0x0245, 0x037f, 0x0386, 0x038c, 0x03cf, 0x03f4, 0x03f9, 0x04c0,
305 0x1e9e, 0x1fec, 0x2126, 0x212a, 0x212b, 0x2132, 0x2183, 0x2c60, 0x2c62,
306 0x2c63, 0x2c64, 0x2c6d, 0x2c6e, 0x2c6f, 0x2c70, 0xa77d, 0xa78d, 0xa7aa,
307 0xa7ab, 0xa7ac, 0xa7ad, 0xa7ae, 0xa7b0, 0xa7b1, 0xa7b2, 0xa7b3};
308 constexpr int kNumToLowerSingles = ARRAYSIZE(kToLowerSingles);
309 constexpr int kToLowerSinglesOffsets[] = {
310 -199, -121, 210, 206, 1, 79, 202, 203, 1,
311 207, 211, 209, 1, 211, 213, 214, 218, 218,
312 218, 219, -97, -56, -130, 10795, -163, 10792, -195,
313 69, 71, 116, 38, 64, 8, -60, -7, 15,
314 -7615, -7, -7517, -8383, -8262, 28, 1, 1, -10743,
315 -3814, -10727, -10780, -10749, -10783, -10782, -35332, -42280, -42308,
316 -42319, -42315, -42305, -42308, -42258, -42282, -42261, 928};
317 constexpr int kNumToLowerSinglesOffsets = ARRAYSIZE(kToLowerSinglesOffsets);
318 constexpr int kToUpperSingles[] = {
319 0x00b5, 0x00ff, 0x0131, 0x017f, 0x0180, 0x0195, 0x0199, 0x019a, 0x019e,
320 0x01bf, 0x01dd, 0x01f3, 0x0250, 0x0251, 0x0252, 0x0253, 0x0254, 0x0259,
321 0x025b, 0x025c, 0x0260, 0x0261, 0x0263, 0x0265, 0x0266, 0x0268, 0x0269,
322 0x026a, 0x026b, 0x026c, 0x026f, 0x0271, 0x0272, 0x0275, 0x027d, 0x0280,
323 0x0282, 0x0283, 0x0287, 0x0288, 0x0289, 0x028c, 0x0292, 0x029d, 0x029e,
324 0x03ac, 0x03c2, 0x03cc, 0x03d0, 0x03d1, 0x03d5, 0x03d6, 0x03d7, 0x03f0,
325 0x03f1, 0x03f2, 0x03f3, 0x03f5, 0x04cf, 0x1c80, 0x1c81, 0x1c82, 0x1c85,
326 0x1c86, 0x1c87, 0x1c88, 0x1d79, 0x1d7d, 0x1d8e, 0x1e9b, 0x1fb3, 0x1fbe,
327 0x1fc3, 0x1fe5, 0x1ff3, 0x214e, 0x2184, 0x2c61, 0x2c65, 0x2c66, 0xa794,
328 0xab53};
329 constexpr int kNumToUpperSingles = ARRAYSIZE(kToUpperSingles);
330 constexpr int kToUpperSinglesOffsets[] = {
331 743, 121, -232, -300, 195, 97, -1, 163, 130, 56,
332 -79, -2, 10783, 10780, 10782, -210, -206, -202, -203, 42319,
333 -205, 42315, -207, 42280, 42308, -209, -211, 42308, 10743, 42305,
334 -211, 10749, -213, -214, 10727, -218, 42307, -218, 42282, -218,
335 -69, -71, -219, 42261, 42258, -38, -31, -64, -62, -57,
336 -47, -54, -8, -86, -80, 7, -116, -96, -15, -6254,
337 -6253, -6244, -6243, -6236, -6181, 35266, 35332, 3814, 35384, -59,
338 9, -7205, 9, 7, 9, -28, -1, -1, -10795, -10792,
339 48, -928};
340 constexpr int kNumToUpperSinglesOffsets = ARRAYSIZE(kToUpperSinglesOffsets);
341 constexpr int kToLowerRangesStart[] = {
342 0x0041, 0x0100, 0x0189, 0x01a0, 0x01b1, 0x01b3, 0x0388, 0x038e, 0x0391,
343 0x03d8, 0x03fd, 0x0400, 0x0410, 0x0460, 0x0531, 0x10a0, 0x13a0, 0x13f0,
344 0x1e00, 0x1f08, 0x1fba, 0x1fc8, 0x1fd8, 0x1fda, 0x1fe8, 0x1fea, 0x1ff8,
345 0x1ffa, 0x2c00, 0x2c67, 0x2c7e, 0x2c80, 0xff21, 0x10400, 0x10c80, 0x118a0};
346 constexpr int kNumToLowerRangesStart = ARRAYSIZE(kToLowerRangesStart);
347 constexpr int kToLowerRangesEnd[] = {
348 0x00de, 0x0187, 0x019f, 0x01af, 0x01b2, 0x0386, 0x038c, 0x038f, 0x03cf,
349 0x03fa, 0x03ff, 0x040f, 0x042f, 0x052e, 0x0556, 0x10cd, 0x13ef, 0x13f5,
350 0x1efe, 0x1fb9, 0x1fbb, 0x1fcb, 0x1fd9, 0x1fdb, 0x1fe9, 0x1fec, 0x1ff9,
351 0x2183, 0x2c64, 0x2c75, 0x2c7f, 0xa7b6, 0xff3a, 0x104d3, 0x10cb2, 0x118bf};
352 constexpr int kNumToLowerRangesEnd = ARRAYSIZE(kToLowerRangesEnd);
353 constexpr int kToLowerRangesOffsets[] = {
354 32, 1, 205, 1, 217, 1, 37, 63, 32, 1, -130, 80,
355 32, 1, 48, 7264, 38864, 8, 1, -8, -74, -86, -8, -100,
356 -8, -112, -128, -126, 48, 1, -10815, 1, 32, 40, 64, 32};
357 constexpr int kNumToLowerRangesOffsets = ARRAYSIZE(kToLowerRangesOffsets);
358 constexpr int kToUpperRangesStart[] = {
359 0x0061, 0x0101, 0x01c6, 0x01ce, 0x023f, 0x0242, 0x0256, 0x028a,
360 0x0371, 0x037b, 0x03ad, 0x03b1, 0x03cd, 0x03d9, 0x0430, 0x0450,
361 0x0461, 0x0561, 0x10d0, 0x13f8, 0x1c83, 0x1e01, 0x1f00, 0x1f70,
362 0x1f72, 0x1f76, 0x1f78, 0x1f7a, 0x1f7c, 0x1f80, 0x2c30, 0x2c68,
363 0x2d00, 0xa641, 0xab70, 0xff41, 0x10428, 0x10cc0, 0x118c0};
364 constexpr int kNumToUpperRangesStart = ARRAYSIZE(kToUpperRangesStart);
365 constexpr int kToUpperRangesEnd[] = {
366 0x00fe, 0x01bd, 0x01cc, 0x023c, 0x0240, 0x024f, 0x0257, 0x028b,
367 0x0377, 0x037d, 0x03af, 0x03cb, 0x03ce, 0x03fb, 0x044f, 0x045f,
368 0x052f, 0x0586, 0x10ff, 0x13fd, 0x1c84, 0x1eff, 0x1f67, 0x1f71,
369 0x1f75, 0x1f77, 0x1f79, 0x1f7b, 0x1f7d, 0x1fe1, 0x2c5e, 0x2cf3,
370 0x2d2d, 0xa7c3, 0xabbf, 0xff5a, 0x104fb, 0x10cf2, 0x16e7f};
371 constexpr int kNumToUpperRangesEnd = ARRAYSIZE(kToUpperRangesEnd);
372 constexpr int kToUpperRangesOffsets[]{
373 -32, -1, -2, -1, 10815, -1, -205, -217, -1, 130, -37, -32, -63,
374 -1, -32, -80, -1, -48, 3008, -8, -6242, -1, 8, 74, 86, 100,
375 128, 112, 126, 8, -48, -1, -7264, -1, -38864, -32, -40, -64, -32};
376 constexpr int kNumToUpperRangesOffsets = ARRAYSIZE(kToUpperRangesOffsets);
377
378 // Source: https://unicode-search.net/unicode-namesearch.pl?term=PERCENT
379 constexpr char32 kPercentages[] = {0x0025, 0x066A, 0xFE6A, 0xFF05};
380 constexpr int kNumPercentages = ARRAYSIZE(kPercentages);
381
382 // Source from https://unicode-search.net/unicode-namesearch.pl?term=SLASH
383 constexpr char32 kSlashes[] = {0x002f, 0x0337, 0x0338, 0x2044, 0x2215, 0xff0f};
384 constexpr int kNumSlashes = ARRAYSIZE(kSlashes);
385
386 // Source: https://unicode-search.net/unicode-namesearch.pl?term=minus
387 constexpr char32 kMinuses[] = {0x002d, 0x02d7, 0x2212, 0xff0d};
388 constexpr int kNumMinuses = ARRAYSIZE(kMinuses);
389
390 // Source: https://unicode-search.net/unicode-namesearch.pl?term=NUMBER%20SIGN
391 constexpr char32 kNumberSign[] = {0x0023, 0xfe5f, 0xff03};
392 constexpr int kNumNumberSign = ARRAYSIZE(kNumberSign);
393
394 // Source: https://unicode-search.net/unicode-namesearch.pl?term=period
395 constexpr char32 kDots[] = {0x002e, 0xfe52, 0xff0e};
396 constexpr int kNumDots = ARRAYSIZE(kDots);
397
398 // Source: https://unicode-search.net/unicode-namesearch.pl?term=Apostrophe
399 constexpr char32 kApostrophe[] = {0x0027, 0x02BC, 0x02EE, 0x055A,
400 0x07F4, 0x07F5, 0xFF07};
401 constexpr int kNumApostrophe = ARRAYSIZE(kApostrophe);
402
403 // Source: https://unicode-search.net/unicode-namesearch.pl?term=Quotation
404 constexpr char32 kQuotation[] = {
405 0x0022, 0x00AB, 0x00BB, 0x2018, 0x2019, 0x201A, 0x201B, 0x201C,
406 0x201D, 0x201E, 0x201F, 0x2039, 0x203A, 0x275B, 0x275C, 0x275D,
407 0x275E, 0x276E, 0x276F, 0x2E42, 0x301D, 0x301E, 0x301F, 0xFF02};
408 constexpr int kNumQuotation = ARRAYSIZE(kQuotation);
409
410 // Source: https://unicode-search.net/unicode-namesearch.pl?term=ampersand
411 constexpr char32 kAmpersand[] = {0x0026, 0xFE60, 0xFF06, 0x1F674, 0x1F675};
412 constexpr int kNumAmpersand = ARRAYSIZE(kAmpersand);
413
414 #undef ARRAYSIZE
415
416 static_assert(kNumOpeningBrackets == kNumClosingBrackets,
417 "mismatching number of opening and closing brackets");
418 static_assert(kNumLowerRanges1Start == kNumLowerRanges1End,
419 "number of uppercase stride 1 range starts/ends doesn't match");
420 static_assert(kNumLowerRanges2Start == kNumLowerRanges2End,
421 "number of uppercase stride 2 range starts/ends doesn't match");
422 static_assert(kNumUpperRanges1Start == kNumUpperRanges1End,
423 "number of uppercase stride 1 range starts/ends doesn't match");
424 static_assert(kNumUpperRanges2Start == kNumUpperRanges2End,
425 "number of uppercase stride 2 range starts/ends doesn't match");
426 static_assert(kNumToLowerSingles == kNumToLowerSinglesOffsets,
427 "number of to lower singles and offsets doesn't match");
428 static_assert(kNumToLowerRangesStart == kNumToLowerRangesEnd,
429 "mismatching number of range starts/ends for to lower ranges");
430 static_assert(kNumToLowerRangesStart == kNumToLowerRangesOffsets,
431 "number of to lower ranges and offsets doesn't match");
432 static_assert(kNumToUpperSingles == kNumToUpperSinglesOffsets,
433 "number of to upper singles and offsets doesn't match");
434 static_assert(kNumToUpperRangesStart == kNumToUpperRangesEnd,
435 "mismatching number of range starts/ends for to upper ranges");
436 static_assert(kNumToUpperRangesStart == kNumToUpperRangesOffsets,
437 "number of to upper ranges and offsets doesn't match");
438 static_assert(kNumPunctuationRangesStart == kNumPunctuationRangesEnd,
439 "mismatch number of start/ends for punctuation ranges.");
440 static_assert(kNumLatinLettersRangesStart == kNumLatinLettersRangesEnd,
441 "mismatch number of start/ends for letters ranges.");
442 static_assert(kNumArabicLettersRangesStart == kNumArabicLettersRangesEnd,
443 "mismatch number of start/ends for letters ranges.");
444 static_assert(kNumCyrillicLettersRangesStart == kNumCyrillicLettersRangesEnd,
445 "mismatch number of start/ends for letters ranges.");
446 static_assert(kNumChineseLettersRangesStart == kNumChineseLettersRangesEnd,
447 "mismatch number of start/ends for letters ranges.");
448 static_assert(kNumJapaneseLettersRangesStart == kNumJapaneseLettersRangesEnd,
449 "mismatch number of start/ends for letters ranges.");
450 static_assert(kNumKoreanLettersRangesStart == kNumKoreanLettersRangesEnd,
451 "mismatch number of start/ends for letters ranges.");
452 static_assert(kNumThaiLettersRangesStart == kNumThaiLettersRangesEnd,
453 "mismatch number of start/ends for letters ranges.");
454
455 constexpr int kNoMatch = -1;
456
457 // Returns the index of the element in the array that matched the given
458 // codepoint, or kNoMatch if the element didn't exist.
459 // The input array must be in sorted order.
GetMatchIndex(const char32 * array,int array_length,char32 c)460 int GetMatchIndex(const char32* array, int array_length, char32 c) {
461 const char32* end = array + array_length;
462 const auto find_it = std::lower_bound(array, end, c);
463 if (find_it != end && *find_it == c) {
464 return find_it - array;
465 } else {
466 return kNoMatch;
467 }
468 }
469
470 // Returns the index of the range in the array that overlapped the given
471 // codepoint, or kNoMatch if no such range existed.
472 // The input array must be in sorted order.
GetOverlappingRangeIndex(const char32 * arr,int arr_length,int range_length,char32 c)473 int GetOverlappingRangeIndex(const char32* arr, int arr_length,
474 int range_length, char32 c) {
475 const char32* end = arr + arr_length;
476 const auto find_it = std::lower_bound(arr, end, c);
477 if (find_it == end) {
478 return kNoMatch;
479 }
480 // The end is inclusive, we so subtract one less than the range length.
481 const char32 range_end = *find_it;
482 const char32 range_start = range_end - (range_length - 1);
483 if (c < range_start || range_end < c) {
484 return kNoMatch;
485 } else {
486 return find_it - arr;
487 }
488 }
489
490 // As above, but with explicit codepoint start and end indices for the range.
491 // The input array must be in sorted order.
GetOverlappingRangeIndex(const char32 * start_arr,const char32 * end_arr,int arr_length,int stride,char32 c)492 int GetOverlappingRangeIndex(const char32* start_arr, const char32* end_arr,
493 int arr_length, int stride, char32 c) {
494 const char32* end_arr_end = end_arr + arr_length;
495 const auto find_it = std::lower_bound(end_arr, end_arr_end, c);
496 if (find_it == end_arr_end) {
497 return kNoMatch;
498 }
499 // Find the corresponding start.
500 const int range_index = find_it - end_arr;
501 const char32 range_start = start_arr[range_index];
502 const char32 range_end = *find_it;
503 if (c < range_start || range_end < c) {
504 return kNoMatch;
505 }
506 if ((c - range_start) % stride == 0) {
507 return range_index;
508 } else {
509 return kNoMatch;
510 }
511 }
512
513 } // anonymous namespace
514
IsOpeningBracket(char32 codepoint)515 bool IsOpeningBracket(char32 codepoint) {
516 return GetMatchIndex(kOpeningBrackets, kNumOpeningBrackets, codepoint) >= 0;
517 }
518
IsClosingBracket(char32 codepoint)519 bool IsClosingBracket(char32 codepoint) {
520 return GetMatchIndex(kClosingBrackets, kNumClosingBrackets, codepoint) >= 0;
521 }
522
IsWhitespace(char32 codepoint)523 bool IsWhitespace(char32 codepoint) {
524 return GetMatchIndex(kWhitespaces, kNumWhitespaces, codepoint) >= 0;
525 }
526
IsBidirectional(char32 codepoint)527 bool IsBidirectional(char32 codepoint) {
528 return GetMatchIndex(kBidirectional, kNumBidirectional, codepoint) >= 0;
529 }
530
IsDigit(char32 codepoint)531 bool IsDigit(char32 codepoint) {
532 return GetOverlappingRangeIndex(kDecimalDigitRangesEnd,
533 kNumDecimalDigitRangesEnd,
534 /*range_length=*/10, codepoint) >= 0;
535 }
536
IsLower(char32 codepoint)537 bool IsLower(char32 codepoint) {
538 if (GetMatchIndex(kLowerSingles, kNumLowerSingles, codepoint) >= 0) {
539 return true;
540 } else if (GetOverlappingRangeIndex(kLowerRanges1Start, kLowerRanges1End,
541 kNumLowerRanges1Start, /*stride=*/1,
542 codepoint) >= 0) {
543 return true;
544 } else if (GetOverlappingRangeIndex(kLowerRanges2Start, kLowerRanges2End,
545 kNumLowerRanges2Start, /*stride=*/2,
546 codepoint) >= 0) {
547 return true;
548 } else {
549 return false;
550 }
551 }
552
IsUpper(char32 codepoint)553 bool IsUpper(char32 codepoint) {
554 if (GetMatchIndex(kUpperSingles, kNumUpperSingles, codepoint) >= 0) {
555 return true;
556 } else if (GetOverlappingRangeIndex(kUpperRanges1Start, kUpperRanges1End,
557 kNumUpperRanges1Start, /*stride=*/1,
558 codepoint) >= 0) {
559 return true;
560 } else if (GetOverlappingRangeIndex(kUpperRanges2Start, kUpperRanges2End,
561 kNumUpperRanges2Start, /*stride=*/2,
562 codepoint) >= 0) {
563 return true;
564 } else {
565 return false;
566 }
567 }
568
IsPunctuation(char32 codepoint)569 bool IsPunctuation(char32 codepoint) {
570 return (GetOverlappingRangeIndex(
571 kPunctuationRangesStart, kPunctuationRangesEnd,
572 kNumPunctuationRangesStart, /*stride=*/1, codepoint) >= 0);
573 }
574
IsPercentage(char32 codepoint)575 bool IsPercentage(char32 codepoint) {
576 return GetMatchIndex(kPercentages, kNumPercentages, codepoint) >= 0;
577 }
578
IsSlash(char32 codepoint)579 bool IsSlash(char32 codepoint) {
580 return GetMatchIndex(kSlashes, kNumSlashes, codepoint) >= 0;
581 }
582
IsMinus(char32 codepoint)583 bool IsMinus(char32 codepoint) {
584 return GetMatchIndex(kMinuses, kNumMinuses, codepoint) >= 0;
585 }
586
IsNumberSign(char32 codepoint)587 bool IsNumberSign(char32 codepoint) {
588 return GetMatchIndex(kNumberSign, kNumNumberSign, codepoint) >= 0;
589 }
590
IsDot(char32 codepoint)591 bool IsDot(char32 codepoint) {
592 return GetMatchIndex(kDots, kNumDots, codepoint) >= 0;
593 }
594
IsApostrophe(char32 codepoint)595 bool IsApostrophe(char32 codepoint) {
596 return GetMatchIndex(kApostrophe, kNumApostrophe, codepoint) >= 0;
597 }
598
IsQuotation(char32 codepoint)599 bool IsQuotation(char32 codepoint) {
600 return GetMatchIndex(kQuotation, kNumQuotation, codepoint) >= 0;
601 }
602
IsAmpersand(char32 codepoint)603 bool IsAmpersand(char32 codepoint) {
604 return GetMatchIndex(kAmpersand, kNumAmpersand, codepoint) >= 0;
605 }
606
IsLatinLetter(char32 codepoint)607 bool IsLatinLetter(char32 codepoint) {
608 return (GetOverlappingRangeIndex(
609 kLatinLettersRangesStart, kLatinLettersRangesEnd,
610 kNumLatinLettersRangesStart, /*stride=*/1, codepoint) >= 0);
611 }
612
IsArabicLetter(char32 codepoint)613 bool IsArabicLetter(char32 codepoint) {
614 return (GetOverlappingRangeIndex(
615 kArabicLettersRangesStart, kArabicLettersRangesEnd,
616 kNumArabicLettersRangesStart, /*stride=*/1, codepoint) >= 0);
617 }
618
IsCyrillicLetter(char32 codepoint)619 bool IsCyrillicLetter(char32 codepoint) {
620 return (GetOverlappingRangeIndex(
621 kCyrillicLettersRangesStart, kCyrillicLettersRangesEnd,
622 kNumCyrillicLettersRangesStart, /*stride=*/1, codepoint) >= 0);
623 }
624
IsChineseLetter(char32 codepoint)625 bool IsChineseLetter(char32 codepoint) {
626 return (GetOverlappingRangeIndex(
627 kChineseLettersRangesStart, kChineseLettersRangesEnd,
628 kNumChineseLettersRangesStart, /*stride=*/1, codepoint) >= 0);
629 }
630
IsJapaneseLetter(char32 codepoint)631 bool IsJapaneseLetter(char32 codepoint) {
632 return (GetOverlappingRangeIndex(
633 kJapaneseLettersRangesStart, kJapaneseLettersRangesEnd,
634 kNumJapaneseLettersRangesStart, /*stride=*/1, codepoint) >= 0);
635 }
636
IsKoreanLetter(char32 codepoint)637 bool IsKoreanLetter(char32 codepoint) {
638 return (GetOverlappingRangeIndex(
639 kKoreanLettersRangesStart, kKoreanLettersRangesEnd,
640 kNumKoreanLettersRangesStart, /*stride=*/1, codepoint) >= 0);
641 }
642
IsThaiLetter(char32 codepoint)643 bool IsThaiLetter(char32 codepoint) {
644 return (GetOverlappingRangeIndex(
645 kThaiLettersRangesStart, kThaiLettersRangesEnd,
646 kNumThaiLettersRangesStart, /*stride=*/1, codepoint) >= 0);
647 }
648
IsCJTletter(char32 codepoint)649 bool IsCJTletter(char32 codepoint) {
650 return IsJapaneseLetter(codepoint) || IsChineseLetter(codepoint) ||
651 IsThaiLetter(codepoint);
652 }
653
IsLetter(char32 codepoint)654 bool IsLetter(char32 codepoint) {
655 return IsLatinLetter(codepoint) || IsArabicLetter(codepoint) ||
656 IsCyrillicLetter(codepoint) || IsJapaneseLetter(codepoint) ||
657 IsKoreanLetter(codepoint) || IsThaiLetter(codepoint) ||
658 IsChineseLetter(codepoint);
659 }
660
ToLower(char32 codepoint)661 char32 ToLower(char32 codepoint) {
662 // Make sure we still produce output even if the method is called for a
663 // codepoint that's not an uppercase character.
664 if (!IsUpper(codepoint)) {
665 return codepoint;
666 }
667 const int singles_idx =
668 GetMatchIndex(kToLowerSingles, kNumToLowerSingles, codepoint);
669 if (singles_idx >= 0) {
670 return codepoint + kToLowerSinglesOffsets[singles_idx];
671 }
672 const int ranges_idx =
673 GetOverlappingRangeIndex(kToLowerRangesStart, kToLowerRangesEnd,
674 kNumToLowerRangesStart, /*stride=*/1, codepoint);
675 if (ranges_idx >= 0) {
676 return codepoint + kToLowerRangesOffsets[ranges_idx];
677 }
678 return codepoint;
679 }
680
ToUpper(char32 codepoint)681 char32 ToUpper(char32 codepoint) {
682 // Make sure we still produce output even if the method is called for a
683 // codepoint that's not an uppercase character.
684 if (!IsLower(codepoint)) {
685 return codepoint;
686 }
687 const int singles_idx =
688 GetMatchIndex(kToUpperSingles, kNumToUpperSingles, codepoint);
689 if (singles_idx >= 0) {
690 return codepoint + kToUpperSinglesOffsets[singles_idx];
691 }
692 const int ranges_idx =
693 GetOverlappingRangeIndex(kToUpperRangesStart, kToUpperRangesEnd,
694 kNumToUpperRangesStart, /*stride=*/1, codepoint);
695 if (ranges_idx >= 0) {
696 return codepoint + kToUpperRangesOffsets[ranges_idx];
697 }
698 return codepoint;
699 }
700
GetPairedBracket(char32 codepoint)701 char32 GetPairedBracket(char32 codepoint) {
702 const int open_offset =
703 GetMatchIndex(kOpeningBrackets, kNumOpeningBrackets, codepoint);
704 if (open_offset >= 0) {
705 return kClosingBrackets[open_offset];
706 }
707 const int close_offset =
708 GetMatchIndex(kClosingBrackets, kNumClosingBrackets, codepoint);
709 if (close_offset >= 0) {
710 return kOpeningBrackets[close_offset];
711 }
712 return codepoint;
713 }
714
715 } // namespace libtextclassifier3
716