1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/i18n/string_search.h"
6
7 #include <stddef.h>
8
9 #include <string>
10 #include <vector>
11
12 #include "base/i18n/rtl.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "testing/gtest/include/gtest/gtest.h"
15 #include "third_party/icu/source/i18n/unicode/usearch.h"
16
17 namespace base {
18 namespace i18n {
19
20 #define EXPECT_MATCH_IGNORE_CASE(find_this, in_this, ex_start, ex_len) \
21 { \
22 size_t index = 0; \
23 size_t length = 0; \
24 EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(find_this, in_this, &index, \
25 &length)); \
26 EXPECT_EQ(ex_start, index); \
27 EXPECT_EQ(ex_len, length); \
28 index = 0; \
29 length = 0; \
30 EXPECT_TRUE( \
31 StringSearch(find_this, in_this, &index, &length, false, true)); \
32 EXPECT_EQ(ex_start, index); \
33 EXPECT_EQ(ex_len, length); \
34 }
35
36 #define EXPECT_MATCH_SENSITIVE(find_this, in_this, ex_start, ex_len) \
37 { \
38 size_t index = 0; \
39 size_t length = 0; \
40 EXPECT_TRUE( \
41 StringSearch(find_this, in_this, &index, &length, true, true)); \
42 EXPECT_EQ(ex_start, index); \
43 EXPECT_EQ(ex_len, length); \
44 }
45
46 #define EXPECT_MATCH_IGNORE_CASE_BACKWARDS(find_this, in_this, ex_start, \
47 ex_len) \
48 { \
49 size_t index = 0; \
50 size_t length = 0; \
51 EXPECT_TRUE( \
52 StringSearch(find_this, in_this, &index, &length, false, false)); \
53 EXPECT_EQ(ex_start, index); \
54 EXPECT_EQ(ex_len, length); \
55 }
56
57 #define EXPECT_MATCH_SENSITIVE_BACKWARDS(find_this, in_this, ex_start, ex_len) \
58 { \
59 size_t index = 0; \
60 size_t length = 0; \
61 EXPECT_TRUE( \
62 StringSearch(find_this, in_this, &index, &length, true, false)); \
63 EXPECT_EQ(ex_start, index); \
64 EXPECT_EQ(ex_len, length); \
65 }
66
67 #define EXPECT_MISS_IGNORE_CASE(find_this, in_this) \
68 { \
69 size_t index = 0; \
70 size_t length = 0; \
71 EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(find_this, in_this, \
72 &index, &length)); \
73 index = 0; \
74 length = 0; \
75 EXPECT_FALSE( \
76 StringSearch(find_this, in_this, &index, &length, false, true)); \
77 }
78
79 #define EXPECT_MISS_SENSITIVE(find_this, in_this) \
80 { \
81 size_t index = 0; \
82 size_t length = 0; \
83 EXPECT_FALSE( \
84 StringSearch(find_this, in_this, &index, &length, true, true)); \
85 }
86
87 #define EXPECT_MISS_IGNORE_CASE_BACKWARDS(find_this, in_this) \
88 { \
89 size_t index = 0; \
90 size_t length = 0; \
91 EXPECT_FALSE( \
92 StringSearch(find_this, in_this, &index, &length, false, false)); \
93 }
94
95 #define EXPECT_MISS_SENSITIVE_BACKWARDS(find_this, in_this) \
96 { \
97 size_t index = 0; \
98 size_t length = 0; \
99 EXPECT_FALSE( \
100 StringSearch(find_this, in_this, &index, &length, true, false)); \
101 }
102
103 // Note on setting default locale for testing: The current default locale on
104 // the Mac trybot is en_US_POSIX, with which primary-level collation strength
105 // string search is case-sensitive, when normally it should be
106 // case-insensitive. In other locales (including en_US which English speakers
107 // in the U.S. use), this search would be case-insensitive as expected.
108
TEST(StringSearchTest,ASCII)109 TEST(StringSearchTest, ASCII) {
110 std::string default_locale(uloc_getDefault());
111 bool locale_is_posix = (default_locale == "en_US_POSIX");
112 if (locale_is_posix)
113 SetICUDefaultLocale("en_US");
114
115 EXPECT_MATCH_IGNORE_CASE(u"hello", u"hello world", 0U, 5U);
116
117 EXPECT_MISS_IGNORE_CASE(u"h e l l o", u"h e l l o");
118
119 EXPECT_MATCH_IGNORE_CASE(u"aabaaa", u"aaabaabaaa", 4U, 6U);
120
121 EXPECT_MISS_IGNORE_CASE(u"searching within empty string", std::u16string());
122
123 EXPECT_MATCH_IGNORE_CASE(std::u16string(), u"searching for empty string", 0U,
124 0U);
125
126 EXPECT_MATCH_IGNORE_CASE(u"case insensitivity", u"CaSe InSeNsItIvItY", 0U,
127 18U);
128
129 EXPECT_MATCH_SENSITIVE(u"aabaaa", u"aaabaabaaa", 4U, 6U);
130
131 EXPECT_MISS_SENSITIVE(u"searching within empty string", std::u16string());
132
133 EXPECT_MATCH_SENSITIVE(std::u16string(), u"searching for empty string", 0U,
134 0U);
135
136 EXPECT_MISS_SENSITIVE(u"case insensitivity", u"CaSe InSeNsItIvItY");
137
138 if (locale_is_posix)
139 SetICUDefaultLocale(default_locale.data());
140 }
141
TEST(StringSearchTest,UnicodeLocaleIndependent)142 TEST(StringSearchTest, UnicodeLocaleIndependent) {
143 // Base characters
144 const std::u16string e_base = u"e";
145 const std::u16string E_base = u"E";
146 const std::u16string a_base = u"a";
147
148 // Composed characters
149 const std::u16string e_with_acute_accent = u"\u00e9";
150 const std::u16string E_with_acute_accent = u"\u00c9";
151 const std::u16string e_with_grave_accent = u"\u00e8";
152 const std::u16string E_with_grave_accent = u"\u00c8";
153 const std::u16string a_with_acute_accent = u"\u00e1";
154
155 // Decomposed characters
156 const std::u16string e_with_acute_combining_mark = u"e\u0301";
157 const std::u16string E_with_acute_combining_mark = u"E\u0301";
158 const std::u16string e_with_grave_combining_mark = u"e\u0300";
159 const std::u16string E_with_grave_combining_mark = u"E\u0300";
160 const std::u16string a_with_acute_combining_mark = u"a\u0301";
161
162 std::string default_locale(uloc_getDefault());
163 bool locale_is_posix = (default_locale == "en_US_POSIX");
164 if (locale_is_posix)
165 SetICUDefaultLocale("en_US");
166
167 EXPECT_MATCH_IGNORE_CASE(e_base, e_with_acute_accent, 0U,
168 e_with_acute_accent.size());
169
170 EXPECT_MATCH_IGNORE_CASE(e_with_acute_accent, e_base, 0U, e_base.size());
171
172 EXPECT_MATCH_IGNORE_CASE(e_base, e_with_acute_combining_mark, 0U,
173 e_with_acute_combining_mark.size());
174
175 EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_base, 0U,
176 e_base.size());
177
178 EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_with_acute_accent, 0U,
179 e_with_acute_accent.size());
180
181 EXPECT_MATCH_IGNORE_CASE(e_with_acute_accent, e_with_acute_combining_mark, 0U,
182 e_with_acute_combining_mark.size());
183
184 EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark,
185 e_with_grave_combining_mark, 0U,
186 e_with_grave_combining_mark.size());
187
188 EXPECT_MATCH_IGNORE_CASE(e_with_grave_combining_mark,
189 e_with_acute_combining_mark, 0U,
190 e_with_acute_combining_mark.size());
191
192 EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_with_grave_accent, 0U,
193 e_with_grave_accent.size());
194
195 EXPECT_MATCH_IGNORE_CASE(e_with_grave_accent, e_with_acute_combining_mark, 0U,
196 e_with_acute_combining_mark.size());
197
198 EXPECT_MATCH_IGNORE_CASE(E_with_acute_accent, e_with_acute_accent, 0U,
199 e_with_acute_accent.size());
200
201 EXPECT_MATCH_IGNORE_CASE(E_with_grave_accent, e_with_acute_accent, 0U,
202 e_with_acute_accent.size());
203
204 EXPECT_MATCH_IGNORE_CASE(E_with_acute_combining_mark, e_with_grave_accent, 0U,
205 e_with_grave_accent.size());
206
207 EXPECT_MATCH_IGNORE_CASE(E_with_grave_combining_mark, e_with_acute_accent, 0U,
208 e_with_acute_accent.size());
209
210 EXPECT_MATCH_IGNORE_CASE(E_base, e_with_grave_accent, 0U,
211 e_with_grave_accent.size());
212
213 EXPECT_MISS_IGNORE_CASE(a_with_acute_accent, e_with_acute_accent);
214
215 EXPECT_MISS_IGNORE_CASE(a_with_acute_combining_mark,
216 e_with_acute_combining_mark);
217
218 EXPECT_MISS_SENSITIVE(e_base, e_with_acute_accent);
219
220 EXPECT_MISS_SENSITIVE(e_with_acute_accent, e_base);
221
222 EXPECT_MISS_SENSITIVE(e_base, e_with_acute_combining_mark);
223
224 EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, e_base);
225
226 EXPECT_MATCH_SENSITIVE(e_with_acute_combining_mark, e_with_acute_accent, 0U,
227 1U);
228
229 EXPECT_MATCH_SENSITIVE(e_with_acute_accent, e_with_acute_combining_mark, 0U,
230 2U);
231
232 EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark,
233 e_with_grave_combining_mark);
234
235 EXPECT_MISS_SENSITIVE(e_with_grave_combining_mark,
236 e_with_acute_combining_mark);
237
238 EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, e_with_grave_accent);
239
240 EXPECT_MISS_SENSITIVE(e_with_grave_accent, e_with_acute_combining_mark);
241
242 EXPECT_MISS_SENSITIVE(E_with_acute_accent, e_with_acute_accent);
243
244 EXPECT_MISS_SENSITIVE(E_with_grave_accent, e_with_acute_accent);
245
246 EXPECT_MISS_SENSITIVE(E_with_acute_combining_mark, e_with_grave_accent);
247
248 EXPECT_MISS_SENSITIVE(E_with_grave_combining_mark, e_with_acute_accent);
249
250 EXPECT_MISS_SENSITIVE(E_base, e_with_grave_accent);
251
252 EXPECT_MISS_SENSITIVE(a_with_acute_accent, e_with_acute_accent);
253
254 EXPECT_MISS_SENSITIVE(a_with_acute_combining_mark,
255 e_with_acute_combining_mark);
256
257 EXPECT_MATCH_SENSITIVE(a_with_acute_combining_mark,
258 a_with_acute_combining_mark, 0U, 2U);
259
260 if (locale_is_posix)
261 SetICUDefaultLocale(default_locale.data());
262 }
263
TEST(StringSearchTest,UnicodeLocaleDependent)264 TEST(StringSearchTest, UnicodeLocaleDependent) {
265 // Base characters
266 const std::u16string a_base = u"a";
267
268 // Composed characters
269 const std::u16string a_with_ring = u"\u00e5";
270
271 EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(a_base, a_with_ring, nullptr,
272 nullptr));
273 EXPECT_TRUE(StringSearch(a_base, a_with_ring, nullptr, nullptr, false, true));
274
275 const char* default_locale = uloc_getDefault();
276 SetICUDefaultLocale("da");
277
278 EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(a_base, a_with_ring, nullptr,
279 nullptr));
280 EXPECT_FALSE(
281 StringSearch(a_base, a_with_ring, nullptr, nullptr, false, true));
282
283 SetICUDefaultLocale(default_locale);
284 }
285
TEST(StringSearchTest,SearchBackwards)286 TEST(StringSearchTest, SearchBackwards) {
287 std::string default_locale(uloc_getDefault());
288 bool locale_is_posix = (default_locale == "en_US_POSIX");
289 if (locale_is_posix)
290 SetICUDefaultLocale("en_US");
291
292 EXPECT_MATCH_IGNORE_CASE_BACKWARDS(u"ab", u"ABAB", 2U, 2U);
293 EXPECT_MATCH_SENSITIVE_BACKWARDS(u"ab", u"abab", 2U, 2U);
294 EXPECT_MISS_SENSITIVE_BACKWARDS(u"ab", u"ABAB");
295
296 if (locale_is_posix)
297 SetICUDefaultLocale(default_locale.data());
298 }
299
TEST(StringSearchTest,FixedPatternMultipleSearch)300 TEST(StringSearchTest, FixedPatternMultipleSearch) {
301 std::string default_locale(uloc_getDefault());
302 bool locale_is_posix = (default_locale == "en_US_POSIX");
303 if (locale_is_posix)
304 SetICUDefaultLocale("en_US");
305
306 size_t index = 0;
307 size_t length = 0;
308
309 // Search "foo" over multiple texts.
310 FixedPatternStringSearch query1(u"foo", true);
311 EXPECT_TRUE(query1.Search(u"12foo34", &index, &length, true));
312 EXPECT_EQ(2U, index);
313 EXPECT_EQ(3U, length);
314 EXPECT_FALSE(query1.Search(u"bye", &index, &length, true));
315 EXPECT_FALSE(query1.Search(u"FOO", &index, &length, true));
316 EXPECT_TRUE(query1.Search(u"foobarfoo", &index, &length, true));
317 EXPECT_EQ(0U, index);
318 EXPECT_EQ(3U, length);
319 EXPECT_TRUE(query1.Search(u"foobarfoo", &index, &length, false));
320 EXPECT_EQ(6U, index);
321 EXPECT_EQ(3U, length);
322
323 // Search "hello" over multiple texts.
324 FixedPatternStringSearchIgnoringCaseAndAccents query2(u"hello");
325 EXPECT_TRUE(query2.Search(u"12hello34", &index, &length));
326 EXPECT_EQ(2U, index);
327 EXPECT_EQ(5U, length);
328 EXPECT_FALSE(query2.Search(u"bye", &index, &length));
329 EXPECT_TRUE(query2.Search(u"hELLo", &index, &length));
330 EXPECT_EQ(0U, index);
331 EXPECT_EQ(5U, length);
332
333 if (locale_is_posix)
334 SetICUDefaultLocale(default_locale.data());
335 }
336
TEST(StringSearchTest,RepeatingStringSearch)337 TEST(StringSearchTest, RepeatingStringSearch) {
338 struct MatchResult {
339 int match_index;
340 int match_length;
341 };
342
343 std::string default_locale(uloc_getDefault());
344 bool locale_is_posix = (default_locale == "en_US_POSIX");
345 if (locale_is_posix)
346 SetICUDefaultLocale("en_US");
347
348 const char16_t kPattern[] = u"fox";
349 const char16_t kTarget[] = u"The quick brown fox jumped over the lazy Fox";
350
351 // Case sensitive.
352 {
353 const MatchResult kExpectation[] = {{16, 3}};
354
355 RepeatingStringSearch searcher(kPattern, kTarget, /*case_sensitive=*/true);
356 std::vector<MatchResult> results;
357 int match_index;
358 int match_length;
359 while (searcher.NextMatchResult(match_index, match_length)) {
360 results.push_back(
361 {.match_index = match_index, .match_length = match_length});
362 }
363
364 ASSERT_EQ(std::size(kExpectation), results.size());
365 for (size_t i = 0; i < results.size(); ++i) {
366 EXPECT_EQ(results[i].match_index, kExpectation[i].match_index);
367 EXPECT_EQ(results[i].match_length, kExpectation[i].match_length);
368 }
369 }
370
371 // Case insensitive.
372 {
373 const MatchResult kExpectation[] = {{16, 3}, {41, 3}};
374
375 RepeatingStringSearch searcher(kPattern, kTarget, /*case_sensitive=*/false);
376 std::vector<MatchResult> results;
377 int match_index;
378 int match_length;
379 while (searcher.NextMatchResult(match_index, match_length)) {
380 results.push_back(
381 {.match_index = match_index, .match_length = match_length});
382 }
383
384 ASSERT_EQ(std::size(kExpectation), results.size());
385 for (size_t i = 0; i < results.size(); ++i) {
386 EXPECT_EQ(results[i].match_index, kExpectation[i].match_index);
387 EXPECT_EQ(results[i].match_length, kExpectation[i].match_length);
388 }
389 }
390
391 if (locale_is_posix)
392 SetICUDefaultLocale(default_locale.data());
393 }
394
395 } // namespace i18n
396 } // namespace base
397