1*6777b538SAndroid Build Coastguard Worker // Copyright 2011 The Chromium Authors
2*6777b538SAndroid Build Coastguard Worker // Use of this source code is governed by a BSD-style license that can be
3*6777b538SAndroid Build Coastguard Worker // found in the LICENSE file.
4*6777b538SAndroid Build Coastguard Worker
5*6777b538SAndroid Build Coastguard Worker #include <stdint.h>
6*6777b538SAndroid Build Coastguard Worker
7*6777b538SAndroid Build Coastguard Worker #include "base/i18n/string_search.h"
8*6777b538SAndroid Build Coastguard Worker
9*6777b538SAndroid Build Coastguard Worker #include "base/check.h"
10*6777b538SAndroid Build Coastguard Worker #include "base/check_op.h"
11*6777b538SAndroid Build Coastguard Worker #include "third_party/icu/source/i18n/unicode/usearch.h"
12*6777b538SAndroid Build Coastguard Worker
13*6777b538SAndroid Build Coastguard Worker namespace base {
14*6777b538SAndroid Build Coastguard Worker namespace i18n {
15*6777b538SAndroid Build Coastguard Worker
FixedPatternStringSearch(const std::u16string & find_this,bool case_sensitive)16*6777b538SAndroid Build Coastguard Worker FixedPatternStringSearch::FixedPatternStringSearch(
17*6777b538SAndroid Build Coastguard Worker const std::u16string& find_this,
18*6777b538SAndroid Build Coastguard Worker bool case_sensitive)
19*6777b538SAndroid Build Coastguard Worker : find_this_(find_this) {
20*6777b538SAndroid Build Coastguard Worker // usearch_open requires a valid string argument to be searched, even if we
21*6777b538SAndroid Build Coastguard Worker // want to set it by usearch_setText afterwards. So, supplying a dummy text.
22*6777b538SAndroid Build Coastguard Worker const std::u16string& dummy = find_this_;
23*6777b538SAndroid Build Coastguard Worker
24*6777b538SAndroid Build Coastguard Worker UErrorCode status = U_ZERO_ERROR;
25*6777b538SAndroid Build Coastguard Worker search_ = usearch_open(find_this_.data(), find_this_.size(), dummy.data(),
26*6777b538SAndroid Build Coastguard Worker dummy.size(), uloc_getDefault(),
27*6777b538SAndroid Build Coastguard Worker nullptr, // breakiter
28*6777b538SAndroid Build Coastguard Worker &status);
29*6777b538SAndroid Build Coastguard Worker if (U_SUCCESS(status)) {
30*6777b538SAndroid Build Coastguard Worker // http://icu-project.org/apiref/icu4c40/ucol_8h.html#6a967f36248b0a1bc7654f538ee8ba96
31*6777b538SAndroid Build Coastguard Worker // Set comparison level to UCOL_PRIMARY to ignore secondary and tertiary
32*6777b538SAndroid Build Coastguard Worker // differences. Set comparison level to UCOL_TERTIARY to include all
33*6777b538SAndroid Build Coastguard Worker // comparison differences.
34*6777b538SAndroid Build Coastguard Worker // Diacritical differences on the same base letter represent a
35*6777b538SAndroid Build Coastguard Worker // secondary difference.
36*6777b538SAndroid Build Coastguard Worker // Uppercase and lowercase versions of the same character represents a
37*6777b538SAndroid Build Coastguard Worker // tertiary difference.
38*6777b538SAndroid Build Coastguard Worker UCollator* collator = usearch_getCollator(search_);
39*6777b538SAndroid Build Coastguard Worker ucol_setStrength(collator, case_sensitive ? UCOL_TERTIARY : UCOL_PRIMARY);
40*6777b538SAndroid Build Coastguard Worker usearch_reset(search_);
41*6777b538SAndroid Build Coastguard Worker }
42*6777b538SAndroid Build Coastguard Worker }
43*6777b538SAndroid Build Coastguard Worker
~FixedPatternStringSearch()44*6777b538SAndroid Build Coastguard Worker FixedPatternStringSearch::~FixedPatternStringSearch() {
45*6777b538SAndroid Build Coastguard Worker if (search_)
46*6777b538SAndroid Build Coastguard Worker usearch_close(search_.ExtractAsDangling());
47*6777b538SAndroid Build Coastguard Worker }
48*6777b538SAndroid Build Coastguard Worker
Search(const std::u16string & in_this,size_t * match_index,size_t * match_length,bool forward_search)49*6777b538SAndroid Build Coastguard Worker bool FixedPatternStringSearch::Search(const std::u16string& in_this,
50*6777b538SAndroid Build Coastguard Worker size_t* match_index,
51*6777b538SAndroid Build Coastguard Worker size_t* match_length,
52*6777b538SAndroid Build Coastguard Worker bool forward_search) {
53*6777b538SAndroid Build Coastguard Worker UErrorCode status = U_ZERO_ERROR;
54*6777b538SAndroid Build Coastguard Worker usearch_setText(search_, in_this.data(), in_this.size(), &status);
55*6777b538SAndroid Build Coastguard Worker
56*6777b538SAndroid Build Coastguard Worker // Default to basic substring search if usearch fails. According to
57*6777b538SAndroid Build Coastguard Worker // http://icu-project.org/apiref/icu4c/usearch_8h.html, usearch_open will fail
58*6777b538SAndroid Build Coastguard Worker // if either |find_this| or |in_this| are empty. In either case basic
59*6777b538SAndroid Build Coastguard Worker // substring search will give the correct return value.
60*6777b538SAndroid Build Coastguard Worker if (!U_SUCCESS(status)) {
61*6777b538SAndroid Build Coastguard Worker size_t index = in_this.find(find_this_);
62*6777b538SAndroid Build Coastguard Worker if (index == std::u16string::npos)
63*6777b538SAndroid Build Coastguard Worker return false;
64*6777b538SAndroid Build Coastguard Worker if (match_index)
65*6777b538SAndroid Build Coastguard Worker *match_index = index;
66*6777b538SAndroid Build Coastguard Worker if (match_length)
67*6777b538SAndroid Build Coastguard Worker *match_length = find_this_.size();
68*6777b538SAndroid Build Coastguard Worker return true;
69*6777b538SAndroid Build Coastguard Worker }
70*6777b538SAndroid Build Coastguard Worker
71*6777b538SAndroid Build Coastguard Worker int32_t index = forward_search ? usearch_first(search_, &status)
72*6777b538SAndroid Build Coastguard Worker : usearch_last(search_, &status);
73*6777b538SAndroid Build Coastguard Worker if (!U_SUCCESS(status) || index == USEARCH_DONE)
74*6777b538SAndroid Build Coastguard Worker return false;
75*6777b538SAndroid Build Coastguard Worker if (match_index)
76*6777b538SAndroid Build Coastguard Worker *match_index = static_cast<size_t>(index);
77*6777b538SAndroid Build Coastguard Worker if (match_length)
78*6777b538SAndroid Build Coastguard Worker *match_length = static_cast<size_t>(usearch_getMatchedLength(search_));
79*6777b538SAndroid Build Coastguard Worker return true;
80*6777b538SAndroid Build Coastguard Worker }
81*6777b538SAndroid Build Coastguard Worker
82*6777b538SAndroid Build Coastguard Worker FixedPatternStringSearchIgnoringCaseAndAccents::
FixedPatternStringSearchIgnoringCaseAndAccents(const std::u16string & find_this)83*6777b538SAndroid Build Coastguard Worker FixedPatternStringSearchIgnoringCaseAndAccents(
84*6777b538SAndroid Build Coastguard Worker const std::u16string& find_this)
85*6777b538SAndroid Build Coastguard Worker : base_search_(find_this, /*case_sensitive=*/false) {}
86*6777b538SAndroid Build Coastguard Worker
Search(const std::u16string & in_this,size_t * match_index,size_t * match_length)87*6777b538SAndroid Build Coastguard Worker bool FixedPatternStringSearchIgnoringCaseAndAccents::Search(
88*6777b538SAndroid Build Coastguard Worker const std::u16string& in_this,
89*6777b538SAndroid Build Coastguard Worker size_t* match_index,
90*6777b538SAndroid Build Coastguard Worker size_t* match_length) {
91*6777b538SAndroid Build Coastguard Worker return base_search_.Search(in_this, match_index, match_length,
92*6777b538SAndroid Build Coastguard Worker /*forward_search=*/true);
93*6777b538SAndroid Build Coastguard Worker }
94*6777b538SAndroid Build Coastguard Worker
StringSearchIgnoringCaseAndAccents(const std::u16string & find_this,const std::u16string & in_this,size_t * match_index,size_t * match_length)95*6777b538SAndroid Build Coastguard Worker bool StringSearchIgnoringCaseAndAccents(const std::u16string& find_this,
96*6777b538SAndroid Build Coastguard Worker const std::u16string& in_this,
97*6777b538SAndroid Build Coastguard Worker size_t* match_index,
98*6777b538SAndroid Build Coastguard Worker size_t* match_length) {
99*6777b538SAndroid Build Coastguard Worker return FixedPatternStringSearchIgnoringCaseAndAccents(find_this).Search(
100*6777b538SAndroid Build Coastguard Worker in_this, match_index, match_length);
101*6777b538SAndroid Build Coastguard Worker }
102*6777b538SAndroid Build Coastguard Worker
StringSearch(const std::u16string & find_this,const std::u16string & in_this,size_t * match_index,size_t * match_length,bool case_sensitive,bool forward_search)103*6777b538SAndroid Build Coastguard Worker bool StringSearch(const std::u16string& find_this,
104*6777b538SAndroid Build Coastguard Worker const std::u16string& in_this,
105*6777b538SAndroid Build Coastguard Worker size_t* match_index,
106*6777b538SAndroid Build Coastguard Worker size_t* match_length,
107*6777b538SAndroid Build Coastguard Worker bool case_sensitive,
108*6777b538SAndroid Build Coastguard Worker bool forward_search) {
109*6777b538SAndroid Build Coastguard Worker return FixedPatternStringSearch(find_this, case_sensitive)
110*6777b538SAndroid Build Coastguard Worker .Search(in_this, match_index, match_length, forward_search);
111*6777b538SAndroid Build Coastguard Worker }
112*6777b538SAndroid Build Coastguard Worker
RepeatingStringSearch(const std::u16string & find_this,const std::u16string & in_this,bool case_sensitive)113*6777b538SAndroid Build Coastguard Worker RepeatingStringSearch::RepeatingStringSearch(const std::u16string& find_this,
114*6777b538SAndroid Build Coastguard Worker const std::u16string& in_this,
115*6777b538SAndroid Build Coastguard Worker bool case_sensitive)
116*6777b538SAndroid Build Coastguard Worker : find_this_(find_this), in_this_(in_this) {
117*6777b538SAndroid Build Coastguard Worker std::string locale = uloc_getDefault();
118*6777b538SAndroid Build Coastguard Worker UErrorCode status = U_ZERO_ERROR;
119*6777b538SAndroid Build Coastguard Worker search_ = usearch_open(find_this_.data(), find_this_.size(), in_this_.data(),
120*6777b538SAndroid Build Coastguard Worker in_this_.size(), locale.data(), /*breakiter=*/nullptr,
121*6777b538SAndroid Build Coastguard Worker &status);
122*6777b538SAndroid Build Coastguard Worker DCHECK(U_SUCCESS(status));
123*6777b538SAndroid Build Coastguard Worker if (U_SUCCESS(status)) {
124*6777b538SAndroid Build Coastguard Worker // http://icu-project.org/apiref/icu4c40/ucol_8h.html#6a967f36248b0a1bc7654f538ee8ba96
125*6777b538SAndroid Build Coastguard Worker // Set comparison level to UCOL_PRIMARY to ignore secondary and tertiary
126*6777b538SAndroid Build Coastguard Worker // differences. Set comparison level to UCOL_TERTIARY to include all
127*6777b538SAndroid Build Coastguard Worker // comparison differences.
128*6777b538SAndroid Build Coastguard Worker // Diacritical differences on the same base letter represent a
129*6777b538SAndroid Build Coastguard Worker // secondary difference.
130*6777b538SAndroid Build Coastguard Worker // Uppercase and lowercase versions of the same character represents a
131*6777b538SAndroid Build Coastguard Worker // tertiary difference.
132*6777b538SAndroid Build Coastguard Worker UCollator* collator = usearch_getCollator(search_);
133*6777b538SAndroid Build Coastguard Worker ucol_setStrength(collator, case_sensitive ? UCOL_TERTIARY : UCOL_PRIMARY);
134*6777b538SAndroid Build Coastguard Worker usearch_reset(search_);
135*6777b538SAndroid Build Coastguard Worker }
136*6777b538SAndroid Build Coastguard Worker }
137*6777b538SAndroid Build Coastguard Worker
~RepeatingStringSearch()138*6777b538SAndroid Build Coastguard Worker RepeatingStringSearch::~RepeatingStringSearch() {
139*6777b538SAndroid Build Coastguard Worker if (search_)
140*6777b538SAndroid Build Coastguard Worker usearch_close(search_.ExtractAsDangling());
141*6777b538SAndroid Build Coastguard Worker }
142*6777b538SAndroid Build Coastguard Worker
NextMatchResult(int & match_index,int & match_length)143*6777b538SAndroid Build Coastguard Worker bool RepeatingStringSearch::NextMatchResult(int& match_index,
144*6777b538SAndroid Build Coastguard Worker int& match_length) {
145*6777b538SAndroid Build Coastguard Worker UErrorCode status = U_ZERO_ERROR;
146*6777b538SAndroid Build Coastguard Worker const int match_start = usearch_next(search_, &status);
147*6777b538SAndroid Build Coastguard Worker if (U_FAILURE(status) || match_start == USEARCH_DONE)
148*6777b538SAndroid Build Coastguard Worker return false;
149*6777b538SAndroid Build Coastguard Worker DCHECK(U_SUCCESS(status));
150*6777b538SAndroid Build Coastguard Worker match_index = match_start;
151*6777b538SAndroid Build Coastguard Worker match_length = usearch_getMatchedLength(search_);
152*6777b538SAndroid Build Coastguard Worker return true;
153*6777b538SAndroid Build Coastguard Worker }
154*6777b538SAndroid Build Coastguard Worker
155*6777b538SAndroid Build Coastguard Worker } // namespace i18n
156*6777b538SAndroid Build Coastguard Worker } // namespace base
157