1*8b6cd535SAndroid Build Coastguard Worker // Copyright (C) 2022 Google LLC
2*8b6cd535SAndroid Build Coastguard Worker //
3*8b6cd535SAndroid Build Coastguard Worker // Licensed under the Apache License, Version 2.0 (the "License");
4*8b6cd535SAndroid Build Coastguard Worker // you may not use this file except in compliance with the License.
5*8b6cd535SAndroid Build Coastguard Worker // You may obtain a copy of the License at
6*8b6cd535SAndroid Build Coastguard Worker //
7*8b6cd535SAndroid Build Coastguard Worker // http://www.apache.org/licenses/LICENSE-2.0
8*8b6cd535SAndroid Build Coastguard Worker //
9*8b6cd535SAndroid Build Coastguard Worker // Unless required by applicable law or agreed to in writing, software
10*8b6cd535SAndroid Build Coastguard Worker // distributed under the License is distributed on an "AS IS" BASIS,
11*8b6cd535SAndroid Build Coastguard Worker // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*8b6cd535SAndroid Build Coastguard Worker // See the License for the specific language governing permissions and
13*8b6cd535SAndroid Build Coastguard Worker // limitations under the License.
14*8b6cd535SAndroid Build Coastguard Worker
15*8b6cd535SAndroid Build Coastguard Worker #include <algorithm>
16*8b6cd535SAndroid Build Coastguard Worker #include <cstdint>
17*8b6cd535SAndroid Build Coastguard Worker #include <iterator>
18*8b6cd535SAndroid Build Coastguard Worker #include <memory>
19*8b6cd535SAndroid Build Coastguard Worker #include <string>
20*8b6cd535SAndroid Build Coastguard Worker #include <string_view>
21*8b6cd535SAndroid Build Coastguard Worker #include <utility>
22*8b6cd535SAndroid Build Coastguard Worker #include <vector>
23*8b6cd535SAndroid Build Coastguard Worker
24*8b6cd535SAndroid Build Coastguard Worker #include "icing/text_classifier/lib3/utils/base/statusor.h"
25*8b6cd535SAndroid Build Coastguard Worker #include "gmock/gmock.h"
26*8b6cd535SAndroid Build Coastguard Worker #include "gtest/gtest.h"
27*8b6cd535SAndroid Build Coastguard Worker #include "icing/feature-flags.h"
28*8b6cd535SAndroid Build Coastguard Worker #include "icing/file/filesystem.h"
29*8b6cd535SAndroid Build Coastguard Worker #include "icing/file/portable-file-backed-proto-log.h"
30*8b6cd535SAndroid Build Coastguard Worker #include "icing/index/embed/embedding-index.h"
31*8b6cd535SAndroid Build Coastguard Worker #include "icing/index/index.h"
32*8b6cd535SAndroid Build Coastguard Worker #include "icing/index/numeric/dummy-numeric-index.h"
33*8b6cd535SAndroid Build Coastguard Worker #include "icing/index/numeric/numeric-index.h"
34*8b6cd535SAndroid Build Coastguard Worker #include "icing/jni/jni-cache.h"
35*8b6cd535SAndroid Build Coastguard Worker #include "icing/legacy/index/icing-filesystem.h"
36*8b6cd535SAndroid Build Coastguard Worker #include "icing/portable/platform.h"
37*8b6cd535SAndroid Build Coastguard Worker #include "icing/proto/schema.pb.h"
38*8b6cd535SAndroid Build Coastguard Worker #include "icing/query/query-processor.h"
39*8b6cd535SAndroid Build Coastguard Worker #include "icing/query/query-results.h"
40*8b6cd535SAndroid Build Coastguard Worker #include "icing/query/query-terms.h"
41*8b6cd535SAndroid Build Coastguard Worker #include "icing/schema/schema-store.h"
42*8b6cd535SAndroid Build Coastguard Worker #include "icing/store/document-store.h"
43*8b6cd535SAndroid Build Coastguard Worker #include "icing/testing/common-matchers.h"
44*8b6cd535SAndroid Build Coastguard Worker #include "icing/testing/fake-clock.h"
45*8b6cd535SAndroid Build Coastguard Worker #include "icing/testing/jni-test-helpers.h"
46*8b6cd535SAndroid Build Coastguard Worker #include "icing/testing/test-data.h"
47*8b6cd535SAndroid Build Coastguard Worker #include "icing/testing/test-feature-flags.h"
48*8b6cd535SAndroid Build Coastguard Worker #include "icing/testing/tmp-directory.h"
49*8b6cd535SAndroid Build Coastguard Worker #include "icing/tokenization/language-segmenter-factory.h"
50*8b6cd535SAndroid Build Coastguard Worker #include "icing/tokenization/language-segmenter.h"
51*8b6cd535SAndroid Build Coastguard Worker #include "icing/tokenization/token.h"
52*8b6cd535SAndroid Build Coastguard Worker #include "icing/tokenization/tokenizer-factory.h"
53*8b6cd535SAndroid Build Coastguard Worker #include "icing/tokenization/tokenizer.h"
54*8b6cd535SAndroid Build Coastguard Worker #include "icing/transform/normalizer-factory.h"
55*8b6cd535SAndroid Build Coastguard Worker #include "icing/transform/normalizer.h"
56*8b6cd535SAndroid Build Coastguard Worker #include "icing/util/icu-data-file-helper.h"
57*8b6cd535SAndroid Build Coastguard Worker #include "icing/util/status-macros.h"
58*8b6cd535SAndroid Build Coastguard Worker #include "unicode/uloc.h"
59*8b6cd535SAndroid Build Coastguard Worker
60*8b6cd535SAndroid Build Coastguard Worker namespace icing {
61*8b6cd535SAndroid Build Coastguard Worker namespace lib {
62*8b6cd535SAndroid Build Coastguard Worker
63*8b6cd535SAndroid Build Coastguard Worker namespace {
64*8b6cd535SAndroid Build Coastguard Worker
65*8b6cd535SAndroid Build Coastguard Worker using ::testing::ElementsAre;
66*8b6cd535SAndroid Build Coastguard Worker using ::testing::UnorderedElementsAre;
67*8b6cd535SAndroid Build Coastguard Worker
68*8b6cd535SAndroid Build Coastguard Worker // This test exists to ensure that the different tokenizers treat different
69*8b6cd535SAndroid Build Coastguard Worker // segments of text in the same manner.
70*8b6cd535SAndroid Build Coastguard Worker class CombinedTokenizerTest : public ::testing::Test {
71*8b6cd535SAndroid Build Coastguard Worker protected:
CombinedTokenizerTest()72*8b6cd535SAndroid Build Coastguard Worker CombinedTokenizerTest()
73*8b6cd535SAndroid Build Coastguard Worker : test_dir_(GetTestTempDir() + "/icing"),
74*8b6cd535SAndroid Build Coastguard Worker store_dir_(test_dir_ + "/store"),
75*8b6cd535SAndroid Build Coastguard Worker schema_store_dir_(test_dir_ + "/schema_store"),
76*8b6cd535SAndroid Build Coastguard Worker index_dir_(test_dir_ + "/index"),
77*8b6cd535SAndroid Build Coastguard Worker numeric_index_dir_(test_dir_ + "/numeric_index"),
78*8b6cd535SAndroid Build Coastguard Worker embedding_index_dir_(test_dir_ + "/embedding_index") {}
79*8b6cd535SAndroid Build Coastguard Worker
SetUp()80*8b6cd535SAndroid Build Coastguard Worker void SetUp() override {
81*8b6cd535SAndroid Build Coastguard Worker feature_flags_ = std::make_unique<FeatureFlags>(GetTestFeatureFlags());
82*8b6cd535SAndroid Build Coastguard Worker filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
83*8b6cd535SAndroid Build Coastguard Worker filesystem_.CreateDirectoryRecursively(index_dir_.c_str());
84*8b6cd535SAndroid Build Coastguard Worker filesystem_.CreateDirectoryRecursively(store_dir_.c_str());
85*8b6cd535SAndroid Build Coastguard Worker filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
86*8b6cd535SAndroid Build Coastguard Worker if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
87*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK(
88*8b6cd535SAndroid Build Coastguard Worker // File generated via icu_data_file rule in //icing/BUILD.
89*8b6cd535SAndroid Build Coastguard Worker icu_data_file_helper::SetUpIcuDataFile(
90*8b6cd535SAndroid Build Coastguard Worker GetTestFilePath("icing/icu.dat")));
91*8b6cd535SAndroid Build Coastguard Worker }
92*8b6cd535SAndroid Build Coastguard Worker jni_cache_ = GetTestJniCache();
93*8b6cd535SAndroid Build Coastguard Worker
94*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
95*8b6cd535SAndroid Build Coastguard Worker schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_,
96*8b6cd535SAndroid Build Coastguard Worker &fake_clock_, feature_flags_.get()));
97*8b6cd535SAndroid Build Coastguard Worker
98*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
99*8b6cd535SAndroid Build Coastguard Worker DocumentStore::CreateResult create_result,
100*8b6cd535SAndroid Build Coastguard Worker DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
101*8b6cd535SAndroid Build Coastguard Worker schema_store_.get(), feature_flags_.get(),
102*8b6cd535SAndroid Build Coastguard Worker /*force_recovery_and_revalidate_documents=*/false,
103*8b6cd535SAndroid Build Coastguard Worker /*pre_mapping_fbv=*/false,
104*8b6cd535SAndroid Build Coastguard Worker /*use_persistent_hash_map=*/false,
105*8b6cd535SAndroid Build Coastguard Worker PortableFileBackedProtoLog<
106*8b6cd535SAndroid Build Coastguard Worker DocumentWrapper>::kDefaultCompressionLevel,
107*8b6cd535SAndroid Build Coastguard Worker /*initialize_stats=*/nullptr));
108*8b6cd535SAndroid Build Coastguard Worker document_store_ = std::move(create_result.document_store);
109*8b6cd535SAndroid Build Coastguard Worker
110*8b6cd535SAndroid Build Coastguard Worker Index::Options options(index_dir_,
111*8b6cd535SAndroid Build Coastguard Worker /*index_merge_size=*/1024 * 1024,
112*8b6cd535SAndroid Build Coastguard Worker /*lite_index_sort_at_indexing=*/true,
113*8b6cd535SAndroid Build Coastguard Worker /*lite_index_sort_size=*/1024 * 8);
114*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
115*8b6cd535SAndroid Build Coastguard Worker index_, Index::Create(options, &filesystem_, &icing_filesystem_));
116*8b6cd535SAndroid Build Coastguard Worker // TODO(b/249829533): switch to use persistent numeric index.
117*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
118*8b6cd535SAndroid Build Coastguard Worker numeric_index_,
119*8b6cd535SAndroid Build Coastguard Worker DummyNumericIndex<int64_t>::Create(filesystem_, numeric_index_dir_));
120*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
121*8b6cd535SAndroid Build Coastguard Worker embedding_index_,
122*8b6cd535SAndroid Build Coastguard Worker EmbeddingIndex::Create(&filesystem_, embedding_index_dir_, &fake_clock_,
123*8b6cd535SAndroid Build Coastguard Worker feature_flags_.get()));
124*8b6cd535SAndroid Build Coastguard Worker
125*8b6cd535SAndroid Build Coastguard Worker language_segmenter_factory::SegmenterOptions segmenter_options(
126*8b6cd535SAndroid Build Coastguard Worker ULOC_US, jni_cache_.get());
127*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
128*8b6cd535SAndroid Build Coastguard Worker lang_segmenter_,
129*8b6cd535SAndroid Build Coastguard Worker language_segmenter_factory::Create(std::move(segmenter_options)));
130*8b6cd535SAndroid Build Coastguard Worker
131*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
132*8b6cd535SAndroid Build Coastguard Worker /*max_term_byte_size=*/1000));
133*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
134*8b6cd535SAndroid Build Coastguard Worker query_processor_,
135*8b6cd535SAndroid Build Coastguard Worker QueryProcessor::Create(
136*8b6cd535SAndroid Build Coastguard Worker index_.get(), numeric_index_.get(), embedding_index_.get(),
137*8b6cd535SAndroid Build Coastguard Worker lang_segmenter_.get(), normalizer_.get(), document_store_.get(),
138*8b6cd535SAndroid Build Coastguard Worker schema_store_.get(), /*join_children_fetcher=*/nullptr,
139*8b6cd535SAndroid Build Coastguard Worker &fake_clock_, feature_flags_.get()));
140*8b6cd535SAndroid Build Coastguard Worker }
141*8b6cd535SAndroid Build Coastguard Worker
GetQueryTerms(std::string_view query)142*8b6cd535SAndroid Build Coastguard Worker libtextclassifier3::StatusOr<std::vector<std::string>> GetQueryTerms(
143*8b6cd535SAndroid Build Coastguard Worker std::string_view query) {
144*8b6cd535SAndroid Build Coastguard Worker SearchSpecProto search_spec;
145*8b6cd535SAndroid Build Coastguard Worker search_spec.set_query(std::string(query));
146*8b6cd535SAndroid Build Coastguard Worker search_spec.set_term_match_type(TermMatchType::PREFIX);
147*8b6cd535SAndroid Build Coastguard Worker ICING_ASSIGN_OR_RETURN(
148*8b6cd535SAndroid Build Coastguard Worker QueryResults parsed_query,
149*8b6cd535SAndroid Build Coastguard Worker query_processor_->ParseSearch(
150*8b6cd535SAndroid Build Coastguard Worker search_spec, ScoringSpecProto::RankingStrategy::NONE,
151*8b6cd535SAndroid Build Coastguard Worker /*current_time_ms=*/0, /*search_stats=*/nullptr));
152*8b6cd535SAndroid Build Coastguard Worker
153*8b6cd535SAndroid Build Coastguard Worker std::vector<std::string> query_terms;
154*8b6cd535SAndroid Build Coastguard Worker const SectionRestrictQueryTermsMap& query_terms_map =
155*8b6cd535SAndroid Build Coastguard Worker parsed_query.query_terms;
156*8b6cd535SAndroid Build Coastguard Worker for (const auto& [section_id, terms] : query_terms_map) {
157*8b6cd535SAndroid Build Coastguard Worker std::copy(terms.begin(), terms.end(), std::back_inserter(query_terms));
158*8b6cd535SAndroid Build Coastguard Worker }
159*8b6cd535SAndroid Build Coastguard Worker return query_terms;
160*8b6cd535SAndroid Build Coastguard Worker }
161*8b6cd535SAndroid Build Coastguard Worker
162*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<FeatureFlags> feature_flags_;
163*8b6cd535SAndroid Build Coastguard Worker Filesystem filesystem_;
164*8b6cd535SAndroid Build Coastguard Worker const std::string test_dir_;
165*8b6cd535SAndroid Build Coastguard Worker const std::string store_dir_;
166*8b6cd535SAndroid Build Coastguard Worker const std::string schema_store_dir_;
167*8b6cd535SAndroid Build Coastguard Worker
168*8b6cd535SAndroid Build Coastguard Worker IcingFilesystem icing_filesystem_;
169*8b6cd535SAndroid Build Coastguard Worker const std::string index_dir_;
170*8b6cd535SAndroid Build Coastguard Worker const std::string numeric_index_dir_;
171*8b6cd535SAndroid Build Coastguard Worker const std::string embedding_index_dir_;
172*8b6cd535SAndroid Build Coastguard Worker
173*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<const JniCache> jni_cache_;
174*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<LanguageSegmenter> lang_segmenter_;
175*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<QueryProcessor> query_processor_;
176*8b6cd535SAndroid Build Coastguard Worker
177*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<Index> index_;
178*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<NumericIndex<int64_t>> numeric_index_;
179*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<EmbeddingIndex> embedding_index_;
180*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<Normalizer> normalizer_;
181*8b6cd535SAndroid Build Coastguard Worker FakeClock fake_clock_;
182*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<SchemaStore> schema_store_;
183*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<DocumentStore> document_store_;
184*8b6cd535SAndroid Build Coastguard Worker };
185*8b6cd535SAndroid Build Coastguard Worker
GetTokenTerms(const std::vector<Token> & tokens)186*8b6cd535SAndroid Build Coastguard Worker std::vector<std::string> GetTokenTerms(const std::vector<Token>& tokens) {
187*8b6cd535SAndroid Build Coastguard Worker std::vector<std::string> terms;
188*8b6cd535SAndroid Build Coastguard Worker terms.reserve(tokens.size());
189*8b6cd535SAndroid Build Coastguard Worker for (const Token& token : tokens) {
190*8b6cd535SAndroid Build Coastguard Worker if (token.type == Token::Type::REGULAR) {
191*8b6cd535SAndroid Build Coastguard Worker terms.push_back(std::string(token.text));
192*8b6cd535SAndroid Build Coastguard Worker }
193*8b6cd535SAndroid Build Coastguard Worker }
194*8b6cd535SAndroid Build Coastguard Worker return terms;
195*8b6cd535SAndroid Build Coastguard Worker }
196*8b6cd535SAndroid Build Coastguard Worker
197*8b6cd535SAndroid Build Coastguard Worker } // namespace
198*8b6cd535SAndroid Build Coastguard Worker
TEST_F(CombinedTokenizerTest,SpecialCharacters)199*8b6cd535SAndroid Build Coastguard Worker TEST_F(CombinedTokenizerTest, SpecialCharacters) {
200*8b6cd535SAndroid Build Coastguard Worker const std::string_view kText = " Hello! Goodbye?";
201*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
202*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<Tokenizer> indexing_tokenizer,
203*8b6cd535SAndroid Build Coastguard Worker tokenizer_factory::CreateIndexingTokenizer(
204*8b6cd535SAndroid Build Coastguard Worker StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
205*8b6cd535SAndroid Build Coastguard Worker
206*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
207*8b6cd535SAndroid Build Coastguard Worker indexing_tokenizer->TokenizeAll(kText));
208*8b6cd535SAndroid Build Coastguard Worker std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
209*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(indexing_terms, ElementsAre("", "Hello", "Goodbye"));
210*8b6cd535SAndroid Build Coastguard Worker
211*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
212*8b6cd535SAndroid Build Coastguard Worker GetQueryTerms(kText));
213*8b6cd535SAndroid Build Coastguard Worker // NOTE: The query parser will also normalize query terms
214*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(query_terms, UnorderedElementsAre("", "hello", "goodbye"));
215*8b6cd535SAndroid Build Coastguard Worker }
216*8b6cd535SAndroid Build Coastguard Worker
TEST_F(CombinedTokenizerTest,Parentheses)217*8b6cd535SAndroid Build Coastguard Worker TEST_F(CombinedTokenizerTest, Parentheses) {
218*8b6cd535SAndroid Build Coastguard Worker const std::string_view kText = "((paren1)(paren2) (last paren))";
219*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
220*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<Tokenizer> indexing_tokenizer,
221*8b6cd535SAndroid Build Coastguard Worker tokenizer_factory::CreateIndexingTokenizer(
222*8b6cd535SAndroid Build Coastguard Worker StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
223*8b6cd535SAndroid Build Coastguard Worker
224*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
225*8b6cd535SAndroid Build Coastguard Worker indexing_tokenizer->TokenizeAll(kText));
226*8b6cd535SAndroid Build Coastguard Worker std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
227*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(indexing_terms, ElementsAre("paren1", "paren2", "last", "paren"));
228*8b6cd535SAndroid Build Coastguard Worker
229*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
230*8b6cd535SAndroid Build Coastguard Worker GetQueryTerms(kText));
231*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(query_terms,
232*8b6cd535SAndroid Build Coastguard Worker UnorderedElementsAre("paren1", "paren2", "last", "paren"));
233*8b6cd535SAndroid Build Coastguard Worker }
234*8b6cd535SAndroid Build Coastguard Worker
TEST_F(CombinedTokenizerTest,Negation)235*8b6cd535SAndroid Build Coastguard Worker TEST_F(CombinedTokenizerTest, Negation) {
236*8b6cd535SAndroid Build Coastguard Worker const std::string_view kText = "-foo -bar -baz";
237*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
238*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<Tokenizer> indexing_tokenizer,
239*8b6cd535SAndroid Build Coastguard Worker tokenizer_factory::CreateIndexingTokenizer(
240*8b6cd535SAndroid Build Coastguard Worker StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
241*8b6cd535SAndroid Build Coastguard Worker
242*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
243*8b6cd535SAndroid Build Coastguard Worker indexing_tokenizer->TokenizeAll(kText));
244*8b6cd535SAndroid Build Coastguard Worker std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
245*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
246*8b6cd535SAndroid Build Coastguard Worker
247*8b6cd535SAndroid Build Coastguard Worker const std::string_view kQueryText = "\\-foo \\-bar \\-baz";
248*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
249*8b6cd535SAndroid Build Coastguard Worker GetQueryTerms(kQueryText));
250*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(query_terms,
251*8b6cd535SAndroid Build Coastguard Worker UnorderedElementsAre("foo", "bar", "baz"));
252*8b6cd535SAndroid Build Coastguard Worker }
253*8b6cd535SAndroid Build Coastguard Worker
254*8b6cd535SAndroid Build Coastguard Worker // TODO(b/254874614): Handle colon word breaks in ICU 72+
TEST_F(CombinedTokenizerTest,Colons)255*8b6cd535SAndroid Build Coastguard Worker TEST_F(CombinedTokenizerTest, Colons) {
256*8b6cd535SAndroid Build Coastguard Worker const std::string_view kText = ":foo: :bar baz:";
257*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
258*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<Tokenizer> indexing_tokenizer,
259*8b6cd535SAndroid Build Coastguard Worker tokenizer_factory::CreateIndexingTokenizer(
260*8b6cd535SAndroid Build Coastguard Worker StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
261*8b6cd535SAndroid Build Coastguard Worker
262*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
263*8b6cd535SAndroid Build Coastguard Worker indexing_tokenizer->TokenizeAll(kText));
264*8b6cd535SAndroid Build Coastguard Worker std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
265*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
266*8b6cd535SAndroid Build Coastguard Worker
267*8b6cd535SAndroid Build Coastguard Worker const std::string_view kQueryText = "\\:foo\\: \\:bar baz\\:";
268*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
269*8b6cd535SAndroid Build Coastguard Worker GetQueryTerms(kQueryText));
270*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(query_terms, UnorderedElementsAre("foo", "bar", "baz"));
271*8b6cd535SAndroid Build Coastguard Worker }
272*8b6cd535SAndroid Build Coastguard Worker
273*8b6cd535SAndroid Build Coastguard Worker // TODO(b/254874614): Handle colon word breaks in ICU 72+
TEST_F(CombinedTokenizerTest,ColonsPropertyRestricts)274*8b6cd535SAndroid Build Coastguard Worker TEST_F(CombinedTokenizerTest, ColonsPropertyRestricts) {
275*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
276*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<Tokenizer> indexing_tokenizer,
277*8b6cd535SAndroid Build Coastguard Worker tokenizer_factory::CreateIndexingTokenizer(
278*8b6cd535SAndroid Build Coastguard Worker StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
279*8b6cd535SAndroid Build Coastguard Worker
280*8b6cd535SAndroid Build Coastguard Worker if (GetIcuTokenizationVersion() >= 72) {
281*8b6cd535SAndroid Build Coastguard Worker // In ICU 72+ and above, ':' are no longer considered word connectors.
282*8b6cd535SAndroid Build Coastguard Worker constexpr std::string_view kText = "foo:bar";
283*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
284*8b6cd535SAndroid Build Coastguard Worker indexing_tokenizer->TokenizeAll(kText));
285*8b6cd535SAndroid Build Coastguard Worker std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
286*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar"));
287*8b6cd535SAndroid Build Coastguard Worker
288*8b6cd535SAndroid Build Coastguard Worker const std::string_view kQueryText = "foo\\:bar";
289*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
290*8b6cd535SAndroid Build Coastguard Worker GetQueryTerms(kQueryText));
291*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(query_terms, UnorderedElementsAre("foo", "bar"));
292*8b6cd535SAndroid Build Coastguard Worker
293*8b6cd535SAndroid Build Coastguard Worker constexpr std::string_view kText2 = "foo:bar:baz";
294*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
295*8b6cd535SAndroid Build Coastguard Worker indexing_tokenizer->TokenizeAll(kText2));
296*8b6cd535SAndroid Build Coastguard Worker indexing_terms = GetTokenTerms(indexing_tokens);
297*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
298*8b6cd535SAndroid Build Coastguard Worker
299*8b6cd535SAndroid Build Coastguard Worker const std::string_view kQueryText2 = "foo\\:bar\\:baz";
300*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(query_terms,
301*8b6cd535SAndroid Build Coastguard Worker GetQueryTerms(kQueryText2));
302*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(query_terms, UnorderedElementsAre("foo", "bar", "baz"));
303*8b6cd535SAndroid Build Coastguard Worker } else {
304*8b6cd535SAndroid Build Coastguard Worker constexpr std::string_view kText = "foo:bar";
305*8b6cd535SAndroid Build Coastguard Worker constexpr std::string_view kQueryText = "foo\\:bar";
306*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
307*8b6cd535SAndroid Build Coastguard Worker indexing_tokenizer->TokenizeAll(kText));
308*8b6cd535SAndroid Build Coastguard Worker std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
309*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(indexing_terms, ElementsAre("foo:bar"));
310*8b6cd535SAndroid Build Coastguard Worker
311*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
312*8b6cd535SAndroid Build Coastguard Worker GetQueryTerms(kQueryText));
313*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(query_terms, UnorderedElementsAre("foo:bar"));
314*8b6cd535SAndroid Build Coastguard Worker
315*8b6cd535SAndroid Build Coastguard Worker constexpr std::string_view kText2 = "foo:bar:baz";
316*8b6cd535SAndroid Build Coastguard Worker constexpr std::string_view kQueryText2 = "foo\\:bar\\:baz";
317*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
318*8b6cd535SAndroid Build Coastguard Worker indexing_tokenizer->TokenizeAll(kText2));
319*8b6cd535SAndroid Build Coastguard Worker indexing_terms = GetTokenTerms(indexing_tokens);
320*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(indexing_terms, ElementsAre("foo:bar:baz"));
321*8b6cd535SAndroid Build Coastguard Worker
322*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(query_terms, GetQueryTerms(kQueryText2));
323*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(query_terms, UnorderedElementsAre("foo:bar:baz"));
324*8b6cd535SAndroid Build Coastguard Worker }
325*8b6cd535SAndroid Build Coastguard Worker }
326*8b6cd535SAndroid Build Coastguard Worker
TEST_F(CombinedTokenizerTest,Punctuation)327*8b6cd535SAndroid Build Coastguard Worker TEST_F(CombinedTokenizerTest, Punctuation) {
328*8b6cd535SAndroid Build Coastguard Worker const std::string_view kText = "Who? What!? Why & How";
329*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(
330*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<Tokenizer> indexing_tokenizer,
331*8b6cd535SAndroid Build Coastguard Worker tokenizer_factory::CreateIndexingTokenizer(
332*8b6cd535SAndroid Build Coastguard Worker StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
333*8b6cd535SAndroid Build Coastguard Worker
334*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
335*8b6cd535SAndroid Build Coastguard Worker indexing_tokenizer->TokenizeAll(kText));
336*8b6cd535SAndroid Build Coastguard Worker std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
337*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(indexing_terms, ElementsAre("Who", "What", "Why", "How"));
338*8b6cd535SAndroid Build Coastguard Worker
339*8b6cd535SAndroid Build Coastguard Worker ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
340*8b6cd535SAndroid Build Coastguard Worker GetQueryTerms(kText));
341*8b6cd535SAndroid Build Coastguard Worker // NOTE: The query parser will also normalize query terms
342*8b6cd535SAndroid Build Coastguard Worker EXPECT_THAT(query_terms, UnorderedElementsAre("who", "what", "why", "how"));
343*8b6cd535SAndroid Build Coastguard Worker }
344*8b6cd535SAndroid Build Coastguard Worker
345*8b6cd535SAndroid Build Coastguard Worker } // namespace lib
346*8b6cd535SAndroid Build Coastguard Worker } // namespace icing
347