1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include <algorithm>
16 #include <cstdint>
17 #include <iterator>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "gmock/gmock.h"
26 #include "gtest/gtest.h"
27 #include "icing/feature-flags.h"
28 #include "icing/file/filesystem.h"
29 #include "icing/file/portable-file-backed-proto-log.h"
30 #include "icing/index/embed/embedding-index.h"
31 #include "icing/index/index.h"
32 #include "icing/index/numeric/dummy-numeric-index.h"
33 #include "icing/index/numeric/numeric-index.h"
34 #include "icing/jni/jni-cache.h"
35 #include "icing/legacy/index/icing-filesystem.h"
36 #include "icing/portable/platform.h"
37 #include "icing/proto/schema.pb.h"
38 #include "icing/query/query-processor.h"
39 #include "icing/query/query-results.h"
40 #include "icing/query/query-terms.h"
41 #include "icing/schema/schema-store.h"
42 #include "icing/store/document-store.h"
43 #include "icing/testing/common-matchers.h"
44 #include "icing/testing/fake-clock.h"
45 #include "icing/testing/jni-test-helpers.h"
46 #include "icing/testing/test-data.h"
47 #include "icing/testing/test-feature-flags.h"
48 #include "icing/testing/tmp-directory.h"
49 #include "icing/tokenization/language-segmenter-factory.h"
50 #include "icing/tokenization/language-segmenter.h"
51 #include "icing/tokenization/token.h"
52 #include "icing/tokenization/tokenizer-factory.h"
53 #include "icing/tokenization/tokenizer.h"
54 #include "icing/transform/normalizer-factory.h"
55 #include "icing/transform/normalizer.h"
56 #include "icing/util/icu-data-file-helper.h"
57 #include "icing/util/status-macros.h"
58 #include "unicode/uloc.h"
59
60 namespace icing {
61 namespace lib {
62
63 namespace {
64
65 using ::testing::ElementsAre;
66 using ::testing::UnorderedElementsAre;
67
68 // This test exists to ensure that the different tokenizers treat different
69 // segments of text in the same manner.
70 class CombinedTokenizerTest : public ::testing::Test {
71 protected:
CombinedTokenizerTest()72 CombinedTokenizerTest()
73 : test_dir_(GetTestTempDir() + "/icing"),
74 store_dir_(test_dir_ + "/store"),
75 schema_store_dir_(test_dir_ + "/schema_store"),
76 index_dir_(test_dir_ + "/index"),
77 numeric_index_dir_(test_dir_ + "/numeric_index"),
78 embedding_index_dir_(test_dir_ + "/embedding_index") {}
79
SetUp()80 void SetUp() override {
81 feature_flags_ = std::make_unique<FeatureFlags>(GetTestFeatureFlags());
82 filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
83 filesystem_.CreateDirectoryRecursively(index_dir_.c_str());
84 filesystem_.CreateDirectoryRecursively(store_dir_.c_str());
85 filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
86 if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
87 ICING_ASSERT_OK(
88 // File generated via icu_data_file rule in //icing/BUILD.
89 icu_data_file_helper::SetUpIcuDataFile(
90 GetTestFilePath("icing/icu.dat")));
91 }
92 jni_cache_ = GetTestJniCache();
93
94 ICING_ASSERT_OK_AND_ASSIGN(
95 schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_,
96 &fake_clock_, feature_flags_.get()));
97
98 ICING_ASSERT_OK_AND_ASSIGN(
99 DocumentStore::CreateResult create_result,
100 DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
101 schema_store_.get(), feature_flags_.get(),
102 /*force_recovery_and_revalidate_documents=*/false,
103 /*pre_mapping_fbv=*/false,
104 /*use_persistent_hash_map=*/false,
105 PortableFileBackedProtoLog<
106 DocumentWrapper>::kDefaultCompressionLevel,
107 /*initialize_stats=*/nullptr));
108 document_store_ = std::move(create_result.document_store);
109
110 Index::Options options(index_dir_,
111 /*index_merge_size=*/1024 * 1024,
112 /*lite_index_sort_at_indexing=*/true,
113 /*lite_index_sort_size=*/1024 * 8);
114 ICING_ASSERT_OK_AND_ASSIGN(
115 index_, Index::Create(options, &filesystem_, &icing_filesystem_));
116 // TODO(b/249829533): switch to use persistent numeric index.
117 ICING_ASSERT_OK_AND_ASSIGN(
118 numeric_index_,
119 DummyNumericIndex<int64_t>::Create(filesystem_, numeric_index_dir_));
120 ICING_ASSERT_OK_AND_ASSIGN(
121 embedding_index_,
122 EmbeddingIndex::Create(&filesystem_, embedding_index_dir_, &fake_clock_,
123 feature_flags_.get()));
124
125 language_segmenter_factory::SegmenterOptions segmenter_options(
126 ULOC_US, jni_cache_.get());
127 ICING_ASSERT_OK_AND_ASSIGN(
128 lang_segmenter_,
129 language_segmenter_factory::Create(std::move(segmenter_options)));
130
131 ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
132 /*max_term_byte_size=*/1000));
133 ICING_ASSERT_OK_AND_ASSIGN(
134 query_processor_,
135 QueryProcessor::Create(
136 index_.get(), numeric_index_.get(), embedding_index_.get(),
137 lang_segmenter_.get(), normalizer_.get(), document_store_.get(),
138 schema_store_.get(), /*join_children_fetcher=*/nullptr,
139 &fake_clock_, feature_flags_.get()));
140 }
141
GetQueryTerms(std::string_view query)142 libtextclassifier3::StatusOr<std::vector<std::string>> GetQueryTerms(
143 std::string_view query) {
144 SearchSpecProto search_spec;
145 search_spec.set_query(std::string(query));
146 search_spec.set_term_match_type(TermMatchType::PREFIX);
147 ICING_ASSIGN_OR_RETURN(
148 QueryResults parsed_query,
149 query_processor_->ParseSearch(
150 search_spec, ScoringSpecProto::RankingStrategy::NONE,
151 /*current_time_ms=*/0, /*search_stats=*/nullptr));
152
153 std::vector<std::string> query_terms;
154 const SectionRestrictQueryTermsMap& query_terms_map =
155 parsed_query.query_terms;
156 for (const auto& [section_id, terms] : query_terms_map) {
157 std::copy(terms.begin(), terms.end(), std::back_inserter(query_terms));
158 }
159 return query_terms;
160 }
161
162 std::unique_ptr<FeatureFlags> feature_flags_;
163 Filesystem filesystem_;
164 const std::string test_dir_;
165 const std::string store_dir_;
166 const std::string schema_store_dir_;
167
168 IcingFilesystem icing_filesystem_;
169 const std::string index_dir_;
170 const std::string numeric_index_dir_;
171 const std::string embedding_index_dir_;
172
173 std::unique_ptr<const JniCache> jni_cache_;
174 std::unique_ptr<LanguageSegmenter> lang_segmenter_;
175 std::unique_ptr<QueryProcessor> query_processor_;
176
177 std::unique_ptr<Index> index_;
178 std::unique_ptr<NumericIndex<int64_t>> numeric_index_;
179 std::unique_ptr<EmbeddingIndex> embedding_index_;
180 std::unique_ptr<Normalizer> normalizer_;
181 FakeClock fake_clock_;
182 std::unique_ptr<SchemaStore> schema_store_;
183 std::unique_ptr<DocumentStore> document_store_;
184 };
185
GetTokenTerms(const std::vector<Token> & tokens)186 std::vector<std::string> GetTokenTerms(const std::vector<Token>& tokens) {
187 std::vector<std::string> terms;
188 terms.reserve(tokens.size());
189 for (const Token& token : tokens) {
190 if (token.type == Token::Type::REGULAR) {
191 terms.push_back(std::string(token.text));
192 }
193 }
194 return terms;
195 }
196
197 } // namespace
198
TEST_F(CombinedTokenizerTest,SpecialCharacters)199 TEST_F(CombinedTokenizerTest, SpecialCharacters) {
200 const std::string_view kText = " Hello! Goodbye?";
201 ICING_ASSERT_OK_AND_ASSIGN(
202 std::unique_ptr<Tokenizer> indexing_tokenizer,
203 tokenizer_factory::CreateIndexingTokenizer(
204 StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
205
206 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
207 indexing_tokenizer->TokenizeAll(kText));
208 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
209 EXPECT_THAT(indexing_terms, ElementsAre("", "Hello", "Goodbye"));
210
211 ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
212 GetQueryTerms(kText));
213 // NOTE: The query parser will also normalize query terms
214 EXPECT_THAT(query_terms, UnorderedElementsAre("", "hello", "goodbye"));
215 }
216
TEST_F(CombinedTokenizerTest,Parentheses)217 TEST_F(CombinedTokenizerTest, Parentheses) {
218 const std::string_view kText = "((paren1)(paren2) (last paren))";
219 ICING_ASSERT_OK_AND_ASSIGN(
220 std::unique_ptr<Tokenizer> indexing_tokenizer,
221 tokenizer_factory::CreateIndexingTokenizer(
222 StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
223
224 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
225 indexing_tokenizer->TokenizeAll(kText));
226 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
227 EXPECT_THAT(indexing_terms, ElementsAre("paren1", "paren2", "last", "paren"));
228
229 ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
230 GetQueryTerms(kText));
231 EXPECT_THAT(query_terms,
232 UnorderedElementsAre("paren1", "paren2", "last", "paren"));
233 }
234
TEST_F(CombinedTokenizerTest,Negation)235 TEST_F(CombinedTokenizerTest, Negation) {
236 const std::string_view kText = "-foo -bar -baz";
237 ICING_ASSERT_OK_AND_ASSIGN(
238 std::unique_ptr<Tokenizer> indexing_tokenizer,
239 tokenizer_factory::CreateIndexingTokenizer(
240 StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
241
242 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
243 indexing_tokenizer->TokenizeAll(kText));
244 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
245 EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
246
247 const std::string_view kQueryText = "\\-foo \\-bar \\-baz";
248 ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
249 GetQueryTerms(kQueryText));
250 EXPECT_THAT(query_terms,
251 UnorderedElementsAre("foo", "bar", "baz"));
252 }
253
254 // TODO(b/254874614): Handle colon word breaks in ICU 72+
TEST_F(CombinedTokenizerTest,Colons)255 TEST_F(CombinedTokenizerTest, Colons) {
256 const std::string_view kText = ":foo: :bar baz:";
257 ICING_ASSERT_OK_AND_ASSIGN(
258 std::unique_ptr<Tokenizer> indexing_tokenizer,
259 tokenizer_factory::CreateIndexingTokenizer(
260 StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
261
262 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
263 indexing_tokenizer->TokenizeAll(kText));
264 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
265 EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
266
267 const std::string_view kQueryText = "\\:foo\\: \\:bar baz\\:";
268 ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
269 GetQueryTerms(kQueryText));
270 EXPECT_THAT(query_terms, UnorderedElementsAre("foo", "bar", "baz"));
271 }
272
273 // TODO(b/254874614): Handle colon word breaks in ICU 72+
TEST_F(CombinedTokenizerTest,ColonsPropertyRestricts)274 TEST_F(CombinedTokenizerTest, ColonsPropertyRestricts) {
275 ICING_ASSERT_OK_AND_ASSIGN(
276 std::unique_ptr<Tokenizer> indexing_tokenizer,
277 tokenizer_factory::CreateIndexingTokenizer(
278 StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
279
280 if (GetIcuTokenizationVersion() >= 72) {
281 // In ICU 72+ and above, ':' are no longer considered word connectors.
282 constexpr std::string_view kText = "foo:bar";
283 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
284 indexing_tokenizer->TokenizeAll(kText));
285 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
286 EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar"));
287
288 const std::string_view kQueryText = "foo\\:bar";
289 ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
290 GetQueryTerms(kQueryText));
291 EXPECT_THAT(query_terms, UnorderedElementsAre("foo", "bar"));
292
293 constexpr std::string_view kText2 = "foo:bar:baz";
294 ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
295 indexing_tokenizer->TokenizeAll(kText2));
296 indexing_terms = GetTokenTerms(indexing_tokens);
297 EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
298
299 const std::string_view kQueryText2 = "foo\\:bar\\:baz";
300 ICING_ASSERT_OK_AND_ASSIGN(query_terms,
301 GetQueryTerms(kQueryText2));
302 EXPECT_THAT(query_terms, UnorderedElementsAre("foo", "bar", "baz"));
303 } else {
304 constexpr std::string_view kText = "foo:bar";
305 constexpr std::string_view kQueryText = "foo\\:bar";
306 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
307 indexing_tokenizer->TokenizeAll(kText));
308 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
309 EXPECT_THAT(indexing_terms, ElementsAre("foo:bar"));
310
311 ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
312 GetQueryTerms(kQueryText));
313 EXPECT_THAT(query_terms, UnorderedElementsAre("foo:bar"));
314
315 constexpr std::string_view kText2 = "foo:bar:baz";
316 constexpr std::string_view kQueryText2 = "foo\\:bar\\:baz";
317 ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
318 indexing_tokenizer->TokenizeAll(kText2));
319 indexing_terms = GetTokenTerms(indexing_tokens);
320 EXPECT_THAT(indexing_terms, ElementsAre("foo:bar:baz"));
321
322 ICING_ASSERT_OK_AND_ASSIGN(query_terms, GetQueryTerms(kQueryText2));
323 EXPECT_THAT(query_terms, UnorderedElementsAre("foo:bar:baz"));
324 }
325 }
326
TEST_F(CombinedTokenizerTest,Punctuation)327 TEST_F(CombinedTokenizerTest, Punctuation) {
328 const std::string_view kText = "Who? What!? Why & How";
329 ICING_ASSERT_OK_AND_ASSIGN(
330 std::unique_ptr<Tokenizer> indexing_tokenizer,
331 tokenizer_factory::CreateIndexingTokenizer(
332 StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
333
334 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
335 indexing_tokenizer->TokenizeAll(kText));
336 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
337 EXPECT_THAT(indexing_terms, ElementsAre("Who", "What", "Why", "How"));
338
339 ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
340 GetQueryTerms(kText));
341 // NOTE: The query parser will also normalize query terms
342 EXPECT_THAT(query_terms, UnorderedElementsAre("who", "what", "why", "how"));
343 }
344
345 } // namespace lib
346 } // namespace icing
347