xref: /aosp_15_r20/external/icing/icing/tokenization/combined-tokenizer_test.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <algorithm>
16 #include <cstdint>
17 #include <iterator>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23 
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "gmock/gmock.h"
26 #include "gtest/gtest.h"
27 #include "icing/feature-flags.h"
28 #include "icing/file/filesystem.h"
29 #include "icing/file/portable-file-backed-proto-log.h"
30 #include "icing/index/embed/embedding-index.h"
31 #include "icing/index/index.h"
32 #include "icing/index/numeric/dummy-numeric-index.h"
33 #include "icing/index/numeric/numeric-index.h"
34 #include "icing/jni/jni-cache.h"
35 #include "icing/legacy/index/icing-filesystem.h"
36 #include "icing/portable/platform.h"
37 #include "icing/proto/schema.pb.h"
38 #include "icing/query/query-processor.h"
39 #include "icing/query/query-results.h"
40 #include "icing/query/query-terms.h"
41 #include "icing/schema/schema-store.h"
42 #include "icing/store/document-store.h"
43 #include "icing/testing/common-matchers.h"
44 #include "icing/testing/fake-clock.h"
45 #include "icing/testing/jni-test-helpers.h"
46 #include "icing/testing/test-data.h"
47 #include "icing/testing/test-feature-flags.h"
48 #include "icing/testing/tmp-directory.h"
49 #include "icing/tokenization/language-segmenter-factory.h"
50 #include "icing/tokenization/language-segmenter.h"
51 #include "icing/tokenization/token.h"
52 #include "icing/tokenization/tokenizer-factory.h"
53 #include "icing/tokenization/tokenizer.h"
54 #include "icing/transform/normalizer-factory.h"
55 #include "icing/transform/normalizer.h"
56 #include "icing/util/icu-data-file-helper.h"
57 #include "icing/util/status-macros.h"
58 #include "unicode/uloc.h"
59 
60 namespace icing {
61 namespace lib {
62 
63 namespace {
64 
65 using ::testing::ElementsAre;
66 using ::testing::UnorderedElementsAre;
67 
68 // This test exists to ensure that the different tokenizers treat different
69 // segments of text in the same manner.
70 class CombinedTokenizerTest : public ::testing::Test {
71  protected:
CombinedTokenizerTest()72   CombinedTokenizerTest()
73       : test_dir_(GetTestTempDir() + "/icing"),
74         store_dir_(test_dir_ + "/store"),
75         schema_store_dir_(test_dir_ + "/schema_store"),
76         index_dir_(test_dir_ + "/index"),
77         numeric_index_dir_(test_dir_ + "/numeric_index"),
78         embedding_index_dir_(test_dir_ + "/embedding_index") {}
79 
SetUp()80   void SetUp() override {
81     feature_flags_ = std::make_unique<FeatureFlags>(GetTestFeatureFlags());
82     filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
83     filesystem_.CreateDirectoryRecursively(index_dir_.c_str());
84     filesystem_.CreateDirectoryRecursively(store_dir_.c_str());
85     filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
86     if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
87       ICING_ASSERT_OK(
88           // File generated via icu_data_file rule in //icing/BUILD.
89           icu_data_file_helper::SetUpIcuDataFile(
90               GetTestFilePath("icing/icu.dat")));
91     }
92     jni_cache_ = GetTestJniCache();
93 
94     ICING_ASSERT_OK_AND_ASSIGN(
95         schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_,
96                                            &fake_clock_, feature_flags_.get()));
97 
98     ICING_ASSERT_OK_AND_ASSIGN(
99         DocumentStore::CreateResult create_result,
100         DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
101                               schema_store_.get(), feature_flags_.get(),
102                               /*force_recovery_and_revalidate_documents=*/false,
103                               /*pre_mapping_fbv=*/false,
104                               /*use_persistent_hash_map=*/false,
105                               PortableFileBackedProtoLog<
106                                   DocumentWrapper>::kDefaultCompressionLevel,
107                               /*initialize_stats=*/nullptr));
108     document_store_ = std::move(create_result.document_store);
109 
110     Index::Options options(index_dir_,
111                            /*index_merge_size=*/1024 * 1024,
112                            /*lite_index_sort_at_indexing=*/true,
113                            /*lite_index_sort_size=*/1024 * 8);
114     ICING_ASSERT_OK_AND_ASSIGN(
115         index_, Index::Create(options, &filesystem_, &icing_filesystem_));
116     // TODO(b/249829533): switch to use persistent numeric index.
117     ICING_ASSERT_OK_AND_ASSIGN(
118         numeric_index_,
119         DummyNumericIndex<int64_t>::Create(filesystem_, numeric_index_dir_));
120     ICING_ASSERT_OK_AND_ASSIGN(
121         embedding_index_,
122         EmbeddingIndex::Create(&filesystem_, embedding_index_dir_, &fake_clock_,
123                                feature_flags_.get()));
124 
125     language_segmenter_factory::SegmenterOptions segmenter_options(
126         ULOC_US, jni_cache_.get());
127     ICING_ASSERT_OK_AND_ASSIGN(
128         lang_segmenter_,
129         language_segmenter_factory::Create(std::move(segmenter_options)));
130 
131     ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
132                                                 /*max_term_byte_size=*/1000));
133     ICING_ASSERT_OK_AND_ASSIGN(
134         query_processor_,
135         QueryProcessor::Create(
136             index_.get(), numeric_index_.get(), embedding_index_.get(),
137             lang_segmenter_.get(), normalizer_.get(), document_store_.get(),
138             schema_store_.get(), /*join_children_fetcher=*/nullptr,
139             &fake_clock_, feature_flags_.get()));
140   }
141 
GetQueryTerms(std::string_view query)142   libtextclassifier3::StatusOr<std::vector<std::string>> GetQueryTerms(
143       std::string_view query) {
144     SearchSpecProto search_spec;
145     search_spec.set_query(std::string(query));
146     search_spec.set_term_match_type(TermMatchType::PREFIX);
147     ICING_ASSIGN_OR_RETURN(
148         QueryResults parsed_query,
149         query_processor_->ParseSearch(
150             search_spec, ScoringSpecProto::RankingStrategy::NONE,
151             /*current_time_ms=*/0, /*search_stats=*/nullptr));
152 
153     std::vector<std::string> query_terms;
154     const SectionRestrictQueryTermsMap& query_terms_map =
155         parsed_query.query_terms;
156     for (const auto& [section_id, terms] : query_terms_map) {
157       std::copy(terms.begin(), terms.end(), std::back_inserter(query_terms));
158     }
159     return query_terms;
160   }
161 
162   std::unique_ptr<FeatureFlags> feature_flags_;
163   Filesystem filesystem_;
164   const std::string test_dir_;
165   const std::string store_dir_;
166   const std::string schema_store_dir_;
167 
168   IcingFilesystem icing_filesystem_;
169   const std::string index_dir_;
170   const std::string numeric_index_dir_;
171   const std::string embedding_index_dir_;
172 
173   std::unique_ptr<const JniCache> jni_cache_;
174   std::unique_ptr<LanguageSegmenter> lang_segmenter_;
175   std::unique_ptr<QueryProcessor> query_processor_;
176 
177   std::unique_ptr<Index> index_;
178   std::unique_ptr<NumericIndex<int64_t>> numeric_index_;
179   std::unique_ptr<EmbeddingIndex> embedding_index_;
180   std::unique_ptr<Normalizer> normalizer_;
181   FakeClock fake_clock_;
182   std::unique_ptr<SchemaStore> schema_store_;
183   std::unique_ptr<DocumentStore> document_store_;
184 };
185 
GetTokenTerms(const std::vector<Token> & tokens)186 std::vector<std::string> GetTokenTerms(const std::vector<Token>& tokens) {
187   std::vector<std::string> terms;
188   terms.reserve(tokens.size());
189   for (const Token& token : tokens) {
190     if (token.type == Token::Type::REGULAR) {
191       terms.push_back(std::string(token.text));
192     }
193   }
194   return terms;
195 }
196 
197 }  // namespace
198 
TEST_F(CombinedTokenizerTest,SpecialCharacters)199 TEST_F(CombinedTokenizerTest, SpecialCharacters) {
200   const std::string_view kText = "�� Hello! Goodbye?";
201   ICING_ASSERT_OK_AND_ASSIGN(
202       std::unique_ptr<Tokenizer> indexing_tokenizer,
203       tokenizer_factory::CreateIndexingTokenizer(
204           StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
205 
206   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
207                              indexing_tokenizer->TokenizeAll(kText));
208   std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
209   EXPECT_THAT(indexing_terms, ElementsAre("��", "Hello", "Goodbye"));
210 
211   ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
212                              GetQueryTerms(kText));
213   // NOTE: The query parser will also normalize query terms
214   EXPECT_THAT(query_terms, UnorderedElementsAre("��", "hello", "goodbye"));
215 }
216 
TEST_F(CombinedTokenizerTest,Parentheses)217 TEST_F(CombinedTokenizerTest, Parentheses) {
218   const std::string_view kText = "((paren1)(paren2) (last paren))";
219   ICING_ASSERT_OK_AND_ASSIGN(
220       std::unique_ptr<Tokenizer> indexing_tokenizer,
221       tokenizer_factory::CreateIndexingTokenizer(
222           StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
223 
224   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
225                              indexing_tokenizer->TokenizeAll(kText));
226   std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
227   EXPECT_THAT(indexing_terms, ElementsAre("paren1", "paren2", "last", "paren"));
228 
229   ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
230                              GetQueryTerms(kText));
231   EXPECT_THAT(query_terms,
232               UnorderedElementsAre("paren1", "paren2", "last", "paren"));
233 }
234 
TEST_F(CombinedTokenizerTest,Negation)235 TEST_F(CombinedTokenizerTest, Negation) {
236   const std::string_view kText = "-foo -bar -baz";
237   ICING_ASSERT_OK_AND_ASSIGN(
238       std::unique_ptr<Tokenizer> indexing_tokenizer,
239       tokenizer_factory::CreateIndexingTokenizer(
240           StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
241 
242   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
243                              indexing_tokenizer->TokenizeAll(kText));
244   std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
245   EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
246 
247   const std::string_view kQueryText = "\\-foo \\-bar \\-baz";
248   ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
249                              GetQueryTerms(kQueryText));
250   EXPECT_THAT(query_terms,
251               UnorderedElementsAre("foo", "bar", "baz"));
252 }
253 
254 // TODO(b/254874614): Handle colon word breaks in ICU 72+
TEST_F(CombinedTokenizerTest,Colons)255 TEST_F(CombinedTokenizerTest, Colons) {
256   const std::string_view kText = ":foo: :bar baz:";
257   ICING_ASSERT_OK_AND_ASSIGN(
258       std::unique_ptr<Tokenizer> indexing_tokenizer,
259       tokenizer_factory::CreateIndexingTokenizer(
260           StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
261 
262   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
263                              indexing_tokenizer->TokenizeAll(kText));
264   std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
265   EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
266 
267   const std::string_view kQueryText = "\\:foo\\: \\:bar baz\\:";
268   ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
269                              GetQueryTerms(kQueryText));
270   EXPECT_THAT(query_terms, UnorderedElementsAre("foo", "bar", "baz"));
271 }
272 
273 // TODO(b/254874614): Handle colon word breaks in ICU 72+
TEST_F(CombinedTokenizerTest,ColonsPropertyRestricts)274 TEST_F(CombinedTokenizerTest, ColonsPropertyRestricts) {
275   ICING_ASSERT_OK_AND_ASSIGN(
276       std::unique_ptr<Tokenizer> indexing_tokenizer,
277       tokenizer_factory::CreateIndexingTokenizer(
278           StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
279 
280   if (GetIcuTokenizationVersion() >= 72) {
281     // In ICU 72+ and above, ':' are no longer considered word connectors.
282     constexpr std::string_view kText = "foo:bar";
283     ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
284                                indexing_tokenizer->TokenizeAll(kText));
285     std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
286     EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar"));
287 
288     const std::string_view kQueryText = "foo\\:bar";
289     ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
290                               GetQueryTerms(kQueryText));
291     EXPECT_THAT(query_terms, UnorderedElementsAre("foo", "bar"));
292 
293     constexpr std::string_view kText2 = "foo:bar:baz";
294     ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
295                                indexing_tokenizer->TokenizeAll(kText2));
296     indexing_terms = GetTokenTerms(indexing_tokens);
297     EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
298 
299     const std::string_view kQueryText2 = "foo\\:bar\\:baz";
300     ICING_ASSERT_OK_AND_ASSIGN(query_terms,
301                               GetQueryTerms(kQueryText2));
302     EXPECT_THAT(query_terms, UnorderedElementsAre("foo", "bar", "baz"));
303   } else {
304     constexpr std::string_view kText = "foo:bar";
305     constexpr std::string_view kQueryText = "foo\\:bar";
306     ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
307                                indexing_tokenizer->TokenizeAll(kText));
308     std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
309     EXPECT_THAT(indexing_terms, ElementsAre("foo:bar"));
310 
311     ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
312                                GetQueryTerms(kQueryText));
313     EXPECT_THAT(query_terms, UnorderedElementsAre("foo:bar"));
314 
315     constexpr std::string_view kText2 = "foo:bar:baz";
316     constexpr std::string_view kQueryText2 = "foo\\:bar\\:baz";
317     ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
318                                indexing_tokenizer->TokenizeAll(kText2));
319     indexing_terms = GetTokenTerms(indexing_tokens);
320     EXPECT_THAT(indexing_terms, ElementsAre("foo:bar:baz"));
321 
322     ICING_ASSERT_OK_AND_ASSIGN(query_terms, GetQueryTerms(kQueryText2));
323     EXPECT_THAT(query_terms, UnorderedElementsAre("foo:bar:baz"));
324   }
325 }
326 
TEST_F(CombinedTokenizerTest,Punctuation)327 TEST_F(CombinedTokenizerTest, Punctuation) {
328   const std::string_view kText = "Who? What!? Why & How";
329   ICING_ASSERT_OK_AND_ASSIGN(
330       std::unique_ptr<Tokenizer> indexing_tokenizer,
331       tokenizer_factory::CreateIndexingTokenizer(
332           StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
333 
334   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
335                              indexing_tokenizer->TokenizeAll(kText));
336   std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
337   EXPECT_THAT(indexing_terms, ElementsAre("Who", "What", "Why", "How"));
338 
339   ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string> query_terms,
340                              GetQueryTerms(kText));
341   // NOTE: The query parser will also normalize query terms
342   EXPECT_THAT(query_terms, UnorderedElementsAre("who", "what", "why", "how"));
343 }
344 
345 }  // namespace lib
346 }  // namespace icing
347