xref: /aosp_15_r20/external/icing/icing/query/suggestion-processor.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/query/suggestion-processor.h"
16 
17 #include <cstdint>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <unordered_map>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/absl_ports/str_cat.h"
29 #include "icing/feature-flags.h"
30 #include "icing/index/embed/embedding-index.h"
31 #include "icing/index/index.h"
32 #include "icing/index/iterator/doc-hit-info-iterator.h"
33 #include "icing/index/numeric/numeric-index.h"
34 #include "icing/index/term-metadata.h"
35 #include "icing/proto/search.pb.h"
36 #include "icing/query/query-processor.h"
37 #include "icing/query/query-results.h"
38 #include "icing/schema/schema-store.h"
39 #include "icing/schema/section.h"
40 #include "icing/store/document-filter-data.h"
41 #include "icing/store/document-id.h"
42 #include "icing/store/document-store.h"
43 #include "icing/store/namespace-id.h"
44 #include "icing/store/suggestion-result-checker-impl.h"
45 #include "icing/tokenization/language-segmenter.h"
46 #include "icing/transform/normalizer.h"
47 #include "icing/util/clock.h"
48 #include "icing/util/status-macros.h"
49 
50 namespace icing {
51 namespace lib {
52 
53 libtextclassifier3::StatusOr<std::unique_ptr<SuggestionProcessor>>
Create(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const LanguageSegmenter * language_segmenter,const Normalizer * normalizer,const DocumentStore * document_store,const SchemaStore * schema_store,const Clock * clock,const FeatureFlags * feature_flags)54 SuggestionProcessor::Create(Index* index,
55                             const NumericIndex<int64_t>* numeric_index,
56                             const EmbeddingIndex* embedding_index,
57                             const LanguageSegmenter* language_segmenter,
58                             const Normalizer* normalizer,
59                             const DocumentStore* document_store,
60                             const SchemaStore* schema_store, const Clock* clock,
61                             const FeatureFlags* feature_flags) {
62   ICING_RETURN_ERROR_IF_NULL(index);
63   ICING_RETURN_ERROR_IF_NULL(numeric_index);
64   ICING_RETURN_ERROR_IF_NULL(embedding_index);
65   ICING_RETURN_ERROR_IF_NULL(language_segmenter);
66   ICING_RETURN_ERROR_IF_NULL(normalizer);
67   ICING_RETURN_ERROR_IF_NULL(document_store);
68   ICING_RETURN_ERROR_IF_NULL(schema_store);
69   ICING_RETURN_ERROR_IF_NULL(clock);
70   ICING_RETURN_ERROR_IF_NULL(feature_flags);
71 
72   return std::unique_ptr<SuggestionProcessor>(new SuggestionProcessor(
73       index, numeric_index, embedding_index, language_segmenter, normalizer,
74       document_store, schema_store, clock, feature_flags));
75 }
76 
77 libtextclassifier3::StatusOr<
78     std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>>
PopulateDocumentIdFilters(const DocumentStore * document_store,const icing::lib::SuggestionSpecProto & suggestion_spec,const std::unordered_set<NamespaceId> & namespace_ids)79 PopulateDocumentIdFilters(
80     const DocumentStore* document_store,
81     const icing::lib::SuggestionSpecProto& suggestion_spec,
82     const std::unordered_set<NamespaceId>& namespace_ids) {
83   std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>
84       document_id_filter_map;
85   document_id_filter_map.reserve(suggestion_spec.document_uri_filters_size());
86   for (const NamespaceDocumentUriGroup& namespace_document_uri_group :
87        suggestion_spec.document_uri_filters()) {
88     auto namespace_id_or = document_store->GetNamespaceId(
89         namespace_document_uri_group.namespace_());
90     if (!namespace_id_or.ok()) {
91       // The current namespace doesn't exist.
92       continue;
93     }
94     NamespaceId namespace_id = namespace_id_or.ValueOrDie();
95     if (!namespace_ids.empty() &&
96         namespace_ids.find(namespace_id) == namespace_ids.end()) {
97       // The current namespace doesn't appear in the namespace filter.
98       return absl_ports::InvalidArgumentError(absl_ports::StrCat(
99           "The namespace : ", namespace_document_uri_group.namespace_(),
100           " appears in the document uri filter, but doesn't appear in the "
101           "namespace filter."));
102     }
103 
104     if (namespace_document_uri_group.document_uris().empty()) {
105       // Client should use namespace filter to filter out all document under
106       // a namespace.
107       return absl_ports::InvalidArgumentError(absl_ports::StrCat(
108           "The namespace : ", namespace_document_uri_group.namespace_(),
109           " has empty document uri in the document uri filter. Please use the "
110           "namespace filter to exclude a namespace instead of the document uri "
111           "filter."));
112     }
113 
114     // Translate namespace document Uris into document_ids
115     std::unordered_set<DocumentId> target_document_ids;
116     target_document_ids.reserve(
117         namespace_document_uri_group.document_uris_size());
118     for (std::string_view document_uri :
119          namespace_document_uri_group.document_uris()) {
120       auto document_id_or = document_store->GetDocumentId(
121           namespace_document_uri_group.namespace_(), document_uri);
122       if (!document_id_or.ok()) {
123         continue;
124       }
125       target_document_ids.insert(document_id_or.ValueOrDie());
126     }
127     document_id_filter_map.insert({namespace_id, target_document_ids});
128   }
129   return document_id_filter_map;
130 }
131 
132 libtextclassifier3::StatusOr<std::unordered_map<SchemaTypeId, SectionIdMask>>
PopulatePropertyFilters(const SchemaStore * schema_store,const icing::lib::SuggestionSpecProto & suggestion_spec,const std::unordered_set<SchemaTypeId> & schema_type_ids)133 PopulatePropertyFilters(
134     const SchemaStore* schema_store,
135     const icing::lib::SuggestionSpecProto& suggestion_spec,
136     const std::unordered_set<SchemaTypeId>& schema_type_ids) {
137   std::unordered_map<SchemaTypeId, SectionIdMask> property_filter_map;
138   property_filter_map.reserve(suggestion_spec.type_property_filters_size());
139   for (const TypePropertyMask& type_field_mask :
140        suggestion_spec.type_property_filters()) {
141     auto schema_type_id_or =
142         schema_store->GetSchemaTypeId(type_field_mask.schema_type());
143     if (!schema_type_id_or.ok()) {
144       // The current schema doesn't exist
145       continue;
146     }
147     SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
148 
149     if (!schema_type_ids.empty() &&
150         schema_type_ids.find(schema_type_id) == schema_type_ids.end()) {
151       // The current schema type doesn't appear in the schema type filter.
152       return absl_ports::InvalidArgumentError(absl_ports::StrCat(
153           "The schema : ", type_field_mask.schema_type(),
154           " appears in the property filter, but doesn't appear in the schema"
155           " type filter."));
156     }
157 
158     if (type_field_mask.paths().empty()) {
159       return absl_ports::InvalidArgumentError(absl_ports::StrCat(
160           "The schema type : ", type_field_mask.schema_type(),
161           " has empty path in the property filter. Please use the schema type"
162           " filter to exclude a schema type instead of the property filter."));
163     }
164 
165     // Translate property paths into section id mask
166     SectionIdMask section_mask = kSectionIdMaskNone;
167     auto section_metadata_list_or =
168         schema_store->GetSectionMetadata(type_field_mask.schema_type());
169     if (!section_metadata_list_or.ok()) {
170       // The current schema doesn't has section metadata.
171       continue;
172     }
173     std::unordered_set<std::string> target_property_paths;
174     target_property_paths.reserve(type_field_mask.paths_size());
175     for (const std::string& target_property_path : type_field_mask.paths()) {
176       target_property_paths.insert(target_property_path);
177     }
178     const std::vector<SectionMetadata>* section_metadata_list =
179         section_metadata_list_or.ValueOrDie();
180     for (const SectionMetadata& section_metadata : *section_metadata_list) {
181       if (target_property_paths.find(section_metadata.path) !=
182           target_property_paths.end()) {
183         section_mask |= UINT64_C(1) << section_metadata.id;
184       }
185     }
186     property_filter_map.insert({schema_type_id, section_mask});
187   }
188   return property_filter_map;
189 }
190 
191 libtextclassifier3::StatusOr<std::vector<TermMetadata>>
QuerySuggestions(const icing::lib::SuggestionSpecProto & suggestion_spec,int64_t current_time_ms)192 SuggestionProcessor::QuerySuggestions(
193     const icing::lib::SuggestionSpecProto& suggestion_spec,
194     int64_t current_time_ms) {
195   // We use query tokenizer to tokenize the give prefix, and we only use the
196   // last token to be the suggestion prefix.
197 
198   // Populate target namespace filter.
199   std::unordered_set<NamespaceId> namespace_ids;
200   namespace_ids.reserve(suggestion_spec.namespace_filters_size());
201   for (std::string_view name_space : suggestion_spec.namespace_filters()) {
202     auto namespace_id_or = document_store_.GetNamespaceId(name_space);
203     if (!namespace_id_or.ok()) {
204       // The current namespace doesn't exist.
205       continue;
206     }
207     namespace_ids.insert(namespace_id_or.ValueOrDie());
208   }
209   if (namespace_ids.empty() && !suggestion_spec.namespace_filters().empty()) {
210     // None of desired namespace exists, we should return directly.
211     return std::vector<TermMetadata>();
212   }
213 
214   // Populate target document id filter.
215   auto document_id_filter_map_or = PopulateDocumentIdFilters(
216       &document_store_, suggestion_spec, namespace_ids);
217   if (!document_id_filter_map_or.ok()) {
218     return std::move(document_id_filter_map_or).status();
219   }
220 
221   std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>
222       document_id_filter_map = document_id_filter_map_or.ValueOrDie();
223   if (document_id_filter_map.empty() &&
224       !suggestion_spec.document_uri_filters().empty()) {
225     // None of desired DocumentId exists, we should return directly.
226     return std::vector<TermMetadata>();
227   }
228 
229   // Populate target schema type filter.
230   std::unordered_set<SchemaTypeId> schema_type_ids;
231   schema_type_ids.reserve(suggestion_spec.schema_type_filters_size());
232   for (std::string_view schema_type : suggestion_spec.schema_type_filters()) {
233     auto schema_type_id_or = schema_store_.GetSchemaTypeId(schema_type);
234     if (!schema_type_id_or.ok()) {
235       continue;
236     }
237     schema_type_ids.insert(schema_type_id_or.ValueOrDie());
238   }
239   if (schema_type_ids.empty() &&
240       !suggestion_spec.schema_type_filters().empty()) {
241     // None of desired schema type exists, we should return directly.
242     return std::vector<TermMetadata>();
243   }
244 
245   // Populate target properties filter.
246   auto property_filter_map_or =
247       PopulatePropertyFilters(&schema_store_, suggestion_spec, schema_type_ids);
248   if (!property_filter_map_or.ok()) {
249     return std::move(property_filter_map_or).status();
250   }
251   std::unordered_map<SchemaTypeId, SectionIdMask> property_filter_map =
252       property_filter_map_or.ValueOrDie();
253 
254   ICING_ASSIGN_OR_RETURN(
255       std::unique_ptr<QueryProcessor> query_processor,
256       QueryProcessor::Create(
257           &index_, &numeric_index_, &embedding_index_, &language_segmenter_,
258           &normalizer_, &document_store_, &schema_store_,
259           /*join_children_fetcher=*/nullptr, &clock_, &feature_flags_));
260 
261   SearchSpecProto search_spec;
262   search_spec.set_query(suggestion_spec.prefix());
263   search_spec.set_term_match_type(
264       suggestion_spec.scoring_spec().scoring_match_type());
265   for (const PropertyProto::VectorProto& vector :
266        suggestion_spec.embedding_query_vectors()) {
267     *search_spec.add_embedding_query_vectors() = vector;
268   }
269   search_spec.set_embedding_query_metric_type(
270       suggestion_spec.embedding_query_metric_type());
271   for (const std::string& query_parameter_string :
272        suggestion_spec.query_parameter_strings()) {
273     search_spec.add_query_parameter_strings(query_parameter_string);
274   }
275   for (const std::string& enabled_feature :
276        suggestion_spec.enabled_features()) {
277     search_spec.add_enabled_features(enabled_feature);
278   }
279   ICING_ASSIGN_OR_RETURN(
280       QueryResults query_results,
281       query_processor->ParseSearch(search_spec,
282                                    ScoringSpecProto::RankingStrategy::NONE,
283                                    current_time_ms));
284 
285   ICING_ASSIGN_OR_RETURN(
286       DocHitInfoIterator::TrimmedNode trimmed_node,
287       std::move(*query_results.root_iterator).TrimRightMostNode());
288 
289   // If the position of the last token is not the end of the prefix, it means
290   // there should be some operator tokens after it and are ignored by the
291   // tokenizer.
292   bool is_last_token =
293       trimmed_node.term_start_index_ + trimmed_node.unnormalized_term_length_ >=
294       suggestion_spec.prefix().length();
295 
296   if (!is_last_token || trimmed_node.term_.empty()) {
297     // We don't have a valid last token, return early.
298     return std::vector<TermMetadata>();
299   }
300 
301   // Populate the search base in document ids.
302   // Suggestions are only generated for the very last term,
303   // trimmed_node.iterator_ tracks search results for all previous terms. If it
304   // is null means there is no pervious term and we are generating suggetion for
305   // a single term.
306   std::unordered_set<DocumentId> search_base;
307   if (trimmed_node.iterator_ != nullptr) {
308     while (trimmed_node.iterator_->Advance().ok()) {
309       search_base.insert(trimmed_node.iterator_->doc_hit_info().document_id());
310     }
311     if (search_base.empty()) {
312       // Nothing matches the previous terms in the query. There are no valid
313       // suggestions to make, we should return directly.
314       return std::vector<TermMetadata>();
315     }
316   }
317 
318   // Create result checker based on given filters.
319   SuggestionResultCheckerImpl suggestion_result_checker_impl(
320       &document_store_, &schema_store_, std::move(namespace_ids),
321       std::move(document_id_filter_map), std::move(schema_type_ids),
322       std::move(property_filter_map), std::move(trimmed_node.target_section_),
323       std::move(search_base), current_time_ms);
324   // TODO(b/228240987) support generate suggestion and append suffix for advance
325   // query and function call.
326   std::string query_prefix =
327       suggestion_spec.prefix().substr(0, trimmed_node.term_start_index_);
328   // Run suggestion based on given SuggestionSpec.
329   // Normalize token text to lowercase since all tokens in the lexicon are
330   // lowercase.
331   ICING_ASSIGN_OR_RETURN(
332       std::vector<TermMetadata> terms,
333       index_.FindTermsByPrefix(
334           trimmed_node.term_, suggestion_spec.num_to_return(),
335           suggestion_spec.scoring_spec().scoring_match_type(),
336           suggestion_spec.scoring_spec().rank_by(),
337           &suggestion_result_checker_impl));
338   for (TermMetadata& term : terms) {
339     term.content = query_prefix + term.content;
340   }
341   return terms;
342 }
343 
SuggestionProcessor(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const LanguageSegmenter * language_segmenter,const Normalizer * normalizer,const DocumentStore * document_store,const SchemaStore * schema_store,const Clock * clock,const FeatureFlags * feature_flags)344 SuggestionProcessor::SuggestionProcessor(
345     Index* index, const NumericIndex<int64_t>* numeric_index,
346     const EmbeddingIndex* embedding_index,
347     const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
348     const DocumentStore* document_store, const SchemaStore* schema_store,
349     const Clock* clock, const FeatureFlags* feature_flags)
350     : index_(*index),
351       numeric_index_(*numeric_index),
352       embedding_index_(*embedding_index),
353       language_segmenter_(*language_segmenter),
354       normalizer_(*normalizer),
355       document_store_(*document_store),
356       schema_store_(*schema_store),
357       clock_(*clock),
358       feature_flags_(*feature_flags) {}
359 
360 }  // namespace lib
361 }  // namespace icing
362