1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/query/suggestion-processor.h"
16
17 #include <cstdint>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <unordered_map>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/absl_ports/str_cat.h"
29 #include "icing/feature-flags.h"
30 #include "icing/index/embed/embedding-index.h"
31 #include "icing/index/index.h"
32 #include "icing/index/iterator/doc-hit-info-iterator.h"
33 #include "icing/index/numeric/numeric-index.h"
34 #include "icing/index/term-metadata.h"
35 #include "icing/proto/search.pb.h"
36 #include "icing/query/query-processor.h"
37 #include "icing/query/query-results.h"
38 #include "icing/schema/schema-store.h"
39 #include "icing/schema/section.h"
40 #include "icing/store/document-filter-data.h"
41 #include "icing/store/document-id.h"
42 #include "icing/store/document-store.h"
43 #include "icing/store/namespace-id.h"
44 #include "icing/store/suggestion-result-checker-impl.h"
45 #include "icing/tokenization/language-segmenter.h"
46 #include "icing/transform/normalizer.h"
47 #include "icing/util/clock.h"
48 #include "icing/util/status-macros.h"
49
50 namespace icing {
51 namespace lib {
52
53 libtextclassifier3::StatusOr<std::unique_ptr<SuggestionProcessor>>
Create(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const LanguageSegmenter * language_segmenter,const Normalizer * normalizer,const DocumentStore * document_store,const SchemaStore * schema_store,const Clock * clock,const FeatureFlags * feature_flags)54 SuggestionProcessor::Create(Index* index,
55 const NumericIndex<int64_t>* numeric_index,
56 const EmbeddingIndex* embedding_index,
57 const LanguageSegmenter* language_segmenter,
58 const Normalizer* normalizer,
59 const DocumentStore* document_store,
60 const SchemaStore* schema_store, const Clock* clock,
61 const FeatureFlags* feature_flags) {
62 ICING_RETURN_ERROR_IF_NULL(index);
63 ICING_RETURN_ERROR_IF_NULL(numeric_index);
64 ICING_RETURN_ERROR_IF_NULL(embedding_index);
65 ICING_RETURN_ERROR_IF_NULL(language_segmenter);
66 ICING_RETURN_ERROR_IF_NULL(normalizer);
67 ICING_RETURN_ERROR_IF_NULL(document_store);
68 ICING_RETURN_ERROR_IF_NULL(schema_store);
69 ICING_RETURN_ERROR_IF_NULL(clock);
70 ICING_RETURN_ERROR_IF_NULL(feature_flags);
71
72 return std::unique_ptr<SuggestionProcessor>(new SuggestionProcessor(
73 index, numeric_index, embedding_index, language_segmenter, normalizer,
74 document_store, schema_store, clock, feature_flags));
75 }
76
77 libtextclassifier3::StatusOr<
78 std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>>
PopulateDocumentIdFilters(const DocumentStore * document_store,const icing::lib::SuggestionSpecProto & suggestion_spec,const std::unordered_set<NamespaceId> & namespace_ids)79 PopulateDocumentIdFilters(
80 const DocumentStore* document_store,
81 const icing::lib::SuggestionSpecProto& suggestion_spec,
82 const std::unordered_set<NamespaceId>& namespace_ids) {
83 std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>
84 document_id_filter_map;
85 document_id_filter_map.reserve(suggestion_spec.document_uri_filters_size());
86 for (const NamespaceDocumentUriGroup& namespace_document_uri_group :
87 suggestion_spec.document_uri_filters()) {
88 auto namespace_id_or = document_store->GetNamespaceId(
89 namespace_document_uri_group.namespace_());
90 if (!namespace_id_or.ok()) {
91 // The current namespace doesn't exist.
92 continue;
93 }
94 NamespaceId namespace_id = namespace_id_or.ValueOrDie();
95 if (!namespace_ids.empty() &&
96 namespace_ids.find(namespace_id) == namespace_ids.end()) {
97 // The current namespace doesn't appear in the namespace filter.
98 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
99 "The namespace : ", namespace_document_uri_group.namespace_(),
100 " appears in the document uri filter, but doesn't appear in the "
101 "namespace filter."));
102 }
103
104 if (namespace_document_uri_group.document_uris().empty()) {
105 // Client should use namespace filter to filter out all document under
106 // a namespace.
107 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
108 "The namespace : ", namespace_document_uri_group.namespace_(),
109 " has empty document uri in the document uri filter. Please use the "
110 "namespace filter to exclude a namespace instead of the document uri "
111 "filter."));
112 }
113
114 // Translate namespace document Uris into document_ids
115 std::unordered_set<DocumentId> target_document_ids;
116 target_document_ids.reserve(
117 namespace_document_uri_group.document_uris_size());
118 for (std::string_view document_uri :
119 namespace_document_uri_group.document_uris()) {
120 auto document_id_or = document_store->GetDocumentId(
121 namespace_document_uri_group.namespace_(), document_uri);
122 if (!document_id_or.ok()) {
123 continue;
124 }
125 target_document_ids.insert(document_id_or.ValueOrDie());
126 }
127 document_id_filter_map.insert({namespace_id, target_document_ids});
128 }
129 return document_id_filter_map;
130 }
131
132 libtextclassifier3::StatusOr<std::unordered_map<SchemaTypeId, SectionIdMask>>
PopulatePropertyFilters(const SchemaStore * schema_store,const icing::lib::SuggestionSpecProto & suggestion_spec,const std::unordered_set<SchemaTypeId> & schema_type_ids)133 PopulatePropertyFilters(
134 const SchemaStore* schema_store,
135 const icing::lib::SuggestionSpecProto& suggestion_spec,
136 const std::unordered_set<SchemaTypeId>& schema_type_ids) {
137 std::unordered_map<SchemaTypeId, SectionIdMask> property_filter_map;
138 property_filter_map.reserve(suggestion_spec.type_property_filters_size());
139 for (const TypePropertyMask& type_field_mask :
140 suggestion_spec.type_property_filters()) {
141 auto schema_type_id_or =
142 schema_store->GetSchemaTypeId(type_field_mask.schema_type());
143 if (!schema_type_id_or.ok()) {
144 // The current schema doesn't exist
145 continue;
146 }
147 SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
148
149 if (!schema_type_ids.empty() &&
150 schema_type_ids.find(schema_type_id) == schema_type_ids.end()) {
151 // The current schema type doesn't appear in the schema type filter.
152 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
153 "The schema : ", type_field_mask.schema_type(),
154 " appears in the property filter, but doesn't appear in the schema"
155 " type filter."));
156 }
157
158 if (type_field_mask.paths().empty()) {
159 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
160 "The schema type : ", type_field_mask.schema_type(),
161 " has empty path in the property filter. Please use the schema type"
162 " filter to exclude a schema type instead of the property filter."));
163 }
164
165 // Translate property paths into section id mask
166 SectionIdMask section_mask = kSectionIdMaskNone;
167 auto section_metadata_list_or =
168 schema_store->GetSectionMetadata(type_field_mask.schema_type());
169 if (!section_metadata_list_or.ok()) {
170 // The current schema doesn't has section metadata.
171 continue;
172 }
173 std::unordered_set<std::string> target_property_paths;
174 target_property_paths.reserve(type_field_mask.paths_size());
175 for (const std::string& target_property_path : type_field_mask.paths()) {
176 target_property_paths.insert(target_property_path);
177 }
178 const std::vector<SectionMetadata>* section_metadata_list =
179 section_metadata_list_or.ValueOrDie();
180 for (const SectionMetadata& section_metadata : *section_metadata_list) {
181 if (target_property_paths.find(section_metadata.path) !=
182 target_property_paths.end()) {
183 section_mask |= UINT64_C(1) << section_metadata.id;
184 }
185 }
186 property_filter_map.insert({schema_type_id, section_mask});
187 }
188 return property_filter_map;
189 }
190
191 libtextclassifier3::StatusOr<std::vector<TermMetadata>>
QuerySuggestions(const icing::lib::SuggestionSpecProto & suggestion_spec,int64_t current_time_ms)192 SuggestionProcessor::QuerySuggestions(
193 const icing::lib::SuggestionSpecProto& suggestion_spec,
194 int64_t current_time_ms) {
195 // We use query tokenizer to tokenize the give prefix, and we only use the
196 // last token to be the suggestion prefix.
197
198 // Populate target namespace filter.
199 std::unordered_set<NamespaceId> namespace_ids;
200 namespace_ids.reserve(suggestion_spec.namespace_filters_size());
201 for (std::string_view name_space : suggestion_spec.namespace_filters()) {
202 auto namespace_id_or = document_store_.GetNamespaceId(name_space);
203 if (!namespace_id_or.ok()) {
204 // The current namespace doesn't exist.
205 continue;
206 }
207 namespace_ids.insert(namespace_id_or.ValueOrDie());
208 }
209 if (namespace_ids.empty() && !suggestion_spec.namespace_filters().empty()) {
210 // None of desired namespace exists, we should return directly.
211 return std::vector<TermMetadata>();
212 }
213
214 // Populate target document id filter.
215 auto document_id_filter_map_or = PopulateDocumentIdFilters(
216 &document_store_, suggestion_spec, namespace_ids);
217 if (!document_id_filter_map_or.ok()) {
218 return std::move(document_id_filter_map_or).status();
219 }
220
221 std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>
222 document_id_filter_map = document_id_filter_map_or.ValueOrDie();
223 if (document_id_filter_map.empty() &&
224 !suggestion_spec.document_uri_filters().empty()) {
225 // None of desired DocumentId exists, we should return directly.
226 return std::vector<TermMetadata>();
227 }
228
229 // Populate target schema type filter.
230 std::unordered_set<SchemaTypeId> schema_type_ids;
231 schema_type_ids.reserve(suggestion_spec.schema_type_filters_size());
232 for (std::string_view schema_type : suggestion_spec.schema_type_filters()) {
233 auto schema_type_id_or = schema_store_.GetSchemaTypeId(schema_type);
234 if (!schema_type_id_or.ok()) {
235 continue;
236 }
237 schema_type_ids.insert(schema_type_id_or.ValueOrDie());
238 }
239 if (schema_type_ids.empty() &&
240 !suggestion_spec.schema_type_filters().empty()) {
241 // None of desired schema type exists, we should return directly.
242 return std::vector<TermMetadata>();
243 }
244
245 // Populate target properties filter.
246 auto property_filter_map_or =
247 PopulatePropertyFilters(&schema_store_, suggestion_spec, schema_type_ids);
248 if (!property_filter_map_or.ok()) {
249 return std::move(property_filter_map_or).status();
250 }
251 std::unordered_map<SchemaTypeId, SectionIdMask> property_filter_map =
252 property_filter_map_or.ValueOrDie();
253
254 ICING_ASSIGN_OR_RETURN(
255 std::unique_ptr<QueryProcessor> query_processor,
256 QueryProcessor::Create(
257 &index_, &numeric_index_, &embedding_index_, &language_segmenter_,
258 &normalizer_, &document_store_, &schema_store_,
259 /*join_children_fetcher=*/nullptr, &clock_, &feature_flags_));
260
261 SearchSpecProto search_spec;
262 search_spec.set_query(suggestion_spec.prefix());
263 search_spec.set_term_match_type(
264 suggestion_spec.scoring_spec().scoring_match_type());
265 for (const PropertyProto::VectorProto& vector :
266 suggestion_spec.embedding_query_vectors()) {
267 *search_spec.add_embedding_query_vectors() = vector;
268 }
269 search_spec.set_embedding_query_metric_type(
270 suggestion_spec.embedding_query_metric_type());
271 for (const std::string& query_parameter_string :
272 suggestion_spec.query_parameter_strings()) {
273 search_spec.add_query_parameter_strings(query_parameter_string);
274 }
275 for (const std::string& enabled_feature :
276 suggestion_spec.enabled_features()) {
277 search_spec.add_enabled_features(enabled_feature);
278 }
279 ICING_ASSIGN_OR_RETURN(
280 QueryResults query_results,
281 query_processor->ParseSearch(search_spec,
282 ScoringSpecProto::RankingStrategy::NONE,
283 current_time_ms));
284
285 ICING_ASSIGN_OR_RETURN(
286 DocHitInfoIterator::TrimmedNode trimmed_node,
287 std::move(*query_results.root_iterator).TrimRightMostNode());
288
289 // If the position of the last token is not the end of the prefix, it means
290 // there should be some operator tokens after it and are ignored by the
291 // tokenizer.
292 bool is_last_token =
293 trimmed_node.term_start_index_ + trimmed_node.unnormalized_term_length_ >=
294 suggestion_spec.prefix().length();
295
296 if (!is_last_token || trimmed_node.term_.empty()) {
297 // We don't have a valid last token, return early.
298 return std::vector<TermMetadata>();
299 }
300
301 // Populate the search base in document ids.
302 // Suggestions are only generated for the very last term,
303 // trimmed_node.iterator_ tracks search results for all previous terms. If it
304 // is null means there is no pervious term and we are generating suggetion for
305 // a single term.
306 std::unordered_set<DocumentId> search_base;
307 if (trimmed_node.iterator_ != nullptr) {
308 while (trimmed_node.iterator_->Advance().ok()) {
309 search_base.insert(trimmed_node.iterator_->doc_hit_info().document_id());
310 }
311 if (search_base.empty()) {
312 // Nothing matches the previous terms in the query. There are no valid
313 // suggestions to make, we should return directly.
314 return std::vector<TermMetadata>();
315 }
316 }
317
318 // Create result checker based on given filters.
319 SuggestionResultCheckerImpl suggestion_result_checker_impl(
320 &document_store_, &schema_store_, std::move(namespace_ids),
321 std::move(document_id_filter_map), std::move(schema_type_ids),
322 std::move(property_filter_map), std::move(trimmed_node.target_section_),
323 std::move(search_base), current_time_ms);
324 // TODO(b/228240987) support generate suggestion and append suffix for advance
325 // query and function call.
326 std::string query_prefix =
327 suggestion_spec.prefix().substr(0, trimmed_node.term_start_index_);
328 // Run suggestion based on given SuggestionSpec.
329 // Normalize token text to lowercase since all tokens in the lexicon are
330 // lowercase.
331 ICING_ASSIGN_OR_RETURN(
332 std::vector<TermMetadata> terms,
333 index_.FindTermsByPrefix(
334 trimmed_node.term_, suggestion_spec.num_to_return(),
335 suggestion_spec.scoring_spec().scoring_match_type(),
336 suggestion_spec.scoring_spec().rank_by(),
337 &suggestion_result_checker_impl));
338 for (TermMetadata& term : terms) {
339 term.content = query_prefix + term.content;
340 }
341 return terms;
342 }
343
SuggestionProcessor(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const LanguageSegmenter * language_segmenter,const Normalizer * normalizer,const DocumentStore * document_store,const SchemaStore * schema_store,const Clock * clock,const FeatureFlags * feature_flags)344 SuggestionProcessor::SuggestionProcessor(
345 Index* index, const NumericIndex<int64_t>* numeric_index,
346 const EmbeddingIndex* embedding_index,
347 const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
348 const DocumentStore* document_store, const SchemaStore* schema_store,
349 const Clock* clock, const FeatureFlags* feature_flags)
350 : index_(*index),
351 numeric_index_(*numeric_index),
352 embedding_index_(*embedding_index),
353 language_segmenter_(*language_segmenter),
354 normalizer_(*normalizer),
355 document_store_(*document_store),
356 schema_store_(*schema_store),
357 clock_(*clock),
358 feature_flags_(*feature_flags) {}
359
360 } // namespace lib
361 } // namespace icing
362