xref: /aosp_15_r20/external/icing/icing/icing-search-engine.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/icing-search-engine.h"
16 
17 #include <algorithm>
18 #include <cstddef>
19 #include <cstdint>
20 #include <functional>
21 #include <memory>
22 #include <string>
23 #include <string_view>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <utility>
27 #include <vector>
28 
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/annotate.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/mutex.h"
34 #include "icing/absl_ports/str_cat.h"
35 #include "icing/feature-flags.h"
36 #include "icing/file/destructible-file.h"
37 #include "icing/file/file-backed-proto.h"
38 #include "icing/file/filesystem.h"
39 #include "icing/file/version-util.h"
40 #include "icing/index/data-indexing-handler.h"
41 #include "icing/index/embed/embedding-index.h"
42 #include "icing/index/embedding-indexing-handler.h"
43 #include "icing/index/hit/doc-hit-info.h"
44 #include "icing/index/index-processor.h"
45 #include "icing/index/index.h"
46 #include "icing/index/integer-section-indexing-handler.h"
47 #include "icing/index/iterator/doc-hit-info-iterator.h"
48 #include "icing/index/numeric/integer-index.h"
49 #include "icing/index/term-indexing-handler.h"
50 #include "icing/index/term-metadata.h"
51 #include "icing/jni/jni-cache.h"
52 #include "icing/join/join-children-fetcher.h"
53 #include "icing/join/join-processor.h"
54 #include "icing/join/qualified-id-join-index-impl-v2.h"
55 #include "icing/join/qualified-id-join-index-impl-v3.h"
56 #include "icing/join/qualified-id-join-index.h"
57 #include "icing/join/qualified-id-join-indexing-handler.h"
58 #include "icing/legacy/index/icing-filesystem.h"
59 #include "icing/performance-configuration.h"
60 #include "icing/portable/endian.h"
61 #include "icing/proto/blob.pb.h"
62 #include "icing/proto/debug.pb.h"
63 #include "icing/proto/document.pb.h"
64 #include "icing/proto/initialize.pb.h"
65 #include "icing/proto/internal/optimize.pb.h"
66 #include "icing/proto/logging.pb.h"
67 #include "icing/proto/optimize.pb.h"
68 #include "icing/proto/persist.pb.h"
69 #include "icing/proto/reset.pb.h"
70 #include "icing/proto/schema.pb.h"
71 #include "icing/proto/scoring.pb.h"
72 #include "icing/proto/search.pb.h"
73 #include "icing/proto/status.pb.h"
74 #include "icing/proto/storage.pb.h"
75 #include "icing/proto/term.pb.h"
76 #include "icing/proto/usage.pb.h"
77 #include "icing/query/advanced_query_parser/lexer.h"
78 #include "icing/query/query-features.h"
79 #include "icing/query/query-processor.h"
80 #include "icing/query/query-results.h"
81 #include "icing/query/suggestion-processor.h"
82 #include "icing/result/page-result.h"
83 #include "icing/result/projection-tree.h"
84 #include "icing/result/projector.h"
85 #include "icing/result/result-adjustment-info.h"
86 #include "icing/result/result-retriever-v2.h"
87 #include "icing/result/result-state-manager.h"
88 #include "icing/schema/schema-store.h"
89 #include "icing/scoring/advanced_scoring/score-expression.h"
90 #include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
91 #include "icing/scoring/scored-document-hit.h"
92 #include "icing/scoring/scored-document-hits-ranker.h"
93 #include "icing/scoring/scoring-processor.h"
94 #include "icing/store/blob-store.h"
95 #include "icing/store/document-id.h"
96 #include "icing/store/document-store.h"
97 #include "icing/tokenization/language-segmenter-factory.h"
98 #include "icing/transform/normalizer-factory.h"
99 #include "icing/util/clock.h"
100 #include "icing/util/data-loss.h"
101 #include "icing/util/logging.h"
102 #include "icing/util/status-macros.h"
103 #include "icing/util/tokenized-document.h"
104 #include "unicode/uloc.h"
105 #include <google/protobuf/repeated_field.h>
106 
107 namespace icing {
108 namespace lib {
109 
110 namespace {
111 
112 constexpr std::string_view kDocumentSubfolderName = "document_dir";
113 constexpr std::string_view kBlobSubfolderName = "blob_dir";
114 constexpr std::string_view kIndexSubfolderName = "index_dir";
115 constexpr std::string_view kIntegerIndexSubfolderName = "integer_index_dir";
116 constexpr std::string_view kQualifiedIdJoinIndexSubfolderName =
117     "qualified_id_join_index_dir";
118 constexpr std::string_view kEmbeddingIndexSubfolderName = "embedding_index_dir";
119 constexpr std::string_view kSchemaSubfolderName = "schema_dir";
120 constexpr std::string_view kSetSchemaMarkerFilename = "set_schema_marker";
121 constexpr std::string_view kInitMarkerFilename = "init_marker";
122 constexpr std::string_view kOptimizeStatusFilename = "optimize_status";
123 
124 // The maximum number of unsuccessful initialization attempts from the current
125 // state that we will tolerate before deleting all data and starting from a
126 // fresh state.
127 constexpr int kMaxUnsuccessfulInitAttempts = 5;
128 
129 // A pair that holds namespace and type.
130 struct NamespaceTypePair {
131   std::string namespace_;
132   std::string type;
133 
operator ==icing::lib::__anonbe1cacbd0111::NamespaceTypePair134   bool operator==(const NamespaceTypePair& other) const {
135     return namespace_ == other.namespace_ && type == other.type;
136   }
137 };
138 
139 struct NamespaceTypePairHasher {
operator ()icing::lib::__anonbe1cacbd0111::NamespaceTypePairHasher140   std::size_t operator()(const NamespaceTypePair& pair) const {
141     return std::hash<std::string>()(pair.namespace_) ^
142            std::hash<std::string>()(pair.type);
143   }
144 };
145 
ValidateResultSpec(const DocumentStore * document_store,const ResultSpecProto & result_spec)146 libtextclassifier3::Status ValidateResultSpec(
147     const DocumentStore* document_store, const ResultSpecProto& result_spec) {
148   if (result_spec.num_per_page() < 0) {
149     return absl_ports::InvalidArgumentError(
150         "ResultSpecProto.num_per_page cannot be negative.");
151   }
152   if (result_spec.num_total_bytes_per_page_threshold() <= 0) {
153     return absl_ports::InvalidArgumentError(
154         "ResultSpecProto.num_total_bytes_per_page_threshold cannot be "
155         "non-positive.");
156   }
157   if (result_spec.max_joined_children_per_parent_to_return() < 0) {
158     return absl_ports::InvalidArgumentError(
159         "ResultSpecProto.max_joined_children_per_parent_to_return cannot be "
160         "negative.");
161   }
162   if (result_spec.num_to_score() <= 0) {
163     return absl_ports::InvalidArgumentError(
164         "ResultSpecProto.num_to_score cannot be non-positive.");
165   }
166   // Validate ResultGroupings.
167   std::unordered_set<int32_t> unique_entry_ids;
168   ResultSpecProto::ResultGroupingType result_grouping_type =
169       result_spec.result_group_type();
170   for (const ResultSpecProto::ResultGrouping& result_grouping :
171        result_spec.result_groupings()) {
172     if (result_grouping.max_results() <= 0) {
173       return absl_ports::InvalidArgumentError(
174           "Cannot specify a result grouping with max results <= 0.");
175     }
176     for (const ResultSpecProto::ResultGrouping::Entry& entry :
177          result_grouping.entry_groupings()) {
178       const std::string& name_space = entry.namespace_();
179       const std::string& schema = entry.schema();
180       auto entry_id_or = document_store->GetResultGroupingEntryId(
181           result_grouping_type, name_space, schema);
182       if (!entry_id_or.ok()) {
183         continue;
184       }
185       int32_t entry_id = entry_id_or.ValueOrDie();
186       if (unique_entry_ids.find(entry_id) != unique_entry_ids.end()) {
187         return absl_ports::InvalidArgumentError(
188             "Entry Ids must be unique across result groups.");
189       }
190       unique_entry_ids.insert(entry_id);
191     }
192   }
193   return libtextclassifier3::Status::OK;
194 }
195 
ValidateSearchSpec(const SearchSpecProto & search_spec,const PerformanceConfiguration & configuration)196 libtextclassifier3::Status ValidateSearchSpec(
197     const SearchSpecProto& search_spec,
198     const PerformanceConfiguration& configuration) {
199   if (search_spec.query().size() > configuration.max_query_length) {
200     return absl_ports::InvalidArgumentError(
201         absl_ports::StrCat("SearchSpecProto.query is longer than the maximum "
202                            "allowed query length: ",
203                            std::to_string(configuration.max_query_length)));
204   }
205   // Check that no unknown features have been enabled in the search spec.
206   std::unordered_set<Feature> query_features_set = GetQueryFeaturesSet();
207   for (const Feature feature : search_spec.enabled_features()) {
208     if (query_features_set.find(feature) == query_features_set.end()) {
209       return absl_ports::InvalidArgumentError(
210           absl_ports::StrCat("Unknown feature in "
211                              "SearchSpecProto.enabled_features: ",
212                              feature));
213     }
214   }
215   return libtextclassifier3::Status::OK;
216 }
217 
ValidateSuggestionSpec(const SuggestionSpecProto & suggestion_spec,const PerformanceConfiguration & configuration)218 libtextclassifier3::Status ValidateSuggestionSpec(
219     const SuggestionSpecProto& suggestion_spec,
220     const PerformanceConfiguration& configuration) {
221   if (suggestion_spec.prefix().empty()) {
222     return absl_ports::InvalidArgumentError(
223         absl_ports::StrCat("SuggestionSpecProto.prefix is empty!"));
224   }
225   if (suggestion_spec.scoring_spec().scoring_match_type() ==
226       TermMatchType::UNKNOWN) {
227     return absl_ports::InvalidArgumentError(
228         absl_ports::StrCat("SuggestionSpecProto.term_match_type is unknown!"));
229   }
230   if (suggestion_spec.num_to_return() <= 0) {
231     return absl_ports::InvalidArgumentError(absl_ports::StrCat(
232         "SuggestionSpecProto.num_to_return must be positive."));
233   }
234   if (suggestion_spec.prefix().size() > configuration.max_query_length) {
235     return absl_ports::InvalidArgumentError(
236         absl_ports::StrCat("SuggestionSpecProto.prefix is longer than the "
237                            "maximum allowed prefix length: ",
238                            std::to_string(configuration.max_query_length)));
239   }
240   return libtextclassifier3::Status::OK;
241 }
242 
ValidateScoringSpec(const ScoringSpecProto & scoring_spec)243 libtextclassifier3::Status ValidateScoringSpec(
244     const ScoringSpecProto& scoring_spec) {
245   std::unordered_set<std::string> alias_schema_types;
246   for (const SchemaTypeAliasMapProto& alias_map_proto :
247        scoring_spec.schema_type_alias_map_protos()) {
248     if (alias_map_proto.alias_schema_type().empty()) {
249       return absl_ports::InvalidArgumentError(
250           "SchemaTypeAliasMapProto contains alias_schema_type with empty "
251           "string");
252     }
253     if (alias_map_proto.schema_types().empty()) {
254       return absl_ports::InvalidArgumentError(
255           absl_ports::StrCat("SchemaTypeAliasMapProto contains empty "
256                              "schema_types for alias_schema_type: ",
257                              alias_map_proto.alias_schema_type()));
258     }
259     if (alias_schema_types.find(alias_map_proto.alias_schema_type()) !=
260         alias_schema_types.end()) {
261       return absl_ports::InvalidArgumentError(
262           absl_ports::StrCat("SchemaTypeAliasMapProto contains multiple "
263                              "entries with the same alias_schema_type: ",
264                              alias_map_proto.alias_schema_type()));
265     }
266     alias_schema_types.insert(alias_map_proto.alias_schema_type());
267   }
268   return libtextclassifier3::Status::OK;
269 }
270 
CopyParentSchemaTypeAliasMapToChild(const ScoringSpecProto & parent_scoring_spec,const ScoringSpecProto & child_scoring_spec)271 ScoringSpecProto CopyParentSchemaTypeAliasMapToChild(
272     const ScoringSpecProto& parent_scoring_spec,
273     const ScoringSpecProto& child_scoring_spec) {
274   ScoringSpecProto new_child_scoring_spec = std::move(child_scoring_spec);
275   for (const SchemaTypeAliasMapProto& alias_map_proto :
276        parent_scoring_spec.schema_type_alias_map_protos()) {
277     *new_child_scoring_spec.add_schema_type_alias_map_protos() =
278         alias_map_proto;
279   }
280   return new_child_scoring_spec;
281 }
282 
283 libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>>
CreateQualifiedIdJoinIndex(const Filesystem & filesystem,std::string qualified_id_join_index_dir,const IcingSearchEngineOptions & options,const FeatureFlags & feature_flags)284 CreateQualifiedIdJoinIndex(const Filesystem& filesystem,
285                            std::string qualified_id_join_index_dir,
286                            const IcingSearchEngineOptions& options,
287                            const FeatureFlags& feature_flags) {
288   if (options.enable_qualified_id_join_index_v3_and_delete_propagate_from()) {
289     return QualifiedIdJoinIndexImplV3::Create(
290         filesystem, std::move(qualified_id_join_index_dir), feature_flags);
291   } else {
292     // V2
293     return QualifiedIdJoinIndexImplV2::Create(
294         filesystem, std::move(qualified_id_join_index_dir),
295         options.pre_mapping_fbv());
296   }
297 }
298 
299 // Document store files are in a standalone subfolder for easier file
300 // management. We can delete and recreate the subfolder and not touch/affect
301 // anything else.
MakeDocumentDirectoryPath(const std::string & base_dir)302 std::string MakeDocumentDirectoryPath(const std::string& base_dir) {
303   return absl_ports::StrCat(base_dir, "/", kDocumentSubfolderName);
304 }
305 
MakeBlobDirectoryPath(const std::string & base_dir)306 std::string MakeBlobDirectoryPath(const std::string& base_dir) {
307   return absl_ports::StrCat(base_dir, "/", kBlobSubfolderName);
308 }
309 
310 // Makes a temporary folder path for the document store which will be used
311 // during full optimization.
MakeDocumentTemporaryDirectoryPath(const std::string & base_dir)312 std::string MakeDocumentTemporaryDirectoryPath(const std::string& base_dir) {
313   return absl_ports::StrCat(base_dir, "/", kDocumentSubfolderName,
314                             "_optimize_tmp");
315 }
316 
317 // Index files are in a standalone subfolder because for easier file management.
318 // We can delete and recreate the subfolder and not touch/affect anything
319 // else.
MakeIndexDirectoryPath(const std::string & base_dir)320 std::string MakeIndexDirectoryPath(const std::string& base_dir) {
321   return absl_ports::StrCat(base_dir, "/", kIndexSubfolderName);
322 }
323 
324 // Working path for integer index. Integer index is derived from
325 // PersistentStorage and it will take full ownership of this working path,
326 // including creation/deletion. See PersistentStorage for more details about
327 // working path.
MakeIntegerIndexWorkingPath(const std::string & base_dir)328 std::string MakeIntegerIndexWorkingPath(const std::string& base_dir) {
329   return absl_ports::StrCat(base_dir, "/", kIntegerIndexSubfolderName);
330 }
331 
332 // Working path for qualified id join index. It is derived from
333 // PersistentStorage and it will take full ownership of this working path,
334 // including creation/deletion. See PersistentStorage for more details about
335 // working path.
MakeQualifiedIdJoinIndexWorkingPath(const std::string & base_dir)336 std::string MakeQualifiedIdJoinIndexWorkingPath(const std::string& base_dir) {
337   return absl_ports::StrCat(base_dir, "/", kQualifiedIdJoinIndexSubfolderName);
338 }
339 
340 // Working path for embedding index.
MakeEmbeddingIndexWorkingPath(const std::string & base_dir)341 std::string MakeEmbeddingIndexWorkingPath(const std::string& base_dir) {
342   return absl_ports::StrCat(base_dir, "/", kEmbeddingIndexSubfolderName);
343 }
344 
345 // SchemaStore files are in a standalone subfolder for easier file management.
346 // We can delete and recreate the subfolder and not touch/affect anything
347 // else.
MakeSchemaDirectoryPath(const std::string & base_dir)348 std::string MakeSchemaDirectoryPath(const std::string& base_dir) {
349   return absl_ports::StrCat(base_dir, "/", kSchemaSubfolderName);
350 }
351 
MakeSetSchemaMarkerFilePath(const std::string & base_dir)352 std::string MakeSetSchemaMarkerFilePath(const std::string& base_dir) {
353   return absl_ports::StrCat(base_dir, "/", kSetSchemaMarkerFilename);
354 }
355 
MakeInitMarkerFilePath(const std::string & base_dir)356 std::string MakeInitMarkerFilePath(const std::string& base_dir) {
357   return absl_ports::StrCat(base_dir, "/", kInitMarkerFilename);
358 }
359 
TransformStatus(const libtextclassifier3::Status & internal_status,StatusProto * status_proto)360 void TransformStatus(const libtextclassifier3::Status& internal_status,
361                      StatusProto* status_proto) {
362   StatusProto::Code code;
363   if (!internal_status.ok()) {
364     ICING_LOG(WARNING) << "Error: " << internal_status.error_code()
365                        << ", Message: " << internal_status.error_message();
366   }
367   switch (internal_status.CanonicalCode()) {
368     case libtextclassifier3::StatusCode::OK:
369       code = StatusProto::OK;
370       break;
371     case libtextclassifier3::StatusCode::DATA_LOSS:
372       code = StatusProto::WARNING_DATA_LOSS;
373       break;
374     case libtextclassifier3::StatusCode::INVALID_ARGUMENT:
375       code = StatusProto::INVALID_ARGUMENT;
376       break;
377     case libtextclassifier3::StatusCode::NOT_FOUND:
378       code = StatusProto::NOT_FOUND;
379       break;
380     case libtextclassifier3::StatusCode::FAILED_PRECONDITION:
381       code = StatusProto::FAILED_PRECONDITION;
382       break;
383     case libtextclassifier3::StatusCode::ABORTED:
384       code = StatusProto::ABORTED;
385       break;
386     case libtextclassifier3::StatusCode::INTERNAL:
387       // TODO(b/147699081): Cleanup our internal use of INTERNAL since it
388       // doesn't match with what it *should* indicate as described in
389       // go/icing-library-apis.
390       code = StatusProto::INTERNAL;
391       break;
392     case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED:
393       // TODO(b/147699081): Note that we don't detect all cases of OUT_OF_SPACE
394       // (e.g. if the document log is full). And we use RESOURCE_EXHAUSTED
395       // internally to indicate other resources are exhausted (e.g.
396       // DocHitInfos) - although none of these are exposed through the API.
397       // Consider separating the two cases out more clearly.
398       code = StatusProto::OUT_OF_SPACE;
399       break;
400     case libtextclassifier3::StatusCode::ALREADY_EXISTS:
401       code = StatusProto::ALREADY_EXISTS;
402       break;
403     case libtextclassifier3::StatusCode::CANCELLED:
404       [[fallthrough]];
405     case libtextclassifier3::StatusCode::UNKNOWN:
406       [[fallthrough]];
407     case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED:
408       [[fallthrough]];
409     case libtextclassifier3::StatusCode::PERMISSION_DENIED:
410       [[fallthrough]];
411     case libtextclassifier3::StatusCode::OUT_OF_RANGE:
412       [[fallthrough]];
413     case libtextclassifier3::StatusCode::UNIMPLEMENTED:
414       [[fallthrough]];
415     case libtextclassifier3::StatusCode::UNAVAILABLE:
416       [[fallthrough]];
417     case libtextclassifier3::StatusCode::UNAUTHENTICATED:
418       // Other internal status codes aren't supported externally yet. If it
419       // should be supported, add another switch-case above.
420       ICING_LOG(ERROR) << "Internal status code "
421                        << internal_status.error_code()
422                        << " not supported in the external API";
423       code = StatusProto::UNKNOWN;
424       break;
425   }
426   status_proto->set_code(code);
427   status_proto->set_message(internal_status.error_message());
428 }
429 
RetrieveAndAddDocumentInfo(const DocumentStore * document_store,DeleteByQueryResultProto & result_proto,std::unordered_map<NamespaceTypePair,DeleteByQueryResultProto::DocumentGroupInfo *,NamespaceTypePairHasher> & info_map,DocumentId document_id)430 libtextclassifier3::Status RetrieveAndAddDocumentInfo(
431     const DocumentStore* document_store, DeleteByQueryResultProto& result_proto,
432     std::unordered_map<NamespaceTypePair,
433                        DeleteByQueryResultProto::DocumentGroupInfo*,
434                        NamespaceTypePairHasher>& info_map,
435     DocumentId document_id) {
436   ICING_ASSIGN_OR_RETURN(DocumentProto document,
437                          document_store->Get(document_id));
438   NamespaceTypePair key = {document.namespace_(), document.schema()};
439   auto iter = info_map.find(key);
440   if (iter == info_map.end()) {
441     auto entry = result_proto.add_deleted_documents();
442     entry->set_namespace_(std::move(document.namespace_()));
443     entry->set_schema(std::move(document.schema()));
444     entry->add_uris(std::move(document.uri()));
445     info_map[key] = entry;
446   } else {
447     iter->second->add_uris(std::move(document.uri()));
448   }
449   return libtextclassifier3::Status::OK;
450 }
451 
ShouldRebuildIndex(const OptimizeStatsProto & optimize_stats,float optimize_rebuild_index_threshold)452 bool ShouldRebuildIndex(const OptimizeStatsProto& optimize_stats,
453                         float optimize_rebuild_index_threshold) {
454   int num_invalid_documents = optimize_stats.num_deleted_documents() +
455                               optimize_stats.num_expired_documents();
456   return num_invalid_documents >= optimize_stats.num_original_documents() *
457                                       optimize_rebuild_index_threshold;
458 }
459 
ScoringExpressionHasRelevanceScoreFunction(std::string_view scoring_expression)460 libtextclassifier3::StatusOr<bool> ScoringExpressionHasRelevanceScoreFunction(
461     std::string_view scoring_expression) {
462   // TODO(b/261474063) The Lexer will be called again when creating the
463   // AdvancedScorer instance. Consider refactoring the code to allow the Lexer
464   // to be called only once.
465   Lexer lexer(scoring_expression, Lexer::Language::SCORING);
466   ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
467                          std::move(lexer).ExtractTokens());
468   for (const Lexer::LexerToken& token : lexer_tokens) {
469     if (token.type == Lexer::TokenType::FUNCTION_NAME &&
470         token.text == RelevanceScoreFunctionScoreExpression::kFunctionName) {
471       return true;
472     }
473   }
474   return false;
475 }
476 
477 // Useful method to get RankingStrategy if advanced scoring is enabled. When the
478 // "RelevanceScore" function is used in the advanced scoring expression,
479 // RankingStrategy will be treated as RELEVANCE_SCORE in order to prepare the
480 // necessary information needed for calculating relevance score.
481 libtextclassifier3::StatusOr<ScoringSpecProto::RankingStrategy::Code>
GetRankingStrategyFromScoringSpec(const ScoringSpecProto & scoring_spec)482 GetRankingStrategyFromScoringSpec(const ScoringSpecProto& scoring_spec) {
483   if (scoring_spec.advanced_scoring_expression().empty() &&
484       scoring_spec.additional_advanced_scoring_expressions().empty()) {
485     return scoring_spec.rank_by();
486   }
487 
488   ICING_ASSIGN_OR_RETURN(bool has_relevance_score_function,
489                          ScoringExpressionHasRelevanceScoreFunction(
490                              scoring_spec.advanced_scoring_expression()));
491   if (has_relevance_score_function) {
492     return ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;
493   }
494   for (std::string_view additional_scoring_expression :
495        scoring_spec.additional_advanced_scoring_expressions()) {
496     ICING_ASSIGN_OR_RETURN(has_relevance_score_function,
497                            ScoringExpressionHasRelevanceScoreFunction(
498                                additional_scoring_expression));
499     if (has_relevance_score_function) {
500       return ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;
501     }
502   }
503   return ScoringSpecProto::RankingStrategy::NONE;
504 }
505 
506 }  // namespace
507 
IcingSearchEngine(const IcingSearchEngineOptions & options,std::unique_ptr<const JniCache> jni_cache)508 IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
509                                      std::unique_ptr<const JniCache> jni_cache)
510     : IcingSearchEngine(options, std::make_unique<Filesystem>(),
511                         std::make_unique<IcingFilesystem>(),
512                         std::make_unique<Clock>(), std::move(jni_cache)) {}
513 
IcingSearchEngine(IcingSearchEngineOptions options,std::unique_ptr<const Filesystem> filesystem,std::unique_ptr<const IcingFilesystem> icing_filesystem,std::unique_ptr<Clock> clock,std::unique_ptr<const JniCache> jni_cache)514 IcingSearchEngine::IcingSearchEngine(
515     IcingSearchEngineOptions options,
516     std::unique_ptr<const Filesystem> filesystem,
517     std::unique_ptr<const IcingFilesystem> icing_filesystem,
518     std::unique_ptr<Clock> clock, std::unique_ptr<const JniCache> jni_cache)
519     : options_(std::move(options)),
520       feature_flags_(options_.enable_scorable_properties(),
521                      options_.enable_embedding_quantization(),
522                      options_.enable_repeated_field_joins()),
523       filesystem_(std::move(filesystem)),
524       icing_filesystem_(std::move(icing_filesystem)),
525       clock_(std::move(clock)),
526       jni_cache_(std::move(jni_cache)) {
527   ICING_VLOG(1) << "Creating IcingSearchEngine in dir: " << options_.base_dir();
528 }
529 
~IcingSearchEngine()530 IcingSearchEngine::~IcingSearchEngine() {
531   if (initialized_) {
532     if (PersistToDisk(PersistType::FULL).status().code() != StatusProto::OK) {
533       ICING_LOG(ERROR)
534           << "Error persisting to disk in IcingSearchEngine destructor";
535     }
536   }
537 }
538 
Initialize()539 InitializeResultProto IcingSearchEngine::Initialize() {
540   // This method does both read and write so we need a writer lock. Using two
541   // locks (reader and writer) has the chance to be interrupted during
542   // switching.
543   absl_ports::unique_lock l(&mutex_);
544   return InternalInitialize();
545 }
546 
ResetMembers()547 void IcingSearchEngine::ResetMembers() {
548   // Reset all members in the reverse order of their initialization to ensure
549   // the dependencies are not violated.
550   embedding_index_.reset();
551   qualified_id_join_index_.reset();
552   integer_index_.reset();
553   index_.reset();
554   normalizer_.reset();
555   language_segmenter_.reset();
556   blob_store_.reset();
557   result_state_manager_.reset();
558   document_store_.reset();
559   schema_store_.reset();
560 }
561 
CheckInitMarkerFile(InitializeStatsProto * initialize_stats)562 libtextclassifier3::Status IcingSearchEngine::CheckInitMarkerFile(
563     InitializeStatsProto* initialize_stats) {
564   // Check to see if the marker file exists and if we've already passed our max
565   // number of init attempts.
566   std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
567   bool file_exists = filesystem_->FileExists(marker_filepath.c_str());
568   int network_init_attempts = 0;
569   int host_init_attempts = 0;
570 
571   // Read the number of previous failed init attempts from the file. If it
572   // fails, then just assume the value is zero (the most likely reason for
573   // failure would be non-existence because the last init was successful
574   // anyways).
575   std::unique_ptr<ScopedFd> marker_file_fd = std::make_unique<ScopedFd>(
576       filesystem_->OpenForWrite(marker_filepath.c_str()));
577   libtextclassifier3::Status status;
578   if (file_exists &&
579       filesystem_->PRead(marker_file_fd->get(), &network_init_attempts,
580                          sizeof(network_init_attempts), /*offset=*/0)) {
581     host_init_attempts = GNetworkToHostL(network_init_attempts);
582     if (host_init_attempts > kMaxUnsuccessfulInitAttempts) {
583       // We're tried and failed to init too many times. We need to throw
584       // everything out and start from scratch.
585       ResetMembers();
586       marker_file_fd.reset();
587 
588       // Delete the entire base directory.
589       if (!filesystem_->DeleteDirectoryRecursively(
590               options_.base_dir().c_str())) {
591         return absl_ports::InternalError("Failed to delete icing base dir!");
592       }
593 
594       // Create the base directory again and reopen marker file.
595       if (!filesystem_->CreateDirectoryRecursively(
596               options_.base_dir().c_str())) {
597         return absl_ports::InternalError("Failed to create icing base dir!");
598       }
599 
600       marker_file_fd = std::make_unique<ScopedFd>(
601           filesystem_->OpenForWrite(marker_filepath.c_str()));
602 
603       status = absl_ports::DataLossError(
604           "Encountered failed initialization limit. Cleared all data.");
605       host_init_attempts = 0;
606     }
607   }
608 
609   // Use network_init_attempts here because we might have set host_init_attempts
610   // to 0 if it exceeded the max threshold.
611   initialize_stats->set_num_previous_init_failures(
612       GNetworkToHostL(network_init_attempts));
613 
614   ++host_init_attempts;
615   network_init_attempts = GHostToNetworkL(host_init_attempts);
616   // Write the updated number of attempts before we get started.
617   if (!filesystem_->PWrite(marker_file_fd->get(), /*offset=*/0,
618                            &network_init_attempts,
619                            sizeof(network_init_attempts)) ||
620       !filesystem_->DataSync(marker_file_fd->get())) {
621     return absl_ports::InternalError(
622         "Failed to write and sync init marker file");
623   }
624 
625   return status;
626 }
627 
InternalInitialize()628 InitializeResultProto IcingSearchEngine::InternalInitialize() {
629   ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: "
630                 << options_.base_dir();
631 
632   // Measure the latency of the initialization process.
633   std::unique_ptr<Timer> initialize_timer = clock_->GetNewTimer();
634 
635   InitializeResultProto result_proto;
636   StatusProto* result_status = result_proto.mutable_status();
637   InitializeStatsProto* initialize_stats =
638       result_proto.mutable_initialize_stats();
639   if (initialized_) {
640     // Already initialized.
641     result_status->set_code(StatusProto::OK);
642     initialize_stats->set_latency_ms(
643         initialize_timer->GetElapsedMilliseconds());
644     initialize_stats->set_num_documents(document_store_->num_documents());
645     return result_proto;
646   }
647 
648   // Now go ahead and try to initialize.
649   libtextclassifier3::Status status = InitializeMembers(initialize_stats);
650   if (status.ok() || absl_ports::IsDataLoss(status)) {
651     // We successfully initialized. We should delete the init marker file to
652     // indicate a successful init.
653     std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
654     if (!filesystem_->DeleteFile(marker_filepath.c_str())) {
655       status = absl_ports::InternalError("Failed to delete init marker file!");
656     } else {
657       initialized_ = true;
658     }
659   }
660   TransformStatus(status, result_status);
661   initialize_stats->set_latency_ms(initialize_timer->GetElapsedMilliseconds());
662   return result_proto;
663 }
664 
InitializeMembers(InitializeStatsProto * initialize_stats)665 libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
666     InitializeStatsProto* initialize_stats) {
667   ICING_RETURN_ERROR_IF_NULL(initialize_stats);
668   // Make sure the base directory exists
669   if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) {
670     return absl_ports::InternalError(absl_ports::StrCat(
671         "Could not create directory: ", options_.base_dir()));
672   }
673 
674   // Check to see if the marker file exists and if we've already passed our max
675   // number of init attempts.
676   libtextclassifier3::Status status = CheckInitMarkerFile(initialize_stats);
677   if (!status.ok() && !absl_ports::IsDataLoss(status)) {
678     return status;
679   }
680 
681   // Do version and flags compatibility check
682   // Read version file, determine the state change and rebuild derived files if
683   // needed.
684   const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
685   ICING_ASSIGN_OR_RETURN(
686       IcingSearchEngineVersionProto stored_version_proto,
687       version_util::ReadVersion(
688           *filesystem_, /*version_file_dir=*/options_.base_dir(), index_dir));
689   version_util::VersionInfo stored_version_info =
690       version_util::GetVersionInfoFromProto(stored_version_proto);
691   version_util::StateChange version_state_change =
692       version_util::GetVersionStateChange(stored_version_info);
693 
694   // Construct icing's current version proto based on the current code version
695   IcingSearchEngineVersionProto current_version_proto;
696   current_version_proto.set_version(version_util::kVersion);
697   current_version_proto.set_max_version(
698       std::max(stored_version_info.max_version, version_util::kVersion));
699   version_util::AddEnabledFeatures(options_, &current_version_proto);
700 
701   // Step 1: Perform schema migration if needed. This is a no-op if the schema
702   // is fully compatible with the current version.
703   bool perform_schema_database_migration =
704       version_util::SchemaDatabaseMigrationRequired(stored_version_proto) &&
705       options_.enable_schema_database();
706   ICING_RETURN_IF_ERROR(SchemaStore::MigrateSchema(
707       filesystem_.get(), MakeSchemaDirectoryPath(options_.base_dir()),
708       version_state_change, version_util::kVersion,
709       perform_schema_database_migration));
710 
711   // Step 2: Discard derived files that need to be rebuilt
712   version_util::DerivedFilesRebuildResult required_derived_files_rebuild =
713       version_util::CalculateRequiredDerivedFilesRebuild(stored_version_proto,
714                                                          current_version_proto);
715   ICING_RETURN_IF_ERROR(DiscardDerivedFiles(required_derived_files_rebuild));
716 
717   // Step 3: update version files. We need to update both the V1 and V2
718   // version files.
719   ICING_RETURN_IF_ERROR(version_util::WriteV1Version(
720       *filesystem_, /*version_file_dir=*/options_.base_dir(),
721       version_util::GetVersionInfoFromProto(current_version_proto)));
722   ICING_RETURN_IF_ERROR(version_util::WriteV2Version(
723       *filesystem_, /*version_file_dir=*/options_.base_dir(),
724       std::make_unique<IcingSearchEngineVersionProto>(
725           std::move(current_version_proto))));
726 
727   ICING_RETURN_IF_ERROR(InitializeSchemaStore(initialize_stats));
728 
729   // TODO(b/156383798) : Resolve how to specify the locale.
730   language_segmenter_factory::SegmenterOptions segmenter_options(
731       ULOC_US, jni_cache_.get());
732   TC3_ASSIGN_OR_RETURN(language_segmenter_, language_segmenter_factory::Create(
733                                                 std::move(segmenter_options)));
734 
735   TC3_ASSIGN_OR_RETURN(normalizer_,
736                        normalizer_factory::Create(options_.max_token_length()));
737 
738   std::string marker_filepath =
739       MakeSetSchemaMarkerFilePath(options_.base_dir());
740 
741   libtextclassifier3::Status index_init_status;
742   if (absl_ports::IsNotFound(schema_store_->GetSchema().status())) {
743     // The schema was either lost or never set before. Wipe out the doc store
744     // and index directories and initialize them from scratch.
745     const std::string doc_store_dir =
746         MakeDocumentDirectoryPath(options_.base_dir());
747     const std::string integer_index_dir =
748         MakeIntegerIndexWorkingPath(options_.base_dir());
749     const std::string qualified_id_join_index_dir =
750         MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
751     const std::string embedding_index_dir =
752         MakeEmbeddingIndexWorkingPath(options_.base_dir());
753     const std::string blob_store_dir =
754         MakeBlobDirectoryPath(options_.base_dir());
755 
756     if (!filesystem_->DeleteDirectoryRecursively(doc_store_dir.c_str()) ||
757         !filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
758         !IntegerIndex::Discard(*filesystem_, integer_index_dir).ok() ||
759         !QualifiedIdJoinIndex::Discard(*filesystem_,
760                                        qualified_id_join_index_dir)
761              .ok() ||
762         !EmbeddingIndex::Discard(*filesystem_, embedding_index_dir).ok() ||
763         !filesystem_->DeleteDirectoryRecursively(blob_store_dir.c_str())) {
764       return absl_ports::InternalError(absl_ports::StrCat(
765           "Could not delete directories: ", index_dir, ", ", integer_index_dir,
766           ", ", qualified_id_join_index_dir, ", ", embedding_index_dir, ", ",
767           blob_store_dir, " and ", doc_store_dir));
768     }
769     if (options_.enable_blob_store()) {
770       ICING_RETURN_IF_ERROR(
771           InitializeBlobStore(options_.orphan_blob_time_to_live_ms(),
772                               options_.blob_store_compression_level()));
773     }
774     ICING_ASSIGN_OR_RETURN(
775         bool document_store_derived_files_regenerated,
776         InitializeDocumentStore(
777             /*force_recovery_and_revalidate_documents=*/false,
778             initialize_stats));
779     index_init_status = InitializeIndex(
780         document_store_derived_files_regenerated, initialize_stats);
781     if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
782       return index_init_status;
783     }
784   } else if (filesystem_->FileExists(marker_filepath.c_str())) {
785     // If the marker file is still around then something wonky happened when we
786     // last tried to set the schema.
787     //
788     // Since we're going to rebuild all indices in this case, the return value
789     // of InitializeDocumentStore (document_store_derived_files_regenerated) is
790     // unused.
791     if (options_.enable_blob_store()) {
792       ICING_RETURN_IF_ERROR(
793           InitializeBlobStore(options_.orphan_blob_time_to_live_ms(),
794                               options_.blob_store_compression_level()));
795     }
796     ICING_RETURN_IF_ERROR(InitializeDocumentStore(
797         /*force_recovery_and_revalidate_documents=*/true, initialize_stats));
798 
799     // We're going to need to build the index from scratch. So just delete its
800     // directory now.
801     // Discard index directory and instantiate a new one.
802     Index::Options index_options(index_dir, options_.index_merge_size(),
803                                  /*lite_index_sort_at_indexing=*/true,
804                                  options_.lite_index_sort_size());
805     if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
806         !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
807       return absl_ports::InternalError(
808           absl_ports::StrCat("Could not recreate directory: ", index_dir));
809     }
810     ICING_ASSIGN_OR_RETURN(index_,
811                            Index::Create(index_options, filesystem_.get(),
812                                          icing_filesystem_.get()));
813 
814     // Discard integer index directory and instantiate a new one.
815     std::string integer_index_dir =
816         MakeIntegerIndexWorkingPath(options_.base_dir());
817     ICING_RETURN_IF_ERROR(
818         IntegerIndex::Discard(*filesystem_, integer_index_dir));
819     ICING_ASSIGN_OR_RETURN(
820         integer_index_,
821         IntegerIndex::Create(*filesystem_, std::move(integer_index_dir),
822                              options_.integer_index_bucket_split_threshold(),
823                              options_.pre_mapping_fbv()));
824 
825     // Discard qualified id join index directory and instantiate a new one.
826     std::string qualified_id_join_index_dir =
827         MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
828     ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
829         *filesystem_, qualified_id_join_index_dir));
830     ICING_ASSIGN_OR_RETURN(
831         qualified_id_join_index_,
832         CreateQualifiedIdJoinIndex(*filesystem_,
833                                    std::move(qualified_id_join_index_dir),
834                                    options_, feature_flags_));
835 
836     // Discard embedding index directory and instantiate a new one.
837     std::string embedding_index_dir =
838         MakeEmbeddingIndexWorkingPath(options_.base_dir());
839     ICING_RETURN_IF_ERROR(
840         EmbeddingIndex::Discard(*filesystem_, embedding_index_dir));
841     ICING_ASSIGN_OR_RETURN(
842         embedding_index_,
843         EmbeddingIndex::Create(filesystem_.get(), embedding_index_dir,
844                                clock_.get(), &feature_flags_));
845 
846     std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
847     IndexRestorationResult restore_result = RestoreIndexIfNeeded();
848     index_init_status = std::move(restore_result.status);
849     // DATA_LOSS means that we have successfully initialized and re-added
850     // content to the index. Some indexed content was lost, but otherwise the
851     // index is in a valid state and can be queried.
852     if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
853       return index_init_status;
854     }
855 
856     // Delete the marker file to indicate that everything is now in sync with
857     // whatever changes were made to the schema.
858     filesystem_->DeleteFile(marker_filepath.c_str());
859 
860     initialize_stats->set_index_restoration_latency_ms(
861         restore_timer->GetElapsedMilliseconds());
862     initialize_stats->set_index_restoration_cause(
863         InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
864     initialize_stats->set_integer_index_restoration_cause(
865         InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
866     initialize_stats->set_qualified_id_join_index_restoration_cause(
867         InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
868     initialize_stats->set_embedding_index_restoration_cause(
869         InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
870   } else if (version_state_change != version_util::StateChange::kCompatible) {
871     if (options_.enable_blob_store()) {
872       ICING_RETURN_IF_ERROR(
873           InitializeBlobStore(options_.orphan_blob_time_to_live_ms(),
874                               options_.blob_store_compression_level()));
875     }
876     ICING_ASSIGN_OR_RETURN(bool document_store_derived_files_regenerated,
877                            InitializeDocumentStore(
878                                /*force_recovery_and_revalidate_documents=*/true,
879                                initialize_stats));
880     index_init_status = InitializeIndex(
881         document_store_derived_files_regenerated, initialize_stats);
882     if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
883       return index_init_status;
884     }
885 
886     initialize_stats->set_schema_store_recovery_cause(
887         InitializeStatsProto::VERSION_CHANGED);
888     initialize_stats->set_document_store_recovery_cause(
889         InitializeStatsProto::VERSION_CHANGED);
890     initialize_stats->set_index_restoration_cause(
891         InitializeStatsProto::VERSION_CHANGED);
892     initialize_stats->set_integer_index_restoration_cause(
893         InitializeStatsProto::VERSION_CHANGED);
894     initialize_stats->set_qualified_id_join_index_restoration_cause(
895         InitializeStatsProto::VERSION_CHANGED);
896     initialize_stats->set_embedding_index_restoration_cause(
897         InitializeStatsProto::VERSION_CHANGED);
898   } else {
899     if (options_.enable_blob_store()) {
900       ICING_RETURN_IF_ERROR(
901           InitializeBlobStore(options_.orphan_blob_time_to_live_ms(),
902                               options_.blob_store_compression_level()));
903     }
904     ICING_ASSIGN_OR_RETURN(
905         bool document_store_derived_files_regenerated,
906         InitializeDocumentStore(
907             /*force_recovery_and_revalidate_documents=*/false,
908             initialize_stats));
909     index_init_status = InitializeIndex(
910         document_store_derived_files_regenerated, initialize_stats);
911     if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
912       return index_init_status;
913     }
914 
915     // Set recovery cause to FEATURE_FLAG_CHANGED according to the calculated
916     // required_derived_files_rebuild
917     if (required_derived_files_rebuild
918             .needs_document_store_derived_files_rebuild) {
919       initialize_stats->set_document_store_recovery_cause(
920           InitializeStatsProto::FEATURE_FLAG_CHANGED);
921     }
922     if (required_derived_files_rebuild
923             .needs_schema_store_derived_files_rebuild) {
924       initialize_stats->set_schema_store_recovery_cause(
925           InitializeStatsProto::FEATURE_FLAG_CHANGED);
926     }
927     if (required_derived_files_rebuild.needs_term_index_rebuild) {
928       initialize_stats->set_index_restoration_cause(
929           InitializeStatsProto::FEATURE_FLAG_CHANGED);
930     }
931     if (required_derived_files_rebuild.needs_integer_index_rebuild) {
932       initialize_stats->set_integer_index_restoration_cause(
933           InitializeStatsProto::FEATURE_FLAG_CHANGED);
934     }
935     if (required_derived_files_rebuild.needs_qualified_id_join_index_rebuild) {
936       initialize_stats->set_qualified_id_join_index_restoration_cause(
937           InitializeStatsProto::FEATURE_FLAG_CHANGED);
938     }
939     if (required_derived_files_rebuild.needs_embedding_index_rebuild) {
940       initialize_stats->set_embedding_index_restoration_cause(
941           InitializeStatsProto::FEATURE_FLAG_CHANGED);
942     }
943   }
944 
945   if (status.ok()) {
946     status = index_init_status;
947   }
948 
949   result_state_manager_ = std::make_unique<ResultStateManager>(
950       performance_configuration_.max_num_total_hits, *document_store_);
951 
952   return status;
953 }
954 
InitializeSchemaStore(InitializeStatsProto * initialize_stats)955 libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore(
956     InitializeStatsProto* initialize_stats) {
957   ICING_RETURN_ERROR_IF_NULL(initialize_stats);
958 
959   const std::string schema_store_dir =
960       MakeSchemaDirectoryPath(options_.base_dir());
961   // Make sure the sub-directory exists
962   if (!filesystem_->CreateDirectoryRecursively(schema_store_dir.c_str())) {
963     return absl_ports::InternalError(
964         absl_ports::StrCat("Could not create directory: ", schema_store_dir));
965   }
966   ICING_ASSIGN_OR_RETURN(
967       schema_store_,
968       SchemaStore::Create(filesystem_.get(), schema_store_dir, clock_.get(),
969                           &feature_flags_, options_.enable_schema_database(),
970                           initialize_stats));
971 
972   return libtextclassifier3::Status::OK;
973 }
974 
InitializeDocumentStore(bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)975 libtextclassifier3::StatusOr<bool> IcingSearchEngine::InitializeDocumentStore(
976     bool force_recovery_and_revalidate_documents,
977     InitializeStatsProto* initialize_stats) {
978   ICING_RETURN_ERROR_IF_NULL(initialize_stats);
979 
980   const std::string document_dir =
981       MakeDocumentDirectoryPath(options_.base_dir());
982   // Make sure the sub-directory exists
983   if (!filesystem_->CreateDirectoryRecursively(document_dir.c_str())) {
984     return absl_ports::InternalError(
985         absl_ports::StrCat("Could not create directory: ", document_dir));
986   }
987   ICING_ASSIGN_OR_RETURN(
988       DocumentStore::CreateResult create_result,
989       DocumentStore::Create(
990           filesystem_.get(), document_dir, clock_.get(), schema_store_.get(),
991           &feature_flags_, force_recovery_and_revalidate_documents,
992           /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/true,
993           options_.compression_level(), initialize_stats));
994   document_store_ = std::move(create_result.document_store);
995   return create_result.derived_files_regenerated;
996 }
997 
InitializeBlobStore(int32_t orphan_blob_time_to_live_ms,int32_t blob_store_compression_level)998 libtextclassifier3::Status IcingSearchEngine::InitializeBlobStore(
999     int32_t orphan_blob_time_to_live_ms, int32_t blob_store_compression_level) {
1000   std::string blob_dir = MakeBlobDirectoryPath(options_.base_dir());
1001   // Make sure the sub-directory exists
1002   if (!filesystem_->CreateDirectoryRecursively(blob_dir.c_str())) {
1003     return absl_ports::InternalError(
1004         absl_ports::StrCat("Could not create directory: ", blob_dir));
1005   }
1006 
1007   ICING_ASSIGN_OR_RETURN(
1008       auto blob_store_or,
1009       BlobStore::Create(filesystem_.get(), blob_dir, clock_.get(),
1010                         orphan_blob_time_to_live_ms,
1011                         blob_store_compression_level));
1012   blob_store_ = std::make_unique<BlobStore>(std::move(blob_store_or));
1013   return libtextclassifier3::Status::OK;
1014 }
1015 
InitializeIndex(bool document_store_derived_files_regenerated,InitializeStatsProto * initialize_stats)1016 libtextclassifier3::Status IcingSearchEngine::InitializeIndex(
1017     bool document_store_derived_files_regenerated,
1018     InitializeStatsProto* initialize_stats) {
1019   ICING_RETURN_ERROR_IF_NULL(initialize_stats);
1020 
1021   const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
1022   // Make sure the sub-directory exists
1023   if (!filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
1024     return absl_ports::InternalError(
1025         absl_ports::StrCat("Could not create directory: ", index_dir));
1026   }
1027   Index::Options index_options(index_dir, options_.index_merge_size(),
1028                                /*lite_index_sort_at_indexing=*/true,
1029                                options_.lite_index_sort_size());
1030 
1031   // Term index
1032   InitializeStatsProto::RecoveryCause index_recovery_cause;
1033   auto index_or =
1034       Index::Create(index_options, filesystem_.get(), icing_filesystem_.get());
1035   if (!index_or.ok()) {
1036     if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
1037         !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
1038       return absl_ports::InternalError(
1039           absl_ports::StrCat("Could not recreate directory: ", index_dir));
1040     }
1041 
1042     index_recovery_cause = InitializeStatsProto::IO_ERROR;
1043 
1044     // Try recreating it from scratch and re-indexing everything.
1045     ICING_ASSIGN_OR_RETURN(index_,
1046                            Index::Create(index_options, filesystem_.get(),
1047                                          icing_filesystem_.get()));
1048   } else {
1049     // Index was created fine.
1050     index_ = std::move(index_or).ValueOrDie();
1051     // If a recover does have to happen, then it must be because the index is
1052     // out of sync with the document store.
1053     index_recovery_cause = InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
1054   }
1055 
1056   // Integer index
1057   std::string integer_index_dir =
1058       MakeIntegerIndexWorkingPath(options_.base_dir());
1059   InitializeStatsProto::RecoveryCause integer_index_recovery_cause;
1060   auto integer_index_or =
1061       IntegerIndex::Create(*filesystem_, integer_index_dir,
1062                            options_.integer_index_bucket_split_threshold(),
1063                            options_.pre_mapping_fbv());
1064   if (!integer_index_or.ok()) {
1065     ICING_RETURN_IF_ERROR(
1066         IntegerIndex::Discard(*filesystem_, integer_index_dir));
1067 
1068     integer_index_recovery_cause = InitializeStatsProto::IO_ERROR;
1069 
1070     // Try recreating it from scratch and re-indexing everything.
1071     ICING_ASSIGN_OR_RETURN(
1072         integer_index_,
1073         IntegerIndex::Create(*filesystem_, std::move(integer_index_dir),
1074                              options_.integer_index_bucket_split_threshold(),
1075                              options_.pre_mapping_fbv()));
1076   } else {
1077     // Integer index was created fine.
1078     integer_index_ = std::move(integer_index_or).ValueOrDie();
1079     // If a recover does have to happen, then it must be because the index is
1080     // out of sync with the document store.
1081     integer_index_recovery_cause =
1082         InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
1083   }
1084 
1085   // Qualified id join index
1086   std::string qualified_id_join_index_dir =
1087       MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
1088   InitializeStatsProto::RecoveryCause qualified_id_join_index_recovery_cause;
1089   if (document_store_derived_files_regenerated &&
1090       !options_.enable_qualified_id_join_index_v3_and_delete_propagate_from()) {
1091     // V2 qualified id join index depends on document store derived files, so we
1092     // have to rebuild it from scratch if
1093     // document_store_derived_files_regenerated is true.
1094     ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
1095         *filesystem_, qualified_id_join_index_dir));
1096 
1097     ICING_ASSIGN_OR_RETURN(
1098         qualified_id_join_index_,
1099         CreateQualifiedIdJoinIndex(*filesystem_,
1100                                    std::move(qualified_id_join_index_dir),
1101                                    options_, feature_flags_));
1102 
1103     qualified_id_join_index_recovery_cause =
1104         InitializeStatsProto::DEPENDENCIES_CHANGED;
1105   } else {
1106     auto qualified_id_join_index_or = CreateQualifiedIdJoinIndex(
1107         *filesystem_, qualified_id_join_index_dir, options_, feature_flags_);
1108     if (!qualified_id_join_index_or.ok()) {
1109       ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
1110           *filesystem_, qualified_id_join_index_dir));
1111 
1112       qualified_id_join_index_recovery_cause = InitializeStatsProto::IO_ERROR;
1113 
1114       // Try recreating it from scratch and rebuild everything.
1115       ICING_ASSIGN_OR_RETURN(
1116           qualified_id_join_index_,
1117           CreateQualifiedIdJoinIndex(*filesystem_,
1118                                      std::move(qualified_id_join_index_dir),
1119                                      options_, feature_flags_));
1120     } else {
1121       // Qualified id join index was created fine.
1122       qualified_id_join_index_ =
1123           std::move(qualified_id_join_index_or).ValueOrDie();
1124       // If a recover does have to happen, then it must be because the index is
1125       // out of sync with the document store.
1126       qualified_id_join_index_recovery_cause =
1127           InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
1128     }
1129   }
1130 
1131   // Embedding index
1132   const std::string embedding_dir =
1133       MakeEmbeddingIndexWorkingPath(options_.base_dir());
1134   InitializeStatsProto::RecoveryCause embedding_index_recovery_cause;
1135   auto embedding_index_or = EmbeddingIndex::Create(
1136       filesystem_.get(), embedding_dir, clock_.get(), &feature_flags_);
1137   if (!embedding_index_or.ok()) {
1138     ICING_RETURN_IF_ERROR(EmbeddingIndex::Discard(*filesystem_, embedding_dir));
1139 
1140     embedding_index_recovery_cause = InitializeStatsProto::IO_ERROR;
1141 
1142     // Try recreating it from scratch and re-indexing everything.
1143     ICING_ASSIGN_OR_RETURN(
1144         embedding_index_,
1145         EmbeddingIndex::Create(filesystem_.get(), embedding_dir, clock_.get(),
1146                                &feature_flags_));
1147   } else {
1148     // Embedding index was created fine.
1149     embedding_index_ = std::move(embedding_index_or).ValueOrDie();
1150     // If a recover does have to happen, then it must be because the index is
1151     // out of sync with the document store.
1152     embedding_index_recovery_cause =
1153         InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
1154   }
1155 
1156   std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
1157   IndexRestorationResult restore_result = RestoreIndexIfNeeded();
1158   if (restore_result.index_needed_restoration ||
1159       restore_result.integer_index_needed_restoration ||
1160       restore_result.qualified_id_join_index_needed_restoration) {
1161     initialize_stats->set_index_restoration_latency_ms(
1162         restore_timer->GetElapsedMilliseconds());
1163 
1164     if (restore_result.index_needed_restoration) {
1165       initialize_stats->set_index_restoration_cause(index_recovery_cause);
1166     }
1167     if (restore_result.integer_index_needed_restoration) {
1168       initialize_stats->set_integer_index_restoration_cause(
1169           integer_index_recovery_cause);
1170     }
1171     if (restore_result.qualified_id_join_index_needed_restoration) {
1172       initialize_stats->set_qualified_id_join_index_restoration_cause(
1173           qualified_id_join_index_recovery_cause);
1174     }
1175     if (restore_result.embedding_index_needed_restoration) {
1176       initialize_stats->set_embedding_index_restoration_cause(
1177           embedding_index_recovery_cause);
1178     }
1179   }
1180   return restore_result.status;
1181 }
1182 
SetSchema(const SchemaProto & new_schema,bool ignore_errors_and_delete_documents)1183 SetSchemaResultProto IcingSearchEngine::SetSchema(
1184     const SchemaProto& new_schema, bool ignore_errors_and_delete_documents) {
1185   return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents);
1186 }
1187 
SetSchema(SchemaProto && new_schema,bool ignore_errors_and_delete_documents)1188 SetSchemaResultProto IcingSearchEngine::SetSchema(
1189     SchemaProto&& new_schema, bool ignore_errors_and_delete_documents) {
1190   ICING_VLOG(1) << "Setting new Schema";
1191 
1192   SetSchemaResultProto result_proto;
1193   StatusProto* result_status = result_proto.mutable_status();
1194 
1195   absl_ports::unique_lock l(&mutex_);
1196   ScopedTimer timer(clock_->GetNewTimer(), [&result_proto](int64_t t) {
1197     result_proto.set_latency_ms(t);
1198   });
1199   if (!initialized_) {
1200     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1201     result_status->set_message("IcingSearchEngine has not been initialized!");
1202     return result_proto;
1203   }
1204 
1205   auto lost_previous_schema_or = LostPreviousSchema();
1206   if (!lost_previous_schema_or.ok()) {
1207     TransformStatus(lost_previous_schema_or.status(), result_status);
1208     return result_proto;
1209   }
1210   bool lost_previous_schema = lost_previous_schema_or.ValueOrDie();
1211 
1212   std::string marker_filepath =
1213       MakeSetSchemaMarkerFilePath(options_.base_dir());
1214   // Create the marker file indicating that we are going to apply a schema
1215   // change. No need to write anything to the marker file - its existence is the
1216   // only thing that matters. The marker file is used to indicate if we
1217   // encountered a crash or a power loss while updating the schema and other
1218   // files. So set it up to be deleted as long as we return from this function.
1219   DestructibleFile marker_file(marker_filepath, filesystem_.get());
1220 
1221   auto set_schema_result_or = schema_store_->SetSchema(
1222       std::move(new_schema), ignore_errors_and_delete_documents,
1223       options_.allow_circular_schema_definitions());
1224   if (!set_schema_result_or.ok()) {
1225     TransformStatus(set_schema_result_or.status(), result_status);
1226     return result_proto;
1227   }
1228   SchemaStore::SetSchemaResult set_schema_result =
1229       std::move(set_schema_result_or).ValueOrDie();
1230 
1231   for (const std::string& deleted_type :
1232        set_schema_result.schema_types_deleted_by_name) {
1233     result_proto.add_deleted_schema_types(deleted_type);
1234   }
1235 
1236   for (const std::string& incompatible_type :
1237        set_schema_result.schema_types_incompatible_by_name) {
1238     result_proto.add_incompatible_schema_types(incompatible_type);
1239   }
1240 
1241   for (const std::string& new_type :
1242        set_schema_result.schema_types_new_by_name) {
1243     result_proto.add_new_schema_types(std::move(new_type));
1244   }
1245 
1246   for (const std::string& compatible_type :
1247        set_schema_result.schema_types_changed_fully_compatible_by_name) {
1248     result_proto.add_fully_compatible_changed_schema_types(
1249         std::move(compatible_type));
1250   }
1251 
1252   bool index_incompatible =
1253       !set_schema_result.schema_types_index_incompatible_by_name.empty();
1254   for (const std::string& index_incompatible_type :
1255        set_schema_result.schema_types_index_incompatible_by_name) {
1256     result_proto.add_index_incompatible_changed_schema_types(
1257         std::move(index_incompatible_type));
1258   }
1259 
1260   // Join index is incompatible and needs rebuild if:
1261   // - Any schema type is join incompatible.
1262   // - OR existing schema type id assignment has changed, since join index
1263   //   stores schema type id (+ joinable property path) as a key to group join
1264   //   data.
1265   bool join_incompatible =
1266       !set_schema_result.schema_types_join_incompatible_by_name.empty() ||
1267       !set_schema_result.old_schema_type_ids_changed.empty();
1268   for (const std::string& join_incompatible_type :
1269        set_schema_result.schema_types_join_incompatible_by_name) {
1270     result_proto.add_join_incompatible_changed_schema_types(
1271         std::move(join_incompatible_type));
1272   }
1273 
1274   libtextclassifier3::Status status;
1275   if (set_schema_result.success) {
1276     if (lost_previous_schema) {
1277       // No previous schema to calculate a diff against. We have to go through
1278       // and revalidate all the Documents in the DocumentStore
1279       status = document_store_->UpdateSchemaStore(schema_store_.get());
1280       if (!status.ok()) {
1281         TransformStatus(status, result_status);
1282         return result_proto;
1283       }
1284     } else if (!set_schema_result.old_schema_type_ids_changed.empty() ||
1285                !set_schema_result.schema_types_incompatible_by_id.empty() ||
1286                !set_schema_result.schema_types_deleted_by_id.empty()) {
1287       status = document_store_->OptimizedUpdateSchemaStore(schema_store_.get(),
1288                                                            set_schema_result);
1289       if (!status.ok()) {
1290         TransformStatus(status, result_status);
1291         return result_proto;
1292       }
1293     }
1294 
1295     if (lost_previous_schema || index_incompatible) {
1296       // Clears search indices
1297       status = ClearSearchIndices();
1298       if (!status.ok()) {
1299         TransformStatus(status, result_status);
1300         return result_proto;
1301       }
1302     }
1303 
1304     if (lost_previous_schema || join_incompatible) {
1305       // Clears join indices
1306       status = ClearJoinIndices();
1307       if (!status.ok()) {
1308         TransformStatus(status, result_status);
1309         return result_proto;
1310       }
1311     }
1312 
1313     if (lost_previous_schema || index_incompatible || join_incompatible) {
1314       IndexRestorationResult restore_result = RestoreIndexIfNeeded();
1315       // DATA_LOSS means that we have successfully re-added content to the
1316       // index. Some indexed content was lost, but otherwise the index is in a
1317       // valid state and can be queried.
1318       if (!restore_result.status.ok() &&
1319           !absl_ports::IsDataLoss(restore_result.status)) {
1320         TransformStatus(status, result_status);
1321         return result_proto;
1322       }
1323     }
1324 
1325     if (feature_flags_.enable_scorable_properties()) {
1326       if (!set_schema_result.schema_types_scorable_property_inconsistent_by_id
1327                .empty()) {
1328         status = document_store_->RegenerateScorablePropertyCache(
1329             set_schema_result
1330                 .schema_types_scorable_property_inconsistent_by_id);
1331         if (!status.ok()) {
1332           TransformStatus(status, result_status);
1333           return result_proto;
1334         }
1335       }
1336     }
1337 
1338     result_status->set_code(StatusProto::OK);
1339   } else {
1340     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1341     result_status->set_message("Schema is incompatible.");
1342   }
1343 
1344   return result_proto;
1345 }
1346 
GetSchema()1347 GetSchemaResultProto IcingSearchEngine::GetSchema() {
1348   GetSchemaResultProto result_proto;
1349   StatusProto* result_status = result_proto.mutable_status();
1350 
1351   absl_ports::shared_lock l(&mutex_);
1352   if (!initialized_) {
1353     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1354     result_status->set_message("IcingSearchEngine has not been initialized!");
1355     return result_proto;
1356   }
1357 
1358   auto schema_or = schema_store_->GetSchema();
1359   if (!schema_or.ok()) {
1360     TransformStatus(schema_or.status(), result_status);
1361     return result_proto;
1362   }
1363 
1364   result_status->set_code(StatusProto::OK);
1365   *result_proto.mutable_schema() = *std::move(schema_or).ValueOrDie();
1366   return result_proto;
1367 }
1368 
GetSchema(std::string_view database)1369 GetSchemaResultProto IcingSearchEngine::GetSchema(std::string_view database) {
1370   GetSchemaResultProto result_proto;
1371   StatusProto* result_status = result_proto.mutable_status();
1372 
1373   absl_ports::shared_lock l(&mutex_);
1374   if (!initialized_) {
1375     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1376     result_status->set_message("IcingSearchEngine has not been initialized!");
1377     return result_proto;
1378   }
1379 
1380   libtextclassifier3::StatusOr<SchemaProto> schema =
1381       schema_store_->GetSchema(std::string(database));
1382   if (!schema.ok()) {
1383     TransformStatus(schema.status(), result_status);
1384     return result_proto;
1385   }
1386 
1387   result_status->set_code(StatusProto::OK);
1388   *result_proto.mutable_schema() = std::move(schema).ValueOrDie();
1389   return result_proto;
1390 }
1391 
GetSchemaType(std::string_view schema_type)1392 GetSchemaTypeResultProto IcingSearchEngine::GetSchemaType(
1393     std::string_view schema_type) {
1394   GetSchemaTypeResultProto result_proto;
1395   StatusProto* result_status = result_proto.mutable_status();
1396 
1397   absl_ports::shared_lock l(&mutex_);
1398   if (!initialized_) {
1399     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1400     result_status->set_message("IcingSearchEngine has not been initialized!");
1401     return result_proto;
1402   }
1403 
1404   auto type_config_or = schema_store_->GetSchemaTypeConfig(schema_type);
1405   if (!type_config_or.ok()) {
1406     TransformStatus(type_config_or.status(), result_status);
1407     return result_proto;
1408   }
1409 
1410   result_status->set_code(StatusProto::OK);
1411   *result_proto.mutable_schema_type_config() = *(type_config_or.ValueOrDie());
1412   return result_proto;
1413 }
1414 
Put(const DocumentProto & document)1415 PutResultProto IcingSearchEngine::Put(const DocumentProto& document) {
1416   return Put(DocumentProto(document));
1417 }
1418 
Put(DocumentProto && document)1419 PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
1420   ICING_VLOG(1) << "Writing document to document store";
1421 
1422   PutResultProto result_proto;
1423   StatusProto* result_status = result_proto.mutable_status();
1424   PutDocumentStatsProto* put_document_stats =
1425       result_proto.mutable_put_document_stats();
1426   ScopedTimer put_timer(clock_->GetNewTimer(), [put_document_stats](int64_t t) {
1427     put_document_stats->set_latency_ms(t);
1428   });
1429 
1430   // Lock must be acquired before validation because the DocumentStore uses
1431   // the schema file to validate, and the schema could be changed in
1432   // SetSchema() which is protected by the same mutex.
1433   absl_ports::unique_lock l(&mutex_);
1434   if (!initialized_) {
1435     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1436     result_status->set_message("IcingSearchEngine has not been initialized!");
1437     return result_proto;
1438   }
1439 
1440   auto tokenized_document_or = TokenizedDocument::Create(
1441       schema_store_.get(), language_segmenter_.get(), std::move(document));
1442   if (!tokenized_document_or.ok()) {
1443     TransformStatus(tokenized_document_or.status(), result_status);
1444     return result_proto;
1445   }
1446   TokenizedDocument tokenized_document(
1447       std::move(tokenized_document_or).ValueOrDie());
1448 
1449   auto put_result_or = document_store_->Put(
1450       tokenized_document.document(), tokenized_document.num_string_tokens(),
1451       put_document_stats);
1452   if (!put_result_or.ok()) {
1453     TransformStatus(put_result_or.status(), result_status);
1454     return result_proto;
1455   }
1456   DocumentId old_document_id = put_result_or.ValueOrDie().old_document_id;
1457   DocumentId document_id = put_result_or.ValueOrDie().new_document_id;
1458   result_proto.set_was_replacement(
1459       put_result_or.ValueOrDie().was_replacement());
1460 
1461   auto data_indexing_handlers_or = CreateDataIndexingHandlers();
1462   if (!data_indexing_handlers_or.ok()) {
1463     TransformStatus(data_indexing_handlers_or.status(), result_status);
1464     return result_proto;
1465   }
1466   IndexProcessor index_processor(
1467       std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get());
1468 
1469   auto index_status = index_processor.IndexDocument(
1470       tokenized_document, document_id, old_document_id, put_document_stats);
1471   // Getting an internal error from the index could possibly mean that the index
1472   // is broken. Try to rebuild them to recover.
1473   if (absl_ports::IsInternal(index_status)) {
1474     ICING_LOG(ERROR) << "Got an internal error from the index. Trying to "
1475                         "rebuild the index!\n"
1476                      << index_status.error_message();
1477     index_status = ClearAllIndices();
1478     if (index_status.ok()) {
1479       index_status = RestoreIndexIfNeeded().status;
1480       if (!index_status.ok()) {
1481         ICING_LOG(ERROR) << "Failed to reindex documents after a failure of "
1482                             "indexing a document.";
1483       }
1484     } else {
1485       ICING_LOG(ERROR)
1486           << "Failed to clear indices after a failure of indexing a document.";
1487     }
1488   }
1489 
1490   if (!index_status.ok()) {
1491     // If we encountered a failure or cannot resolve an internal error while
1492     // indexing this document, then mark it as deleted.
1493     int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
1494     libtextclassifier3::Status delete_status =
1495         document_store_->Delete(document_id, current_time_ms);
1496     if (!delete_status.ok()) {
1497       // This is pretty dire (and, hopefully, unlikely). We can't roll back the
1498       // document that we just added. Wipeout the whole index.
1499       ICING_LOG(ERROR) << "Cannot delete the document that is failed to index. "
1500                           "Wiping out the whole Icing search engine.";
1501       ResetInternal();
1502     }
1503   }
1504 
1505   TransformStatus(index_status, result_status);
1506   return result_proto;
1507 }
1508 
Get(const std::string_view name_space,const std::string_view uri,const GetResultSpecProto & result_spec)1509 GetResultProto IcingSearchEngine::Get(const std::string_view name_space,
1510                                       const std::string_view uri,
1511                                       const GetResultSpecProto& result_spec) {
1512   GetResultProto result_proto;
1513   StatusProto* result_status = result_proto.mutable_status();
1514 
1515   absl_ports::shared_lock l(&mutex_);
1516   if (!initialized_) {
1517     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1518     result_status->set_message("IcingSearchEngine has not been initialized!");
1519     return result_proto;
1520   }
1521 
1522   auto document_or = document_store_->Get(name_space, uri);
1523   if (!document_or.ok()) {
1524     TransformStatus(document_or.status(), result_status);
1525     return result_proto;
1526   }
1527 
1528   DocumentProto document = std::move(document_or).ValueOrDie();
1529   std::unique_ptr<ProjectionTree> type_projection_tree;
1530   std::unique_ptr<ProjectionTree> wildcard_projection_tree;
1531   for (const SchemaStore::ExpandedTypePropertyMask& type_field_mask :
1532        schema_store_->ExpandTypePropertyMasks(
1533            result_spec.type_property_masks())) {
1534     if (type_field_mask.schema_type == document.schema()) {
1535       type_projection_tree = std::make_unique<ProjectionTree>(type_field_mask);
1536     } else if (type_field_mask.schema_type ==
1537                SchemaStore::kSchemaTypeWildcard) {
1538       wildcard_projection_tree =
1539           std::make_unique<ProjectionTree>(type_field_mask);
1540     }
1541   }
1542 
1543   // Apply projection
1544   if (type_projection_tree != nullptr) {
1545     projector::Project(type_projection_tree->root().children, &document);
1546   } else if (wildcard_projection_tree != nullptr) {
1547     projector::Project(wildcard_projection_tree->root().children, &document);
1548   }
1549 
1550   result_status->set_code(StatusProto::OK);
1551   *result_proto.mutable_document() = std::move(document);
1552   return result_proto;
1553 }
1554 
ReportUsage(const UsageReport & usage_report)1555 ReportUsageResultProto IcingSearchEngine::ReportUsage(
1556     const UsageReport& usage_report) {
1557   ReportUsageResultProto result_proto;
1558   StatusProto* result_status = result_proto.mutable_status();
1559 
1560   absl_ports::unique_lock l(&mutex_);
1561   if (!initialized_) {
1562     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1563     result_status->set_message("IcingSearchEngine has not been initialized!");
1564     return result_proto;
1565   }
1566 
1567   libtextclassifier3::Status status =
1568       document_store_->ReportUsage(usage_report);
1569   TransformStatus(status, result_status);
1570   return result_proto;
1571 }
1572 
GetAllNamespaces()1573 GetAllNamespacesResultProto IcingSearchEngine::GetAllNamespaces() {
1574   GetAllNamespacesResultProto result_proto;
1575   StatusProto* result_status = result_proto.mutable_status();
1576 
1577   absl_ports::shared_lock l(&mutex_);
1578   if (!initialized_) {
1579     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1580     result_status->set_message("IcingSearchEngine has not been initialized!");
1581     return result_proto;
1582   }
1583 
1584   std::vector<std::string> namespaces = document_store_->GetAllNamespaces();
1585 
1586   for (const std::string& namespace_ : namespaces) {
1587     result_proto.add_namespaces(namespace_);
1588   }
1589 
1590   result_status->set_code(StatusProto::OK);
1591   return result_proto;
1592 }
1593 
Delete(const std::string_view name_space,const std::string_view uri)1594 DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
1595                                             const std::string_view uri) {
1596   ICING_VLOG(1) << "Deleting document from doc store";
1597 
1598   DeleteResultProto result_proto;
1599   StatusProto* result_status = result_proto.mutable_status();
1600 
1601   absl_ports::unique_lock l(&mutex_);
1602   if (!initialized_) {
1603     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1604     result_status->set_message("IcingSearchEngine has not been initialized!");
1605     return result_proto;
1606   }
1607 
1608   DeleteStatsProto* delete_stats = result_proto.mutable_delete_stats();
1609   delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE);
1610 
1611   libtextclassifier3::Status status;
1612   libtextclassifier3::Status propagate_delete_status;
1613   int num_documents_deleted = 0;
1614   std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
1615 
1616   libtextclassifier3::StatusOr<DocumentId> document_id_or =
1617       document_store_->GetDocumentId(name_space, uri);
1618   if (!document_id_or.ok()) {
1619     status = std::move(document_id_or).status();
1620   } else {
1621     DocumentId document_id = document_id_or.ValueOrDie();
1622 
1623     // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
1624     // that can support error logging.
1625     int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
1626     status = document_store_->Delete(document_id, current_time_ms);
1627     if (status.ok()) {
1628       ++num_documents_deleted;
1629     }
1630 
1631     // It is possible that the document has expired and the delete operation
1632     // fails with NOT_FOUND_ERROR. In this case, we should still propagate the
1633     // delete operation, regardless of the outcome of the delete operation.
1634     libtextclassifier3::StatusOr<int> propagated_child_docs_deleted_or =
1635         PropagateDelete(/*deleted_document_ids=*/{document_id},
1636                         current_time_ms);
1637     if (propagated_child_docs_deleted_or.ok()) {
1638       num_documents_deleted += propagated_child_docs_deleted_or.ValueOrDie();
1639     } else {
1640       propagate_delete_status =
1641           std::move(propagated_child_docs_deleted_or).status();
1642     }
1643   }
1644   delete_stats->set_num_documents_deleted(num_documents_deleted);
1645   delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
1646 
1647   if (!status.ok()) {
1648     LogSeverity::Code severity = ERROR;
1649     if (absl_ports::IsNotFound(status)) {
1650       severity = DBG;
1651     }
1652     ICING_LOG(severity) << status.error_message()
1653                         << ". Failed to delete Document. namespace: "
1654                         << name_space << ", uri: " << uri;
1655     TransformStatus(status, result_status);
1656     return result_proto;
1657   }
1658 
1659   if (!propagate_delete_status.ok()) {
1660     ICING_LOG(ERROR) << propagate_delete_status.error_message()
1661                      << ". Failed to propagate delete for document. namespace: "
1662                      << name_space << ", uri: " << uri;
1663     TransformStatus(propagate_delete_status, result_status);
1664     return result_proto;
1665   }
1666 
1667   result_status->set_code(StatusProto::OK);
1668   return result_proto;
1669 }
1670 
DeleteByNamespace(const std::string_view name_space)1671 DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
1672     const std::string_view name_space) {
1673   ICING_VLOG(1) << "Deleting namespace from doc store";
1674 
1675   DeleteByNamespaceResultProto delete_result;
1676   StatusProto* result_status = delete_result.mutable_status();
1677   absl_ports::unique_lock l(&mutex_);
1678   if (!initialized_) {
1679     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1680     result_status->set_message("IcingSearchEngine has not been initialized!");
1681     return delete_result;
1682   }
1683 
1684   DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats();
1685   delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE);
1686 
1687   std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
1688   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
1689   // that can support error logging.
1690   DocumentStore::DeleteByGroupResult doc_store_result =
1691       document_store_->DeleteByNamespace(name_space);
1692   if (!doc_store_result.status.ok()) {
1693     ICING_LOG(ERROR) << doc_store_result.status.error_message()
1694                      << "Failed to delete Namespace: " << name_space;
1695     TransformStatus(doc_store_result.status, result_status);
1696     return delete_result;
1697   }
1698 
1699   result_status->set_code(StatusProto::OK);
1700   delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
1701   delete_stats->set_num_documents_deleted(doc_store_result.num_docs_deleted);
1702   return delete_result;
1703 }
1704 
DeleteBySchemaType(const std::string_view schema_type)1705 DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
1706     const std::string_view schema_type) {
1707   ICING_VLOG(1) << "Deleting type from doc store";
1708 
1709   DeleteBySchemaTypeResultProto delete_result;
1710   StatusProto* result_status = delete_result.mutable_status();
1711   absl_ports::unique_lock l(&mutex_);
1712   if (!initialized_) {
1713     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1714     result_status->set_message("IcingSearchEngine has not been initialized!");
1715     return delete_result;
1716   }
1717 
1718   DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats();
1719   delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE);
1720 
1721   std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
1722   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
1723   // that can support error logging.
1724   DocumentStore::DeleteByGroupResult doc_store_result =
1725       document_store_->DeleteBySchemaType(schema_type);
1726   if (!doc_store_result.status.ok()) {
1727     ICING_LOG(ERROR) << doc_store_result.status.error_message()
1728                      << "Failed to delete SchemaType: " << schema_type;
1729     TransformStatus(doc_store_result.status, result_status);
1730     return delete_result;
1731   }
1732 
1733   result_status->set_code(StatusProto::OK);
1734   delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
1735   delete_stats->set_num_documents_deleted(doc_store_result.num_docs_deleted);
1736   return delete_result;
1737 }
1738 
DeleteByQuery(const SearchSpecProto & search_spec,bool return_deleted_document_info)1739 DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
1740     const SearchSpecProto& search_spec, bool return_deleted_document_info) {
1741   ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
1742                 << " from doc store";
1743 
1744   DeleteByQueryResultProto result_proto;
1745   StatusProto* result_status = result_proto.mutable_status();
1746 
1747   absl_ports::unique_lock l(&mutex_);
1748   if (!initialized_) {
1749     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1750     result_status->set_message("IcingSearchEngine has not been initialized!");
1751     return result_proto;
1752   }
1753 
1754   DeleteByQueryStatsProto* delete_stats =
1755       result_proto.mutable_delete_by_query_stats();
1756   delete_stats->set_query_length(search_spec.query().length());
1757   delete_stats->set_num_namespaces_filtered(
1758       search_spec.namespace_filters_size());
1759   delete_stats->set_num_schema_types_filtered(
1760       search_spec.schema_type_filters_size());
1761 
1762   ScopedTimer delete_timer(clock_->GetNewTimer(), [delete_stats](int64_t t) {
1763     delete_stats->set_latency_ms(t);
1764   });
1765   libtextclassifier3::Status status =
1766       ValidateSearchSpec(search_spec, performance_configuration_);
1767   if (!status.ok()) {
1768     TransformStatus(status, result_status);
1769     return result_proto;
1770   }
1771 
1772   std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
1773   // Gets unordered results from query processor
1774   auto query_processor_or = QueryProcessor::Create(
1775       index_.get(), integer_index_.get(), embedding_index_.get(),
1776       language_segmenter_.get(), normalizer_.get(), document_store_.get(),
1777       schema_store_.get(), /*join_children_fetcher=*/nullptr, clock_.get(),
1778       &feature_flags_);
1779   if (!query_processor_or.ok()) {
1780     TransformStatus(query_processor_or.status(), result_status);
1781     delete_stats->set_parse_query_latency_ms(
1782         component_timer->GetElapsedMilliseconds());
1783     return result_proto;
1784   }
1785   std::unique_ptr<QueryProcessor> query_processor =
1786       std::move(query_processor_or).ValueOrDie();
1787 
1788   int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
1789   auto query_results_or = query_processor->ParseSearch(
1790       search_spec, ScoringSpecProto::RankingStrategy::NONE, current_time_ms);
1791   if (!query_results_or.ok()) {
1792     TransformStatus(query_results_or.status(), result_status);
1793     delete_stats->set_parse_query_latency_ms(
1794         component_timer->GetElapsedMilliseconds());
1795     return result_proto;
1796   }
1797   QueryResults query_results = std::move(query_results_or).ValueOrDie();
1798   delete_stats->set_parse_query_latency_ms(
1799       component_timer->GetElapsedMilliseconds());
1800 
1801   ICING_VLOG(2) << "Deleting the docs that matched the query.";
1802   int num_deleted = 0;
1803   // A map used to group deleted documents.
1804   // From the (namespace, type) pair to a list of uris.
1805   std::unordered_map<NamespaceTypePair,
1806                      DeleteByQueryResultProto::DocumentGroupInfo*,
1807                      NamespaceTypePairHasher>
1808       deleted_info_map;
1809 
1810   component_timer = clock_->GetNewTimer();
1811   while (query_results.root_iterator->Advance().ok()) {
1812     ICING_VLOG(3) << "Deleting doc "
1813                   << query_results.root_iterator->doc_hit_info().document_id();
1814     ++num_deleted;
1815     if (return_deleted_document_info) {
1816       status = RetrieveAndAddDocumentInfo(
1817           document_store_.get(), result_proto, deleted_info_map,
1818           query_results.root_iterator->doc_hit_info().document_id());
1819       if (!status.ok()) {
1820         TransformStatus(status, result_status);
1821         delete_stats->set_document_removal_latency_ms(
1822             component_timer->GetElapsedMilliseconds());
1823         return result_proto;
1824       }
1825     }
1826     status = document_store_->Delete(
1827         query_results.root_iterator->doc_hit_info().document_id(),
1828         current_time_ms);
1829     if (!status.ok()) {
1830       TransformStatus(status, result_status);
1831       delete_stats->set_document_removal_latency_ms(
1832           component_timer->GetElapsedMilliseconds());
1833       return result_proto;
1834     }
1835   }
1836   delete_stats->set_document_removal_latency_ms(
1837       component_timer->GetElapsedMilliseconds());
1838   int term_count = 0;
1839   for (const auto& section_and_terms : query_results.query_terms) {
1840     term_count += section_and_terms.second.size();
1841   }
1842   delete_stats->set_num_terms(term_count);
1843 
1844   if (num_deleted > 0) {
1845     result_proto.mutable_status()->set_code(StatusProto::OK);
1846   } else {
1847     result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
1848     result_proto.mutable_status()->set_message(
1849         "No documents matched the query to delete by!");
1850   }
1851   delete_stats->set_num_documents_deleted(num_deleted);
1852   return result_proto;
1853 }
1854 
PropagateDelete(const std::unordered_set<DocumentId> & deleted_document_ids,int64_t current_time_ms)1855 libtextclassifier3::StatusOr<int> IcingSearchEngine::PropagateDelete(
1856     const std::unordered_set<DocumentId>& deleted_document_ids,
1857     int64_t current_time_ms) {
1858   int propagated_child_docs_deleted = 0;
1859 
1860   if (!options_.enable_qualified_id_join_index_v3_and_delete_propagate_from() ||
1861       qualified_id_join_index_->version() !=
1862           QualifiedIdJoinIndex::Version::kV3) {
1863     // No-op if delete propagation is disabled or the join index is not v3.
1864     return propagated_child_docs_deleted;
1865   }
1866 
1867   // Create join processor to get propagated child documents to delete.
1868   JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
1869                                qualified_id_join_index_.get(), current_time_ms);
1870   ICING_ASSIGN_OR_RETURN(
1871       std::unordered_set<DocumentId> child_docs_to_delete,
1872       join_processor.GetPropagatedChildDocumentsToDelete(deleted_document_ids));
1873 
1874   // Delete all propagated child documents.
1875   for (DocumentId child_doc_id : child_docs_to_delete) {
1876     auto status = document_store_->Delete(child_doc_id, current_time_ms);
1877     if (!status.ok()) {
1878       if (absl_ports::IsNotFound(status)) {
1879         // The child document has already been deleted or expired, so skip the
1880         // error.
1881         continue;
1882       }
1883 
1884       // Real error.
1885       return status;
1886     }
1887     ++propagated_child_docs_deleted;
1888   }
1889 
1890   return propagated_child_docs_deleted;
1891 }
1892 
PersistToDisk(PersistType::Code persist_type)1893 PersistToDiskResultProto IcingSearchEngine::PersistToDisk(
1894     PersistType::Code persist_type) {
1895   ICING_VLOG(1) << "Persisting data to disk";
1896 
1897   PersistToDiskResultProto result_proto;
1898   StatusProto* result_status = result_proto.mutable_status();
1899 
1900   absl_ports::unique_lock l(&mutex_);
1901   if (!initialized_) {
1902     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1903     result_status->set_message("IcingSearchEngine has not been initialized!");
1904     return result_proto;
1905   }
1906 
1907   auto status = InternalPersistToDisk(persist_type);
1908   TransformStatus(status, result_status);
1909   return result_proto;
1910 }
1911 
1912 // Optimizes Icing's storage
1913 //
1914 // Steps:
1915 // 1. Flush data to disk.
1916 // 2. Copy data needed to a tmp directory.
1917 // 3. Swap current directory and tmp directory.
Optimize()1918 OptimizeResultProto IcingSearchEngine::Optimize() {
1919   ICING_VLOG(1) << "Optimizing icing storage";
1920 
1921   OptimizeResultProto result_proto;
1922   StatusProto* result_status = result_proto.mutable_status();
1923 
1924   absl_ports::unique_lock l(&mutex_);
1925   if (!initialized_) {
1926     result_status->set_code(StatusProto::FAILED_PRECONDITION);
1927     result_status->set_message("IcingSearchEngine has not been initialized!");
1928     return result_proto;
1929   }
1930 
1931   OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats();
1932   ScopedTimer optimize_timer(
1933       clock_->GetNewTimer(),
1934       [optimize_stats](int64_t t) { optimize_stats->set_latency_ms(t); });
1935 
1936   // Flushes data to disk before doing optimization
1937   auto status = InternalPersistToDisk(PersistType::FULL);
1938   if (!status.ok()) {
1939     TransformStatus(status, result_status);
1940     return result_proto;
1941   }
1942 
1943   int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
1944   optimize_stats->set_storage_size_before(
1945       Filesystem::SanitizeFileSize(before_size));
1946 
1947   // Get all expired blob handles
1948   std::unordered_set<std::string> potentially_optimizable_blob_handles;
1949   if (blob_store_ != nullptr) {
1950     potentially_optimizable_blob_handles =
1951         blob_store_->GetPotentiallyOptimizableBlobHandles();
1952   }
1953 
1954   // TODO(b/143646633): figure out if we need to optimize index and doc store
1955   // at the same time.
1956   std::unique_ptr<Timer> optimize_doc_store_timer = clock_->GetNewTimer();
1957   libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
1958       optimize_result_or = OptimizeDocumentStore(
1959           std::move(potentially_optimizable_blob_handles), optimize_stats);
1960   optimize_stats->set_document_store_optimize_latency_ms(
1961       optimize_doc_store_timer->GetElapsedMilliseconds());
1962 
1963   if (!optimize_result_or.ok() &&
1964       !absl_ports::IsDataLoss(optimize_result_or.status())) {
1965     // The status now is either ABORTED_ERROR or INTERNAL_ERROR.
1966     // If ABORTED_ERROR, Icing should still be working.
1967     // If INTERNAL_ERROR, we're having IO errors or other errors that we can't
1968     // recover from.
1969     TransformStatus(optimize_result_or.status(), result_status);
1970     return result_proto;
1971   }
1972 
1973   libtextclassifier3::Status doc_store_optimize_result_status =
1974       optimize_result_or.status();
1975   if (blob_store_ != nullptr && doc_store_optimize_result_status.ok()) {
1976     // optimize blob store
1977     libtextclassifier3::Status blob_store_optimize_status =
1978         blob_store_->Optimize(
1979             optimize_result_or.ValueOrDie().dead_blob_handles);
1980     if (!blob_store_optimize_status.ok()) {
1981       TransformStatus(status, result_status);
1982       return result_proto;
1983     }
1984   }
1985 
1986   // The status is either OK or DATA_LOSS. The optimized document store is
1987   // guaranteed to work, so we update index according to the new document store.
1988   std::unique_ptr<Timer> optimize_index_timer = clock_->GetNewTimer();
1989   bool should_rebuild_index =
1990       !optimize_result_or.ok() ||
1991       optimize_result_or.ValueOrDie().should_rebuild_index ||
1992       ShouldRebuildIndex(*optimize_stats,
1993                          options_.optimize_rebuild_index_threshold());
1994   if (!should_rebuild_index) {
1995     // At this point should_rebuild_index is false, so it means
1996     // optimize_result_or.ok() is true and therefore it is safe to call
1997     // ValueOrDie.
1998     DocumentStore::OptimizeResult optimize_result =
1999         std::move(optimize_result_or).ValueOrDie();
2000 
2001     optimize_stats->set_index_restoration_mode(
2002         OptimizeStatsProto::INDEX_TRANSLATION);
2003     libtextclassifier3::Status index_optimize_status =
2004         index_->Optimize(optimize_result.document_id_old_to_new,
2005                          document_store_->last_added_document_id());
2006     if (!index_optimize_status.ok()) {
2007       ICING_LOG(WARNING) << "Failed to optimize index. Error: "
2008                          << index_optimize_status.error_message();
2009       should_rebuild_index = true;
2010     }
2011 
2012     libtextclassifier3::Status integer_index_optimize_status =
2013         integer_index_->Optimize(optimize_result.document_id_old_to_new,
2014                                  document_store_->last_added_document_id());
2015     if (!integer_index_optimize_status.ok()) {
2016       ICING_LOG(WARNING) << "Failed to optimize integer index. Error: "
2017                          << integer_index_optimize_status.error_message();
2018       should_rebuild_index = true;
2019     }
2020 
2021     libtextclassifier3::Status qualified_id_join_index_optimize_status =
2022         qualified_id_join_index_->Optimize(
2023             optimize_result.document_id_old_to_new,
2024             optimize_result.namespace_id_old_to_new,
2025             document_store_->last_added_document_id());
2026     if (!qualified_id_join_index_optimize_status.ok()) {
2027       ICING_LOG(WARNING)
2028           << "Failed to optimize qualified id join index. Error: "
2029           << qualified_id_join_index_optimize_status.error_message();
2030       should_rebuild_index = true;
2031     }
2032 
2033     libtextclassifier3::Status embedding_index_optimize_status =
2034         embedding_index_->Optimize(document_store_.get(), schema_store_.get(),
2035                                    optimize_result.document_id_old_to_new,
2036                                    document_store_->last_added_document_id());
2037     if (!embedding_index_optimize_status.ok()) {
2038       ICING_LOG(WARNING) << "Failed to optimize embedding index. Error: "
2039                          << embedding_index_optimize_status.error_message();
2040       should_rebuild_index = true;
2041     }
2042   }
2043   // If we received a DATA_LOSS error from OptimizeDocumentStore, we have a
2044   // valid document store, but it might be the old one or the new one. So throw
2045   // out the index data and rebuild from scratch.
2046   // Also rebuild index if DocumentStore::OptimizeInto hints to do so.
2047   // Likewise, if Index::Optimize failed, then attempt to recover the index by
2048   // rebuilding from scratch.
2049   // If ShouldRebuildIndex() returns true, we will also rebuild the index for
2050   // better performance.
2051   if (should_rebuild_index) {
2052     optimize_stats->set_index_restoration_mode(
2053         OptimizeStatsProto::FULL_INDEX_REBUILD);
2054     ICING_LOG(WARNING) << "Clearing the entire index!";
2055 
2056     libtextclassifier3::Status index_clear_status = ClearAllIndices();
2057     if (!index_clear_status.ok()) {
2058       status = absl_ports::Annotate(
2059           absl_ports::InternalError("Failed to clear index."),
2060           index_clear_status.error_message());
2061       TransformStatus(status, result_status);
2062       optimize_stats->set_index_restoration_latency_ms(
2063           optimize_index_timer->GetElapsedMilliseconds());
2064       return result_proto;
2065     }
2066 
2067     IndexRestorationResult index_restoration_status = RestoreIndexIfNeeded();
2068     // DATA_LOSS means that we have successfully re-added content to the index.
2069     // Some indexed content was lost, but otherwise the index is in a valid
2070     // state and can be queried.
2071     if (!index_restoration_status.status.ok() &&
2072         !absl_ports::IsDataLoss(index_restoration_status.status)) {
2073       status = absl_ports::Annotate(
2074           absl_ports::InternalError(
2075               "Failed to reindex documents after optimization."),
2076           index_restoration_status.status.error_message());
2077 
2078       TransformStatus(status, result_status);
2079       optimize_stats->set_index_restoration_latency_ms(
2080           optimize_index_timer->GetElapsedMilliseconds());
2081       return result_proto;
2082     }
2083   }
2084   optimize_stats->set_index_restoration_latency_ms(
2085       optimize_index_timer->GetElapsedMilliseconds());
2086 
2087   // Read the optimize status to get the time that we last ran.
2088   std::string optimize_status_filename =
2089       absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename);
2090   FileBackedProto<OptimizeStatusProto> optimize_status_file(
2091       *filesystem_, optimize_status_filename);
2092   auto optimize_status_or = optimize_status_file.Read();
2093   int64_t current_time = clock_->GetSystemTimeMilliseconds();
2094   if (optimize_status_or.ok()) {
2095     // If we have trouble reading the status or this is the first time that
2096     // we've ever run, don't set this field.
2097     optimize_stats->set_time_since_last_optimize_ms(
2098         current_time - optimize_status_or.ValueOrDie()
2099                            ->last_successful_optimize_run_time_ms());
2100   }
2101 
2102   // Update the status for this run and write it.
2103   auto optimize_status = std::make_unique<OptimizeStatusProto>();
2104   optimize_status->set_last_successful_optimize_run_time_ms(current_time);
2105   auto write_status = optimize_status_file.Write(std::move(optimize_status));
2106   if (!write_status.ok()) {
2107     ICING_LOG(ERROR) << "Failed to write optimize status:\n"
2108                      << write_status.error_message();
2109   }
2110 
2111   // Flushes data to disk after doing optimization
2112   status = InternalPersistToDisk(PersistType::FULL);
2113   if (!status.ok()) {
2114     TransformStatus(status, result_status);
2115     return result_proto;
2116   }
2117 
2118   int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
2119   optimize_stats->set_storage_size_after(
2120       Filesystem::SanitizeFileSize(after_size));
2121 
2122   TransformStatus(doc_store_optimize_result_status, result_status);
2123   return result_proto;
2124 }
2125 
GetOptimizeInfo()2126 GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() {
2127   ICING_VLOG(1) << "Getting optimize info from IcingSearchEngine";
2128 
2129   GetOptimizeInfoResultProto result_proto;
2130   StatusProto* result_status = result_proto.mutable_status();
2131 
2132   absl_ports::shared_lock l(&mutex_);
2133   if (!initialized_) {
2134     result_status->set_code(StatusProto::FAILED_PRECONDITION);
2135     result_status->set_message("IcingSearchEngine has not been initialized!");
2136     return result_proto;
2137   }
2138 
2139   // Read the optimize status to get the time that we last ran.
2140   std::string optimize_status_filename =
2141       absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename);
2142   FileBackedProto<OptimizeStatusProto> optimize_status_file(
2143       *filesystem_, optimize_status_filename);
2144   auto optimize_status_or = optimize_status_file.Read();
2145   int64_t current_time = clock_->GetSystemTimeMilliseconds();
2146 
2147   if (optimize_status_or.ok()) {
2148     // If we have trouble reading the status or this is the first time that
2149     // we've ever run, don't set this field.
2150     result_proto.set_time_since_last_optimize_ms(
2151         current_time - optimize_status_or.ValueOrDie()
2152                            ->last_successful_optimize_run_time_ms());
2153   }
2154 
2155   // Get stats from DocumentStore
2156   auto doc_store_optimize_info_or = document_store_->GetOptimizeInfo();
2157   if (!doc_store_optimize_info_or.ok()) {
2158     TransformStatus(doc_store_optimize_info_or.status(), result_status);
2159     return result_proto;
2160   }
2161   DocumentStore::OptimizeInfo doc_store_optimize_info =
2162       doc_store_optimize_info_or.ValueOrDie();
2163   result_proto.set_optimizable_docs(doc_store_optimize_info.optimizable_docs);
2164 
2165   if (doc_store_optimize_info.optimizable_docs == 0) {
2166     // Can return early since there's nothing to calculate on the index side
2167     result_proto.set_estimated_optimizable_bytes(0);
2168     result_status->set_code(StatusProto::OK);
2169     return result_proto;
2170   }
2171 
2172   // Get stats from Index.
2173   auto index_elements_size_or = index_->GetElementsSize();
2174   if (!index_elements_size_or.ok()) {
2175     TransformStatus(index_elements_size_or.status(), result_status);
2176     return result_proto;
2177   }
2178   int64_t index_elements_size = index_elements_size_or.ValueOrDie();
2179   // TODO(b/273591938): add stats for blob store
2180   // TODO(b/259744228): add stats for integer index
2181 
2182   // Sum up the optimizable sizes from DocumentStore and Index
2183   result_proto.set_estimated_optimizable_bytes(
2184       index_elements_size * doc_store_optimize_info.optimizable_docs /
2185           doc_store_optimize_info.total_docs +
2186       doc_store_optimize_info.estimated_optimizable_bytes);
2187 
2188   result_status->set_code(StatusProto::OK);
2189   return result_proto;
2190 }
2191 
GetStorageInfo()2192 StorageInfoResultProto IcingSearchEngine::GetStorageInfo() {
2193   StorageInfoResultProto result;
2194   absl_ports::shared_lock l(&mutex_);
2195   if (!initialized_) {
2196     result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
2197     result.mutable_status()->set_message(
2198         "IcingSearchEngine has not been initialized!");
2199     return result;
2200   }
2201 
2202   int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
2203   result.mutable_storage_info()->set_total_storage_size(
2204       Filesystem::SanitizeFileSize(index_size));
2205   *result.mutable_storage_info()->mutable_document_storage_info() =
2206       document_store_->GetStorageInfo();
2207   *result.mutable_storage_info()->mutable_schema_store_storage_info() =
2208       schema_store_->GetStorageInfo();
2209   *result.mutable_storage_info()->mutable_index_storage_info() =
2210       index_->GetStorageInfo();
2211   if (blob_store_ != nullptr) {
2212     auto namespace_blob_storage_infos_or = blob_store_->GetStorageInfo();
2213     if (!namespace_blob_storage_infos_or.ok()) {
2214       result.mutable_status()->set_code(StatusProto::INTERNAL);
2215       result.mutable_status()->set_message(
2216           namespace_blob_storage_infos_or.status().error_message());
2217       return result;
2218     }
2219     std::vector<NamespaceBlobStorageInfoProto> namespace_blob_storage_infos =
2220         std::move(namespace_blob_storage_infos_or).ValueOrDie();
2221 
2222     for (NamespaceBlobStorageInfoProto& namespace_blob_storage_info :
2223          namespace_blob_storage_infos) {
2224       *result.mutable_storage_info()
2225            ->mutable_namespace_blob_storage_info()
2226            ->Add() = std::move(namespace_blob_storage_info);
2227     }
2228   }
2229   // TODO(b/259744228): add stats for integer index
2230   result.mutable_status()->set_code(StatusProto::OK);
2231   return result;
2232 }
2233 
GetDebugInfo(DebugInfoVerbosity::Code verbosity)2234 DebugInfoResultProto IcingSearchEngine::GetDebugInfo(
2235     DebugInfoVerbosity::Code verbosity) {
2236   DebugInfoResultProto debug_info;
2237   StatusProto* result_status = debug_info.mutable_status();
2238   absl_ports::shared_lock l(&mutex_);
2239   if (!initialized_) {
2240     debug_info.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
2241     debug_info.mutable_status()->set_message(
2242         "IcingSearchEngine has not been initialized!");
2243     return debug_info;
2244   }
2245 
2246   // Index
2247   *debug_info.mutable_debug_info()->mutable_index_info() =
2248       index_->GetDebugInfo(verbosity);
2249 
2250   // TODO(b/259744228): add debug info for integer index
2251 
2252   // Document Store
2253   libtextclassifier3::StatusOr<DocumentDebugInfoProto> document_debug_info =
2254       document_store_->GetDebugInfo(verbosity);
2255   if (!document_debug_info.ok()) {
2256     TransformStatus(document_debug_info.status(), result_status);
2257     return debug_info;
2258   }
2259   *debug_info.mutable_debug_info()->mutable_document_info() =
2260       std::move(document_debug_info).ValueOrDie();
2261 
2262   // Schema Store
2263   libtextclassifier3::StatusOr<SchemaDebugInfoProto> schema_debug_info =
2264       schema_store_->GetDebugInfo();
2265   if (!schema_debug_info.ok()) {
2266     TransformStatus(schema_debug_info.status(), result_status);
2267     return debug_info;
2268   }
2269   *debug_info.mutable_debug_info()->mutable_schema_info() =
2270       std::move(schema_debug_info).ValueOrDie();
2271 
2272   result_status->set_code(StatusProto::OK);
2273   return debug_info;
2274 }
2275 
InternalPersistToDisk(PersistType::Code persist_type)2276 libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk(
2277     PersistType::Code persist_type) {
2278   if (blob_store_ != nullptr) {
2279     // For all valid PersistTypes, we persist the ground truth. The ground truth
2280     // in the blob_store is a proto log file, which is need to be called when
2281     // persist_type is LITE.
2282     ICING_RETURN_IF_ERROR(blob_store_->PersistToDisk());
2283   }
2284   ICING_RETURN_IF_ERROR(document_store_->PersistToDisk(persist_type));
2285   if (persist_type == PersistType::RECOVERY_PROOF) {
2286     // Persist RECOVERY_PROOF will persist the ground truth and then update all
2287     // checksums. There is no need to call document_store_->UpdateChecksum()
2288     // because PersistToDisk(RECOVERY_PROOF) will update the checksum anyways.
2289     ICING_RETURN_IF_ERROR(schema_store_->UpdateChecksum());
2290     index_->UpdateChecksum();
2291     ICING_RETURN_IF_ERROR(integer_index_->UpdateChecksums());
2292     ICING_RETURN_IF_ERROR(qualified_id_join_index_->UpdateChecksums());
2293     ICING_RETURN_IF_ERROR(embedding_index_->UpdateChecksums());
2294   } else if (persist_type == PersistType::FULL) {
2295     ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk());
2296     ICING_RETURN_IF_ERROR(index_->PersistToDisk());
2297     ICING_RETURN_IF_ERROR(integer_index_->PersistToDisk());
2298     ICING_RETURN_IF_ERROR(qualified_id_join_index_->PersistToDisk());
2299     ICING_RETURN_IF_ERROR(embedding_index_->PersistToDisk());
2300   }
2301 
2302   return libtextclassifier3::Status::OK;
2303 }
2304 
Search(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2305 SearchResultProto IcingSearchEngine::Search(
2306     const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2307     const ResultSpecProto& result_spec) {
2308   if (search_spec.use_read_only_search()) {
2309     return SearchLockedShared(search_spec, scoring_spec, result_spec);
2310   } else {
2311     return SearchLockedExclusive(search_spec, scoring_spec, result_spec);
2312   }
2313 }
2314 
SearchLockedShared(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2315 SearchResultProto IcingSearchEngine::SearchLockedShared(
2316     const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2317     const ResultSpecProto& result_spec) {
2318   std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
2319 
2320   // Only acquire an overall read-lock for this implementation. Finer-grained
2321   // locks are implemented around code paths that write changes to Icing's data
2322   // members.
2323   absl_ports::shared_lock l(&mutex_);
2324   int64_t lock_acquisition_latency = overall_timer->GetElapsedMilliseconds();
2325 
2326   SearchResultProto result_proto =
2327       InternalSearch(search_spec, scoring_spec, result_spec);
2328 
2329   result_proto.mutable_query_stats()->set_lock_acquisition_latency_ms(
2330       lock_acquisition_latency);
2331   result_proto.mutable_query_stats()->set_latency_ms(
2332       overall_timer->GetElapsedMilliseconds());
2333   return result_proto;
2334 }
2335 
SearchLockedExclusive(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2336 SearchResultProto IcingSearchEngine::SearchLockedExclusive(
2337     const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2338     const ResultSpecProto& result_spec) {
2339   std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
2340 
2341   // Acquire the overall write-lock for this locked implementation.
2342   absl_ports::unique_lock l(&mutex_);
2343   int64_t lock_acquisition_latency = overall_timer->GetElapsedMilliseconds();
2344 
2345   SearchResultProto result_proto =
2346       InternalSearch(search_spec, scoring_spec, result_spec);
2347 
2348   result_proto.mutable_query_stats()->set_lock_acquisition_latency_ms(
2349       lock_acquisition_latency);
2350   result_proto.mutable_query_stats()->set_latency_ms(
2351       overall_timer->GetElapsedMilliseconds());
2352   return result_proto;
2353 }
2354 
InternalSearch(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2355 SearchResultProto IcingSearchEngine::InternalSearch(
2356     const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2357     const ResultSpecProto& result_spec) {
2358   SearchResultProto result_proto;
2359   StatusProto* result_status = result_proto.mutable_status();
2360 
2361   QueryStatsProto* query_stats = result_proto.mutable_query_stats();
2362   query_stats->set_is_first_page(true);
2363   query_stats->set_requested_page_size(result_spec.num_per_page());
2364 
2365   // TODO(b/305098009): deprecate search-related flat fields in query_stats.
2366   query_stats->set_num_namespaces_filtered(
2367       search_spec.namespace_filters_size());
2368   query_stats->set_num_schema_types_filtered(
2369       search_spec.schema_type_filters_size());
2370   query_stats->set_query_length(search_spec.query().length());
2371   query_stats->set_ranking_strategy(scoring_spec.rank_by());
2372 
2373   if (!initialized_) {
2374     result_status->set_code(StatusProto::FAILED_PRECONDITION);
2375     result_status->set_message("IcingSearchEngine has not been initialized!");
2376     return result_proto;
2377   }
2378   index_->PublishQueryStats(query_stats);
2379 
2380   libtextclassifier3::Status status =
2381       ValidateResultSpec(document_store_.get(), result_spec);
2382   if (!status.ok()) {
2383     TransformStatus(status, result_status);
2384     return result_proto;
2385   }
2386   status = ValidateSearchSpec(search_spec, performance_configuration_);
2387   if (!status.ok()) {
2388     TransformStatus(status, result_status);
2389     return result_proto;
2390   }
2391   status = ValidateScoringSpec(scoring_spec);
2392   if (!status.ok()) {
2393     TransformStatus(status, result_status);
2394     return result_proto;
2395   }
2396 
2397   const JoinSpecProto& join_spec = search_spec.join_spec();
2398   std::unique_ptr<JoinChildrenFetcher> join_children_fetcher;
2399   std::unique_ptr<ResultAdjustmentInfo> child_result_adjustment_info;
2400   int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
2401   if (!join_spec.parent_property_expression().empty() &&
2402       !join_spec.child_property_expression().empty()) {
2403     query_stats->set_is_join_query(true);
2404     QueryStatsProto::SearchStats* child_search_stats =
2405         query_stats->mutable_child_search_stats();
2406 
2407     // Build a child scoring spec by copying the parent schema type alias map.
2408     // Note that this function will transfer the ownership of data from
2409     // search_spec.join_spec().nested_spec().scoring_spec() to the
2410     // child_scoring_spec.
2411     // Hence, that following functions should NOT access
2412     // search_spec.join_spec().nested_spec().scoring_spec() after this function
2413     // call, but using child_scoring_spec instead.
2414     //
2415     // TODO(b/379288742): Avoid making the copy of the parent schema type alias
2416     // map.
2417     ScoringSpecProto child_scoring_spec = CopyParentSchemaTypeAliasMapToChild(
2418         scoring_spec, search_spec.join_spec().nested_spec().scoring_spec());
2419 
2420     // Process child query
2421     // TODO(b/372541905): Validate the child search spec.
2422     QueryScoringResults nested_query_scoring_results = ProcessQueryAndScore(
2423         join_spec.nested_spec().search_spec(), child_scoring_spec,
2424         join_spec.nested_spec().result_spec(),
2425         /*join_children_fetcher=*/nullptr, current_time_ms, child_search_stats);
2426     if (!nested_query_scoring_results.status.ok()) {
2427       TransformStatus(nested_query_scoring_results.status, result_status);
2428       return result_proto;
2429     }
2430 
2431     JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
2432                                  qualified_id_join_index_.get(),
2433                                  current_time_ms);
2434     // Building a JoinChildrenFetcher for looking up child documents by parent
2435     // document id.
2436     libtextclassifier3::StatusOr<std::unique_ptr<JoinChildrenFetcher>>
2437         join_children_fetcher_or = join_processor.GetChildrenFetcher(
2438             search_spec.join_spec(),
2439             std::move(nested_query_scoring_results.scored_document_hits));
2440     if (!join_children_fetcher_or.ok()) {
2441       TransformStatus(join_children_fetcher_or.status(), result_status);
2442       return result_proto;
2443     }
2444     join_children_fetcher = std::move(join_children_fetcher_or).ValueOrDie();
2445 
2446     // Assign child's ResultAdjustmentInfo.
2447     child_result_adjustment_info = std::make_unique<ResultAdjustmentInfo>(
2448         join_spec.nested_spec().search_spec(), child_scoring_spec,
2449         join_spec.nested_spec().result_spec(), schema_store_.get(),
2450         std::move(nested_query_scoring_results.query_terms));
2451   }
2452 
2453   // Process parent query
2454   QueryStatsProto::SearchStats* parent_search_stats =
2455       query_stats->mutable_parent_search_stats();
2456   QueryScoringResults query_scoring_results = ProcessQueryAndScore(
2457       search_spec, scoring_spec, result_spec, join_children_fetcher.get(),
2458       current_time_ms, parent_search_stats);
2459   // TODO(b/305098009): deprecate search-related flat fields in query_stats.
2460   query_stats->set_num_terms(parent_search_stats->num_terms());
2461   query_stats->set_parse_query_latency_ms(
2462       parent_search_stats->parse_query_latency_ms());
2463   query_stats->set_scoring_latency_ms(
2464       parent_search_stats->scoring_latency_ms());
2465   query_stats->set_num_documents_scored(
2466       parent_search_stats->num_documents_scored());
2467   if (!query_scoring_results.status.ok()) {
2468     TransformStatus(query_scoring_results.status, result_status);
2469     return result_proto;
2470   }
2471 
2472   // Returns early for empty result
2473   if (query_scoring_results.scored_document_hits.empty()) {
2474     result_status->set_code(StatusProto::OK);
2475     return result_proto;
2476   }
2477 
2478   // Construct parent's result adjustment info.
2479   auto parent_result_adjustment_info = std::make_unique<ResultAdjustmentInfo>(
2480       search_spec, scoring_spec, result_spec, schema_store_.get(),
2481       std::move(query_scoring_results.query_terms));
2482 
2483   std::unique_ptr<ScoredDocumentHitsRanker> ranker;
2484   if (join_children_fetcher != nullptr) {
2485     std::unique_ptr<Timer> join_timer = clock_->GetNewTimer();
2486     // Join 2 scored document hits
2487     JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
2488                                  qualified_id_join_index_.get(),
2489                                  current_time_ms);
2490     libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>>
2491         joined_result_document_hits_or = join_processor.Join(
2492             join_spec, std::move(query_scoring_results.scored_document_hits),
2493             *join_children_fetcher);
2494     if (!joined_result_document_hits_or.ok()) {
2495       TransformStatus(joined_result_document_hits_or.status(), result_status);
2496       return result_proto;
2497     }
2498     std::vector<JoinedScoredDocumentHit> joined_result_document_hits =
2499         std::move(joined_result_document_hits_or).ValueOrDie();
2500 
2501     query_stats->set_join_latency_ms(join_timer->GetElapsedMilliseconds());
2502 
2503     std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2504     // Ranks results
2505     ranker = std::make_unique<
2506         PriorityQueueScoredDocumentHitsRanker<JoinedScoredDocumentHit>>(
2507         std::move(joined_result_document_hits),
2508         /*is_descending=*/scoring_spec.order_by() ==
2509             ScoringSpecProto::Order::DESC);
2510     query_stats->set_ranking_latency_ms(
2511         component_timer->GetElapsedMilliseconds());
2512   } else {
2513     // Non-join query
2514     std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2515     // Ranks results
2516     ranker = std::make_unique<
2517         PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
2518         std::move(query_scoring_results.scored_document_hits),
2519         /*is_descending=*/scoring_spec.order_by() ==
2520             ScoringSpecProto::Order::DESC);
2521     query_stats->set_ranking_latency_ms(
2522         component_timer->GetElapsedMilliseconds());
2523   }
2524 
2525   std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2526   // CacheAndRetrieveFirstPage and retrieves the document protos and snippets if
2527   // requested
2528   auto result_retriever_or =
2529       ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
2530                                 language_segmenter_.get(), normalizer_.get());
2531   if (!result_retriever_or.ok()) {
2532     TransformStatus(result_retriever_or.status(), result_status);
2533     query_stats->set_document_retrieval_latency_ms(
2534         component_timer->GetElapsedMilliseconds());
2535     return result_proto;
2536   }
2537   std::unique_ptr<ResultRetrieverV2> result_retriever =
2538       std::move(result_retriever_or).ValueOrDie();
2539 
2540   libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
2541       page_result_info_or = result_state_manager_->CacheAndRetrieveFirstPage(
2542           std::move(ranker), std::move(parent_result_adjustment_info),
2543           std::move(child_result_adjustment_info), result_spec,
2544           *document_store_, *result_retriever, current_time_ms);
2545   if (!page_result_info_or.ok()) {
2546     TransformStatus(page_result_info_or.status(), result_status);
2547     query_stats->set_document_retrieval_latency_ms(
2548         component_timer->GetElapsedMilliseconds());
2549     return result_proto;
2550   }
2551   std::pair<uint64_t, PageResult> page_result_info =
2552       std::move(page_result_info_or).ValueOrDie();
2553 
2554   // Assembles the final search result proto
2555   result_proto.mutable_results()->Reserve(
2556       page_result_info.second.results.size());
2557 
2558   int32_t child_count = 0;
2559   for (SearchResultProto::ResultProto& result :
2560        page_result_info.second.results) {
2561     child_count += result.joined_results_size();
2562     result_proto.mutable_results()->Add(std::move(result));
2563   }
2564 
2565   result_status->set_code(StatusProto::OK);
2566   if (page_result_info.first != kInvalidNextPageToken) {
2567     result_proto.set_next_page_token(page_result_info.first);
2568   }
2569 
2570   query_stats->set_document_retrieval_latency_ms(
2571       component_timer->GetElapsedMilliseconds());
2572   query_stats->set_num_results_returned_current_page(
2573       result_proto.results_size());
2574 
2575   query_stats->set_num_joined_results_returned_current_page(child_count);
2576 
2577   query_stats->set_num_results_with_snippets(
2578       page_result_info.second.num_results_with_snippets);
2579   return result_proto;
2580 }
2581 
ProcessQueryAndScore(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec,const JoinChildrenFetcher * join_children_fetcher,int64_t current_time_ms,QueryStatsProto::SearchStats * search_stats)2582 IcingSearchEngine::QueryScoringResults IcingSearchEngine::ProcessQueryAndScore(
2583     const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2584     const ResultSpecProto& result_spec,
2585     const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms,
2586     QueryStatsProto::SearchStats* search_stats) {
2587   search_stats->set_num_namespaces_filtered(
2588       search_spec.namespace_filters_size());
2589   search_stats->set_num_schema_types_filtered(
2590       search_spec.schema_type_filters_size());
2591   search_stats->set_query_length(search_spec.query().length());
2592   search_stats->set_ranking_strategy(scoring_spec.rank_by());
2593 
2594   std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2595 
2596   // Gets unordered results from query processor
2597   auto query_processor_or = QueryProcessor::Create(
2598       index_.get(), integer_index_.get(), embedding_index_.get(),
2599       language_segmenter_.get(), normalizer_.get(), document_store_.get(),
2600       schema_store_.get(), join_children_fetcher, clock_.get(),
2601       &feature_flags_);
2602   if (!query_processor_or.ok()) {
2603     search_stats->set_parse_query_latency_ms(
2604         component_timer->GetElapsedMilliseconds());
2605     return QueryScoringResults(std::move(query_processor_or).status(),
2606                                /*query_terms_in=*/{},
2607                                /*scored_document_hits_in=*/{});
2608   }
2609   std::unique_ptr<QueryProcessor> query_processor =
2610       std::move(query_processor_or).ValueOrDie();
2611 
2612   auto ranking_strategy_or = GetRankingStrategyFromScoringSpec(scoring_spec);
2613   libtextclassifier3::StatusOr<QueryResults> query_results_or;
2614   if (ranking_strategy_or.ok()) {
2615     query_results_or = query_processor->ParseSearch(
2616         search_spec, ranking_strategy_or.ValueOrDie(), current_time_ms,
2617         search_stats);
2618   } else {
2619     query_results_or = ranking_strategy_or.status();
2620   }
2621   search_stats->set_parse_query_latency_ms(
2622       component_timer->GetElapsedMilliseconds());
2623   if (!query_results_or.ok()) {
2624     return QueryScoringResults(std::move(query_results_or).status(),
2625                                /*query_terms_in=*/{},
2626                                /*scored_document_hits_in=*/{});
2627   }
2628   QueryResults query_results = std::move(query_results_or).ValueOrDie();
2629 
2630   // Set SearchStats related to QueryResults.
2631   int term_count = 0;
2632   for (const auto& section_and_terms : query_results.query_terms) {
2633     term_count += section_and_terms.second.size();
2634   }
2635   search_stats->set_num_terms(term_count);
2636 
2637   if (query_results.features_in_use.count(kNumericSearchFeature)) {
2638     search_stats->set_is_numeric_query(true);
2639   }
2640 
2641   component_timer = clock_->GetNewTimer();
2642   // Scores but does not rank the results.
2643   libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>>
2644       scoring_processor_or = ScoringProcessor::Create(
2645           scoring_spec, /*default_semantic_metric_type=*/
2646           search_spec.embedding_query_metric_type(), document_store_.get(),
2647           schema_store_.get(), current_time_ms, join_children_fetcher,
2648           &query_results.embedding_query_results, &feature_flags_);
2649   if (!scoring_processor_or.ok()) {
2650     return QueryScoringResults(std::move(scoring_processor_or).status(),
2651                                std::move(query_results.query_terms),
2652                                /*scored_document_hits_in=*/{});
2653   }
2654   std::unique_ptr<ScoringProcessor> scoring_processor =
2655       std::move(scoring_processor_or).ValueOrDie();
2656   std::vector<ScoredDocumentHit> scored_document_hits =
2657       scoring_processor->Score(
2658           std::move(query_results.root_iterator), result_spec.num_to_score(),
2659           &query_results.query_term_iterators, search_stats);
2660   search_stats->set_scoring_latency_ms(
2661       component_timer->GetElapsedMilliseconds());
2662 
2663   return QueryScoringResults(libtextclassifier3::Status::OK,
2664                              std::move(query_results.query_terms),
2665                              std::move(scored_document_hits));
2666 }
2667 
GetNextPage(uint64_t next_page_token)2668 SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
2669   SearchResultProto result_proto;
2670   StatusProto* result_status = result_proto.mutable_status();
2671 
2672   QueryStatsProto* query_stats = result_proto.mutable_query_stats();
2673   query_stats->set_is_first_page(false);
2674   std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
2675   // ResultStateManager has its own writer lock, so here we only need a reader
2676   // lock for other components.
2677   absl_ports::shared_lock l(&mutex_);
2678   query_stats->set_lock_acquisition_latency_ms(
2679       overall_timer->GetElapsedMilliseconds());
2680   if (!initialized_) {
2681     result_status->set_code(StatusProto::FAILED_PRECONDITION);
2682     result_status->set_message("IcingSearchEngine has not been initialized!");
2683     return result_proto;
2684   }
2685 
2686   auto result_retriever_or =
2687       ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
2688                                 language_segmenter_.get(), normalizer_.get());
2689   if (!result_retriever_or.ok()) {
2690     TransformStatus(result_retriever_or.status(), result_status);
2691     return result_proto;
2692   }
2693   std::unique_ptr<ResultRetrieverV2> result_retriever =
2694       std::move(result_retriever_or).ValueOrDie();
2695 
2696   int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
2697   libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
2698       page_result_info_or = result_state_manager_->GetNextPage(
2699           next_page_token, *result_retriever, current_time_ms);
2700   if (!page_result_info_or.ok()) {
2701     if (absl_ports::IsNotFound(page_result_info_or.status())) {
2702       // NOT_FOUND means an empty result.
2703       result_status->set_code(StatusProto::OK);
2704     } else {
2705       // Real error, pass up.
2706       TransformStatus(page_result_info_or.status(), result_status);
2707     }
2708     return result_proto;
2709   }
2710 
2711   std::pair<uint64_t, PageResult> page_result_info =
2712       std::move(page_result_info_or).ValueOrDie();
2713   query_stats->set_requested_page_size(
2714       page_result_info.second.requested_page_size);
2715 
2716   // Assembles the final search result proto
2717   result_proto.mutable_results()->Reserve(
2718       page_result_info.second.results.size());
2719 
2720   int32_t child_count = 0;
2721   for (SearchResultProto::ResultProto& result :
2722        page_result_info.second.results) {
2723     child_count += result.joined_results_size();
2724     result_proto.mutable_results()->Add(std::move(result));
2725   }
2726 
2727   result_status->set_code(StatusProto::OK);
2728   if (page_result_info.first != kInvalidNextPageToken) {
2729     result_proto.set_next_page_token(page_result_info.first);
2730   }
2731 
2732   // The only thing that we're doing is document retrieval. So document
2733   // retrieval latency and overall latency are the same and can use the same
2734   // timer.
2735   query_stats->set_document_retrieval_latency_ms(
2736       overall_timer->GetElapsedMilliseconds());
2737   query_stats->set_latency_ms(overall_timer->GetElapsedMilliseconds());
2738   query_stats->set_num_results_returned_current_page(
2739       result_proto.results_size());
2740   query_stats->set_num_results_with_snippets(
2741       page_result_info.second.num_results_with_snippets);
2742   query_stats->set_num_joined_results_returned_current_page(child_count);
2743 
2744   return result_proto;
2745 }
2746 
InvalidateNextPageToken(uint64_t next_page_token)2747 void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) {
2748   absl_ports::shared_lock l(&mutex_);
2749   if (!initialized_) {
2750     ICING_LOG(ERROR) << "IcingSearchEngine has not been initialized!";
2751     return;
2752   }
2753   result_state_manager_->InvalidateResultState(next_page_token);
2754 }
2755 
OpenWriteBlob(const PropertyProto::BlobHandleProto & blob_handle)2756 BlobProto IcingSearchEngine::OpenWriteBlob(
2757     const PropertyProto::BlobHandleProto& blob_handle) {
2758   BlobProto blob_proto;
2759   StatusProto* status = blob_proto.mutable_status();
2760 
2761   absl_ports::unique_lock l(&mutex_);
2762   if (blob_store_ == nullptr) {
2763     status->set_code(StatusProto::FAILED_PRECONDITION);
2764     status->set_message(
2765         "Open write blob is not supported in this Icing instance!");
2766     return blob_proto;
2767   }
2768 
2769   if (!initialized_) {
2770     status->set_code(StatusProto::FAILED_PRECONDITION);
2771     status->set_message("IcingSearchEngine has not been initialized!");
2772     return blob_proto;
2773   }
2774 
2775   libtextclassifier3::StatusOr<int> write_fd_or =
2776       blob_store_->OpenWrite(blob_handle);
2777   if (!write_fd_or.ok()) {
2778     TransformStatus(write_fd_or.status(), status);
2779     return blob_proto;
2780   }
2781   blob_proto.set_file_descriptor(write_fd_or.ValueOrDie());
2782   status->set_code(StatusProto::OK);
2783   return blob_proto;
2784 }
2785 
RemoveBlob(const PropertyProto::BlobHandleProto & blob_handle)2786 BlobProto IcingSearchEngine::RemoveBlob(
2787     const PropertyProto::BlobHandleProto& blob_handle) {
2788   BlobProto blob_proto;
2789   StatusProto* status = blob_proto.mutable_status();
2790 
2791   absl_ports::unique_lock l(&mutex_);
2792   if (blob_store_ == nullptr) {
2793     status->set_code(StatusProto::FAILED_PRECONDITION);
2794     status->set_message("Remove blob is not supported in this Icing instance!");
2795     return blob_proto;
2796   }
2797 
2798   if (!initialized_) {
2799     status->set_code(StatusProto::FAILED_PRECONDITION);
2800     status->set_message("IcingSearchEngine has not been initialized!");
2801     return blob_proto;
2802   }
2803 
2804   auto remove_result = blob_store_->RemoveBlob(blob_handle);
2805   if (!remove_result.ok()) {
2806     TransformStatus(remove_result, status);
2807     return blob_proto;
2808   }
2809   status->set_code(StatusProto::OK);
2810   return blob_proto;
2811 }
2812 
OpenReadBlob(const PropertyProto::BlobHandleProto & blob_handle)2813 BlobProto IcingSearchEngine::OpenReadBlob(
2814     const PropertyProto::BlobHandleProto& blob_handle) {
2815   BlobProto blob_proto;
2816   StatusProto* status = blob_proto.mutable_status();
2817   absl_ports::shared_lock l(&mutex_);
2818   if (blob_store_ == nullptr) {
2819     status->set_code(StatusProto::FAILED_PRECONDITION);
2820     status->set_message(
2821         "Open read blob is not supported in this Icing instance!");
2822     return blob_proto;
2823   }
2824 
2825   if (!initialized_) {
2826     status->set_code(StatusProto::FAILED_PRECONDITION);
2827     status->set_message("IcingSearchEngine has not been initialized!");
2828     ICING_LOG(ERROR) << status->message();
2829     return blob_proto;
2830   }
2831 
2832   auto read_fd_or = blob_store_->OpenRead(blob_handle);
2833   if (!read_fd_or.ok()) {
2834     TransformStatus(read_fd_or.status(), status);
2835     return blob_proto;
2836   }
2837   blob_proto.set_file_descriptor(read_fd_or.ValueOrDie());
2838   status->set_code(StatusProto::OK);
2839   return blob_proto;
2840 }
2841 
CommitBlob(const PropertyProto::BlobHandleProto & blob_handle)2842 BlobProto IcingSearchEngine::CommitBlob(
2843     const PropertyProto::BlobHandleProto& blob_handle) {
2844   BlobProto blob_proto;
2845   StatusProto* status = blob_proto.mutable_status();
2846   absl_ports::unique_lock l(&mutex_);
2847   if (blob_store_ == nullptr) {
2848     status->set_code(StatusProto::FAILED_PRECONDITION);
2849     status->set_message("Commit blob is not supported in this Icing instance!");
2850     return blob_proto;
2851   }
2852 
2853   if (!initialized_) {
2854     status->set_code(StatusProto::FAILED_PRECONDITION);
2855     status->set_message("IcingSearchEngine has not been initialized!");
2856     ICING_LOG(ERROR) << status->message();
2857     return blob_proto;
2858   }
2859 
2860   auto commit_result_or = blob_store_->CommitBlob(blob_handle);
2861   if (!commit_result_or.ok()) {
2862     TransformStatus(commit_result_or, status);
2863     return blob_proto;
2864   }
2865   status->set_code(StatusProto::OK);
2866   return blob_proto;
2867 }
2868 
2869 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
OptimizeDocumentStore(std::unordered_set<std::string> && potentially_optimizable_blob_handles,OptimizeStatsProto * optimize_stats)2870 IcingSearchEngine::OptimizeDocumentStore(
2871     std::unordered_set<std::string>&& potentially_optimizable_blob_handles,
2872     OptimizeStatsProto* optimize_stats) {
2873   // Gets the current directory path and an empty tmp directory path for
2874   // document store optimization.
2875   const std::string current_document_dir =
2876       MakeDocumentDirectoryPath(options_.base_dir());
2877   const std::string temporary_document_dir =
2878       MakeDocumentTemporaryDirectoryPath(options_.base_dir());
2879   if (!filesystem_->DeleteDirectoryRecursively(
2880           temporary_document_dir.c_str()) ||
2881       !filesystem_->CreateDirectoryRecursively(
2882           temporary_document_dir.c_str())) {
2883     return absl_ports::AbortedError(absl_ports::StrCat(
2884         "Failed to create a tmp directory: ", temporary_document_dir));
2885   }
2886 
2887   // Copies valid document data to tmp directory
2888   libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
2889       optimize_result_or = document_store_->OptimizeInto(
2890           temporary_document_dir, language_segmenter_.get(),
2891           std::move(potentially_optimizable_blob_handles), optimize_stats);
2892 
2893   // Handles error if any
2894   if (!optimize_result_or.ok()) {
2895     filesystem_->DeleteDirectoryRecursively(temporary_document_dir.c_str());
2896     return absl_ports::Annotate(
2897         absl_ports::AbortedError("Failed to optimize document store"),
2898         optimize_result_or.status().error_message());
2899   }
2900 
2901   // result_state_manager_ depends on document_store_. So we need to reset it at
2902   // the same time that we reset the document_store_.
2903   result_state_manager_.reset();
2904   document_store_.reset();
2905 
2906   // When swapping files, always put the current working directory at the
2907   // second place because it is renamed at the latter position so we're less
2908   // vulnerable to errors.
2909   if (!filesystem_->SwapFiles(temporary_document_dir.c_str(),
2910                               current_document_dir.c_str())) {
2911     ICING_LOG(ERROR) << "Failed to swap files";
2912 
2913     // Ensures that current directory is still present.
2914     if (!filesystem_->CreateDirectoryRecursively(
2915             current_document_dir.c_str())) {
2916       // Can't even create the old directory. Mark as uninitialized and return
2917       // INTERNAL.
2918       initialized_ = false;
2919       return absl_ports::InternalError(
2920           "Failed to create file directory for document store");
2921     }
2922 
2923     // Tries to rebuild document store if swapping fails, to avoid leaving the
2924     // system in the broken state for future operations.
2925     auto create_result_or = DocumentStore::Create(
2926         filesystem_.get(), current_document_dir, clock_.get(),
2927         schema_store_.get(), &feature_flags_,
2928         /*force_recovery_and_revalidate_documents=*/false,
2929         /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/true,
2930         options_.compression_level(), /*initialize_stats=*/nullptr);
2931     // TODO(b/144458732): Implement a more robust version of
2932     // TC_ASSIGN_OR_RETURN that can support error logging.
2933     if (!create_result_or.ok()) {
2934       // Unable to create DocumentStore from the old file. Mark as uninitialized
2935       // and return INTERNAL.
2936       initialized_ = false;
2937       ICING_LOG(ERROR) << "Failed to create document store instance";
2938       return absl_ports::Annotate(
2939           absl_ports::InternalError("Failed to create document store instance"),
2940           create_result_or.status().error_message());
2941     }
2942     document_store_ = std::move(create_result_or.ValueOrDie().document_store);
2943     result_state_manager_ = std::make_unique<ResultStateManager>(
2944         performance_configuration_.max_num_total_hits, *document_store_);
2945 
2946     // Potential data loss
2947     // TODO(b/147373249): Find a way to detect true data loss error
2948     return absl_ports::DataLossError(
2949         "Failed to optimize document store, there might be data loss");
2950   }
2951 
2952   // Recreates the doc store instance
2953   auto create_result_or = DocumentStore::Create(
2954       filesystem_.get(), current_document_dir, clock_.get(),
2955       schema_store_.get(), &feature_flags_,
2956       /*force_recovery_and_revalidate_documents=*/false,
2957       /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/true,
2958       options_.compression_level(), /*initialize_stats=*/nullptr);
2959   if (!create_result_or.ok()) {
2960     // Unable to create DocumentStore from the new file. Mark as uninitialized
2961     // and return INTERNAL.
2962     initialized_ = false;
2963     return absl_ports::InternalError(
2964         "Document store has been optimized, but a valid document store "
2965         "instance can't be created");
2966   }
2967   DocumentStore::CreateResult create_result =
2968       std::move(create_result_or).ValueOrDie();
2969   document_store_ = std::move(create_result.document_store);
2970   result_state_manager_ = std::make_unique<ResultStateManager>(
2971       performance_configuration_.max_num_total_hits, *document_store_);
2972 
2973   // Deletes tmp directory
2974   if (!filesystem_->DeleteDirectoryRecursively(
2975           temporary_document_dir.c_str())) {
2976     ICING_LOG(ERROR) << "Document store has been optimized, but it failed to "
2977                         "delete temporary file directory";
2978   }
2979 
2980   // Since we created new (optimized) document store with correct PersistToDisk
2981   // call, we shouldn't have data loss or regenerate derived files. Therefore,
2982   // if we really encounter any of these situations, then return DataLossError
2983   // to let the caller rebuild index.
2984   if (create_result.data_loss != DataLoss::NONE ||
2985       create_result.derived_files_regenerated) {
2986     return absl_ports::DataLossError(
2987         "Unexpected data loss or derived files regenerated for new document "
2988         "store");
2989   }
2990 
2991   return optimize_result_or;
2992 }
2993 
2994 IcingSearchEngine::IndexRestorationResult
RestoreIndexIfNeeded()2995 IcingSearchEngine::RestoreIndexIfNeeded() {
2996   DocumentId last_stored_document_id =
2997       document_store_->last_added_document_id();
2998   if (last_stored_document_id == index_->last_added_document_id() &&
2999       last_stored_document_id == integer_index_->last_added_document_id() &&
3000       last_stored_document_id ==
3001           qualified_id_join_index_->last_added_document_id() &&
3002       last_stored_document_id == embedding_index_->last_added_document_id()) {
3003     // No need to recover.
3004     return {libtextclassifier3::Status::OK, false, false, false, false};
3005   }
3006 
3007   if (last_stored_document_id == kInvalidDocumentId) {
3008     // Document store is empty but index is not. Clear the index.
3009     return {ClearAllIndices(), false, false, false, false};
3010   }
3011 
3012   // Truncate indices first.
3013   auto truncate_result_or = TruncateIndicesTo(last_stored_document_id);
3014   if (!truncate_result_or.ok()) {
3015     return {std::move(truncate_result_or).status(), false, false, false, false};
3016   }
3017   TruncateIndexResult truncate_result =
3018       std::move(truncate_result_or).ValueOrDie();
3019 
3020   if (truncate_result.first_document_to_reindex > last_stored_document_id) {
3021     // Nothing to restore. Just return.
3022     return {libtextclassifier3::Status::OK, false, false, false, false};
3023   }
3024 
3025   auto data_indexing_handlers_or = CreateDataIndexingHandlers();
3026   if (!data_indexing_handlers_or.ok()) {
3027     return {data_indexing_handlers_or.status(),
3028             truncate_result.index_needed_restoration,
3029             truncate_result.integer_index_needed_restoration,
3030             truncate_result.qualified_id_join_index_needed_restoration,
3031             truncate_result.embedding_index_needed_restoration};
3032   }
3033   // By using recovery_mode for IndexProcessor, we're able to replay documents
3034   // from smaller document id and it will skip documents that are already been
3035   // indexed.
3036   IndexProcessor index_processor(
3037       std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get(),
3038       /*recovery_mode=*/true);
3039 
3040   ICING_VLOG(1) << "Restoring index by replaying documents from document id "
3041                 << truncate_result.first_document_to_reindex
3042                 << " to document id " << last_stored_document_id;
3043   libtextclassifier3::Status overall_status;
3044   for (DocumentId document_id = truncate_result.first_document_to_reindex;
3045        document_id <= last_stored_document_id; ++document_id) {
3046     libtextclassifier3::StatusOr<DocumentProto> document_or =
3047         document_store_->Get(document_id);
3048 
3049     if (!document_or.ok()) {
3050       if (absl_ports::IsInvalidArgument(document_or.status()) ||
3051           absl_ports::IsNotFound(document_or.status())) {
3052         // Skips invalid and non-existing documents.
3053         continue;
3054       } else {
3055         // Returns other errors
3056         return {document_or.status(), truncate_result.index_needed_restoration,
3057                 truncate_result.integer_index_needed_restoration,
3058                 truncate_result.qualified_id_join_index_needed_restoration,
3059                 truncate_result.embedding_index_needed_restoration};
3060       }
3061     }
3062     DocumentProto document(std::move(document_or).ValueOrDie());
3063 
3064     libtextclassifier3::StatusOr<TokenizedDocument> tokenized_document_or =
3065         TokenizedDocument::Create(schema_store_.get(),
3066                                   language_segmenter_.get(),
3067                                   std::move(document));
3068     if (!tokenized_document_or.ok()) {
3069       return {tokenized_document_or.status(),
3070               truncate_result.index_needed_restoration,
3071               truncate_result.integer_index_needed_restoration,
3072               truncate_result.qualified_id_join_index_needed_restoration,
3073               truncate_result.embedding_index_needed_restoration};
3074     }
3075     TokenizedDocument tokenized_document(
3076         std::move(tokenized_document_or).ValueOrDie());
3077 
3078     // No valid old_document_id should be used here since we're in recovery mode
3079     // and there is no "existing document replacement/update".
3080     libtextclassifier3::Status status =
3081         index_processor.IndexDocument(tokenized_document, document_id,
3082                                       /*old_document_id=*/kInvalidDocumentId);
3083     if (!status.ok()) {
3084       if (!absl_ports::IsDataLoss(status)) {
3085         // Real error. Stop recovering and pass it up.
3086         return {status, truncate_result.index_needed_restoration,
3087                 truncate_result.integer_index_needed_restoration,
3088                 truncate_result.qualified_id_join_index_needed_restoration,
3089                 truncate_result.embedding_index_needed_restoration};
3090       }
3091       // FIXME: why can we skip data loss error here?
3092       // Just a data loss. Keep trying to add the remaining docs, but report the
3093       // data loss when we're done.
3094       overall_status = status;
3095     }
3096   }
3097 
3098   return {overall_status, truncate_result.index_needed_restoration,
3099           truncate_result.integer_index_needed_restoration,
3100           truncate_result.qualified_id_join_index_needed_restoration,
3101           truncate_result.embedding_index_needed_restoration};
3102 }
3103 
LostPreviousSchema()3104 libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() {
3105   auto status_or = schema_store_->GetSchema();
3106   if (status_or.ok()) {
3107     // Found a schema.
3108     return false;
3109   }
3110 
3111   if (!absl_ports::IsNotFound(status_or.status())) {
3112     // Any other type of error
3113     return status_or.status();
3114   }
3115 
3116   // We know: We don't have a schema now.
3117   //
3118   // We know: If no documents have been added, then the last_added_document_id
3119   // will be invalid.
3120   //
3121   // So: If documents have been added before and we don't have a schema now,
3122   // then that means we must have had a schema at some point. Since we wouldn't
3123   // accept documents without a schema to validate them against.
3124   return document_store_->last_added_document_id() != kInvalidDocumentId;
3125 }
3126 
3127 libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DataIndexingHandler>>>
CreateDataIndexingHandlers()3128 IcingSearchEngine::CreateDataIndexingHandlers() {
3129   std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
3130 
3131   // Term index handler
3132   ICING_ASSIGN_OR_RETURN(
3133       std::unique_ptr<TermIndexingHandler> term_indexing_handler,
3134       TermIndexingHandler::Create(
3135           clock_.get(), normalizer_.get(), index_.get(),
3136           options_.build_property_existence_metadata_hits()));
3137   handlers.push_back(std::move(term_indexing_handler));
3138 
3139   // Integer index handler
3140   ICING_ASSIGN_OR_RETURN(std::unique_ptr<IntegerSectionIndexingHandler>
3141                              integer_section_indexing_handler,
3142                          IntegerSectionIndexingHandler::Create(
3143                              clock_.get(), integer_index_.get()));
3144   handlers.push_back(std::move(integer_section_indexing_handler));
3145 
3146   // Qualified id join index handler
3147   ICING_ASSIGN_OR_RETURN(
3148       std::unique_ptr<QualifiedIdJoinIndexingHandler>
3149           qualified_id_join_indexing_handler,
3150       QualifiedIdJoinIndexingHandler::Create(
3151           clock_.get(), document_store_.get(), qualified_id_join_index_.get()));
3152   handlers.push_back(std::move(qualified_id_join_indexing_handler));
3153 
3154   // Embedding index handler
3155   ICING_ASSIGN_OR_RETURN(
3156       std::unique_ptr<EmbeddingIndexingHandler> embedding_indexing_handler,
3157       EmbeddingIndexingHandler::Create(clock_.get(), embedding_index_.get(),
3158                                        options_.enable_embedding_index()));
3159   handlers.push_back(std::move(embedding_indexing_handler));
3160   return handlers;
3161 }
3162 
3163 libtextclassifier3::StatusOr<IcingSearchEngine::TruncateIndexResult>
TruncateIndicesTo(DocumentId last_stored_document_id)3164 IcingSearchEngine::TruncateIndicesTo(DocumentId last_stored_document_id) {
3165   // Attempt to truncate term index.
3166   // TruncateTo ensures that the index does not hold any data that is not
3167   // present in the ground truth. If the document store lost some documents,
3168   // TruncateTo will ensure that the index does not contain any hits from those
3169   // lost documents. If the index does not contain any hits for documents with
3170   // document id greater than last_stored_document_id, then TruncateTo will have
3171   // no effect.
3172   ICING_RETURN_IF_ERROR(index_->TruncateTo(last_stored_document_id));
3173 
3174   // Get last indexed document id for term index after truncating.
3175   DocumentId term_index_last_added_document_id =
3176       index_->last_added_document_id();
3177   DocumentId first_document_to_reindex =
3178       (term_index_last_added_document_id != kInvalidDocumentId)
3179           ? term_index_last_added_document_id + 1
3180           : kMinDocumentId;
3181   bool index_needed_restoration =
3182       (last_stored_document_id != term_index_last_added_document_id);
3183 
3184   // Attempt to truncate integer index.
3185   bool integer_index_needed_restoration = false;
3186   DocumentId integer_index_last_added_document_id =
3187       integer_index_->last_added_document_id();
3188   if (integer_index_last_added_document_id == kInvalidDocumentId ||
3189       last_stored_document_id > integer_index_last_added_document_id) {
3190     // If last_stored_document_id is greater than
3191     // integer_index_last_added_document_id, then we only have to replay docs
3192     // starting from integer_index_last_added_document_id + 1. Also use std::min
3193     // since we might need to replay even smaller doc ids for term index.
3194     integer_index_needed_restoration = true;
3195     if (integer_index_last_added_document_id != kInvalidDocumentId) {
3196       first_document_to_reindex = std::min(
3197           first_document_to_reindex, integer_index_last_added_document_id + 1);
3198     } else {
3199       first_document_to_reindex = kMinDocumentId;
3200     }
3201   } else if (last_stored_document_id < integer_index_last_added_document_id) {
3202     // Clear the entire integer index if last_stored_document_id is smaller than
3203     // integer_index_last_added_document_id, because there is no way to remove
3204     // data with doc_id > last_stored_document_id from integer index and we have
3205     // to rebuild.
3206     ICING_RETURN_IF_ERROR(integer_index_->Clear());
3207 
3208     // Since the entire integer index is discarded, we start to rebuild it by
3209     // setting first_document_to_reindex to kMinDocumentId.
3210     integer_index_needed_restoration = true;
3211     first_document_to_reindex = kMinDocumentId;
3212   }
3213 
3214   // Attempt to truncate qualified id join index
3215   bool qualified_id_join_index_needed_restoration = false;
3216   DocumentId qualified_id_join_index_last_added_document_id =
3217       qualified_id_join_index_->last_added_document_id();
3218   if (qualified_id_join_index_last_added_document_id == kInvalidDocumentId ||
3219       last_stored_document_id >
3220           qualified_id_join_index_last_added_document_id) {
3221     // If last_stored_document_id is greater than
3222     // qualified_id_join_index_last_added_document_id, then we only have to
3223     // replay docs starting from (qualified_id_join_index_last_added_document_id
3224     // + 1). Also use std::min since we might need to replay even smaller doc
3225     // ids for other components.
3226     qualified_id_join_index_needed_restoration = true;
3227     if (qualified_id_join_index_last_added_document_id != kInvalidDocumentId) {
3228       first_document_to_reindex =
3229           std::min(first_document_to_reindex,
3230                    qualified_id_join_index_last_added_document_id + 1);
3231     } else {
3232       first_document_to_reindex = kMinDocumentId;
3233     }
3234   } else if (last_stored_document_id <
3235              qualified_id_join_index_last_added_document_id) {
3236     // Clear the entire qualified id join index if last_stored_document_id is
3237     // smaller than qualified_id_join_index_last_added_document_id, because
3238     // there is no way to remove data with doc_id > last_stored_document_id from
3239     // join index efficiently and we have to rebuild.
3240     ICING_RETURN_IF_ERROR(qualified_id_join_index_->Clear());
3241 
3242     // Since the entire qualified id join index is discarded, we start to
3243     // rebuild it by setting first_document_to_reindex to kMinDocumentId.
3244     qualified_id_join_index_needed_restoration = true;
3245     first_document_to_reindex = kMinDocumentId;
3246   }
3247 
3248   // Attempt to truncate embedding index
3249   bool embedding_index_needed_restoration = false;
3250   DocumentId embedding_index_last_added_document_id =
3251       embedding_index_->last_added_document_id();
3252   if (embedding_index_last_added_document_id == kInvalidDocumentId ||
3253       last_stored_document_id > embedding_index_last_added_document_id) {
3254     // If last_stored_document_id is greater than
3255     // embedding_index_last_added_document_id, then we only have to replay docs
3256     // starting from (embedding_index_last_added_document_id + 1). Also use
3257     // std::min since we might need to replay even smaller doc ids for other
3258     // components.
3259     embedding_index_needed_restoration = true;
3260     if (embedding_index_last_added_document_id != kInvalidDocumentId) {
3261       first_document_to_reindex =
3262           std::min(first_document_to_reindex,
3263                    embedding_index_last_added_document_id + 1);
3264     } else {
3265       first_document_to_reindex = kMinDocumentId;
3266     }
3267   } else if (last_stored_document_id < embedding_index_last_added_document_id) {
3268     // Clear the entire embedding index if last_stored_document_id is
3269     // smaller than embedding_index_last_added_document_id, because
3270     // there is no way to remove data with doc_id > last_stored_document_id from
3271     // embedding index efficiently and we have to rebuild.
3272     ICING_RETURN_IF_ERROR(embedding_index_->Clear());
3273 
3274     // Since the entire embedding index is discarded, we start to
3275     // rebuild it by setting first_document_to_reindex to kMinDocumentId.
3276     embedding_index_needed_restoration = true;
3277     first_document_to_reindex = kMinDocumentId;
3278   }
3279 
3280   return TruncateIndexResult(first_document_to_reindex,
3281                              index_needed_restoration,
3282                              integer_index_needed_restoration,
3283                              qualified_id_join_index_needed_restoration,
3284                              embedding_index_needed_restoration);
3285 }
3286 
DiscardDerivedFiles(const version_util::DerivedFilesRebuildResult & rebuild_result)3287 libtextclassifier3::Status IcingSearchEngine::DiscardDerivedFiles(
3288     const version_util::DerivedFilesRebuildResult& rebuild_result) {
3289   if (!rebuild_result.IsRebuildNeeded()) {
3290     return libtextclassifier3::Status::OK;
3291   }
3292 
3293   if (schema_store_ != nullptr || document_store_ != nullptr ||
3294       index_ != nullptr || integer_index_ != nullptr ||
3295       qualified_id_join_index_ != nullptr || embedding_index_ != nullptr) {
3296     return absl_ports::FailedPreconditionError(
3297         "Cannot discard derived files while having valid instances");
3298   }
3299 
3300   // Schema store
3301   if (rebuild_result.needs_schema_store_derived_files_rebuild) {
3302     ICING_RETURN_IF_ERROR(SchemaStore::DiscardDerivedFiles(
3303         filesystem_.get(), MakeSchemaDirectoryPath(options_.base_dir())));
3304   }
3305 
3306   // Document store
3307   if (rebuild_result.needs_document_store_derived_files_rebuild) {
3308     ICING_RETURN_IF_ERROR(DocumentStore::DiscardDerivedFiles(
3309         filesystem_.get(), MakeDocumentDirectoryPath(options_.base_dir())));
3310   }
3311 
3312   // Term index
3313   if (rebuild_result.needs_term_index_rebuild) {
3314     if (!filesystem_->DeleteDirectoryRecursively(
3315             MakeIndexDirectoryPath(options_.base_dir()).c_str())) {
3316       return absl_ports::InternalError("Failed to discard index");
3317     }
3318   }
3319 
3320   // Integer index
3321   if (rebuild_result.needs_integer_index_rebuild) {
3322     if (!filesystem_->DeleteDirectoryRecursively(
3323             MakeIntegerIndexWorkingPath(options_.base_dir()).c_str())) {
3324       return absl_ports::InternalError("Failed to discard integer index");
3325     }
3326   }
3327 
3328   // Qualified id join index
3329   if (rebuild_result.needs_qualified_id_join_index_rebuild) {
3330     if (!filesystem_->DeleteDirectoryRecursively(
3331             MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir()).c_str())) {
3332       return absl_ports::InternalError(
3333           "Failed to discard qualified id join index");
3334     }
3335   }
3336 
3337   // Embedding index.
3338   if (rebuild_result.needs_embedding_index_rebuild) {
3339     ICING_RETURN_IF_ERROR(EmbeddingIndex::Discard(
3340         *filesystem_, MakeEmbeddingIndexWorkingPath(options_.base_dir())));
3341   }
3342 
3343   return libtextclassifier3::Status::OK;
3344 }
3345 
ClearSearchIndices()3346 libtextclassifier3::Status IcingSearchEngine::ClearSearchIndices() {
3347   ICING_RETURN_IF_ERROR(index_->Reset());
3348   ICING_RETURN_IF_ERROR(integer_index_->Clear());
3349   ICING_RETURN_IF_ERROR(embedding_index_->Clear());
3350   return libtextclassifier3::Status::OK;
3351 }
3352 
ClearJoinIndices()3353 libtextclassifier3::Status IcingSearchEngine::ClearJoinIndices() {
3354   return qualified_id_join_index_->Clear();
3355 }
3356 
ClearAllIndices()3357 libtextclassifier3::Status IcingSearchEngine::ClearAllIndices() {
3358   ICING_RETURN_IF_ERROR(ClearSearchIndices());
3359   ICING_RETURN_IF_ERROR(ClearJoinIndices());
3360   return libtextclassifier3::Status::OK;
3361 }
3362 
Reset()3363 ResetResultProto IcingSearchEngine::Reset() {
3364   absl_ports::unique_lock l(&mutex_);
3365   return ResetInternal();
3366 }
3367 
ResetInternal()3368 ResetResultProto IcingSearchEngine::ResetInternal() {
3369   ICING_VLOG(1) << "Resetting IcingSearchEngine";
3370 
3371   ResetResultProto result_proto;
3372   StatusProto* result_status = result_proto.mutable_status();
3373 
3374   initialized_ = false;
3375   ResetMembers();
3376   if (!filesystem_->DeleteDirectoryRecursively(options_.base_dir().c_str())) {
3377     result_status->set_code(StatusProto::INTERNAL);
3378     return result_proto;
3379   }
3380 
3381   if (InternalInitialize().status().code() != StatusProto::OK) {
3382     // We shouldn't hit the following Initialize errors:
3383     //   NOT_FOUND: all data was cleared, we aren't expecting anything
3384     //   DATA_LOSS: all data was cleared, we aren't expecting anything
3385     //   RESOURCE_EXHAUSTED: just deleted files, shouldn't run out of space
3386     //
3387     // We can't tell if Initialize failed and left Icing in an inconsistent
3388     // state or if it was a temporary I/O error. Group everything under INTERNAL
3389     // to be safe.
3390     //
3391     // TODO(b/147699081): Once Initialize returns the proper ABORTED/INTERNAL
3392     // status code, we can just propagate it up from here.
3393     result_status->set_code(StatusProto::INTERNAL);
3394     return result_proto;
3395   }
3396 
3397   result_status->set_code(StatusProto::OK);
3398   return result_proto;
3399 }
3400 
SearchSuggestions(const SuggestionSpecProto & suggestion_spec)3401 SuggestionResponse IcingSearchEngine::SearchSuggestions(
3402     const SuggestionSpecProto& suggestion_spec) {
3403   // TODO(b/146008613) Explore ideas to make this function read-only.
3404   absl_ports::unique_lock l(&mutex_);
3405   SuggestionResponse response;
3406   StatusProto* response_status = response.mutable_status();
3407   if (!initialized_) {
3408     response_status->set_code(StatusProto::FAILED_PRECONDITION);
3409     response_status->set_message("IcingSearchEngine has not been initialized!");
3410     return response;
3411   }
3412 
3413   libtextclassifier3::Status status =
3414       ValidateSuggestionSpec(suggestion_spec, performance_configuration_);
3415   if (!status.ok()) {
3416     TransformStatus(status, response_status);
3417     return response;
3418   }
3419 
3420   // Create the suggestion processor.
3421   auto suggestion_processor_or = SuggestionProcessor::Create(
3422       index_.get(), integer_index_.get(), embedding_index_.get(),
3423       language_segmenter_.get(), normalizer_.get(), document_store_.get(),
3424       schema_store_.get(), clock_.get(), &feature_flags_);
3425   if (!suggestion_processor_or.ok()) {
3426     TransformStatus(suggestion_processor_or.status(), response_status);
3427     return response;
3428   }
3429   std::unique_ptr<SuggestionProcessor> suggestion_processor =
3430       std::move(suggestion_processor_or).ValueOrDie();
3431 
3432   // Run suggestion based on given SuggestionSpec.
3433   int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
3434   libtextclassifier3::StatusOr<std::vector<TermMetadata>> terms_or =
3435       suggestion_processor->QuerySuggestions(suggestion_spec, current_time_ms);
3436   if (!terms_or.ok()) {
3437     TransformStatus(terms_or.status(), response_status);
3438     return response;
3439   }
3440 
3441   // Convert vector<TermMetaData> into final SuggestionResponse proto.
3442   for (TermMetadata& term : terms_or.ValueOrDie()) {
3443     SuggestionResponse::Suggestion suggestion;
3444     suggestion.set_query(std::move(term.content));
3445     response.mutable_suggestions()->Add(std::move(suggestion));
3446   }
3447   response_status->set_code(StatusProto::OK);
3448   return response;
3449 }
3450 
3451 }  // namespace lib
3452 }  // namespace icing
3453