xref: /aosp_15_r20/external/icing/icing/icing-search-engine.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_ICING_SEARCH_ENGINE_H_
16 #define ICING_ICING_SEARCH_ENGINE_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/mutex.h"
29 #include "icing/absl_ports/thread_annotations.h"
30 #include "icing/feature-flags.h"
31 #include "icing/file/filesystem.h"
32 #include "icing/file/version-util.h"
33 #include "icing/index/data-indexing-handler.h"
34 #include "icing/index/embed/embedding-index.h"
35 #include "icing/index/index.h"
36 #include "icing/index/numeric/numeric-index.h"
37 #include "icing/jni/jni-cache.h"
38 #include "icing/join/join-children-fetcher.h"
39 #include "icing/join/qualified-id-join-index.h"
40 #include "icing/legacy/index/icing-filesystem.h"
41 #include "icing/performance-configuration.h"
42 #include "icing/proto/blob.pb.h"
43 #include "icing/proto/debug.pb.h"
44 #include "icing/proto/document.pb.h"
45 #include "icing/proto/initialize.pb.h"
46 #include "icing/proto/logging.pb.h"
47 #include "icing/proto/optimize.pb.h"
48 #include "icing/proto/persist.pb.h"
49 #include "icing/proto/reset.pb.h"
50 #include "icing/proto/schema.pb.h"
51 #include "icing/proto/scoring.pb.h"
52 #include "icing/proto/search.pb.h"
53 #include "icing/proto/storage.pb.h"
54 #include "icing/proto/usage.pb.h"
55 #include "icing/query/query-terms.h"
56 #include "icing/result/result-state-manager.h"
57 #include "icing/schema/schema-store.h"
58 #include "icing/scoring/scored-document-hit.h"
59 #include "icing/store/blob-store.h"
60 #include "icing/store/document-id.h"
61 #include "icing/store/document-store.h"
62 #include "icing/tokenization/language-segmenter.h"
63 #include "icing/transform/normalizer.h"
64 #include "icing/util/clock.h"
65 
66 namespace icing {
67 namespace lib {
68 
69 // TODO(cassiewang) Top-level comments and links to design-doc.
70 class IcingSearchEngine {
71  public:
72   // Note: It is only required to provide a pointer to a valid instance of
73   // JniCache if this instance needs to perform reverse-jni calls. Users on
74   // Linux and iOS should always provide a nullptr.
75   explicit IcingSearchEngine(
76       const IcingSearchEngineOptions& options,
77       std::unique_ptr<const JniCache> jni_cache = nullptr);
78 
79   // Calculates integrity checks and persists files to disk.
80   ~IcingSearchEngine();
81 
82   // Loads & verifies the contents previously indexed from disk and gets ready
83   // to handle read/write requests.
84   //
85   // WARNING: This is expected to be fast if Icing had a clean shutdown.
86   // Otherwise, it can take longer as it runs integrity checks and attempts
87   // to bring the index to a consistent state. If the data on disk is not
88   // consistent, it restores the state when PersistToDisk() was last called.
89   //
90   // TODO(cassiewang): We shouldn't return NOT_FOUND here, this is a symptom
91   // of some other error. We should return a broader error group, i.e. data
92   // inconsistency or something
93   //
94   // Returns:
95   //   OK on success
96   //   DATA_LOSS if encountered any inconsistencies in data and had to restore
97   //     its state back to the last time PersistToDisk was called. Or if any
98   //     persisted data was lost and could not be recovered.
99   //   INTERNAL if any internal state was left in an inconsistent. The instance
100   //     of IcingSearchEngine is unusable if this happens. It's recommended to
101   //     clear the underlying directory provided in
102   //     IcingSearchEngineOptions.base_dir and reinitialize.
103   //   RESOURCE_EXHAUSTED if not enough storage space
104   //   NOT_FOUND if missing some internal data
105   InitializeResultProto Initialize() ICING_LOCKS_EXCLUDED(mutex_);
106 
107   // Specifies the schema to be applied on all Documents that are already
108   // stored as well as future documents. A schema can be 'invalid' and/or
109   // 'incompatible'. These are two independent concepts.
110   //
111   // An 'invalid' schema is one that is not constructed properly. For example,
112   // a PropertyConfigProto is missing the property name field. A schema can be
113   // 'invalid' even if there is no previously existing schema.
114   //
115   // An 'incompatible' schema is one that is incompatible with a previously
116   // existing schema. If there is no previously existing schema, then a new
117   // schema cannot be incompatible. An incompatible schema is one that
118   // invalidates pre-existing data. For example, a previously OPTIONAL field is
119   // now REQUIRED in the new schema, and pre-existing data is considered invalid
120   // against the new schema now.
121   //
122   // Default behavior will not allow a new schema to be set if it is invalid or
123   // incompatible.
124   //
125   // The argument 'ignore_errors_and_delete_documents' can be set to true to
126   // force set an incompatible schema. In that case, documents that are
127   // invalidated by the new schema would be deleted from Icing. This cannot be
128   // used to force set an invalid schema.
129   //
130   // This schema is persisted to disk and used across multiple instances.
131   // So, callers should only have to call this if the schema changed.
132   // However, calling it multiple times with the same schema is a no-op.
133   //
134   // On some errors, Icing will keep using the older schema, but on
135   // INTERNAL_ERROR, it is undefined to continue using Icing.
136   //
137   // Returns:
138   //   OK on success
139   //   ALREADY_EXISTS if 'new_schema' contains multiple definitions of the same
140   //     type or contains a type that has multiple properties with the same
141   //     name.
142   //   INVALID_ARGUMENT if 'new_schema' is invalid
143   //   FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine
144   //     has not been initialized yet.
145   //   INTERNAL_ERROR if Icing failed to store the new schema or upgrade
146   //     existing data based on the new schema. Using Icing beyond this error is
147   //     undefined and may cause crashes.
148   //   DATA_LOSS_ERROR if 'new_schema' requires the index to be rebuilt and an
149   //     IO error leads to some documents being excluded from the index. These
150   //     documents will still be retrievable via Get, but won't match queries.
151   //
152   // TODO(cassiewang) Figure out, document (and maybe even enforce) the best
153   // way ordering of calls between Initialize() and SetSchema(), both when
154   // the caller is creating an instance of IcingSearchEngine for the first
155   // time and when the caller is reinitializing an existing index on disk.
156   SetSchemaResultProto SetSchema(
157       SchemaProto&& new_schema, bool ignore_errors_and_delete_documents = false)
158       ICING_LOCKS_EXCLUDED(mutex_);
159 
160   // This function makes a copy of the schema and calls SetSchema(SchemaProto&&
161   // new_schema, bool ignore_errors_and_delete_documents)
162   //
163   // NOTE: It's recommended to call SetSchema(SchemaProto&& new_schema, bool
164   // ignore_errors_and_delete_documents) directly to avoid a copy if the caller
165   // can make an rvalue SchemaProto.
166   SetSchemaResultProto SetSchema(const SchemaProto& new_schema,
167                                  bool ignore_errors_and_delete_documents =
168                                      false) ICING_LOCKS_EXCLUDED(mutex_);
169 
170   // Get Icing's current copy of the schema.
171   //
172   // Returns:
173   //   SchemaProto on success
174   //   NOT_FOUND if a schema has not been set yet
175   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet.
176   //   INTERNAL_ERROR on IO error
177   GetSchemaResultProto GetSchema() ICING_LOCKS_EXCLUDED(mutex_);
178 
179   // Get Icing's current copy of the schema for the given database.
180   //
181   // NOTE: This is an expensive operation. It is recommended to call GetSchema()
182   // instead if you do not need to filter the schema by database, or if you're
183   // retrieving the only database in the schema.
184   //
185   // Returns:
186   //   SchemaProto on success
187   //   NOT_FOUND if a schema has not been set yet, or if the database is not
188   //     present in the schema
189   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet.
190   //   INTERNAL_ERROR on IO error
191   GetSchemaResultProto GetSchema(std::string_view database)
192       ICING_LOCKS_EXCLUDED(mutex_);
193 
194   // Get Icing's copy of the SchemaTypeConfigProto of name schema_type
195   //
196   // Returns:
197   //   SchemaTypeConfigProto on success
198   //   FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine
199   //     has not been initialized yet.
200   //   NOT_FOUND if there is no SchemaTypeConfig of schema_type in the
201   //     SchemaProto
202   //   INTERNAL_ERROR on IO error
203   GetSchemaTypeResultProto GetSchemaType(std::string_view schema_type)
204       ICING_LOCKS_EXCLUDED(mutex_);
205 
206   // Puts the document into icing search engine so that it's stored and
207   // indexed. Documents are automatically written to disk, callers can also
208   // call PersistToDisk() to flush changes immediately.
209   //
210   // Returns:
211   //   OK on success
212   //   OUT_OF_SPACE if exceeds maximum number of allowed documents
213   //   FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine
214   //     has not been initialized yet.
215   //   NOT_FOUND if there is no SchemaTypeConfig in the SchemaProto that matches
216   //     the document's schema
217   //   DATA_LOSS if an IO error occurs while merging document into the index and
218   //     the index is lost. These documents will still be retrievable via Get,
219   //     but won't match queries.
220   //   INTERNAL_ERROR on IO error
221   PutResultProto Put(DocumentProto&& document) ICING_LOCKS_EXCLUDED(mutex_);
222 
223   // This function makes a copy of document and calls Put(DocumentProto&&
224   // document).
225   //
226   // NOTE: It's recommended to call Put(DocumentProto&& document) directly to
227   // avoid a copy if the caller can make an rvalue DocumentProto.
228   PutResultProto Put(const DocumentProto& document)
229       ICING_LOCKS_EXCLUDED(mutex_);
230 
231   // Finds and returns the document identified by the given key (namespace +
232   // uri)
233   //
234   // Returns:
235   //   The document found on success
236   //   NOT_FOUND if the key doesn't exist or doc has been deleted
237   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
238   //   INTERNAL_ERROR on IO error
239   GetResultProto Get(std::string_view name_space, std::string_view uri,
240                      const GetResultSpecProto& result_spec);
241 
242   // Reports usage. The corresponding usage scores of the specified document in
243   // the report will be updated.
244   //
245   // Returns:
246   //   OK on success
247   //   NOT_FOUND if the [namesapce + uri] key in the report doesn't exist
248   //   INTERNAL_ERROR on I/O errors.
249   ReportUsageResultProto ReportUsage(const UsageReport& usage_report);
250 
251   // Returns all the namespaces that have at least one valid document in it.
252   //
253   // Returns:
254   //   All namespaces on success
255   GetAllNamespacesResultProto GetAllNamespaces();
256 
257   // Deletes the Document specified by the given namespace / uri pair from the
258   // search engine. Delete changes are automatically applied to disk, callers
259   // can also call PersistToDisk() to flush changes immediately.
260   //
261   // NOTE: Space is not reclaimed for deleted documents until Optimize() is
262   // called.
263   //
264   // Returns:
265   //   OK on success
266   //   NOT_FOUND if no document exists with namespace, uri
267   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
268   //   INTERNAL_ERROR on IO error
269   DeleteResultProto Delete(std::string_view name_space, std::string_view uri)
270       ICING_LOCKS_EXCLUDED(mutex_);
271 
272   // Deletes all Documents belonging to the specified namespace from the search
273   // engine. Delete changes are automatically applied to disk, callers can also
274   // call PersistToDisk() to flush changes immediately.
275   //
276   // NOTE: Space is not reclaimed for deleted documents until Optimize() is
277   // called.
278   //
279   // Returns:
280   //   OK on success
281   //   NOT_FOUND if namespace doesn't exist
282   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
283   //   INTERNAL_ERROR on IO error
284   DeleteByNamespaceResultProto DeleteByNamespace(std::string_view name_space)
285       ICING_LOCKS_EXCLUDED(mutex_);
286 
287   // Deletes all Documents belonging to the specified type from the search
288   // engine. Delete changes are automatically applied to disk, callers can also
289   // call PersistToDisk() to flush changes immediately.
290   //
291   // NOTE: Space is not reclaimed for deleted documents until Optimize() is
292   // called.
293   //
294   // Returns:
295   //   OK on success
296   //   NOT_FOUND if schema type doesn't exist
297   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
298   //   INTERNAL_ERROR on IO error
299   DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type)
300       ICING_LOCKS_EXCLUDED(mutex_);
301 
302   // Deletes all Documents that match the query specified in search_spec. Delete
303   // changes are automatically applied to disk, callers can also call
304   // PersistToDisk() to flush changes immediately.
305   //
306   // NOTE: Space is not reclaimed for deleted documents until Optimize() is
307   // called.
308   //
309   // Returns:
310   //   OK on success
311   //   NOT_FOUND if the query doesn't match any documents
312   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
313   //   INTERNAL_ERROR on IO error
314   DeleteByQueryResultProto DeleteByQuery(
315       const SearchSpecProto& search_spec,
316       bool return_deleted_document_info = false) ICING_LOCKS_EXCLUDED(mutex_);
317 
318   // Retrieves, scores, ranks, and returns the results according to the specs.
319   // Results can be empty. If there're multiple pages of results,
320   // SearchResultProto.next_page_token will be set to a non-zero token and can
321   // be used to fetch more pages via GetNextPage() method. Clients should call
322   // InvalidateNextPageToken() after they get the pages they need to release
323   // result cache in memory. Please refer to each proto file for spec
324   // definitions.
325   //
326   // Returns a SearchResultProto with status:
327   //   OK with results on success
328   //   INVALID_ARGUMENT if any of specs is invalid
329   //   ABORTED if failed to perform search but existing data is not affected
330   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
331   //   INTERNAL_ERROR on any other errors
332   SearchResultProto Search(const SearchSpecProto& search_spec,
333                            const ScoringSpecProto& scoring_spec,
334                            const ResultSpecProto& result_spec)
335       ICING_LOCKS_EXCLUDED(mutex_);
336 
337   // Retrieves, scores, ranks and returns the suggested query string according
338   // to the specs. Results can be empty.
339   //
340   // Returns a SuggestionResponse with status:
341   //   OK with results on success
342   //   INVALID_ARGUMENT if any of specs is invalid
343   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
344   //   INTERNAL_ERROR on any other errors
345   SuggestionResponse SearchSuggestions(
346       const SuggestionSpecProto& suggestion_spec) ICING_LOCKS_EXCLUDED(mutex_);
347 
348   // Fetches the next page of results of a previously executed query. Results
349   // can be empty if next-page token is invalid. Invalid next page tokens are
350   // tokens that are either zero or were previously passed to
351   // InvalidateNextPageToken. If there are pages of results remaining after the
352   // one retrieved by this call, SearchResultProto.next_page_token will be
353   // set to a non-zero token and can be used to fetch more pages via
354   // GetNextPage() method.
355   //
356   // Returns a SearchResultProto with status:
357   //   OK with results on success
358   //   ABORTED if failed to get results but existing data is not affected
359   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
360   //   INTERNAL_ERROR on any other errors
361   SearchResultProto GetNextPage(uint64_t next_page_token)
362       ICING_LOCKS_EXCLUDED(mutex_);
363 
364   // Invalidates the next-page token so that no more results of the related
365   // query can be returned.
366   void InvalidateNextPageToken(uint64_t next_page_token)
367       ICING_LOCKS_EXCLUDED(mutex_);
368 
369   // Gets or creates a file for write only purpose for the given blob handle.
370   // To mark the blob is completed written, commitBlob must be called. Once
371   // commitBlob is called, the blob is sealed and rewrite is not allowed.
372   //
373   // Returns:
374   //   File descriptor on success
375   //   InvalidArgumentError on invalid blob handle
376   //   FailedPreconditionError on blob is already opened for write
377   //   AlreadyExistsError on blob is committed
378   //   INTERNAL_ERROR on IO error
379   BlobProto OpenWriteBlob(const PropertyProto::BlobHandleProto& blob_handle);
380 
381   // Removes a blob file and blob handle from the blob store.
382   //
383   // This will remove the blob on any state. No matter it's committed or not or
384   // it has reference document links or not.
385   //
386   // Returns:
387   //   InvalidArgumentError on invalid blob handle
388   //   NotFoundError on blob is not found
389   //   InternalError on IO error
390   BlobProto RemoveBlob(const PropertyProto::BlobHandleProto& blob_handle);
391 
392   // Gets or creates a file for read only purpose for the given blob handle.
393   // The blob must be committed by calling commitBlob otherwise it is not
394   // accessible.
395   //
396   // Returns:
397   //   File descriptor on success
398   //   InvalidArgumentError on invalid blob handle
399   //   NotFoundError on blob is not found or is not committed
400   BlobProto OpenReadBlob(const PropertyProto::BlobHandleProto& blob_handle);
401 
402   // Commits the given blob, the blob is open to write via openWrite.
403   // Before the blob is committed, it is not visible to any reader via openRead.
404   // After the blob is committed, it is not allowed to rewrite or update the
405   // content.
406   //
407   // Returns:
408   //   True on the blob is successfuly committed.
409   //   False on the blob is already committed.
410   //   InvalidArgumentError on invalid blob handle or digest is mismatch with
411   //     file content NotFoundError on blob is not found.
412   BlobProto CommitBlob(const PropertyProto::BlobHandleProto& blob_handle);
413 
414   // Makes sure that every update/delete received till this point is flushed
415   // to disk. If the app crashes after a call to PersistToDisk(), Icing
416   // would be able to fully recover all data written up to this point.
417   //
418   // If persist_type is PersistType::LITE, then only the ground truth will be
419   // synced. This should be relatively lightweight to do (order of microseconds)
420   // and ensures that there will be no data loss. At worst, Icing may need to
421   // recover internal data structures by replaying the document log upon the
422   // next startup. Clients should call PersistToDisk(LITE) after each batch of
423   // mutations.
424   //
425   // If persist_type is PersistType::FULL, then all internal data structures in
426   // Icing will be synced. This is a heavier operation (order of milliseconds).
427   // It ensures that Icing will not need to recover internal data structures
428   // upon the next startup. Clients should call PersistToDisk(FULL) before their
429   // process dies.
430   //
431   // NOTE: It is not necessary to call PersistToDisk() to read back data
432   // that was recently written. All read APIs will include the most recent
433   // updates/deletes regardless of the data being flushed to disk.
434   //
435   // Returns:
436   //   OK on success
437   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
438   //   INTERNAL on I/O error
439   PersistToDiskResultProto PersistToDisk(PersistType::Code persist_type)
440       ICING_LOCKS_EXCLUDED(mutex_);
441 
442   // Allows Icing to run tasks that are too expensive and/or unnecessary to be
443   // executed in real-time, but are useful to keep it fast and be
444   // resource-efficient. This method purely optimizes the internal files and
445   // has no functional impact on what gets accepted/returned.
446   //
447   // WARNING: This method is CPU and IO intensive and depending on the
448   // contents stored, it can take from a few seconds to a few minutes.
449   // This call also blocks all read/write operations on Icing.
450   //
451   // SUGGESTION: Assuming the client has no restrictions on their side, it's
452   // recommended to call this method about once every 24 hours when the
453   // device is idle and charging. It can also be called when the system needs
454   // to free up extra disk-space.
455   //
456   // Returns:
457   //   OK on success
458   //   ABORTED_ERROR if optimization is aborted due to non-fatal errors before
459   //                 actual modifications are made.
460   //   DATA_LOSS_ERROR on errors that could potentially cause data loss,
461   //                   IcingSearchEngine is still functioning.
462   //   INTERNAL_ERROR on any IO errors or other unrecoverable errors. Continued
463   //                  use of Icing is undefined.
464   //                  Clients could clear and reinitialize IcingSearchEngine.
465   //   FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
466   OptimizeResultProto Optimize() ICING_LOCKS_EXCLUDED(mutex_);
467 
468   // Returns potential size and document savings if Optimize were called.
469   //
470   // Returns:
471   //   OK on success
472   //   FAILED_PRECONDITION if IcingSearchEngine has not been initialized yet
473   //   INTERNAL_ERROR on IO error
474   GetOptimizeInfoResultProto GetOptimizeInfo() ICING_LOCKS_EXCLUDED(mutex_);
475 
476   // Calculates the StorageInfo for Icing.
477   //
478   // If an IO error occurs while trying to calculate the value for a field, then
479   // that field will be set to -1.
480   StorageInfoResultProto GetStorageInfo() ICING_LOCKS_EXCLUDED(mutex_);
481 
482   // Get debug information for Icing.
483   DebugInfoResultProto GetDebugInfo(DebugInfoVerbosity::Code verbosity)
484       ICING_LOCKS_EXCLUDED(mutex_);
485 
486   // Clears all data from Icing and re-initializes. Clients DO NOT need to call
487   // Initialize again.
488   //
489   // Returns:
490   //   OK on success
491   //   ABORTED_ERROR if failed to delete underlying files
492   //   INTERNAL_ERROR if internal state is no longer consistent
493   ResetResultProto Reset() ICING_LOCKS_EXCLUDED(mutex_);
494 
495   // Disallow copy and move.
496   IcingSearchEngine(const IcingSearchEngine&) = delete;
497   IcingSearchEngine& operator=(const IcingSearchEngine&) = delete;
498 
499  protected:
500   IcingSearchEngine(IcingSearchEngineOptions options,
501                     std::unique_ptr<const Filesystem> filesystem,
502                     std::unique_ptr<const IcingFilesystem> icing_filesystem,
503                     std::unique_ptr<Clock> clock,
504                     std::unique_ptr<const JniCache> jni_cache = nullptr);
505 
506  private:
507   const IcingSearchEngineOptions options_;
508   const FeatureFlags feature_flags_;
509   const std::unique_ptr<const Filesystem> filesystem_;
510   const std::unique_ptr<const IcingFilesystem> icing_filesystem_;
511   bool initialized_ ICING_GUARDED_BY(mutex_) = false;
512 
513   // Abstraction for accessing time values.
514   const std::unique_ptr<const Clock> clock_;
515 
516   // Provides key thresholds that affects the running time and memory of major
517   // components in Icing search engine.
518   const PerformanceConfiguration performance_configuration_;
519 
520   // Used to provide reader and writer locks
521   absl_ports::shared_mutex mutex_;
522 
523   // Stores and processes the schema
524   std::unique_ptr<SchemaStore> schema_store_ ICING_GUARDED_BY(mutex_);
525 
526   // Used to store all valid documents
527   //
528   // Dependencies: schema_store_
529   std::unique_ptr<DocumentStore> document_store_ ICING_GUARDED_BY(mutex_);
530 
531   // Used to manage pagination state of query results. Even though
532   // ResultStateManager has its own reader-writer lock, mutex_ must still be
533   // acquired first in order to adhere to the global lock ordering:
534   //   1. mutex_
535   //   2. result_state_manager_.lock_
536   //
537   // Dependencies: document_store_
538   std::unique_ptr<ResultStateManager> result_state_manager_
539       ICING_GUARDED_BY(mutex_);
540 
541   // Used to store all valid blob data
542   std::unique_ptr<BlobStore> blob_store_ ICING_GUARDED_BY(mutex_);
543 
544   std::unique_ptr<const LanguageSegmenter> language_segmenter_
545       ICING_GUARDED_BY(mutex_);
546 
547   std::unique_ptr<const Normalizer> normalizer_ ICING_GUARDED_BY(mutex_);
548 
549   // Storage for all hits of string contents from the document store.
550   std::unique_ptr<Index> index_ ICING_GUARDED_BY(mutex_);
551 
552   // Storage for all hits of numeric contents from the document store.
553   std::unique_ptr<NumericIndex<int64_t>> integer_index_
554       ICING_GUARDED_BY(mutex_);
555 
556   // Storage for all join qualified ids from the document store.
557   std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_
558       ICING_GUARDED_BY(mutex_);
559 
560   // Storage for all hits of embedding contents from the document store.
561   std::unique_ptr<EmbeddingIndex> embedding_index_ ICING_GUARDED_BY(mutex_);
562 
563   // Pointer to JNI class references
564   const std::unique_ptr<const JniCache> jni_cache_;
565 
566   // Resets all members that are created during Initialize.
567   void ResetMembers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
568 
569   // Resets all members that are created during Initialize, deletes all
570   // underlying files and initializes a fresh index.
571   ResetResultProto ResetInternal() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
572 
573   // Checks for the existence of the init marker file. If the failed init count
574   // exceeds kMaxUnsuccessfulInitAttempts, all data is deleted and the index is
575   // initialized from scratch. The updated count (original failed init count + 1
576   // ) is written to the marker file.
577   //
578   // RETURNS
579   //   OK on success
580   //   INTERNAL if an IO error occurs while trying to update the marker file.
581   libtextclassifier3::Status CheckInitMarkerFile(
582       InitializeStatsProto* initialize_stats)
583       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
584 
585   // Helper method to do the actual work to persist data to disk. We need this
586   // separate method so that other public methods don't need to call
587   // PersistToDisk(). Public methods calling each other may cause deadlock
588   // issues.
589   libtextclassifier3::Status InternalPersistToDisk(
590       PersistType::Code persist_type) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
591 
592   // Helper method to the actual work to Initialize. We need this separate
593   // method so that other public methods don't need to call Initialize(). Public
594   // methods calling each other may cause deadlock issues.
595   InitializeResultProto InternalInitialize()
596       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
597 
598   // Helper method to initialize member variables.
599   //
600   // Returns:
601   //   OK on success
602   //   FAILED_PRECONDITION if initialize_stats is null
603   //   RESOURCE_EXHAUSTED if the index runs out of storage
604   //   NOT_FOUND if some Document's schema type is not in the SchemaStore
605   //   INTERNAL on any I/O errors
606   libtextclassifier3::Status InitializeMembers(
607       InitializeStatsProto* initialize_stats)
608       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
609 
610   // Do any initialization/recovery necessary to create a SchemaStore instance.
611   //
612   // Returns:
613   //   OK on success
614   //   FAILED_PRECONDITION if initialize_stats is null
615   //   INTERNAL on I/O error
616   libtextclassifier3::Status InitializeSchemaStore(
617       InitializeStatsProto* initialize_stats)
618       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
619 
620   // Do any initialization/recovery necessary to create a DocumentStore
621   // instance.
622   //
623   // See comments on DocumentStore::Create for explanation of
624   // force_recovery_and_revalidate_documents.
625   //
626   // Returns:
627   //   On success, a boolean flag indicating whether derived files of the
628   //     document store have been regenerated or not. If true, any other
629   //     components depending on them should also be rebuilt if true.
630   //   FAILED_PRECONDITION if initialize_stats is null
631   //   INTERNAL on I/O error
632   libtextclassifier3::StatusOr<bool> InitializeDocumentStore(
633       bool force_recovery_and_revalidate_documents,
634       InitializeStatsProto* initialize_stats)
635       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
636 
637   // Do any initialization necessary to create a BlobStore instance.
638   //
639   // Returns:
640   //   OK on success
641   //   FAILED_PRECONDITION if initialize_stats is null
642   libtextclassifier3::Status InitializeBlobStore(
643       int32_t orphan_blob_time_to_live_ms, int32_t compression_level)
644       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
645 
646   // Do any initialization/recovery necessary to create term index, integer
647   // index, and qualified id join index instances.
648   //
649   // If document_store_derived_files_regenerated is true, then we have to
650   // rebuild qualified id join index since NamespaceIds were reassigned.
651   //
652   // Returns:
653   //   OK on success
654   //   FAILED_PRECONDITION if initialize_stats is null
655   //   RESOURCE_EXHAUSTED if the index runs out of storage
656   //   NOT_FOUND if some Document's schema type is not in the SchemaStore
657   //   INTERNAL on I/O error
658   libtextclassifier3::Status InitializeIndex(
659       bool document_store_derived_files_regenerated,
660       InitializeStatsProto* initialize_stats)
661       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
662 
663   // Implementation of IcingSearchEngine::Search that only grabs the overall
664   // read-lock, allowing for parallel non-exclusive operations.
665   // This implementation is used if search_spec.use_read_only_search is true.
666   SearchResultProto SearchLockedShared(const SearchSpecProto& search_spec,
667                                        const ScoringSpecProto& scoring_spec,
668                                        const ResultSpecProto& result_spec)
669       ICING_LOCKS_EXCLUDED(mutex_);
670 
671   // Implementation of IcingSearchEngine::Search that requires the overall
672   // write lock. No other operations of any kind can be executed in parallel if
673   // this version is used.
674   // This implementation is used if search_spec.use_read_only_search is false.
675   SearchResultProto SearchLockedExclusive(const SearchSpecProto& search_spec,
676                                           const ScoringSpecProto& scoring_spec,
677                                           const ResultSpecProto& result_spec)
678       ICING_LOCKS_EXCLUDED(mutex_);
679 
680   // Helper method for the actual work to Search. We need this separate
681   // method to manage locking for Search.
682   SearchResultProto InternalSearch(const SearchSpecProto& search_spec,
683                                    const ScoringSpecProto& scoring_spec,
684                                    const ResultSpecProto& result_spec)
685       ICING_SHARED_LOCKS_REQUIRED(mutex_);
686 
687   // Processes query and scores according to the specs. It is a helper function
688   // (called by Search) to process and score normal query and the nested child
689   // query for join search.
690   //
691   // Returns a QueryScoringResults
692   //   OK on success with a vector of ScoredDocumentHits,
693   //      SectionRestrictQueryTermsMap, and other stats fields for logging.
694   //   Any other errors when processing the query or scoring
695   struct QueryScoringResults {
696     libtextclassifier3::Status status;
697     SectionRestrictQueryTermsMap query_terms;
698     std::vector<ScoredDocumentHit> scored_document_hits;
699 
QueryScoringResultsQueryScoringResults700     explicit QueryScoringResults(
701         libtextclassifier3::Status status_in,
702         SectionRestrictQueryTermsMap&& query_terms_in,
703         std::vector<ScoredDocumentHit>&& scored_document_hits_in)
704         : status(std::move(status_in)),
705           query_terms(std::move(query_terms_in)),
706           scored_document_hits(std::move(scored_document_hits_in)) {}
707   };
708   QueryScoringResults ProcessQueryAndScore(
709       const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
710       const ResultSpecProto& result_spec,
711       const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms,
712       QueryStatsProto::SearchStats* search_stats)
713       ICING_SHARED_LOCKS_REQUIRED(mutex_);
714 
715   // Deletes documents propagated from the given deleted document ids via
716   // joinable properties with delete propagation enabled.
717   //
718   // Returns:
719   //   Number of propagated documents deleted on success
720   //   INTERNAL_ERROR on any I/O errors
721   libtextclassifier3::StatusOr<int> PropagateDelete(
722       const std::unordered_set<DocumentId>& deleted_document_ids,
723       int64_t current_time_ms) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
724 
725   // Discards derived data that requires rebuild based on rebuild_result.
726   //
727   // Returns:
728   //   OK on success
729   //   FAILED_PRECONDITION_ERROR if those instances are valid (non nullptr)
730   //   INTERNAL_ERROR on any I/O errors
731   libtextclassifier3::Status DiscardDerivedFiles(
732       const version_util::DerivedFilesRebuildResult& rebuild_result)
733       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
734 
735   // Repopulates derived data off our ground truths.
736   //
737   // Returns:
738   //   OK on success
739   //   INTERNAL_ERROR on any IO errors
740   libtextclassifier3::Status RegenerateDerivedFiles(
741       InitializeStatsProto* initialize_stats = nullptr,
742       bool log_document_store_stats = false)
743       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
744 
745   // Optimizes the DocumentStore by removing any unneeded documents (i.e.
746   // deleted, expired, etc.) from the filesystem storage.
747   //
748   // NOTE: This may leave the DocumentStore in an invalid/uncreated state. Users
749   // would need call Initialize() to reinitialize everything into a valid state.
750   //
751   // Returns:
752   //   On success, OptimizeResult which contains a vector mapping from old
753   //   document id to new document id and another vector mapping from old
754   //   namespace id to new namespace id. A value of kInvalidDocumentId indicates
755   //   that the old document id has been deleted.
756   //   ABORTED_ERROR if any error happens before the actual optimization, the
757   //                 original document store should be still available
758   //   DATA_LOSS_ERROR on errors that could potentially cause data loss,
759   //                   document store is still available
760   //   INTERNAL_ERROR on any IO errors or other errors that we can't recover
761   //                  from
762   libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
763   OptimizeDocumentStore(std::unordered_set<std::string>&& mature_blob_handles,
764                         OptimizeStatsProto* optimize_stats)
765       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
766 
767   // Helper method to restore missing document data in index_, integer_index_,
768   // and qualified_id_join_index_. All documents will be reindexed. This does
769   // not clear the index, so it is recommended to call ClearAllIndices,
770   // ClearSearchIndices, or ClearJoinIndices first if needed.
771   //
772   // Returns:
773   //   On success, OK and a bool indicating whether or not restoration was
774   //     needed.
775   //   DATA_LOSS, if an error during index merging caused us to lose indexed
776   //     data in the main index. Despite the data loss, this is still considered
777   //     a successful run and needed_restoration will be set to true.
778   //   RESOURCE_EXHAUSTED if the index fills up before finishing indexing
779   //   NOT_FOUND if some Document's schema type is not in the SchemaStore
780   //   INTERNAL_ERROR on any IO errors
781   struct IndexRestorationResult {
782     libtextclassifier3::Status status;
783     bool index_needed_restoration;
784     bool integer_index_needed_restoration;
785     bool qualified_id_join_index_needed_restoration;
786     bool embedding_index_needed_restoration;
787   };
788   IndexRestorationResult RestoreIndexIfNeeded()
789       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
790 
791   // If we lost the schema during a previous failure, it may "look" the same as
792   // not having a schema set before: we don't have a schema proto file. So do
793   // some extra checks to differentiate between having-lost the schema, and
794   // never having a schema before. This may determine if we need to do extra
795   // recovery steps.
796   //
797   // Returns:
798   //   bool indicating if we had a schema and unintentionally lost it
799   //   INTERNAL_ERROR on I/O error
800   libtextclassifier3::StatusOr<bool> LostPreviousSchema()
801       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
802 
803   // Helper method to create all types of data indexing handlers to index term,
804   // integer, and join qualified ids.
805   libtextclassifier3::StatusOr<
806       std::vector<std::unique_ptr<DataIndexingHandler>>>
807   CreateDataIndexingHandlers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
808 
809   // Helper method to discard parts of (term, integer, qualified id join)
810   // indices if they contain data for document ids greater than
811   // last_stored_document_id.
812   //
813   // REQUIRES: last_stored_document_id is valid (!= kInvalidDocumentId). Note:
814   //   if we want to truncate everything in the index, then please call
815   //   ClearSearchIndices/ClearJoinIndices/ClearAllIndices instead.
816   //
817   // Returns:
818   //   On success, a DocumentId indicating the first document to start for
819   //     reindexing and 2 bool flags indicating whether term or integer index
820   //     needs restoration.
821   //   INTERNAL on any I/O errors
822   struct TruncateIndexResult {
823     DocumentId first_document_to_reindex;
824     bool index_needed_restoration;
825     bool integer_index_needed_restoration;
826     bool qualified_id_join_index_needed_restoration;
827     bool embedding_index_needed_restoration;
828 
TruncateIndexResultTruncateIndexResult829     explicit TruncateIndexResult(
830         DocumentId first_document_to_reindex_in,
831         bool index_needed_restoration_in,
832         bool integer_index_needed_restoration_in,
833         bool qualified_id_join_index_needed_restoration_in,
834         bool embedding_index_needed_restoration_in)
835         : first_document_to_reindex(first_document_to_reindex_in),
836           index_needed_restoration(index_needed_restoration_in),
837           integer_index_needed_restoration(integer_index_needed_restoration_in),
838           qualified_id_join_index_needed_restoration(
839               qualified_id_join_index_needed_restoration_in),
840           embedding_index_needed_restoration(
841               embedding_index_needed_restoration_in) {}
842   };
843   libtextclassifier3::StatusOr<TruncateIndexResult> TruncateIndicesTo(
844       DocumentId last_stored_document_id)
845       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
846 
847   // Helper method to discard search (term, integer) indices.
848   //
849   // Returns:
850   //   OK on success
851   //   INTERNAL_ERROR on any I/O errors
852   libtextclassifier3::Status ClearSearchIndices()
853       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
854 
855   // Helper method to discard join (qualified id) indices.
856   //
857   // Returns:
858   //   OK on success
859   //   INTERNAL_ERROR on any I/O errors
860   libtextclassifier3::Status ClearJoinIndices()
861       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
862 
863   // Helper method to discard all search and join indices.
864   //
865   // Returns:
866   //   OK on success
867   //   INTERNAL_ERROR on any I/O errors
868   libtextclassifier3::Status ClearAllIndices()
869       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
870 };
871 
872 }  // namespace lib
873 }  // namespace icing
874 
875 #endif  // ICING_ICING_SEARCH_ENGINE_H_
876