xref: /aosp_15_r20/external/icing/icing/store/document-store.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_STORE_DOCUMENT_STORE_H_
16 #define ICING_STORE_DOCUMENT_STORE_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <optional>
21 #include <string>
22 #include <string_view>
23 #include <unordered_set>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/feature-flags.h"
29 #include "icing/file/file-backed-vector.h"
30 #include "icing/file/filesystem.h"
31 #include "icing/file/memory-mapped-file-backed-proto-log.h"
32 #include "icing/file/portable-file-backed-proto-log.h"
33 #include "icing/proto/debug.pb.h"
34 #include "icing/proto/document.pb.h"
35 #include "icing/proto/document_wrapper.pb.h"
36 #include "icing/proto/internal/scorable_property_set.pb.h"
37 #include "icing/proto/logging.pb.h"
38 #include "icing/proto/optimize.pb.h"
39 #include "icing/proto/persist.pb.h"
40 #include "icing/proto/search.pb.h"
41 #include "icing/proto/storage.pb.h"
42 #include "icing/proto/usage.pb.h"
43 #include "icing/schema/schema-store.h"
44 #include "icing/store/corpus-associated-scoring-data.h"
45 #include "icing/store/corpus-id.h"
46 #include "icing/store/document-associated-score-data.h"
47 #include "icing/store/document-filter-data.h"
48 #include "icing/store/document-id.h"
49 #include "icing/store/key-mapper.h"
50 #include "icing/store/namespace-id-fingerprint.h"
51 #include "icing/store/namespace-id.h"
52 #include "icing/store/usage-store.h"
53 #include "icing/tokenization/language-segmenter.h"
54 #include "icing/util/clock.h"
55 #include "icing/util/crc32.h"
56 #include "icing/util/data-loss.h"
57 #include "icing/util/document-validator.h"
58 #include "icing/util/fingerprint-util.h"
59 #include "icing/util/scorable_property_set.h"
60 
61 namespace icing {
62 namespace lib {
63 
64 // Provides storage interfaces for documents.
65 class DocumentStore {
66  public:
67   struct Header {
68     // Previously used magic numbers, please avoid reusing those:
69     // [0x1b99c8b0, 0x3e005b5e]
70     static constexpr int32_t kMagic = 0x8a32cd1f;
71 
72     // Holds the magic as a quick sanity check against file corruption.
73     int32_t magic;
74 
75     // Checksum of the DocumentStore's sub-component's checksums.
76     uint32_t checksum;
77   };
78 
79   struct OptimizeInfo {
80     // The estimated size in bytes of the optimizable docs. We don't track the
81     // size of each document, so we estimate by taking the size of the entire
82     // DocumentStore and dividing that by the total number of documents we have.
83     // So we end up with an average document size.
84     int64_t estimated_optimizable_bytes = 0;
85 
86     // Number of total documents the DocumentStore tracks.
87     int32_t total_docs = 0;
88 
89     // Number of optimizable (deleted + expired) docs the DocumentStore tracks.
90     int32_t optimizable_docs = 0;
91   };
92 
93   struct DeleteByGroupResult {
94     // Status representing whether or not the operation succeeded. See the
95     // comments above the function that returns this result to determine what
96     // possible statuses could be returned.
97     libtextclassifier3::Status status;
98 
99     int num_docs_deleted = 0;
100   };
101 
102   struct CreateResult {
103     // A successfully initialized document store.
104     std::unique_ptr<DocumentStore> document_store;
105 
106     // The data status after initializing from a previous state. Data loss can
107     // happen if the file is corrupted or some previously added data was
108     // unpersisted. This may be used to signal that any derived data off of the
109     // document store may need to be regenerated.
110     DataLoss data_loss;
111 
112     // A boolean flag indicating if derived files of the document store have
113     // been regenerated or not. This is usually a signal for callers to detect
114     // if any id assignment has changed (e.g. NamespaceId).
115     bool derived_files_regenerated;
116   };
117 
118   // Not copyable
119   DocumentStore(const DocumentStore&) = delete;
120   DocumentStore& operator=(const DocumentStore&) = delete;
121 
122   // Persists and updates checksum of subcomponents.
123   ~DocumentStore();
124 
125   // Factory method to create, initialize, and return a DocumentStore. The base
126   // directory is used to persist document store files. If document store was
127   // previously initialized with this directory, it will reload the files saved
128   // by the last instance.
129   //
130   // force_recovery_and_revalidate_documents=true will pre-emptively throw out
131   // the derived files and validate each document while recreating them. This
132   // can be used to indicate that the schema (and type ids) may have changed and
133   // those changes might not have been applied to the document store.
134   //
135   // If initialize_stats is present, the fields related to DocumentStore will be
136   // populated.
137   //
138   // Does not take any ownership, and all pointers except initialize_stats must
139   // refer to valid objects that outlive the one constructed.
140   //
141   // TODO(cassiewang): Consider returning a status indicating that derived files
142   // were regenerated. This may be helpful in logs.
143   //
144   // Returns:
145   //   A DocumentStore::CreateResult on success
146   //   FAILED_PRECONDITION on any null pointer input
147   //   INTERNAL_ERROR on IO error
148   static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create(
149       const Filesystem* filesystem, const std::string& base_dir,
150       const Clock* clock, const SchemaStore* schema_store,
151       const FeatureFlags* feature_flags,
152       bool force_recovery_and_revalidate_documents, bool pre_mapping_fbv,
153       bool use_persistent_hash_map, int32_t compression_level,
154       InitializeStatsProto* initialize_stats);
155 
156   // Discards all derived data in the document store.
157   //
158   // Returns:
159   //   OK on success or nothing to discard
160   //   INTERNAL_ERROR on any I/O errors
161   static libtextclassifier3::Status DiscardDerivedFiles(
162       const Filesystem* filesystem, const std::string& base_dir);
163 
164   // Returns the maximum DocumentId that the DocumentStore has assigned. If
165   // there has not been any DocumentIds assigned, i.e. the DocumentStore is
166   // empty, then kInvalidDocumentId is returned. This does not filter out
167   // DocumentIds of deleted or expired documents.
last_added_document_id()168   DocumentId last_added_document_id() const {
169     if (document_id_mapper_->num_elements() == 0) {
170       return kInvalidDocumentId;
171     }
172     return document_id_mapper_->num_elements() - 1;
173   }
174 
175   // Returns the number of documents. The result does not filter out DocumentIds
176   // of deleted or expired documents.
num_documents()177   int num_documents() const { return document_id_mapper_->num_elements(); }
178 
179   // Puts the document into document store.
180   //
181   // If put_document_stats is present, the fields related to DocumentStore will
182   // be populated.
183   //
184   //  Returns:
185   //   - On success, a PutResult with the DocumentId of the newly added document
186   //     and the old DocumentId before replacement. If this is a new document,
187   //     then old DocumentId will be kInvalidDocumentId.
188   //   - RESOURCE_EXHAUSTED if exceeds maximum number of allowed documents
189   //   - FAILED_PRECONDITION if schema hasn't been set yet
190   //   - NOT_FOUND if the schema_type or a property config of the document
191   //     doesn't exist in schema
192   //   - INTERNAL_ERROR on IO error
193   struct PutResult {
194     DocumentId old_document_id = kInvalidDocumentId;
195     DocumentId new_document_id = kInvalidDocumentId;
196 
was_replacementPutResult197     bool was_replacement() const {
198       return old_document_id != kInvalidDocumentId;
199     }
200   };
201   libtextclassifier3::StatusOr<PutResult> Put(
202       const DocumentProto& document, int32_t num_tokens = 0,
203       PutDocumentStatsProto* put_document_stats = nullptr);
204   libtextclassifier3::StatusOr<PutResult> Put(
205       DocumentProto&& document, int32_t num_tokens = 0,
206       PutDocumentStatsProto* put_document_stats = nullptr);
207 
208   // Finds and returns the document identified by the given key (namespace +
209   // uri). If 'clear_internal_fields' is true, document level data that's
210   // generated internally by DocumentStore is cleared.
211   //
212   // Returns:
213   //   The document found on success
214   //   NOT_FOUND if the key doesn't exist or document has been deleted
215   //   INTERNAL_ERROR on IO error
216   libtextclassifier3::StatusOr<DocumentProto> Get(
217       std::string_view name_space, std::string_view uri,
218       bool clear_internal_fields = true) const;
219 
220   // Finds and returns the document identified by the given document id. If
221   // 'clear_internal_fields' is true, document level data that's generated
222   // internally by DocumentStore is cleared.
223   //
224   // Returns:
225   //   The document found on success
226   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
227   //                    maximum value
228   //   NOT_FOUND if the document doesn't exist or has been deleted
229   //   INTERNAL_ERROR on IO error
230   libtextclassifier3::StatusOr<DocumentProto> Get(
231       DocumentId document_id, bool clear_internal_fields = true) const;
232 
233   // Returns the ScorablePropertySet of the document specified by the
234   // DocumentId.
235   //
236   // Returns:
237   //   - ScorablePropertySet on success
238   //   - nullptr when the ScorablePropertySet fails to be created, it could be
239   //     due to that:
240   //     - |document_id| is invalid, or
241   //     - no ScorablePropertySetProto is found for the document in the cache
242   //     - internal IO error
243   std::unique_ptr<ScorablePropertySet> GetScorablePropertySet(
244       DocumentId document_id, int64_t current_time_ms) const;
245 
246   // Returns all namespaces which have at least 1 active document (not deleted
247   // or expired). Order of namespaces is undefined.
248   std::vector<std::string> GetAllNamespaces() const;
249 
250   // Deletes the document identified by the given namespace and uri. The
251   // document proto will be erased immediately.
252   //
253   // NOTE:
254   //    Space is not reclaimed for deleted documents until Optimize() is
255   //    called.
256   //
257   // Returns:
258   //   OK on success
259   //   NOT_FOUND if no document exists with namespace, uri
260   //   INTERNAL_ERROR on IO error
261   libtextclassifier3::Status Delete(std::string_view name_space,
262                                     std::string_view uri,
263                                     int64_t current_time_ms);
264 
265   // Deletes the document identified by the given document_id. The document
266   // proto will be erased immediately.
267   //
268   // NOTE:
269   //    Space is not reclaimed for deleted documents until Optimize() is
270   //    called.
271   //
272   // Returns:
273   //   OK on success
274   //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
275   //   INTERNAL_ERROR on IO error
276   //   INVALID_ARGUMENT if document_id is invalid.
277   libtextclassifier3::Status Delete(DocumentId document_id,
278                                     int64_t current_time_ms);
279 
280   // Returns the NamespaceId of the string namespace
281   //
282   // Returns:
283   //   NamespaceId on success
284   //   NOT_FOUND if the namespace doesn't exist
285   //   INTERNAL_ERROR on IO error
286   libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId(
287       std::string_view name_space) const;
288 
289   // Helper method to find a DocumentId that is associated with the given
290   // namespace and uri.
291   //
292   // NOTE: The DocumentId may refer to a invalid document (deleted
293   // or expired). Callers can call DoesDocumentExist(document_id) to ensure it
294   // refers to a valid Document.
295   //
296   // Returns:
297   //   A DocumentId on success
298   //   NOT_FOUND if the key doesn't exist
299   //   INTERNAL_ERROR on IO error
300   libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
301       std::string_view name_space, std::string_view uri) const;
302 
303   // Helper method to find a DocumentId that is associated with the given
304   // NamespaceIdFingerprint.
305   //
306   // NOTE: The DocumentId may refer to a invalid document (deleted
307   // or expired). Callers can call DoesDocumentExist(document_id) to ensure it
308   // refers to a valid Document.
309   //
310   // Returns:
311   //   A DocumentId on success
312   //   NOT_FOUND if the key doesn't exist
313   //   INTERNAL_ERROR on IO error
314   libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
315       const NamespaceIdFingerprint& doc_namespace_id_uri_fingerprint) const;
316 
317   // Returns the CorpusId associated with the given namespace and schema.
318   //
319   // Returns:
320   //   A CorpusId on success
321   //   NOT_FOUND if the key doesn't exist
322   //   INTERNAL_ERROR on IO error
323   libtextclassifier3::StatusOr<CorpusId> GetCorpusId(
324       const std::string_view name_space, const std::string_view schema) const;
325 
326   // Returns the ResultGroupingEntryId associated with the given namespace
327   // and schema.
328   //
329   // NOTE: ResultGroupingEntryIds that are generated by calls with different
330   // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds
331   // are only guarenteed to be unique within their own ResultGroupingType.
332   //
333   // Returns:
334   //   A ResultGroupingEntryId on success
335   //   NOT_FOUND if the key doesn't exist
336   //   INTERNAL_ERROR on IO error
337   libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId(
338       ResultSpecProto::ResultGroupingType result_group_type,
339       const std::string_view name_space, const std::string_view schema) const;
340 
341   // Returns the ResultGrouping Entry Id associated with the given NamespaceId
342   // and SchemaTypeId
343   //
344   // NOTE: ResultGroupingEntryIds that are generated by calls with different
345   // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds
346   // are only guarenteed to be unique within their own ResultGroupingType.
347   //
348   // Returns:
349   //   A ResultGroupingEntryId on success
350   //   NOT_FOUND if the key doesn't exist
351   //   INTERNAL_ERROR on IO error
352   libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId(
353       ResultSpecProto::ResultGroupingType result_group_type,
354       const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const;
355 
356   // Returns the DocumentAssociatedScoreData of the document specified by the
357   // DocumentId.
358   //
359   // Returns:
360   //   DocumentAssociatedScoreData on success
361   //   NOT_FOUND if the document or the score data is not found
362   libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
363   GetDocumentAssociatedScoreData(DocumentId document_id) const;
364 
365   // Returns the CorpusAssociatedScoreData of the corpus specified by the
366   // corpus_id.
367   //
368   // NOTE: This does not check if the corpus exists and will return the
369   // CorpusAssociatedScoreData of the corpus even if all documents belonging to
370   // that corpus have been deleted.
371   //
372   // Returns:
373   //   CorpusAssociatedScoreData on success
374   //   OUT_OF_RANGE if corpus_id is negative or exceeds previously seen
375   //                CorpusIds
376   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
377   GetCorpusAssociatedScoreData(CorpusId corpus_id) const;
378 
379   // Gets the document filter data if a document exists and is not expired.
380   // Otherwise, will get a false optional.
381   //
382   // Existence means it hasn't been deleted and it hasn't expired yet.
383   //
384   // Returns:
385   //   True:DocumentFilterData  if the given document exists.
386   //   False                    if the given document doesn't exist.
387   std::optional<DocumentFilterData> GetAliveDocumentFilterData(
388       DocumentId document_id, int64_t current_time_ms) const;
389 
390   // Gets the document filter data if a document has not been deleted. If the
391   // document is expired but not deleted, will still return a valid document
392   // filter data. Otherwise, will get a false optional.
393   //
394   // Returns:
395   //   True:DocumentFilterData  if the given document exists.
396   //   False                    if the given document has been deleted.
397   std::optional<DocumentFilterData> GetNonDeletedDocumentFilterData(
398       DocumentId document_id) const;
399 
400   // Gets the SchemaTypeId of a document.
401   //
402   // Returns:
403   //   SchemaTypeId on success
404   //   kInvalidSchemaTypeId if the document is deleted or expired.
GetSchemaTypeId(DocumentId document_id,int64_t current_time_ms)405   SchemaTypeId GetSchemaTypeId(DocumentId document_id,
406                                int64_t current_time_ms) const {
407     std::optional<DocumentFilterData> document_filter_data_optional =
408         GetAliveDocumentFilterData(document_id, current_time_ms);
409     if (document_filter_data_optional) {
410       return document_filter_data_optional.value().schema_type_id();
411     } else {
412       return kInvalidSchemaTypeId;
413     }
414   }
415 
416   // Gets the usage scores of a document.
417   //
418   // Returns:
419   //   UsageScores on success
420   //   nullopt if there are no usage scores stored for the requested docid.
421   std::optional<UsageStore::UsageScores> GetUsageScores(
422       DocumentId document_id, int64_t current_time_ms) const;
423 
424   // Reports usage. The corresponding usage scores of the specified document in
425   // the report will be updated.
426   //
427   // Returns:
428   //   OK on success
429   //   NOT_FOUND if the [namesapce + uri] key in the report doesn't exist
430   //   INTERNAL_ERROR on I/O errors.
431   libtextclassifier3::Status ReportUsage(const UsageReport& usage_report);
432 
433   // Deletes all documents belonging to the given namespace. The documents will
434   // be erased immediately.
435   //
436   // NOTE:
437   //    Space is not reclaimed for deleted documents until Optimize() is
438   //    called.
439   //
440   // Returns:
441   //   OK on success
442   //   NOT_FOUND if namespace doesn't exist
443   //   INTERNAL_ERROR on IO error
444   DeleteByGroupResult DeleteByNamespace(std::string_view name_space);
445 
446   // Deletes all documents belonging to the given schema type. The documents
447   // will be erased immediately.
448   //
449   // NOTE:
450   //    Space is not reclaimed for deleted documents until Optimize() is
451   //    called.
452   //
453   // Returns:
454   //   OK on success
455   //   NOT_FOUND if schema_type doesn't exist
456   //   INTERNAL_ERROR on IO error
457   DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type);
458 
459   // Syncs all the data and metadata changes to disk.
460   //
461   // Returns:
462   //   OK on success
463   //   INTERNAL on I/O error
464   libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type);
465 
466   // Calculates the StorageInfo for the Document Store.
467   //
468   // If an IO error occurs while trying to calculate the value for a field, then
469   // that field will be set to -1.
470   DocumentStorageInfoProto GetStorageInfo() const;
471 
472   // Update any derived data off of the SchemaStore with the new SchemaStore.
473   // This may include pointers, SchemaTypeIds, etc.
474   //
475   // NOTE: This function may delete documents. A document may be invalidated by
476   // the new SchemaStore, such as failing validation or having its schema type
477   // deleted from the schema.
478   //
479   // This is best used if the caller is unsure about what's changed in the
480   // SchemaStore, and wants to update all information no matter what. If the
481   // caller does know what has changed, then it's recommended to call
482   // OptimizedUpdateSchemaStore.
483   //
484   // Returns;
485   //   OK on success
486   //   INTERNAL_ERROR on IO error
487   libtextclassifier3::Status UpdateSchemaStore(const SchemaStore* schema_store);
488 
489   // Performs the same funtionality as UpdateSchemaStore, but this can be more
490   // optimized in terms of less disk reads and less work if we know exactly
491   // what's changed between the old and new SchemaStore.
492   //
493   // Returns;
494   //   OK on success
495   //   INTERNAL_ERROR on IO error
496   libtextclassifier3::Status OptimizedUpdateSchemaStore(
497       const SchemaStore* schema_store,
498       const SchemaStore::SetSchemaResult& set_schema_result);
499 
500   // Re-generates the scorable property cache for documents with the given
501   // schema types.
502   //
503   // Returns:
504   //   OK on success
505   //   INTERNAL_ERROR on IO error
506   libtextclassifier3::Status RegenerateScorablePropertyCache(
507       const std::unordered_set<SchemaTypeId>& schema_type_ids);
508 
509   // Reduces internal file sizes by reclaiming space of deleted documents and
510   // regenerating derived files.
511   //
512   // NOTE: The tasks in this method are too expensive to be executed in
513   // real-time. The caller should decide how frequently and when to call this
514   // method based on device usage.
515   //
516   // Returns:
517   //   OK on success
518   //   INTERNAL_ERROR on IO error
519   libtextclassifier3::Status Optimize();
520 
521   struct OptimizeResult {
522     // A vector that maps old document id to new document id.
523     std::vector<DocumentId> document_id_old_to_new;
524 
525     // A vector that maps old namespace id to new namespace id. Will be empty if
526     // should_rebuild_index is set to true.
527     std::vector<NamespaceId> namespace_id_old_to_new;
528 
529     // A boolean flag that hints the caller (usually IcingSearchEngine) if it
530     // should rebuild index instead of adopting the id changes via the 2 vectors
531     // above. It will be set to true if finding any id inconsistency.
532     bool should_rebuild_index = false;
533 
534     // A set of blob handles that are dead and need to be removed.
535     std::unordered_set<std::string> dead_blob_handles;
536   };
537   // Copy data from current base directory into a new directory. Any outdated or
538   // deleted data won't be copied. During the process, document/namespace ids
539   // will be reassigned so any files / classes that are based on old
540   // document/namespace ids may be outdated.
541   //
542   // stats will be set if non-null.
543   //
544   // NOTE: The tasks in this method are too expensive to be executed in
545   // real-time. The caller should decide how frequently and when to call this
546   // method based on device usage.
547   //
548   // Returns:
549   //   OptimizeResult which contains a vector mapping from old document id to
550   //   new document id and another vector mapping from old namespace id to new
551   //   namespace id, on success
552   //   INVALID_ARGUMENT if new_directory is same as current base directory
553   //   INTERNAL_ERROR on IO error
554   libtextclassifier3::StatusOr<OptimizeResult> OptimizeInto(
555       const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
556       std::unordered_set<std::string>&& expired_blob_handles,
557       OptimizeStatsProto* stats = nullptr) const;
558 
559   // Calculates status for a potential Optimize call. Includes how many docs
560   // there are vs how many would be optimized away. And also includes an
561   // estimated size gains, in bytes, if Optimize were called.
562   //
563   // Returns:
564   //   OptimizeInfo on success
565   //   INTERNAL_ERROR on IO error
566   libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const;
567 
568   // Update, replace and persist the header file. Creates the header file if it
569   // doesn't exist.
570   //
571   // Returns:
572   //   OK on success
573   //   INTERNAL on I/O error
574   libtextclassifier3::StatusOr<Crc32> UpdateChecksum();
575 
576   // Calculates and returns the checksum of the document store.
577   //
578   // Returns:
579   //   OK on success
580   //   INTERNAL on I/O error
581   libtextclassifier3::StatusOr<Crc32> GetChecksum() const;
582 
583   // Get debug information for the document store.
584   // verbosity <= 0, simplest debug information
585   // verbosity > 0, also return the total number of documents and tokens in each
586   // (namespace, schema type) pair.
587   //
588   // Returns:
589   //   DocumentDebugInfoProto on success
590   //   INTERNAL_ERROR on IO errors, crc compute error
591   libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo(
592       int verbosity) const;
593 
594  private:
595   // Use DocumentStore::Create() to instantiate.
596   explicit DocumentStore(const Filesystem* filesystem,
597                          std::string_view base_dir, const Clock* clock,
598                          const SchemaStore* schema_store,
599                          const FeatureFlags* feature_flags,
600                          bool pre_mapping_fbv, bool use_persistent_hash_map,
601                          int32_t compression_level);
602 
603   const Filesystem* const filesystem_;
604   const std::string base_dir_;
605   const Clock& clock_;
606   const FeatureFlags& feature_flags_;  // Does not own.
607 
608   // Handles the ground truth schema and all of the derived data off of the
609   // schema
610   const SchemaStore* schema_store_;
611 
612   // Used to validate incoming documents
613   DocumentValidator document_validator_;
614 
615   // Flag indicating whether memory map max possible file size for underlying
616   // FileBackedVector before growing the actual file size.
617   bool pre_mapping_fbv_;
618 
619   // Flag indicating whether use persistent hash map as the key mapper (if
620   // false, then fall back to dynamic trie key mapper). Note: we only use
621   // persistent hash map for uri mapper if it is true.
622   bool use_persistent_hash_map_;
623 
624   const int32_t compression_level_;
625 
626   // A log used to store all documents, it serves as a ground truth of doc
627   // store. key_mapper_ and document_id_mapper_ can be regenerated from it.
628   std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_;
629 
630   // Key (namespace + uri) to DocumentId mapping
631   std::unique_ptr<
632       KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>>
633       document_key_mapper_;
634 
635   // DocumentId to file offset mapping
636   std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_;
637 
638   // A cache of document associated scores. The ground truth of the scores is
639   // DocumentProto stored in document_log_. This cache contains:
640   //   - CorpusId
641   //   - Document score
642   //   - Document creation timestamp in seconds
643   //   - Document length in number of tokens
644   //   - Index of the ScorablePropertySetProto at the scorable_property_cache_
645   std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_;
646 
647   // A cache of document scorable properties. The ground truth of the data is
648   // DocumentProto stored in document_log_.
649   std::unique_ptr<MemoryMappedFileBackedProtoLog<ScorablePropertySetProto>>
650       scorable_property_cache_;
651 
652   // A cache of data, indexed by DocumentId, used to filter documents. Currently
653   // contains:
654   //   - NamespaceId
655   //   - SchemaTypeId
656   //   - Expiration timestamp in seconds
657   std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_;
658 
659   // A cache of corpus associated scores. The ground truth of the scores is
660   // DocumentProto stored in document_log_. This cache contains:
661   //   - Number of documents belonging to the corpus score
662   //   - The sum of the documents' lengths, in number of tokens.
663   std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>>
664       corpus_score_cache_;
665 
666   // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an
667   // id when the first document belonging to that namespace is added to the
668   // DocumentStore. Namespaces may be removed from the mapper during compaction.
669   std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_;
670 
671   // Maps a corpus, i.e. a (namespace, schema type) pair, to a densely-assigned
672   // unique id. A coprus is assigned an
673   // id when the first document belonging to that corpus is added to the
674   // DocumentStore. Corpus ids may be removed from the mapper during compaction.
675   std::unique_ptr<
676       KeyMapper<CorpusId, fingerprint_util::FingerprintStringFormatter>>
677       corpus_mapper_;
678 
679   // A storage class that caches all usage scores. Usage scores are not
680   // considered as ground truth. Usage scores are associated with document ids
681   // so they need to be updated when document ids change.
682   std::unique_ptr<UsageStore> usage_store_;
683 
684   // Used internally to indicate whether the class has been initialized. This is
685   // to guard against cases where the object has been created, but Initialize
686   // fails in the constructor. If we have successfully exited the constructor,
687   // then this field can be ignored. Clients of DocumentStore should not need to
688   // worry about this field.
689   bool initialized_ = false;
690 
691   struct InitializeResult {
692     DataLoss data_loss;
693 
694     // A boolean flag indicating if derived files of the document store have
695     // been regenerated or not. This is usually a signal for callers to detect
696     // if any id assignment has changed (e.g. NamespaceId).
697     bool derived_files_regenerated;
698   };
699   libtextclassifier3::StatusOr<InitializeResult> Initialize(
700       bool force_recovery_and_revalidate_documents,
701       InitializeStatsProto* initialize_stats);
702 
703   // Creates sub-components and verifies the integrity of each sub-component.
704   // This assumes that the the underlying files already exist, and will return
705   // an error if it doesn't find what it's expecting.
706   //
707   // Returns an error if subcomponents failed to initialize successfully.
708   //   INTERNAL_ERROR on IO error
709   libtextclassifier3::Status InitializeExistingDerivedFiles();
710 
711   // Re-generates all files derived from the ground truth: the document log.
712   //
713   // revalidate_documents=true will also cause each document to be revalidated
714   // the schema as it is read out of the document log.
715   //
716   // NOTE: if this function fails, the only thing we can do is to retry it until
717   // it succeeds or prevent the initialization of a DocumentStore. The
718   // DocumentStore object wouldn't work reliably if this fails.
719   //
720   // Steps:
721   //   1. Delete all derived files.
722   //   2. Iterate through document log, put data into new key mapper and
723   //   document_id
724   //      mapper.
725   //   3. Create header and store the updated combined checksum
726   libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents);
727 
728   // Resets the unique_ptr to the document_key_mapper, deletes the underlying
729   // file, and re-creates a new instance of the document_key_mapper .
730   //
731   // Returns OK or any IO errors.
732   libtextclassifier3::Status ResetDocumentKeyMapper();
733 
734   // Resets the unique_ptr to the document_id_mapper, deletes the underlying
735   // file, and re-creates a new instance of the document_id_mapper.
736   //
737   // Returns OK or any IO errors.
738   libtextclassifier3::Status ResetDocumentIdMapper();
739 
740   // Resets the unique_ptr to the score_cache, deletes the underlying file, and
741   // re-creates a new instance of the score_cache.
742   //
743   // Returns OK or any IO errors.
744   libtextclassifier3::Status ResetDocumentAssociatedScoreCache();
745 
746   // Resets the unique_ptr to the |scorable_property_cache_|, deletes the
747   // underlying file, and re-creates a new instance of it.
748   //
749   // Returns OK or any IO errors.
750   libtextclassifier3::Status ResetScorablePropertyCache();
751 
752   // Resets the unique_ptr to the corpus_score_cache, deletes the underlying
753   // file, and re-creates a new instance of the corpus_score_cache.
754   //
755   // Returns OK or any IO errors.
756   libtextclassifier3::Status ResetCorpusAssociatedScoreCache();
757 
758   // Resets the unique_ptr to the filter_cache, deletes the underlying file, and
759   // re-creates a new instance of the filter_cache.
760   //
761   // Returns OK or any IO errors.
762   libtextclassifier3::Status ResetFilterCache();
763 
764   // Resets the unique_ptr to the namespace_mapper, deletes the underlying file,
765   // and re-creates a new instance of the namespace_mapper.
766   //
767   // Returns OK or any IO errors.
768   libtextclassifier3::Status ResetNamespaceMapper();
769 
770   // Resets the unique_ptr to the corpus_mapper, deletes the underlying file,
771   // and re-creates a new instance of the corpus_mapper.
772   //
773   // Returns OK or any IO errors.
774   libtextclassifier3::Status ResetCorpusMapper();
775 
776   // Checks if the header exists already. This does not create the header file
777   // if it doesn't exist.
778   bool HeaderExists();
779 
780   libtextclassifier3::StatusOr<PutResult> InternalPut(
781       DocumentProto&& document,
782       PutDocumentStatsProto* put_document_stats = nullptr);
783 
784   // Helper function to do batch deletes. Documents with the given
785   // "namespace_id" and "schema_type_id" will be deleted. If callers don't need
786   // to specify the namespace or schema type, pass in kInvalidNamespaceId or
787   // kInvalidSchemaTypeId. The document protos with their derived data will be
788   // erased / cleared immediately.
789   //
790   // NOTE: Space is not reclaimed in the derived files until Optimize() is
791   // called.
792   //
793   // Returns:
794   //   Number of documents that were actually updated to be deleted
795   //   INTERNAL_ERROR on IO error
796   libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id,
797                                                 SchemaTypeId schema_type_id);
798 
799   // Returns the CorpusAssociatedScoreData of the corpus specified by the
800   // corpus_id.
801   //
802   // If the corpus_id has never been seen before, it returns a
803   // CorpusAssociatedScoreData with properties set to default values.
804   //
805   // NOTE: This does not check if the corpus exists and will return the
806   // CorpusAssociatedScoreData of the corpus even if all documents belonging to
807   // that corpus have been deleted.
808   //
809   // Returns:
810   //   CorpusAssociatedScoreData on success
811   libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
812   GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const;
813 
814   // Check if a document exists. Existence means it hasn't been deleted and it
815   // hasn't expired yet.
816   //
817   // Returns:
818   //   OK if the document exists
819   //   INVALID_ARGUMENT if document_id is less than 0 or greater than the
820   //                    maximum value
821   //   NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
822   //   INTERNAL_ERROR on IO error
823   libtextclassifier3::Status DoesDocumentExistWithStatus(
824       DocumentId document_id) const;
825 
826   // Checks if a document has been deleted
827   //
828   // This is for internal-use only because we assume that the document_id is
829   // already valid. If you're unsure if the document_id is valid, use
830   // DoesDocumentExist(document_id) instead, which will perform those additional
831   // checks.
832   bool IsDeleted(DocumentId document_id) const;
833 
834   // Checks if a document has expired.
835   //
836   // This is for internal-use only because we assume that the document_id is
837   // already valid. If you're unsure if the document_id is valid, use
838   // DoesDocumentExist(document_id) instead, which will perform those additional
839   // checks.
840 
841   // Returns:
842   //   True:DocumentFilterData  if the given document isn't expired.
843   //   False                    if the given doesn't document is expired.
844   std::optional<DocumentFilterData> GetNonExpiredDocumentFilterData(
845       DocumentId document_id, int64_t current_time_ms) const;
846 
847   // Updates the entry in the score cache for document_id.
848   libtextclassifier3::Status UpdateDocumentAssociatedScoreCache(
849       DocumentId document_id, const DocumentAssociatedScoreData& score_data);
850 
851   // Updates the entry in the corpus score cache for corpus_id.
852   libtextclassifier3::Status UpdateCorpusAssociatedScoreCache(
853       CorpusId corpus_id, const CorpusAssociatedScoreData& score_data);
854 
855   // Updates the entry in the filter cache for document_id.
856   libtextclassifier3::Status UpdateFilterCache(
857       DocumentId document_id, const DocumentFilterData& filter_data);
858 
859   // Helper method to clear the derived data of a document
860   libtextclassifier3::Status ClearDerivedData(DocumentId document_id);
861 
862   // Sets usage scores for the given document.
863   libtextclassifier3::Status SetUsageScores(
864       DocumentId document_id, const UsageStore::UsageScores& usage_scores);
865 
866   // Returns:
867   //   - on success, a DocumentStorageInfoProto with the fields relating to the
868   //     size of Document Store member variables populated.
869   //   - INTERNAL on failure to get file size
870   DocumentStorageInfoProto GetMemberStorageInfo() const;
871 
872   // Returns:
873   //   - on success, the storage_info that was passed in but with the number of
874   //     alive, deleted and expired documents also set.
875   //   - OUT_OF_RANGE, this should never happen. This could only be returned if
876   //     the document_id_mapper somehow became larger than the filter cache.
877   DocumentStorageInfoProto CalculateDocumentStatusCounts(
878       DocumentStorageInfoProto storage_info) const;
879 
880   // Returns:
881   //   - on success, a RepeatedPtrField for CorpusInfo collected.
882   //   - OUT_OF_RANGE, this should never happen.
883   libtextclassifier3::StatusOr<
884       google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
885   CollectCorpusInfo() const;
886 
887   // Extracts the ScorablePropertySetProto from the |document| and add it to
888   // the |scorable_property_cache_|.
889   //
890   // Returns:
891   //     - Index of the newly inserted ScorablePropertySetProto in the
892   //       |scorable_property_cache_|.
893   //     - kInvalidScorablePropertyCacheIndex if the schema contains no
894   //       scorable properties.
895   //     - INVALID_ARGUMENT if |schema_type_id| is invalid, or the converted
896   //       ScorablePropertySetProto exceeds the size limit of 16MiB.
897   //     - INTERNAL_ERROR on IO error.
898   libtextclassifier3::StatusOr<int> UpdateScorablePropertyCache(
899       const DocumentProto& document, SchemaTypeId schema_type_id);
900 };
901 
902 }  // namespace lib
903 }  // namespace icing
904 
905 #endif  // ICING_STORE_DOCUMENT_STORE_H_
906