xref: /aosp_15_r20/external/icing/icing/store/document-store.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/store/document-store.h"
16 
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <optional>
21 #include <string>
22 #include <string_view>
23 #include <unordered_map>
24 #include <unordered_set>
25 #include <utility>
26 #include <vector>
27 
28 #include "icing/text_classifier/lib3/utils/base/status.h"
29 #include "icing/text_classifier/lib3/utils/base/statusor.h"
30 #include "icing/absl_ports/annotate.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/feature-flags.h"
34 #include "icing/file/file-backed-proto-log.h"
35 #include "icing/file/file-backed-vector.h"
36 #include "icing/file/filesystem.h"
37 #include "icing/file/memory-mapped-file-backed-proto-log.h"
38 #include "icing/file/memory-mapped-file.h"
39 #include "icing/file/portable-file-backed-proto-log.h"
40 #include "icing/legacy/core/icing-string-util.h"
41 #include "icing/proto/debug.pb.h"
42 #include "icing/proto/document.pb.h"
43 #include "icing/proto/document_wrapper.pb.h"
44 #include "icing/proto/internal/scorable_property_set.pb.h"
45 #include "icing/proto/logging.pb.h"
46 #include "icing/proto/optimize.pb.h"
47 #include "icing/proto/persist.pb.h"
48 #include "icing/proto/schema.pb.h"
49 #include "icing/proto/storage.pb.h"
50 #include "icing/proto/usage.pb.h"
51 #include "icing/schema/property-util.h"
52 #include "icing/schema/schema-store.h"
53 #include "icing/schema/scorable_property_manager.h"
54 #include "icing/store/blob-store.h"
55 #include "icing/store/corpus-associated-scoring-data.h"
56 #include "icing/store/corpus-id.h"
57 #include "icing/store/document-associated-score-data.h"
58 #include "icing/store/document-filter-data.h"
59 #include "icing/store/document-id.h"
60 #include "icing/store/document-log-creator.h"
61 #include "icing/store/dynamic-trie-key-mapper.h"
62 #include "icing/store/key-mapper.h"
63 #include "icing/store/namespace-id-fingerprint.h"
64 #include "icing/store/namespace-id.h"
65 #include "icing/store/persistent-hash-map-key-mapper.h"
66 #include "icing/store/usage-store.h"
67 #include "icing/tokenization/language-segmenter.h"
68 #include "icing/util/clock.h"
69 #include "icing/util/crc32.h"
70 #include "icing/util/data-loss.h"
71 #include "icing/util/fingerprint-util.h"
72 #include "icing/util/logging.h"
73 #include "icing/util/scorable_property_set.h"
74 #include "icing/util/status-macros.h"
75 #include "icing/util/tokenized-document.h"
76 
77 namespace icing {
78 namespace lib {
79 
80 namespace {
81 
82 // Used in DocumentId mapper to mark a document as deleted
83 constexpr int64_t kDocDeletedFlag = -1;
84 constexpr int32_t kInvalidScorablePropertyCacheIndex = -1;
85 constexpr char kDocumentIdMapperFilename[] = "document_id_mapper";
86 constexpr char kUriHashMapperWorkingPath[] = "uri_mapper";
87 constexpr char kDocumentStoreHeaderFilename[] = "document_store_header";
88 constexpr char kScoreCacheFilename[] = "score_cache";
89 constexpr char kScorablePropertyCacheFilename[] = "scorable_property_cache";
90 constexpr char kCorpusScoreCache[] = "corpus_score_cache";
91 constexpr char kFilterCacheFilename[] = "filter_cache";
92 constexpr char kNamespaceMapperFilename[] = "namespace_mapper";
93 constexpr char kUsageStoreDirectoryName[] = "usage_store";
94 constexpr char kCorpusIdMapperFilename[] = "corpus_mapper";
95 
96 // Determined through manual testing to allow for 4 million uris. 4 million
97 // because we allow up to 4 million DocumentIds.
98 constexpr int32_t kUriDynamicTrieKeyMapperMaxSize =
99     144 * 1024 * 1024;  // 144 MiB
100 
101 constexpr int32_t kUriHashKeyMapperMaxNumEntries =
102     kMaxDocumentId + 1;  // 1 << 22, 4M
103 // - Key: namespace_id_str (3 bytes) + fingerprinted_uri (10 bytes) + '\0' (1
104 //        byte)
105 // - Value: DocumentId (4 bytes)
106 constexpr int32_t kUriHashKeyMapperKVByteSize = 13 + 1 + sizeof(DocumentId);
107 
108 // 384 KiB for a DynamicTrieKeyMapper would allow each internal array to have a
109 // max of 128 KiB for storage.
110 constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024;  // 384 KiB
111 constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024;     // 384 KiB
112 
CreateDocumentWrapper(DocumentProto && document)113 DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) {
114   DocumentWrapper document_wrapper;
115   *document_wrapper.mutable_document() = std::move(document);
116   return document_wrapper;
117 }
118 
MakeHeaderFilename(const std::string & base_dir)119 std::string MakeHeaderFilename(const std::string& base_dir) {
120   return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename);
121 }
122 
MakeUriHashMapperWorkingPath(const std::string & base_dir)123 std::string MakeUriHashMapperWorkingPath(const std::string& base_dir) {
124   return absl_ports::StrCat(base_dir, "/", kUriHashMapperWorkingPath);
125 }
126 
MakeDocumentIdMapperFilename(const std::string & base_dir)127 std::string MakeDocumentIdMapperFilename(const std::string& base_dir) {
128   return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename);
129 }
130 
MakeScoreCacheFilename(const std::string & base_dir)131 std::string MakeScoreCacheFilename(const std::string& base_dir) {
132   return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename);
133 }
134 
MakeScorablePropertyCacheFilename(const std::string & base_dir)135 std::string MakeScorablePropertyCacheFilename(const std::string& base_dir) {
136   return absl_ports::StrCat(base_dir, "/", kScorablePropertyCacheFilename);
137 }
138 
MakeCorpusScoreCache(const std::string & base_dir)139 std::string MakeCorpusScoreCache(const std::string& base_dir) {
140   return absl_ports::StrCat(base_dir, "/", kCorpusScoreCache);
141 }
142 
MakeFilterCacheFilename(const std::string & base_dir)143 std::string MakeFilterCacheFilename(const std::string& base_dir) {
144   return absl_ports::StrCat(base_dir, "/", kFilterCacheFilename);
145 }
146 
MakeNamespaceMapperFilename(const std::string & base_dir)147 std::string MakeNamespaceMapperFilename(const std::string& base_dir) {
148   return absl_ports::StrCat(base_dir, "/", kNamespaceMapperFilename);
149 }
150 
MakeUsageStoreDirectoryName(const std::string & base_dir)151 std::string MakeUsageStoreDirectoryName(const std::string& base_dir) {
152   return absl_ports::StrCat(base_dir, "/", kUsageStoreDirectoryName);
153 }
154 
MakeCorpusMapperFilename(const std::string & base_dir)155 std::string MakeCorpusMapperFilename(const std::string& base_dir) {
156   return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename);
157 }
158 
CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,int64_t ttl_ms)159 int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
160                                        int64_t ttl_ms) {
161   if (ttl_ms == 0) {
162     // Special case where a TTL of 0 indicates the document should never
163     // expire. int64_t max, interpreted as seconds since epoch, represents
164     // some point in the year 292,277,026,596. So we're probably ok to use
165     // this as "never reaching this point".
166     return std::numeric_limits<int64_t>::max();
167   }
168 
169   int64_t expiration_timestamp_ms;
170   if (__builtin_add_overflow(creation_timestamp_ms, ttl_ms,
171                              &expiration_timestamp_ms)) {
172     // Overflow detected. Treat overflow as the same behavior of just int64_t
173     // max
174     return std::numeric_limits<int64_t>::max();
175   }
176 
177   return expiration_timestamp_ms;
178 }
179 
GetRecoveryCause(const DocumentLogCreator::CreateResult & create_result,bool force_recovery_and_revalidate_documents)180 InitializeStatsProto::RecoveryCause GetRecoveryCause(
181     const DocumentLogCreator::CreateResult& create_result,
182     bool force_recovery_and_revalidate_documents) {
183   if (force_recovery_and_revalidate_documents) {
184     return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC;
185   } else if (create_result.log_create_result.has_data_loss()) {
186     return InitializeStatsProto::DATA_LOSS;
187   } else if (create_result.preexisting_file_version !=
188              DocumentLogCreator::kCurrentVersion) {
189     return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT;
190   }
191   return InitializeStatsProto::NONE;
192 }
193 
GetDataStatus(DataLoss data_loss)194 InitializeStatsProto::DocumentStoreDataStatus GetDataStatus(
195     DataLoss data_loss) {
196   switch (data_loss) {
197     case DataLoss::PARTIAL:
198       return InitializeStatsProto::PARTIAL_LOSS;
199     case DataLoss::COMPLETE:
200       return InitializeStatsProto::COMPLETE_LOSS;
201     case DataLoss::NONE:
202       return InitializeStatsProto::NO_DATA_LOSS;
203   }
204 }
205 
GetNamespaceIdsToNamespaces(const KeyMapper<NamespaceId> * key_mapper)206 std::unordered_map<NamespaceId, std::string> GetNamespaceIdsToNamespaces(
207     const KeyMapper<NamespaceId>* key_mapper) {
208   std::unordered_map<NamespaceId, std::string> namespace_ids_to_namespaces;
209 
210   std::unique_ptr<typename KeyMapper<NamespaceId>::Iterator> itr =
211       key_mapper->GetIterator();
212   while (itr->Advance()) {
213     namespace_ids_to_namespaces.insert(
214         {itr->GetValue(), std::string(itr->GetKey())});
215   }
216   return namespace_ids_to_namespaces;
217 }
218 
219 libtextclassifier3::StatusOr<std::unique_ptr<
220     KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>>>
CreateUriMapper(const Filesystem & filesystem,const std::string & base_dir,bool use_persistent_hash_map)221 CreateUriMapper(const Filesystem& filesystem, const std::string& base_dir,
222                 bool use_persistent_hash_map) {
223   std::string uri_hash_mapper_working_path =
224       MakeUriHashMapperWorkingPath(base_dir);
225   // Due to historic issue, we use document store's base_dir directly as
226   // DynamicTrieKeyMapper's working directory for uri mapper.
227   // DynamicTrieKeyMapper also creates a subdirectory "key_mapper_dir", so the
228   // actual files will be put under "<base_dir>/key_mapper_dir/".
229   bool dynamic_trie_key_mapper_dir_exists = filesystem.DirectoryExists(
230       absl_ports::StrCat(base_dir, "/key_mapper_dir").c_str());
231   bool persistent_hash_map_dir_exists =
232       filesystem.DirectoryExists(uri_hash_mapper_working_path.c_str());
233   if ((use_persistent_hash_map && dynamic_trie_key_mapper_dir_exists) ||
234       (!use_persistent_hash_map && persistent_hash_map_dir_exists)) {
235     // Return a failure here so that the caller can properly delete and rebuild
236     // this component.
237     return absl_ports::FailedPreconditionError("Key mapper type mismatch");
238   }
239 
240   if (use_persistent_hash_map) {
241     return PersistentHashMapKeyMapper<
242         DocumentId, fingerprint_util::FingerprintStringFormatter>::
243         Create(filesystem, std::move(uri_hash_mapper_working_path),
244                /*pre_mapping_fbv=*/false,
245                /*max_num_entries=*/kUriHashKeyMapperMaxNumEntries,
246                /*average_kv_byte_size=*/kUriHashKeyMapperKVByteSize);
247   } else {
248     return DynamicTrieKeyMapper<DocumentId,
249                                 fingerprint_util::FingerprintStringFormatter>::
250         Create(filesystem, base_dir, kUriDynamicTrieKeyMapperMaxSize);
251   }
252 }
253 
254 // Find the existing blob handles in the given document and remove them from the
255 // dead_blob_handles set. Those are the blob handles that are still in use.
256 //
257 // This method is flag-guarded by the flag enable_blob_store. If the flag is
258 // disabled, the dead_blob_handles must be empty and this method will be a
259 // no-op.
260 //
261 // The type_blob_map is a map from schema type to a set of blob property names.
RemoveAliveBlobHandles(const DocumentProto & document,const std::unordered_map<std::string,std::vector<std::string>> & type_blob_property_map,std::unordered_set<std::string> & dead_blob_handles)262 void RemoveAliveBlobHandles(
263     const DocumentProto& document,
264     const std::unordered_map<std::string, std::vector<std::string>>&
265         type_blob_property_map,
266     std::unordered_set<std::string>& dead_blob_handles) {
267   if (dead_blob_handles.empty() ||
268       type_blob_property_map.find(document.schema()) ==
269           type_blob_property_map.end()) {
270     // This document does not have any blob properties.
271     return;
272   }
273   const std::vector<std::string>& blob_property_paths =
274       type_blob_property_map.at(document.schema());
275 
276   for (const std::string& blob_property_path : blob_property_paths) {
277     auto content_or = property_util::ExtractPropertyValuesFromDocument<
278         PropertyProto::BlobHandleProto>(document, blob_property_path);
279     if (content_or.ok()) {
280       for (const PropertyProto::BlobHandleProto& blob_handle :
281            content_or.ValueOrDie()) {
282         dead_blob_handles.erase(BlobStore::BuildBlobHandleStr(blob_handle));
283       }
284     }
285   }
286 }
287 
288 }  // namespace
289 
DocumentStore(const Filesystem * filesystem,const std::string_view base_dir,const Clock * clock,const SchemaStore * schema_store,const FeatureFlags * feature_flags,bool pre_mapping_fbv,bool use_persistent_hash_map,int32_t compression_level)290 DocumentStore::DocumentStore(const Filesystem* filesystem,
291                              const std::string_view base_dir,
292                              const Clock* clock,
293                              const SchemaStore* schema_store,
294                              const FeatureFlags* feature_flags,
295                              bool pre_mapping_fbv, bool use_persistent_hash_map,
296                              int32_t compression_level)
297     : filesystem_(filesystem),
298       base_dir_(base_dir),
299       clock_(*clock),
300       feature_flags_(*feature_flags),
301       schema_store_(schema_store),
302       document_validator_(schema_store),
303       pre_mapping_fbv_(pre_mapping_fbv),
304       use_persistent_hash_map_(use_persistent_hash_map),
305       compression_level_(compression_level) {}
306 
Put(const DocumentProto & document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)307 libtextclassifier3::StatusOr<DocumentStore::PutResult> DocumentStore::Put(
308     const DocumentProto& document, int32_t num_tokens,
309     PutDocumentStatsProto* put_document_stats) {
310   return Put(DocumentProto(document), num_tokens, put_document_stats);
311 }
312 
Put(DocumentProto && document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)313 libtextclassifier3::StatusOr<DocumentStore::PutResult> DocumentStore::Put(
314     DocumentProto&& document, int32_t num_tokens,
315     PutDocumentStatsProto* put_document_stats) {
316   document.mutable_internal_fields()->set_length_in_tokens(num_tokens);
317   return InternalPut(std::move(document), put_document_stats);
318 }
319 
~DocumentStore()320 DocumentStore::~DocumentStore() {
321   if (initialized_) {
322     if (!PersistToDisk(PersistType::FULL).ok()) {
323       ICING_LOG(ERROR)
324           << "Error persisting to disk in DocumentStore destructor";
325     }
326   }
327 }
328 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const SchemaStore * schema_store,const FeatureFlags * feature_flags,bool force_recovery_and_revalidate_documents,bool pre_mapping_fbv,bool use_persistent_hash_map,int32_t compression_level,InitializeStatsProto * initialize_stats)329 libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create(
330     const Filesystem* filesystem, const std::string& base_dir,
331     const Clock* clock, const SchemaStore* schema_store,
332     const FeatureFlags* feature_flags,
333     bool force_recovery_and_revalidate_documents, bool pre_mapping_fbv,
334     bool use_persistent_hash_map, int32_t compression_level,
335     InitializeStatsProto* initialize_stats) {
336   ICING_RETURN_ERROR_IF_NULL(filesystem);
337   ICING_RETURN_ERROR_IF_NULL(clock);
338   ICING_RETURN_ERROR_IF_NULL(schema_store);
339   ICING_RETURN_ERROR_IF_NULL(feature_flags);
340 
341   auto document_store = std::unique_ptr<DocumentStore>(new DocumentStore(
342       filesystem, base_dir, clock, schema_store, feature_flags, pre_mapping_fbv,
343       use_persistent_hash_map, compression_level));
344   ICING_ASSIGN_OR_RETURN(
345       InitializeResult initialize_result,
346       document_store->Initialize(force_recovery_and_revalidate_documents,
347                                  initialize_stats));
348 
349   CreateResult create_result;
350   create_result.document_store = std::move(document_store);
351   create_result.data_loss = initialize_result.data_loss;
352   create_result.derived_files_regenerated =
353       initialize_result.derived_files_regenerated;
354   return create_result;
355 }
356 
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)357 /* static */ libtextclassifier3::Status DocumentStore::DiscardDerivedFiles(
358     const Filesystem* filesystem, const std::string& base_dir) {
359   // Header
360   const std::string header_filename = MakeHeaderFilename(base_dir);
361   if (!filesystem->DeleteFile(MakeHeaderFilename(base_dir).c_str())) {
362     return absl_ports::InternalError("Couldn't delete header file");
363   }
364 
365   // Document key mapper. Doesn't hurt to delete both dynamic trie and
366   // persistent hash map without checking.
367   ICING_RETURN_IF_ERROR(
368       DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem, base_dir));
369   ICING_RETURN_IF_ERROR(PersistentHashMapKeyMapper<DocumentId>::Delete(
370       *filesystem, MakeUriHashMapperWorkingPath(base_dir)));
371 
372   // Document id mapper
373   ICING_RETURN_IF_ERROR(FileBackedVector<int64_t>::Delete(
374       *filesystem, MakeDocumentIdMapperFilename(base_dir)));
375 
376   // Document associated score cache
377   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
378       *filesystem, MakeScoreCacheFilename(base_dir)));
379 
380   // Filter cache
381   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
382       *filesystem, MakeFilterCacheFilename(base_dir)));
383 
384   // Namespace mapper
385   ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<NamespaceId>::Delete(
386       *filesystem, MakeNamespaceMapperFilename(base_dir)));
387 
388   // Corpus mapper
389   ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<CorpusId>::Delete(
390       *filesystem, MakeCorpusMapperFilename(base_dir)));
391 
392   // Corpus associated score cache
393   ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
394       *filesystem, MakeCorpusScoreCache(base_dir)));
395 
396   // Scorable Property Cache
397   ICING_RETURN_IF_ERROR(
398       MemoryMappedFileBackedProtoLog<ScorablePropertySetProto>::Delete(
399           *filesystem, MakeScorablePropertyCacheFilename(base_dir)));
400 
401   return libtextclassifier3::Status::OK;
402 }
403 
404 libtextclassifier3::StatusOr<DocumentStore::InitializeResult>
Initialize(bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)405 DocumentStore::Initialize(bool force_recovery_and_revalidate_documents,
406                           InitializeStatsProto* initialize_stats) {
407   auto create_result_or =
408       DocumentLogCreator::Create(filesystem_, base_dir_, compression_level_);
409 
410   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
411   // that can support error logging.
412   if (!create_result_or.ok()) {
413     ICING_LOG(ERROR) << create_result_or.status().error_message()
414                      << "\nFailed to initialize DocumentLog.";
415     return create_result_or.status();
416   }
417   DocumentLogCreator::CreateResult create_result =
418       std::move(create_result_or).ValueOrDie();
419 
420   document_log_ = std::move(create_result.log_create_result.proto_log);
421   InitializeStatsProto::RecoveryCause recovery_cause =
422       GetRecoveryCause(create_result, force_recovery_and_revalidate_documents);
423 
424   bool derived_files_regenerated = false;
425   if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) {
426     ICING_LOG(INFO) << "Starting Document Store Recovery with cause="
427                     << recovery_cause << ", and create result { new_file="
428                     << create_result.new_file << ", preeisting_file_version="
429                     << create_result.preexisting_file_version << ", data_loss="
430                     << create_result.log_create_result.data_loss
431                     << "} and kCurrentVersion="
432                     << DocumentLogCreator::kCurrentVersion;
433     // We can't rely on any existing derived files. Recreate them from scratch.
434     // Currently happens if:
435     //   1) This is a new log and we don't have derived files yet
436     //   2) Client wanted us to force a regeneration.
437     //   3) Log has some data loss, can't rely on existing derived data.
438     std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
439     libtextclassifier3::Status status =
440         RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
441     if (recovery_cause != InitializeStatsProto::NONE) {
442       // Only consider it a recovery if the client forced a recovery or there
443       // was data loss. Otherwise, this could just be the first time we're
444       // initializing and generating derived files.
445       derived_files_regenerated = true;
446       if (initialize_stats != nullptr) {
447         initialize_stats->set_document_store_recovery_latency_ms(
448             document_recovery_timer->GetElapsedMilliseconds());
449         initialize_stats->set_document_store_recovery_cause(recovery_cause);
450         initialize_stats->set_document_store_data_status(
451             GetDataStatus(create_result.log_create_result.data_loss));
452       }
453     }
454     if (!status.ok()) {
455       ICING_LOG(ERROR)
456           << "Failed to regenerate derived files for DocumentStore";
457       return status;
458     }
459   } else {
460     if (!InitializeExistingDerivedFiles().ok()) {
461       ICING_LOG(WARNING)
462           << "Couldn't find derived files or failed to initialize them, "
463              "regenerating derived files for DocumentStore.";
464       std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
465       derived_files_regenerated = true;
466       libtextclassifier3::Status status = RegenerateDerivedFiles(
467           /*force_recovery_and_revalidate_documents=*/false);
468       if (initialize_stats != nullptr) {
469         initialize_stats->set_document_store_recovery_cause(
470             InitializeStatsProto::IO_ERROR);
471         initialize_stats->set_document_store_recovery_latency_ms(
472             document_recovery_timer->GetElapsedMilliseconds());
473       }
474       if (!status.ok()) {
475         ICING_LOG(ERROR)
476             << "Failed to regenerate derived files for DocumentStore";
477         return status;
478       }
479     }
480   }
481 
482   initialized_ = true;
483   if (initialize_stats != nullptr) {
484     initialize_stats->set_num_documents(document_id_mapper_->num_elements());
485   }
486 
487   InitializeResult initialize_result = {
488       .data_loss = create_result.log_create_result.data_loss,
489       .derived_files_regenerated = derived_files_regenerated};
490   return initialize_result;
491 }
492 
InitializeExistingDerivedFiles()493 libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() {
494   if (!HeaderExists()) {
495     // Without a header, we don't know if things are consistent between each
496     // other so the caller should just regenerate everything from ground
497     // truth.
498     return absl_ports::InternalError("DocumentStore header doesn't exist");
499   }
500 
501   DocumentStore::Header header;
502   if (!filesystem_->Read(MakeHeaderFilename(base_dir_).c_str(), &header,
503                          sizeof(header))) {
504     return absl_ports::InternalError(
505         absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
506   }
507 
508   if (header.magic != DocumentStore::Header::kMagic) {
509     return absl_ports::InternalError(absl_ports::StrCat(
510         "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
511   }
512 
513   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
514   // that can support error logging.
515   auto document_key_mapper_or =
516       CreateUriMapper(*filesystem_, base_dir_, use_persistent_hash_map_);
517   if (!document_key_mapper_or.ok()) {
518     ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
519                      << "Failed to initialize KeyMapper";
520     return document_key_mapper_or.status();
521   }
522   document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
523 
524   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
525   // that can support error logging.
526   auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
527       *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
528       MemoryMappedFile::READ_WRITE_AUTO_SYNC);
529   if (!document_id_mapper_or.ok()) {
530     ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
531                      << "Failed to initialize DocumentIdMapper";
532     return document_id_mapper_or.status();
533   }
534   document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
535 
536   ICING_ASSIGN_OR_RETURN(score_cache_,
537                          FileBackedVector<DocumentAssociatedScoreData>::Create(
538                              *filesystem_, MakeScoreCacheFilename(base_dir_),
539                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
540 
541   ICING_ASSIGN_OR_RETURN(
542       scorable_property_cache_,
543       MemoryMappedFileBackedProtoLog<ScorablePropertySetProto>::Create(
544           *filesystem_, MakeScorablePropertyCacheFilename(base_dir_)));
545 
546   ICING_ASSIGN_OR_RETURN(filter_cache_,
547                          FileBackedVector<DocumentFilterData>::Create(
548                              *filesystem_, MakeFilterCacheFilename(base_dir_),
549                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
550 
551   ICING_ASSIGN_OR_RETURN(
552       namespace_mapper_,
553       DynamicTrieKeyMapper<NamespaceId>::Create(
554           *filesystem_, MakeNamespaceMapperFilename(base_dir_),
555           kNamespaceMapperMaxSize));
556 
557   ICING_ASSIGN_OR_RETURN(
558       usage_store_,
559       UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
560 
561   auto corpus_mapper_or =
562       DynamicTrieKeyMapper<CorpusId,
563                            fingerprint_util::FingerprintStringFormatter>::
564           Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
565                  kCorpusMapperMaxSize);
566   if (!corpus_mapper_or.ok()) {
567     return std::move(corpus_mapper_or).status();
568   }
569   corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
570 
571   ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
572                          FileBackedVector<CorpusAssociatedScoreData>::Create(
573                              *filesystem_, MakeCorpusScoreCache(base_dir_),
574                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
575 
576   // Ensure the usage store is the correct size.
577   ICING_RETURN_IF_ERROR(
578       usage_store_->TruncateTo(document_id_mapper_->num_elements()));
579 
580   Crc32 expected_checksum(header.checksum);
581   ICING_ASSIGN_OR_RETURN(Crc32 checksum, GetChecksum());
582   if (checksum != expected_checksum) {
583     return absl_ports::InternalError(
584         "Combined checksum of DocStore was inconsistent");
585   }
586 
587   return libtextclassifier3::Status::OK;
588 }
589 
RegenerateDerivedFiles(bool revalidate_documents)590 libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
591     bool revalidate_documents) {
592   ICING_RETURN_IF_ERROR(ResetDocumentKeyMapper());
593   ICING_RETURN_IF_ERROR(ResetDocumentIdMapper());
594   ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache());
595   ICING_RETURN_IF_ERROR(ResetScorablePropertyCache());
596   ICING_RETURN_IF_ERROR(ResetFilterCache());
597   ICING_RETURN_IF_ERROR(ResetNamespaceMapper());
598   ICING_RETURN_IF_ERROR(ResetCorpusMapper());
599   ICING_RETURN_IF_ERROR(ResetCorpusAssociatedScoreCache());
600 
601   // Creates a new UsageStore instance. Note that we don't reset the data in
602   // usage store here because we're not able to regenerate the usage scores.
603   ICING_ASSIGN_OR_RETURN(
604       usage_store_,
605       UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
606 
607   // Iterates through document log
608   auto iterator = document_log_->GetIterator();
609   auto iterator_status = iterator.Advance();
610   libtextclassifier3::StatusOr<int64_t> element_size =
611       document_log_->GetElementsFileSize();
612   libtextclassifier3::StatusOr<int64_t> disk_usage =
613       document_log_->GetDiskUsage();
614   if (element_size.ok() && disk_usage.ok()) {
615     ICING_VLOG(1) << "Starting recovery of document store. Document store "
616                      "elements file size:"
617                   << element_size.ValueOrDie()
618                   << ", disk usage=" << disk_usage.ValueOrDie();
619   }
620   while (iterator_status.ok()) {
621     ICING_VLOG(2) << "Attempting to read document at offset="
622                   << iterator.GetOffset();
623     libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
624         document_log_->ReadProto(iterator.GetOffset());
625 
626     if (absl_ports::IsNotFound(document_wrapper_or.status())) {
627       // The erased document still occupies 1 document id.
628       DocumentId new_document_id = document_id_mapper_->num_elements();
629       ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
630       iterator_status = iterator.Advance();
631       continue;
632     } else if (!document_wrapper_or.ok()) {
633       return document_wrapper_or.status();
634     }
635 
636     DocumentWrapper document_wrapper =
637         std::move(document_wrapper_or).ValueOrDie();
638     // Revalidate that this document is still compatible if requested.
639     if (revalidate_documents) {
640       if (!document_validator_.Validate(document_wrapper.document()).ok()) {
641         // Document is no longer valid with the current schema. Mark as
642         // deleted
643         DocumentId new_document_id = document_id_mapper_->num_elements();
644         ICING_RETURN_IF_ERROR(document_log_->EraseProto(iterator.GetOffset()));
645         ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
646         continue;
647       }
648     }
649 
650     ICING_ASSIGN_OR_RETURN(
651         NamespaceId namespace_id,
652         namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
653                                     namespace_mapper_->num_keys()));
654 
655     // Updates key mapper and document_id mapper with the new document
656     DocumentId new_document_id = document_id_mapper_->num_elements();
657     NamespaceIdFingerprint new_doc_nsid_uri_fingerprint(
658         namespace_id, document_wrapper.document().uri());
659     ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
660         new_doc_nsid_uri_fingerprint.EncodeToCString(), new_document_id));
661     ICING_RETURN_IF_ERROR(
662         document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
663 
664     SchemaTypeId schema_type_id;
665     auto schema_type_id_or =
666         schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
667     if (absl_ports::IsNotFound(schema_type_id_or.status())) {
668       // Didn't find a SchemaTypeId. This means that the DocumentStore and
669       // the SchemaStore are out of sync. But DocumentStore can't do
670       // anything about it so just ignore this for now. This should be
671       // detected/handled by the owner of DocumentStore. Set it to some
672       // arbitrary invalid value for now, it'll get updated to the correct
673       // ID later.
674       schema_type_id = -1;
675     } else if (!schema_type_id_or.ok()) {
676       // Real error. Pass it up
677       return schema_type_id_or.status();
678     } else {
679       // We're guaranteed that SchemaTypeId is valid now
680       schema_type_id = schema_type_id_or.ValueOrDie();
681     }
682 
683     // Update corpus maps
684     NamespaceIdFingerprint corpus_nsid_schema_fingerprint(
685         namespace_id, document_wrapper.document().schema());
686     ICING_ASSIGN_OR_RETURN(CorpusId corpus_id,
687                            corpus_mapper_->GetOrPut(
688                                corpus_nsid_schema_fingerprint.EncodeToCString(),
689                                corpus_mapper_->num_keys()));
690 
691     ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
692                            GetCorpusAssociatedScoreDataToUpdate(corpus_id));
693     scoring_data.AddDocument(
694         document_wrapper.document().internal_fields().length_in_tokens());
695 
696     ICING_RETURN_IF_ERROR(
697         UpdateCorpusAssociatedScoreCache(corpus_id, scoring_data));
698 
699     int32_t scorable_property_cache_index = kInvalidScorablePropertyCacheIndex;
700     // Swallow the error when schema_type_id is not found, and skip updating the
701     // scorable property cache.
702     if (schema_type_id != -1) {
703       ICING_ASSIGN_OR_RETURN(scorable_property_cache_index,
704                              UpdateScorablePropertyCache(
705                                  document_wrapper.document(), schema_type_id));
706     }
707 
708     ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
709         new_document_id,
710         DocumentAssociatedScoreData(
711             corpus_id, document_wrapper.document().score(),
712             document_wrapper.document().creation_timestamp_ms(),
713             scorable_property_cache_index,
714             document_wrapper.document().internal_fields().length_in_tokens())));
715 
716     int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
717         document_wrapper.document().creation_timestamp_ms(),
718         document_wrapper.document().ttl_ms());
719 
720     ICING_RETURN_IF_ERROR(UpdateFilterCache(
721         new_document_id,
722         DocumentFilterData(namespace_id,
723                            new_doc_nsid_uri_fingerprint.fingerprint(),
724                            schema_type_id, expiration_timestamp_ms)));
725     iterator_status = iterator.Advance();
726   }
727 
728   if (!absl_ports::IsOutOfRange(iterator_status)) {
729     ICING_LOG(WARNING)
730         << "Failed to iterate through proto log while regenerating "
731            "derived files";
732     return absl_ports::Annotate(iterator_status,
733                                 "Failed to iterate through proto log.");
734   }
735 
736   // Shrink usage_store_ to the correct size.
737   ICING_RETURN_IF_ERROR(
738       usage_store_->TruncateTo(document_id_mapper_->num_elements()));
739 
740   // Write the header
741   ICING_RETURN_IF_ERROR(UpdateChecksum());
742   return libtextclassifier3::Status::OK;
743 }
744 
ResetDocumentKeyMapper()745 libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
746   // Only one type of KeyMapper (either DynamicTrieKeyMapper or
747   // PersistentHashMapKeyMapper) will actually exist at any moment, but it is ok
748   // to call Delete() for both since Delete() returns OK if any of them doesn't
749   // exist.
750   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
751   document_key_mapper_.reset();
752   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
753   // that can support error logging.
754   libtextclassifier3::Status status =
755       DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
756   if (!status.ok()) {
757     ICING_LOG(ERROR) << status.error_message()
758                      << "Failed to delete old dynamic trie key mapper";
759     return status;
760   }
761   status = PersistentHashMapKeyMapper<DocumentId>::Delete(
762       *filesystem_, MakeUriHashMapperWorkingPath(base_dir_));
763   if (!status.ok()) {
764     ICING_LOG(ERROR) << status.error_message()
765                      << "Failed to delete old persistent hash map key mapper";
766     return status;
767   }
768 
769   // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
770   // that can support error logging.
771   auto document_key_mapper_or =
772       CreateUriMapper(*filesystem_, base_dir_, use_persistent_hash_map_);
773   if (!document_key_mapper_or.ok()) {
774     ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
775                      << "Failed to re-init key mapper";
776     return document_key_mapper_or.status();
777   }
778   document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
779   return libtextclassifier3::Status::OK;
780 }
781 
ResetDocumentIdMapper()782 libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
783   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
784   document_id_mapper_.reset();
785   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
786   // that can support error logging.
787   libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
788       *filesystem_, MakeDocumentIdMapperFilename(base_dir_));
789   if (!status.ok()) {
790     ICING_LOG(ERROR) << status.error_message()
791                      << "Failed to delete old document_id mapper";
792     return status;
793   }
794   // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
795   // that can support error logging.
796   auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
797       *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
798       MemoryMappedFile::READ_WRITE_AUTO_SYNC);
799   if (!document_id_mapper_or.ok()) {
800     ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
801                      << "Failed to re-init document_id mapper";
802     return document_id_mapper_or.status();
803   }
804   document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
805   return libtextclassifier3::Status::OK;
806 }
807 
ResetDocumentAssociatedScoreCache()808 libtextclassifier3::Status DocumentStore::ResetDocumentAssociatedScoreCache() {
809   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
810   score_cache_.reset();
811   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
812       *filesystem_, MakeScoreCacheFilename(base_dir_)));
813   ICING_ASSIGN_OR_RETURN(score_cache_,
814                          FileBackedVector<DocumentAssociatedScoreData>::Create(
815                              *filesystem_, MakeScoreCacheFilename(base_dir_),
816                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
817   return libtextclassifier3::Status::OK;
818 }
819 
ResetScorablePropertyCache()820 libtextclassifier3::Status DocumentStore::ResetScorablePropertyCache() {
821   scorable_property_cache_.reset();
822   ICING_RETURN_IF_ERROR(
823       MemoryMappedFileBackedProtoLog<ScorablePropertySetProto>::Delete(
824           *filesystem_, MakeScorablePropertyCacheFilename(base_dir_)));
825   ICING_ASSIGN_OR_RETURN(
826       scorable_property_cache_,
827       MemoryMappedFileBackedProtoLog<ScorablePropertySetProto>::Create(
828           *filesystem_, MakeScorablePropertyCacheFilename(base_dir_)));
829   return libtextclassifier3::Status::OK;
830 }
831 
ResetCorpusAssociatedScoreCache()832 libtextclassifier3::Status DocumentStore::ResetCorpusAssociatedScoreCache() {
833   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
834   corpus_score_cache_.reset();
835   ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
836       *filesystem_, MakeCorpusScoreCache(base_dir_)));
837   ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
838                          FileBackedVector<CorpusAssociatedScoreData>::Create(
839                              *filesystem_, MakeCorpusScoreCache(base_dir_),
840                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
841   return libtextclassifier3::Status::OK;
842 }
843 
ResetFilterCache()844 libtextclassifier3::Status DocumentStore::ResetFilterCache() {
845   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
846   filter_cache_.reset();
847   ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
848       *filesystem_, MakeFilterCacheFilename(base_dir_)));
849   ICING_ASSIGN_OR_RETURN(filter_cache_,
850                          FileBackedVector<DocumentFilterData>::Create(
851                              *filesystem_, MakeFilterCacheFilename(base_dir_),
852                              MemoryMappedFile::READ_WRITE_AUTO_SYNC));
853   return libtextclassifier3::Status::OK;
854 }
855 
ResetNamespaceMapper()856 libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
857   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
858   namespace_mapper_.reset();
859   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
860   // that can support error logging.
861   libtextclassifier3::Status status = DynamicTrieKeyMapper<NamespaceId>::Delete(
862       *filesystem_, MakeNamespaceMapperFilename(base_dir_));
863   if (!status.ok()) {
864     ICING_LOG(ERROR) << status.error_message()
865                      << "Failed to delete old namespace_id mapper";
866     return status;
867   }
868   ICING_ASSIGN_OR_RETURN(
869       namespace_mapper_,
870       DynamicTrieKeyMapper<NamespaceId>::Create(
871           *filesystem_, MakeNamespaceMapperFilename(base_dir_),
872           kNamespaceMapperMaxSize));
873   return libtextclassifier3::Status::OK;
874 }
875 
ResetCorpusMapper()876 libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
877   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
878   corpus_mapper_.reset();
879   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
880   // that can support error logging.
881   libtextclassifier3::Status status = DynamicTrieKeyMapper<CorpusId>::Delete(
882       *filesystem_, MakeCorpusMapperFilename(base_dir_));
883   if (!status.ok()) {
884     ICING_LOG(ERROR) << status.error_message()
885                      << "Failed to delete old corpus_id mapper";
886     return status;
887   }
888   auto corpus_mapper_or =
889       DynamicTrieKeyMapper<CorpusId,
890                            fingerprint_util::FingerprintStringFormatter>::
891           Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
892                  kCorpusMapperMaxSize);
893   if (!corpus_mapper_or.ok()) {
894     return std::move(corpus_mapper_or).status();
895   }
896   corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
897   return libtextclassifier3::Status::OK;
898 }
899 
GetChecksum() const900 libtextclassifier3::StatusOr<Crc32> DocumentStore::GetChecksum() const {
901   Crc32 total_checksum;
902 
903   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
904   // that can support error logging.
905   auto checksum_or = document_log_->GetChecksum();
906   if (!checksum_or.ok()) {
907     ICING_LOG(ERROR) << checksum_or.status().error_message()
908                      << "Failed to compute checksum of DocumentLog";
909     return checksum_or.status();
910   }
911   Crc32 document_log_checksum = std::move(checksum_or).ValueOrDie();
912 
913   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
914   // that can support error logging.
915   checksum_or = document_key_mapper_->GetChecksum();
916   if (!checksum_or.ok()) {
917     ICING_LOG(ERROR) << checksum_or.status().error_message()
918                      << "Failed to compute checksum of DocumentKeyMapper";
919     return checksum_or.status();
920   }
921   Crc32 document_key_mapper_checksum = std::move(checksum_or).ValueOrDie();
922 
923   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
924   // that can support error logging.
925   checksum_or = document_id_mapper_->GetChecksum();
926   if (!checksum_or.ok()) {
927     ICING_LOG(ERROR) << checksum_or.status().error_message()
928                      << "Failed to compute checksum of DocumentIdMapper";
929     return checksum_or.status();
930   }
931   Crc32 document_id_mapper_checksum = std::move(checksum_or).ValueOrDie();
932 
933   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
934   // that can support error logging.
935   checksum_or = score_cache_->GetChecksum();
936   if (!checksum_or.ok()) {
937     ICING_LOG(ERROR) << checksum_or.status().error_message()
938                      << "Failed to compute checksum of score cache";
939     return checksum_or.status();
940   }
941   Crc32 score_cache_checksum = std::move(checksum_or).ValueOrDie();
942 
943   checksum_or = scorable_property_cache_->GetChecksum();
944   if (!checksum_or.ok()) {
945     ICING_LOG(ERROR) << checksum_or.status().error_message()
946                      << "Failed to compute checksum of scorable property cache";
947     return checksum_or.status();
948   }
949   Crc32 scorable_property_cache_checksum = std::move(checksum_or).ValueOrDie();
950 
951   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
952   // that can support error logging.
953   checksum_or = filter_cache_->GetChecksum();
954   if (!checksum_or.ok()) {
955     ICING_LOG(ERROR) << checksum_or.status().error_message()
956                      << "Failed to compute checksum of filter cache";
957     return checksum_or.status();
958   }
959   Crc32 filter_cache_checksum = std::move(checksum_or).ValueOrDie();
960 
961   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
962   // that can support error logging.
963   checksum_or = namespace_mapper_->GetChecksum();
964   if (!checksum_or.ok()) {
965     ICING_LOG(ERROR) << checksum_or.status().error_message()
966                      << "Failed to compute checksum of namespace mapper";
967     return checksum_or.status();
968   }
969   Crc32 namespace_mapper_checksum = std::move(checksum_or).ValueOrDie();
970 
971   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
972   // that can support error logging.
973   checksum_or = corpus_mapper_->GetChecksum();
974   if (!checksum_or.ok()) {
975     ICING_LOG(ERROR) << checksum_or.status().error_message()
976                      << "Failed to compute checksum of corpus mapper";
977     return checksum_or.status();
978   }
979   Crc32 corpus_mapper_checksum = std::move(checksum_or).ValueOrDie();
980 
981   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
982   // that can support error logging.
983   checksum_or = corpus_score_cache_->GetChecksum();
984   if (!checksum_or.ok()) {
985     ICING_LOG(WARNING) << checksum_or.status().error_message()
986                        << "Failed to compute checksum of score cache";
987     return checksum_or.status();
988   }
989   Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie();
990 
991   // NOTE: We purposely don't include usage_store checksum here because we can't
992   // regenerate it from ground truth documents. If it gets corrupted, we'll just
993   // clear all usage reports, but we shouldn't throw everything else in the
994   // document store out.
995 
996   total_checksum.Append(std::to_string(document_log_checksum.Get()));
997   total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
998   total_checksum.Append(std::to_string(document_id_mapper_checksum.Get()));
999   total_checksum.Append(std::to_string(score_cache_checksum.Get()));
1000   total_checksum.Append(std::to_string(scorable_property_cache_checksum.Get()));
1001   total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
1002   total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
1003   total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
1004   total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get()));
1005   return total_checksum;
1006 }
1007 
UpdateChecksum()1008 libtextclassifier3::StatusOr<Crc32> DocumentStore::UpdateChecksum() {
1009   Crc32 total_checksum;
1010 
1011   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1012   // that can support error logging.
1013   auto checksum_or = document_log_->UpdateChecksum();
1014   if (!checksum_or.ok()) {
1015     ICING_LOG(ERROR) << checksum_or.status().error_message()
1016                      << "Failed to compute checksum of DocumentLog";
1017     return checksum_or.status();
1018   }
1019   Crc32 document_log_checksum = std::move(checksum_or).ValueOrDie();
1020 
1021   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1022   // that can support error logging.
1023   checksum_or = document_key_mapper_->UpdateChecksum();
1024   if (!checksum_or.ok()) {
1025     ICING_LOG(ERROR) << checksum_or.status().error_message()
1026                      << "Failed to compute checksum of DocumentKeyMapper";
1027     return checksum_or.status();
1028   }
1029   Crc32 document_key_mapper_checksum = std::move(checksum_or).ValueOrDie();
1030 
1031   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1032   // that can support error logging.
1033   checksum_or = document_id_mapper_->UpdateChecksum();
1034   if (!checksum_or.ok()) {
1035     ICING_LOG(ERROR) << checksum_or.status().error_message()
1036                      << "Failed to compute checksum of DocumentIdMapper";
1037     return checksum_or.status();
1038   }
1039   Crc32 document_id_mapper_checksum = std::move(checksum_or).ValueOrDie();
1040 
1041   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1042   // that can support error logging.
1043   checksum_or = score_cache_->UpdateChecksum();
1044   if (!checksum_or.ok()) {
1045     ICING_LOG(ERROR) << checksum_or.status().error_message()
1046                      << "Failed to compute checksum of score cache";
1047     return checksum_or.status();
1048   }
1049   Crc32 score_cache_checksum = std::move(checksum_or).ValueOrDie();
1050 
1051   checksum_or = scorable_property_cache_->UpdateChecksum();
1052   if (!checksum_or.ok()) {
1053     ICING_LOG(ERROR) << checksum_or.status().error_message()
1054                      << "Failed to compute checksum of scorable property cache";
1055     return checksum_or.status();
1056   }
1057   Crc32 scorable_property_cache_checksum = std::move(checksum_or).ValueOrDie();
1058 
1059   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1060   // that can support error logging.
1061   checksum_or = filter_cache_->UpdateChecksum();
1062   if (!checksum_or.ok()) {
1063     ICING_LOG(ERROR) << checksum_or.status().error_message()
1064                      << "Failed to compute checksum of filter cache";
1065     return checksum_or.status();
1066   }
1067   Crc32 filter_cache_checksum = std::move(checksum_or).ValueOrDie();
1068 
1069   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1070   // that can support error logging.
1071   checksum_or = namespace_mapper_->UpdateChecksum();
1072   if (!checksum_or.ok()) {
1073     ICING_LOG(ERROR) << checksum_or.status().error_message()
1074                      << "Failed to compute checksum of namespace mapper";
1075     return checksum_or.status();
1076   }
1077   Crc32 namespace_mapper_checksum = std::move(checksum_or).ValueOrDie();
1078 
1079   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1080   // that can support error logging.
1081   checksum_or = corpus_mapper_->UpdateChecksum();
1082   if (!checksum_or.ok()) {
1083     ICING_LOG(ERROR) << checksum_or.status().error_message()
1084                      << "Failed to compute checksum of corpus mapper";
1085     return checksum_or.status();
1086   }
1087   Crc32 corpus_mapper_checksum = std::move(checksum_or).ValueOrDie();
1088 
1089   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1090   // that can support error logging.
1091   checksum_or = corpus_score_cache_->UpdateChecksum();
1092   if (!checksum_or.ok()) {
1093     ICING_LOG(WARNING) << checksum_or.status().error_message()
1094                        << "Failed to compute checksum of score cache";
1095     return checksum_or.status();
1096   }
1097   Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie();
1098 
1099   // NOTE: We purposely don't include usage_store checksum here because we can't
1100   // regenerate it from ground truth documents. If it gets corrupted, we'll just
1101   // clear all usage reports, but we shouldn't throw everything else in the
1102   // document store out.
1103 
1104   total_checksum.Append(std::to_string(document_log_checksum.Get()));
1105   total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
1106   total_checksum.Append(std::to_string(document_id_mapper_checksum.Get()));
1107   total_checksum.Append(std::to_string(score_cache_checksum.Get()));
1108   total_checksum.Append(std::to_string(scorable_property_cache_checksum.Get()));
1109   total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
1110   total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
1111   total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
1112   total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get()));
1113 
1114   // Write the header
1115   DocumentStore::Header header;
1116   header.magic = DocumentStore::Header::kMagic;
1117   header.checksum = total_checksum.Get();
1118 
1119   // This should overwrite the header.
1120   ScopedFd sfd(
1121       filesystem_->OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
1122   if (!sfd.is_valid() ||
1123       !filesystem_->Write(sfd.get(), &header, sizeof(header)) ||
1124       !filesystem_->DataSync(sfd.get())) {
1125     return absl_ports::InternalError(absl_ports::StrCat(
1126         "Failed to write DocStore header: ", MakeHeaderFilename(base_dir_)));
1127   }
1128   return total_checksum;
1129 }
1130 
HeaderExists()1131 bool DocumentStore::HeaderExists() {
1132   if (!filesystem_->FileExists(MakeHeaderFilename(base_dir_).c_str())) {
1133     return false;
1134   }
1135 
1136   int64_t file_size =
1137       filesystem_->GetFileSize(MakeHeaderFilename(base_dir_).c_str());
1138 
1139   // If it's been truncated to size 0 before, we consider it to be a new file
1140   return file_size != 0 && file_size != Filesystem::kBadFileSize;
1141 }
1142 
1143 libtextclassifier3::StatusOr<DocumentStore::PutResult>
InternalPut(DocumentProto && document,PutDocumentStatsProto * put_document_stats)1144 DocumentStore::InternalPut(DocumentProto&& document,
1145                            PutDocumentStatsProto* put_document_stats) {
1146   std::unique_ptr<Timer> put_timer = clock_.GetNewTimer();
1147   ICING_RETURN_IF_ERROR(document_validator_.Validate(document));
1148 
1149   if (put_document_stats != nullptr) {
1150     put_document_stats->set_document_size(document.ByteSizeLong());
1151   }
1152 
1153   // Copy fields needed before they are moved
1154   std::string name_space = document.namespace_();
1155   std::string uri = document.uri();
1156   std::string schema = document.schema();
1157   int document_score = document.score();
1158   int32_t length_in_tokens = document.internal_fields().length_in_tokens();
1159   int64_t creation_timestamp_ms = document.creation_timestamp_ms();
1160 
1161   // Sets the creation timestamp if caller hasn't specified.
1162   if (document.creation_timestamp_ms() == 0) {
1163     creation_timestamp_ms = clock_.GetSystemTimeMilliseconds();
1164     document.set_creation_timestamp_ms(creation_timestamp_ms);
1165   }
1166 
1167   int64_t expiration_timestamp_ms =
1168       CalculateExpirationTimestampMs(creation_timestamp_ms, document.ttl_ms());
1169 
1170   // Update ground truth first
1171   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1172   // that can support error logging.
1173   DocumentWrapper document_wrapper = CreateDocumentWrapper(std::move(document));
1174   auto offset_or = document_log_->WriteProto(document_wrapper);
1175   if (!offset_or.ok()) {
1176     ICING_LOG(ERROR) << offset_or.status().error_message()
1177                      << "Failed to write document";
1178     return offset_or.status();
1179   }
1180   int64_t file_offset = std::move(offset_or).ValueOrDie();
1181 
1182   // Get existing document id
1183   auto old_document_id_or = GetDocumentId(name_space, uri);
1184   if (!old_document_id_or.ok() &&
1185       !absl_ports::IsNotFound(old_document_id_or.status())) {
1186     return absl_ports::InternalError("Failed to read from key mapper");
1187   }
1188 
1189   // Creates a new document id, updates key mapper and document_id mapper
1190   DocumentId new_document_id = document_id_mapper_->num_elements();
1191   if (!IsDocumentIdValid(new_document_id)) {
1192     return absl_ports::ResourceExhaustedError(
1193         "Exceeded maximum number of documents. Try calling Optimize to reclaim "
1194         "some space.");
1195   }
1196   PutResult put_result;
1197   put_result.new_document_id = new_document_id;
1198 
1199   // Update namespace maps
1200   ICING_ASSIGN_OR_RETURN(
1201       NamespaceId namespace_id,
1202       namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys()));
1203 
1204   NamespaceIdFingerprint new_doc_nsid_uri_fingerprint(namespace_id, uri);
1205 
1206   // Updates key mapper and document_id mapper
1207   ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
1208       new_doc_nsid_uri_fingerprint.EncodeToCString(), new_document_id));
1209   ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
1210 
1211   // Update corpus maps
1212   NamespaceIdFingerprint corpus_nsid_schema_fingerprint(namespace_id, schema);
1213   ICING_ASSIGN_OR_RETURN(
1214       CorpusId corpus_id,
1215       corpus_mapper_->GetOrPut(corpus_nsid_schema_fingerprint.EncodeToCString(),
1216                                corpus_mapper_->num_keys()));
1217 
1218   ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
1219                          GetCorpusAssociatedScoreDataToUpdate(corpus_id));
1220   scoring_data.AddDocument(length_in_tokens);
1221 
1222   ICING_RETURN_IF_ERROR(
1223       UpdateCorpusAssociatedScoreCache(corpus_id, scoring_data));
1224 
1225   ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1226                          schema_store_->GetSchemaTypeId(schema));
1227   ICING_ASSIGN_OR_RETURN(
1228       int scorable_property_cache_index,
1229       UpdateScorablePropertyCache(document_wrapper.document(), schema_type_id));
1230 
1231   ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
1232       new_document_id, DocumentAssociatedScoreData(
1233                            corpus_id, document_score, creation_timestamp_ms,
1234                            scorable_property_cache_index, length_in_tokens)));
1235 
1236   ICING_RETURN_IF_ERROR(UpdateFilterCache(
1237       new_document_id,
1238       DocumentFilterData(namespace_id,
1239                          new_doc_nsid_uri_fingerprint.fingerprint(),
1240                          schema_type_id, expiration_timestamp_ms)));
1241 
1242   if (old_document_id_or.ok()) {
1243     // The old document exists, copy over the usage scores and delete the old
1244     // document.
1245     DocumentId old_document_id = old_document_id_or.ValueOrDie();
1246     put_result.old_document_id = old_document_id;
1247 
1248     ICING_RETURN_IF_ERROR(
1249         usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
1250                                        /*to_document_id=*/new_document_id));
1251 
1252     // Delete the old document. It's fine if it's not found since it might have
1253     // been deleted previously.
1254     auto delete_status =
1255         Delete(old_document_id, clock_.GetSystemTimeMilliseconds());
1256     if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1257       // Real error, pass it up.
1258       return delete_status;
1259     }
1260   }
1261 
1262   if (put_document_stats != nullptr) {
1263     put_document_stats->set_document_store_latency_ms(
1264         put_timer->GetElapsedMilliseconds());
1265   }
1266 
1267   return put_result;
1268 }
1269 
Get(const std::string_view name_space,const std::string_view uri,bool clear_internal_fields) const1270 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
1271     const std::string_view name_space, const std::string_view uri,
1272     bool clear_internal_fields) const {
1273   // TODO(b/147231617): Make a better way to replace the error message in an
1274   // existing Status.
1275   auto document_id_or = GetDocumentId(name_space, uri);
1276   if (!document_id_or.ok()) {
1277     if (absl_ports::IsNotFound(document_id_or.status())) {
1278       ICING_VLOG(1) << document_id_or.status().error_message();
1279       return absl_ports::NotFoundError(absl_ports::StrCat(
1280           "Document (", name_space, ", ", uri, ") not found."));
1281     }
1282 
1283     // Real error. Log it in error level and pass it up.
1284     ICING_LOG(ERROR) << document_id_or.status().error_message();
1285     return std::move(document_id_or).status();
1286   }
1287   DocumentId document_id = document_id_or.ValueOrDie();
1288 
1289   // TODO(b/147231617): Make a better way to replace the error message in an
1290   // existing Status.
1291   auto status_or = Get(document_id, clear_internal_fields);
1292   if (!status_or.ok()) {
1293     if (absl_ports::IsNotFound(status_or.status())) {
1294       ICING_VLOG(1) << status_or.status().error_message();
1295       return absl_ports::NotFoundError(absl_ports::StrCat(
1296           "Document (", name_space, ", ", uri, ") not found."));
1297     }
1298 
1299     // Real error. Log it in error level.
1300     ICING_LOG(ERROR) << status_or.status().error_message();
1301   }
1302   return status_or;
1303 }
1304 
Get(DocumentId document_id,bool clear_internal_fields) const1305 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
1306     DocumentId document_id, bool clear_internal_fields) const {
1307   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1308   auto document_filter_data_optional =
1309       GetAliveDocumentFilterData(document_id, current_time_ms);
1310   if (!document_filter_data_optional) {
1311     // The document doesn't exist. Let's check if the document id is invalid, we
1312     // will return InvalidArgumentError. Otherwise we should return NOT_FOUND
1313     // error.
1314     if (!IsDocumentIdValid(document_id)) {
1315       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
1316           "Document id '%d' invalid.", document_id));
1317     }
1318     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1319         "Document id '%d' doesn't exist", document_id));
1320   }
1321 
1322   auto document_log_offset_or = document_id_mapper_->Get(document_id);
1323   if (!document_log_offset_or.ok()) {
1324     // Since we've just checked that our document_id is valid a few lines
1325     // above, there's no reason this should fail and an error should never
1326     // happen.
1327     return absl_ports::InternalError("Failed to find document offset.");
1328   }
1329   int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1330 
1331   // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1332   // that can support error logging.
1333   auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
1334   if (!document_wrapper_or.ok()) {
1335     ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
1336                      << "Failed to read from document log";
1337     return document_wrapper_or.status();
1338   }
1339   DocumentWrapper document_wrapper =
1340       std::move(document_wrapper_or).ValueOrDie();
1341   if (clear_internal_fields) {
1342     document_wrapper.mutable_document()->clear_internal_fields();
1343   }
1344 
1345   return std::move(*document_wrapper.mutable_document());
1346 }
1347 
GetScorablePropertySet(DocumentId document_id,int64_t current_time_ms) const1348 std::unique_ptr<ScorablePropertySet> DocumentStore::GetScorablePropertySet(
1349     DocumentId document_id, int64_t current_time_ms) const {
1350   if (!feature_flags_.enable_scorable_properties()) {
1351     return nullptr;
1352   }
1353 
1354   // Get scorable property cache index from the score_cache_
1355   libtextclassifier3::StatusOr<const DocumentAssociatedScoreData*>
1356       score_data_or = score_cache_->Get(document_id);
1357   if (!score_data_or.ok()) {
1358     return nullptr;
1359   }
1360   if (score_data_or.ValueOrDie()->scorable_property_cache_index() ==
1361       kInvalidScorablePropertyCacheIndex) {
1362     return nullptr;
1363   }
1364 
1365   // Get ScorablePropertySetProto.
1366   libtextclassifier3::StatusOr<ScorablePropertySetProto>
1367       scorable_property_set_proto_or = scorable_property_cache_->Read(
1368           score_data_or.ValueOrDie()->scorable_property_cache_index());
1369   if (!scorable_property_set_proto_or.ok()) {
1370     return nullptr;
1371   }
1372 
1373   // Get schema type id.
1374   auto document_filter_data_optional =
1375       GetAliveDocumentFilterData(document_id, current_time_ms);
1376   if (!document_filter_data_optional) {
1377     return nullptr;
1378   }
1379 
1380   libtextclassifier3::StatusOr<std::unique_ptr<ScorablePropertySet>>
1381       scorable_property_set_or = ScorablePropertySet::Create(
1382           std::move(scorable_property_set_proto_or.ValueOrDie()),
1383           document_filter_data_optional.value().schema_type_id(),
1384           schema_store_);
1385   if (!scorable_property_set_or.ok()) {
1386     return nullptr;
1387   }
1388   return std::move(scorable_property_set_or.ValueOrDie());
1389 }
1390 
GetDocumentId(const std::string_view name_space,const std::string_view uri) const1391 libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
1392     const std::string_view name_space, const std::string_view uri) const {
1393   auto namespace_id_or = namespace_mapper_->Get(name_space);
1394   libtextclassifier3::Status status = namespace_id_or.status();
1395   if (status.ok()) {
1396     NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1397     NamespaceIdFingerprint doc_nsid_uri_fingerprint(namespace_id, uri);
1398     auto document_id_or =
1399         document_key_mapper_->Get(doc_nsid_uri_fingerprint.EncodeToCString());
1400     status = document_id_or.status();
1401     if (status.ok()) {
1402       // Guaranteed to have a DocumentId now
1403       return document_id_or.ValueOrDie();
1404     }
1405   }
1406   return absl_ports::Annotate(
1407       status, absl_ports::StrCat(
1408                   "Failed to find DocumentId by key: ", name_space, ", ", uri));
1409 }
1410 
GetDocumentId(const NamespaceIdFingerprint & doc_namespace_id_uri_fingerprint) const1411 libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
1412     const NamespaceIdFingerprint& doc_namespace_id_uri_fingerprint) const {
1413   auto document_id_or = document_key_mapper_->Get(
1414       doc_namespace_id_uri_fingerprint.EncodeToCString());
1415   if (document_id_or.ok()) {
1416     return document_id_or.ValueOrDie();
1417   }
1418   return absl_ports::Annotate(
1419       std::move(document_id_or).status(),
1420       "Failed to find DocumentId by namespace id + fingerprint");
1421 }
1422 
GetAllNamespaces() const1423 std::vector<std::string> DocumentStore::GetAllNamespaces() const {
1424   std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1425       GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1426 
1427   std::unordered_set<NamespaceId> existing_namespace_ids;
1428   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1429   for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1430        ++document_id) {
1431     // filter_cache_->Get can only fail if document_id is < 0
1432     // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1433     auto status_or_data = filter_cache_->Get(document_id);
1434     if (!status_or_data.ok()) {
1435       ICING_LOG(ERROR)
1436           << "Error while iterating over filter cache in GetAllNamespaces";
1437       return std::vector<std::string>();
1438     }
1439     const DocumentFilterData* data = status_or_data.ValueOrDie();
1440 
1441     if (GetAliveDocumentFilterData(document_id, current_time_ms)) {
1442       existing_namespace_ids.insert(data->namespace_id());
1443     }
1444   }
1445 
1446   std::vector<std::string> existing_namespaces;
1447   for (auto itr = existing_namespace_ids.begin();
1448        itr != existing_namespace_ids.end(); ++itr) {
1449     existing_namespaces.push_back(namespace_id_to_namespace.at(*itr));
1450   }
1451   return existing_namespaces;
1452 }
1453 
GetAliveDocumentFilterData(DocumentId document_id,int64_t current_time_ms) const1454 std::optional<DocumentFilterData> DocumentStore::GetAliveDocumentFilterData(
1455     DocumentId document_id, int64_t current_time_ms) const {
1456   if (IsDeleted(document_id)) {
1457     return std::nullopt;
1458   }
1459   return GetNonExpiredDocumentFilterData(document_id, current_time_ms);
1460 }
1461 
1462 std::optional<DocumentFilterData>
GetNonDeletedDocumentFilterData(DocumentId document_id) const1463 DocumentStore::GetNonDeletedDocumentFilterData(DocumentId document_id) const {
1464   if (IsDeleted(document_id)) {
1465     return std::nullopt;
1466   }
1467 
1468   auto filter_data_or = filter_cache_->GetCopy(document_id);
1469   if (!filter_data_or.ok()) {
1470     // This would only happen if document_id is out of range of the
1471     // filter_cache, meaning we got some invalid document_id. Callers should
1472     // already have checked that their document_id is valid or used
1473     // DoesDocumentExist(WithStatus). Regardless, return std::nullopt since the
1474     // document doesn't exist.
1475     return std::nullopt;
1476   }
1477 
1478   // At this point, it's guaranteed that the document has not been deleted. It
1479   // could still be expired, but the filter data is guaranteed to be valid here.
1480   return std::move(filter_data_or).ValueOrDie();
1481 }
1482 
IsDeleted(DocumentId document_id) const1483 bool DocumentStore::IsDeleted(DocumentId document_id) const {
1484   auto file_offset_or = document_id_mapper_->Get(document_id);
1485   if (!file_offset_or.ok()) {
1486     // This would only happen if document_id is out of range of the
1487     // document_id_mapper, meaning we got some invalid document_id. Callers
1488     // should already have checked that their document_id is valid or used
1489     // DoesDocumentExist(WithStatus). Regardless, return true since the
1490     // document doesn't exist.
1491     return true;
1492   }
1493   int64_t file_offset = *file_offset_or.ValueOrDie();
1494   return file_offset == kDocDeletedFlag;
1495 }
1496 
1497 // Returns DocumentFilterData if the document is not expired. Otherwise,
1498 // std::nullopt.
1499 std::optional<DocumentFilterData>
GetNonExpiredDocumentFilterData(DocumentId document_id,int64_t current_time_ms) const1500 DocumentStore::GetNonExpiredDocumentFilterData(DocumentId document_id,
1501                                                int64_t current_time_ms) const {
1502   auto filter_data_or = filter_cache_->GetCopy(document_id);
1503   if (!filter_data_or.ok()) {
1504     // This would only happen if document_id is out of range of the
1505     // filter_cache, meaning we got some invalid document_id. Callers should
1506     // already have checked that their document_id is valid or used
1507     // DoesDocumentExist(WithStatus). Regardless, return std::nullopt since the
1508     // document doesn't exist.
1509     return std::nullopt;
1510   }
1511   DocumentFilterData document_filter_data = filter_data_or.ValueOrDie();
1512 
1513   // Check if it's past the expiration time
1514   if (current_time_ms >= document_filter_data.expiration_timestamp_ms()) {
1515     return std::nullopt;
1516   }
1517   return document_filter_data;
1518 }
1519 
Delete(const std::string_view name_space,const std::string_view uri,int64_t current_time_ms)1520 libtextclassifier3::Status DocumentStore::Delete(
1521     const std::string_view name_space, const std::string_view uri,
1522     int64_t current_time_ms) {
1523   // Try to get the DocumentId first
1524   auto document_id_or = GetDocumentId(name_space, uri);
1525   if (!document_id_or.ok()) {
1526     return absl_ports::Annotate(
1527         document_id_or.status(),
1528         absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
1529                            ", uri: ", uri));
1530   }
1531   return Delete(document_id_or.ValueOrDie(), current_time_ms);
1532 }
1533 
Delete(DocumentId document_id,int64_t current_time_ms)1534 libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id,
1535                                                  int64_t current_time_ms) {
1536   auto document_filter_data_optional =
1537       GetAliveDocumentFilterData(document_id, current_time_ms);
1538   if (!document_filter_data_optional) {
1539     // The document doesn't exist. We should return InvalidArgumentError if the
1540     // document id is invalid. Otherwise we should return NOT_FOUND error.
1541     if (!IsDocumentIdValid(document_id)) {
1542       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
1543           "Document id '%d' invalid.", document_id));
1544     }
1545     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1546         "Document id '%d' doesn't exist", document_id));
1547   }
1548 
1549   auto document_log_offset_or = document_id_mapper_->Get(document_id);
1550   if (!document_log_offset_or.ok()) {
1551     return absl_ports::InternalError("Failed to find document offset.");
1552   }
1553   int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1554 
1555   // Erases document proto.
1556   ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
1557   return ClearDerivedData(document_id);
1558 }
1559 
GetNamespaceId(std::string_view name_space) const1560 libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
1561     std::string_view name_space) const {
1562   return namespace_mapper_->Get(name_space);
1563 }
1564 
GetCorpusId(const std::string_view name_space,const std::string_view schema) const1565 libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId(
1566     const std::string_view name_space, const std::string_view schema) const {
1567   ICING_ASSIGN_OR_RETURN(NamespaceId namespace_id,
1568                          namespace_mapper_->Get(name_space));
1569   NamespaceIdFingerprint corpus_nsid_schema_fp(namespace_id, schema);
1570   return corpus_mapper_->Get(corpus_nsid_schema_fp.EncodeToCString());
1571 }
1572 
GetResultGroupingEntryId(ResultSpecProto::ResultGroupingType result_group_type,const std::string_view name_space,const std::string_view schema) const1573 libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
1574     ResultSpecProto::ResultGroupingType result_group_type,
1575     const std::string_view name_space, const std::string_view schema) const {
1576   auto namespace_id = GetNamespaceId(name_space);
1577   auto schema_type_id = schema_store_->GetSchemaTypeId(schema);
1578   switch (result_group_type) {
1579     case ResultSpecProto::NONE:
1580       return absl_ports::InvalidArgumentError(
1581           "Cannot group by ResultSpecProto::NONE");
1582     case ResultSpecProto::SCHEMA_TYPE:
1583       if (schema_type_id.ok()) {
1584         return schema_type_id.ValueOrDie();
1585       }
1586       break;
1587     case ResultSpecProto::NAMESPACE:
1588       if (namespace_id.ok()) {
1589         return namespace_id.ValueOrDie();
1590       }
1591       break;
1592     case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
1593       if (namespace_id.ok() && schema_type_id.ok()) {
1594         // TODO(b/258715421): Temporary workaround to get a
1595         //                    ResultGroupingEntryId given the Namespace string
1596         //                    and Schema string.
1597         return namespace_id.ValueOrDie() << 16 | schema_type_id.ValueOrDie();
1598       }
1599       break;
1600   }
1601   return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
1602 }
1603 
GetResultGroupingEntryId(ResultSpecProto::ResultGroupingType result_group_type,const NamespaceId namespace_id,const SchemaTypeId schema_type_id) const1604 libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
1605     ResultSpecProto::ResultGroupingType result_group_type,
1606     const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const {
1607   switch (result_group_type) {
1608     case ResultSpecProto::NONE:
1609       return absl_ports::InvalidArgumentError(
1610           "Cannot group by ResultSpecProto::NONE");
1611     case ResultSpecProto::SCHEMA_TYPE:
1612       return schema_type_id;
1613     case ResultSpecProto::NAMESPACE:
1614       return namespace_id;
1615     case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
1616       // TODO(b/258715421): Temporary workaround to get a ResultGroupingEntryId
1617       //                    given the Namespace Id and SchemaType Id.
1618       return namespace_id << 16 | schema_type_id;
1619   }
1620   return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
1621 }
1622 
1623 libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const1624 DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
1625   auto score_data_or = score_cache_->GetCopy(document_id);
1626   if (!score_data_or.ok()) {
1627     ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
1628                      << " from score_cache_";
1629     return absl_ports::NotFoundError(
1630         std::move(score_data_or).status().error_message());
1631   }
1632 
1633   DocumentAssociatedScoreData document_associated_score_data =
1634       std::move(score_data_or).ValueOrDie();
1635   return document_associated_score_data;
1636 }
1637 
1638 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreData(CorpusId corpus_id) const1639 DocumentStore::GetCorpusAssociatedScoreData(CorpusId corpus_id) const {
1640   return corpus_score_cache_->GetCopy(corpus_id);
1641 }
1642 
1643 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const1644 DocumentStore::GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const {
1645   auto corpus_scoring_data_or = GetCorpusAssociatedScoreData(corpus_id);
1646   if (!corpus_scoring_data_or.ok() &&
1647       absl_ports::IsOutOfRange(corpus_scoring_data_or.status())) {
1648     // OUT_OF_RANGE is the StatusCode returned when a corpus id is added to
1649     // corpus_score_cache_ for the first time. Return a default
1650     // CorpusAssociatedScoreData object in this case.
1651     return CorpusAssociatedScoreData();
1652   }
1653 
1654   return corpus_scoring_data_or;
1655 }
1656 
1657 // TODO(b/273826815): Decide on and adopt a consistent pattern for handling
1658 // NOT_FOUND 'errors' returned by our internal classes.
GetUsageScores(DocumentId document_id,int64_t current_time_ms) const1659 std::optional<UsageStore::UsageScores> DocumentStore::GetUsageScores(
1660     DocumentId document_id, int64_t current_time_ms) const {
1661   std::optional<DocumentFilterData> opt =
1662       GetAliveDocumentFilterData(document_id, current_time_ms);
1663   if (!opt) {
1664     return std::nullopt;
1665   }
1666   if (document_id >= usage_store_->num_elements()) {
1667     return std::nullopt;
1668   }
1669   auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1670   if (!usage_scores_or.ok()) {
1671     ICING_LOG(ERROR) << "Error retrieving usage for " << document_id << ": "
1672                      << usage_scores_or.status().error_message();
1673     return std::nullopt;
1674   }
1675   return std::move(usage_scores_or).ValueOrDie();
1676 }
1677 
ReportUsage(const UsageReport & usage_report)1678 libtextclassifier3::Status DocumentStore::ReportUsage(
1679     const UsageReport& usage_report) {
1680   ICING_ASSIGN_OR_RETURN(DocumentId document_id,
1681                          GetDocumentId(usage_report.document_namespace(),
1682                                        usage_report.document_uri()));
1683   // We can use the internal version here because we got our document_id from
1684   // our internal data structures. We would have thrown some error if the
1685   // namespace and/or uri were incorrect.
1686   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1687   if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1688     // Document was probably deleted or expired.
1689     return absl_ports::NotFoundError(absl_ports::StrCat(
1690         "Couldn't report usage on a nonexistent document: (namespace: '",
1691         usage_report.document_namespace(), "', uri: '",
1692         usage_report.document_uri(), "')"));
1693   }
1694 
1695   return usage_store_->AddUsageReport(usage_report, document_id);
1696 }
1697 
DeleteByNamespace(std::string_view name_space)1698 DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace(
1699     std::string_view name_space) {
1700   DeleteByGroupResult result;
1701   auto namespace_id_or = namespace_mapper_->Get(name_space);
1702   if (!namespace_id_or.ok()) {
1703     result.status = absl_ports::Annotate(
1704         namespace_id_or.status(),
1705         absl_ports::StrCat("Failed to find namespace: ", name_space));
1706     return result;
1707   }
1708   NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1709   auto num_deleted_or = BatchDelete(namespace_id, kInvalidSchemaTypeId);
1710   if (!num_deleted_or.ok()) {
1711     result.status = std::move(num_deleted_or).status();
1712     return result;
1713   }
1714 
1715   result.num_docs_deleted = num_deleted_or.ValueOrDie();
1716   if (result.num_docs_deleted <= 0) {
1717     // Treat the fact that no existing documents had this namespace to be the
1718     // same as this namespace not existing at all.
1719     result.status = absl_ports::NotFoundError(
1720         absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
1721     return result;
1722   }
1723 
1724   return result;
1725 }
1726 
DeleteBySchemaType(std::string_view schema_type)1727 DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType(
1728     std::string_view schema_type) {
1729   DeleteByGroupResult result;
1730   auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
1731   if (!schema_type_id_or.ok()) {
1732     result.status = absl_ports::Annotate(
1733         schema_type_id_or.status(),
1734         absl_ports::StrCat("Failed to find schema type. schema_type: ",
1735                            schema_type));
1736     return result;
1737   }
1738   SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
1739   auto num_deleted_or = BatchDelete(kInvalidNamespaceId, schema_type_id);
1740   if (!num_deleted_or.ok()) {
1741     result.status = std::move(num_deleted_or).status();
1742     return result;
1743   }
1744 
1745   result.num_docs_deleted = num_deleted_or.ValueOrDie();
1746   if (result.num_docs_deleted <= 0) {
1747     result.status = absl_ports::NotFoundError(absl_ports::StrCat(
1748         "No documents found with schema type '", schema_type, "'"));
1749     return result;
1750   }
1751 
1752   return result;
1753 }
1754 
BatchDelete(NamespaceId namespace_id,SchemaTypeId schema_type_id)1755 libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
1756     NamespaceId namespace_id, SchemaTypeId schema_type_id) {
1757   // Tracks if there were any existing documents with this namespace that we
1758   // will mark as deleted.
1759   int num_updated_documents = 0;
1760 
1761   // Traverse FilterCache and delete all docs that match namespace_id and
1762   // schema_type_id.
1763   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1764   for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1765        ++document_id) {
1766     // filter_cache_->Get can only fail if document_id is < 0
1767     // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1768     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
1769                            filter_cache_->Get(document_id));
1770 
1771     // Check namespace only when the input namespace id is valid.
1772     if (namespace_id != kInvalidNamespaceId &&
1773         (data->namespace_id() == kInvalidNamespaceId ||
1774          data->namespace_id() != namespace_id)) {
1775       // The document has already been hard-deleted or isn't from the desired
1776       // namespace.
1777       continue;
1778     }
1779 
1780     // Check schema type only when the input schema type id is valid.
1781     if (schema_type_id != kInvalidSchemaTypeId &&
1782         (data->schema_type_id() == kInvalidSchemaTypeId ||
1783          data->schema_type_id() != schema_type_id)) {
1784       // The document has already been hard-deleted or doesn't have the
1785       // desired schema type.
1786       continue;
1787     }
1788 
1789     // The document has the desired namespace and schema type, it either
1790     // exists or has expired.
1791     libtextclassifier3::Status delete_status =
1792         Delete(document_id, current_time_ms);
1793     if (absl_ports::IsNotFound(delete_status)) {
1794       continue;
1795     } else if (!delete_status.ok()) {
1796       // Real error, pass up.
1797       return delete_status;
1798     }
1799     ++num_updated_documents;
1800   }
1801 
1802   return num_updated_documents;
1803 }
1804 
PersistToDisk(PersistType::Code persist_type)1805 libtextclassifier3::Status DocumentStore::PersistToDisk(
1806     PersistType::Code persist_type) {
1807   ICING_RETURN_IF_ERROR(document_log_->PersistToDisk());
1808   if (persist_type == PersistType::LITE) {
1809     // only persist the document log.
1810     return libtextclassifier3::Status::OK;
1811   }
1812   if (persist_type == PersistType::RECOVERY_PROOF) {
1813     return UpdateChecksum().status();
1814   }
1815   ICING_RETURN_IF_ERROR(document_key_mapper_->PersistToDisk());
1816   ICING_RETURN_IF_ERROR(document_id_mapper_->PersistToDisk());
1817   ICING_RETURN_IF_ERROR(score_cache_->PersistToDisk());
1818   ICING_RETURN_IF_ERROR(scorable_property_cache_->PersistToDisk());
1819   ICING_RETURN_IF_ERROR(filter_cache_->PersistToDisk());
1820   ICING_RETURN_IF_ERROR(namespace_mapper_->PersistToDisk());
1821   ICING_RETURN_IF_ERROR(usage_store_->PersistToDisk());
1822   ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk());
1823   ICING_RETURN_IF_ERROR(corpus_score_cache_->PersistToDisk());
1824 
1825   // Update the combined checksum and write to header file.
1826   ICING_RETURN_IF_ERROR(UpdateChecksum());
1827   return libtextclassifier3::Status::OK;
1828 }
1829 
GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t> & value_or,int64_t default_value)1830 int64_t GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t>& value_or,
1831                           int64_t default_value) {
1832   return (value_or.ok()) ? value_or.ValueOrDie() : default_value;
1833 }
1834 
GetMemberStorageInfo() const1835 DocumentStorageInfoProto DocumentStore::GetMemberStorageInfo() const {
1836   DocumentStorageInfoProto storage_info;
1837   storage_info.set_document_log_size(
1838       GetValueOrDefault(document_log_->GetDiskUsage(), -1));
1839   storage_info.set_key_mapper_size(
1840       GetValueOrDefault(document_key_mapper_->GetDiskUsage(), -1));
1841   storage_info.set_document_id_mapper_size(
1842       GetValueOrDefault(document_id_mapper_->GetDiskUsage(), -1));
1843   storage_info.set_score_cache_size(
1844       GetValueOrDefault(score_cache_->GetDiskUsage(), -1));
1845   storage_info.set_scorable_property_cache_size(
1846       GetValueOrDefault(scorable_property_cache_->GetDiskUsage(), -1));
1847   storage_info.set_filter_cache_size(
1848       GetValueOrDefault(filter_cache_->GetDiskUsage(), -1));
1849   storage_info.set_namespace_id_mapper_size(
1850       GetValueOrDefault(namespace_mapper_->GetDiskUsage(), -1));
1851   storage_info.set_corpus_mapper_size(
1852       GetValueOrDefault(corpus_mapper_->GetDiskUsage(), -1));
1853   storage_info.set_corpus_score_cache_size(
1854       GetValueOrDefault(corpus_score_cache_->GetDiskUsage(), -1));
1855   return storage_info;
1856 }
1857 
CalculateDocumentStatusCounts(DocumentStorageInfoProto storage_info) const1858 DocumentStorageInfoProto DocumentStore::CalculateDocumentStatusCounts(
1859     DocumentStorageInfoProto storage_info) const {
1860   int total_num_alive = 0;
1861   int total_num_expired = 0;
1862   int total_num_deleted = 0;
1863   std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1864       GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1865   std::unordered_map<std::string, NamespaceStorageInfoProto>
1866       namespace_to_storage_info;
1867 
1868   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1869   for (DocumentId document_id = 0;
1870        document_id < document_id_mapper_->num_elements(); ++document_id) {
1871     // Check if it's deleted first.
1872     if (IsDeleted(document_id)) {
1873       // We don't have the namespace id of hard deleted documents anymore, so
1874       // we can't add to our namespace storage info.
1875       ++total_num_deleted;
1876       continue;
1877     }
1878 
1879     // At this point, the document is either alive or expired, we can get
1880     // namespace info for it.
1881     auto filter_data_or = filter_cache_->Get(document_id);
1882     if (!filter_data_or.ok()) {
1883       ICING_VLOG(1) << "Error trying to get filter data for document store "
1884                        "storage info counts.";
1885       continue;
1886     }
1887     const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
1888     auto itr = namespace_id_to_namespace.find(filter_data->namespace_id());
1889     if (itr == namespace_id_to_namespace.end()) {
1890       ICING_VLOG(1) << "Error trying to find namespace for document store "
1891                        "storage info counts.";
1892       continue;
1893     }
1894     const std::string& name_space = itr->second;
1895 
1896     // Always set the namespace, if the NamespaceStorageInfoProto didn't exist
1897     // before, we'll get back a default instance of it.
1898     NamespaceStorageInfoProto& namespace_storage_info =
1899         namespace_to_storage_info[name_space];
1900     namespace_storage_info.set_namespace_(name_space);
1901 
1902     // Get usage scores
1903     auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1904     if (!usage_scores_or.ok()) {
1905       ICING_VLOG(1) << "Error trying to get usage scores for document store "
1906                        "storage info counts.";
1907       continue;
1908     }
1909     UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie();
1910 
1911     // Update our stats
1912     if (!GetNonExpiredDocumentFilterData(document_id, current_time_ms)) {
1913       ++total_num_expired;
1914       namespace_storage_info.set_num_expired_documents(
1915           namespace_storage_info.num_expired_documents() + 1);
1916       if (usage_scores.usage_type1_count > 0) {
1917         namespace_storage_info.set_num_expired_documents_usage_type1(
1918             namespace_storage_info.num_expired_documents_usage_type1() + 1);
1919       }
1920       if (usage_scores.usage_type2_count > 0) {
1921         namespace_storage_info.set_num_expired_documents_usage_type2(
1922             namespace_storage_info.num_expired_documents_usage_type2() + 1);
1923       }
1924       if (usage_scores.usage_type3_count > 0) {
1925         namespace_storage_info.set_num_expired_documents_usage_type3(
1926             namespace_storage_info.num_expired_documents_usage_type3() + 1);
1927       }
1928     } else {
1929       ++total_num_alive;
1930       namespace_storage_info.set_num_alive_documents(
1931           namespace_storage_info.num_alive_documents() + 1);
1932       if (usage_scores.usage_type1_count > 0) {
1933         namespace_storage_info.set_num_alive_documents_usage_type1(
1934             namespace_storage_info.num_alive_documents_usage_type1() + 1);
1935       }
1936       if (usage_scores.usage_type2_count > 0) {
1937         namespace_storage_info.set_num_alive_documents_usage_type2(
1938             namespace_storage_info.num_alive_documents_usage_type2() + 1);
1939       }
1940       if (usage_scores.usage_type3_count > 0) {
1941         namespace_storage_info.set_num_alive_documents_usage_type3(
1942             namespace_storage_info.num_alive_documents_usage_type3() + 1);
1943       }
1944     }
1945   }
1946 
1947   for (auto& itr : namespace_to_storage_info) {
1948     storage_info.mutable_namespace_storage_info()->Add(std::move(itr.second));
1949   }
1950   storage_info.set_num_alive_documents(total_num_alive);
1951   storage_info.set_num_deleted_documents(total_num_deleted);
1952   storage_info.set_num_expired_documents(total_num_expired);
1953   return storage_info;
1954 }
1955 
GetStorageInfo() const1956 DocumentStorageInfoProto DocumentStore::GetStorageInfo() const {
1957   DocumentStorageInfoProto storage_info = GetMemberStorageInfo();
1958   int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
1959   if (directory_size != Filesystem::kBadFileSize) {
1960     storage_info.set_document_store_size(directory_size);
1961   } else {
1962     storage_info.set_document_store_size(-1);
1963   }
1964   storage_info.set_num_namespaces(namespace_mapper_->num_keys());
1965   return CalculateDocumentStatusCounts(std::move(storage_info));
1966 }
1967 
UpdateSchemaStore(const SchemaStore * schema_store)1968 libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
1969     const SchemaStore* schema_store) {
1970   // Update all references to the SchemaStore
1971   schema_store_ = schema_store;
1972   document_validator_.UpdateSchemaStore(schema_store);
1973 
1974   int size = document_id_mapper_->num_elements();
1975   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1976   for (DocumentId document_id = 0; document_id < size; document_id++) {
1977     auto document_or = Get(document_id);
1978     if (absl_ports::IsNotFound(document_or.status())) {
1979       // Skip nonexistent documents
1980       continue;
1981     } else if (!document_or.ok()) {
1982       // Real error, pass up
1983       return absl_ports::Annotate(
1984           document_or.status(),
1985           IcingStringUtil::StringPrintf(
1986               "Failed to retrieve Document for DocumentId %d", document_id));
1987     }
1988 
1989     // Guaranteed to have a document now.
1990     DocumentProto document = document_or.ValueOrDie();
1991 
1992     // Revalidate that this document is still compatible
1993     if (document_validator_.Validate(document).ok()) {
1994       // Update the SchemaTypeId for this entry
1995       ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1996                              schema_store_->GetSchemaTypeId(document.schema()));
1997       ICING_ASSIGN_OR_RETURN(
1998           typename FileBackedVector<DocumentFilterData>::MutableView
1999               doc_filter_data_view,
2000           filter_cache_->GetMutable(document_id));
2001       doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
2002     } else {
2003       // Document is no longer valid with the new SchemaStore. Mark as
2004       // deleted
2005       auto delete_status =
2006           Delete(document.namespace_(), document.uri(), current_time_ms);
2007       if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
2008         // Real error, pass up
2009         return delete_status;
2010       }
2011     }
2012   }
2013 
2014   return libtextclassifier3::Status::OK;
2015 }
2016 
OptimizedUpdateSchemaStore(const SchemaStore * schema_store,const SchemaStore::SetSchemaResult & set_schema_result)2017 libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore(
2018     const SchemaStore* schema_store,
2019     const SchemaStore::SetSchemaResult& set_schema_result) {
2020   if (!set_schema_result.success) {
2021     // No new schema was set, no work to be done
2022     return libtextclassifier3::Status::OK;
2023   }
2024 
2025   // Update all references to the SchemaStore
2026   schema_store_ = schema_store;
2027   document_validator_.UpdateSchemaStore(schema_store);
2028 
2029   int size = document_id_mapper_->num_elements();
2030   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
2031   for (DocumentId document_id = 0; document_id < size; document_id++) {
2032     if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
2033       // Skip nonexistent documents
2034       continue;
2035     }
2036 
2037     // Guaranteed that the document exists now.
2038     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
2039                            filter_cache_->Get(document_id));
2040 
2041     bool delete_document = set_schema_result.schema_types_deleted_by_id.count(
2042                                filter_data->schema_type_id()) != 0;
2043 
2044     // Check if we need to update the FilterCache entry for this document. It
2045     // may have been assigned a different SchemaTypeId in the new SchemaStore.
2046     bool update_filter_cache =
2047         set_schema_result.old_schema_type_ids_changed.count(
2048             filter_data->schema_type_id()) != 0;
2049 
2050     // Check if we need to revalidate this document if the type is now
2051     // incompatible
2052     bool revalidate_document =
2053         set_schema_result.schema_types_incompatible_by_id.count(
2054             filter_data->schema_type_id()) != 0;
2055 
2056     if (update_filter_cache || revalidate_document) {
2057       ICING_ASSIGN_OR_RETURN(DocumentProto document, Get(document_id));
2058 
2059       if (update_filter_cache) {
2060         ICING_ASSIGN_OR_RETURN(
2061             SchemaTypeId schema_type_id,
2062             schema_store_->GetSchemaTypeId(document.schema()));
2063         ICING_ASSIGN_OR_RETURN(
2064             typename FileBackedVector<DocumentFilterData>::MutableView
2065                 doc_filter_data_view,
2066             filter_cache_->GetMutable(document_id));
2067         doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
2068       }
2069       if (revalidate_document) {
2070         delete_document = !document_validator_.Validate(document).ok();
2071       }
2072     }
2073 
2074     if (delete_document) {
2075       // Document is no longer valid with the new SchemaStore. Mark as deleted
2076       auto delete_status = Delete(document_id, current_time_ms);
2077       if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
2078         // Real error, pass up
2079         return delete_status;
2080       }
2081     }
2082   }
2083 
2084   return libtextclassifier3::Status::OK;
2085 }
2086 
RegenerateScorablePropertyCache(const std::unordered_set<SchemaTypeId> & schema_type_ids)2087 libtextclassifier3::Status DocumentStore::RegenerateScorablePropertyCache(
2088     const std::unordered_set<SchemaTypeId>& schema_type_ids) {
2089   if (schema_type_ids.empty()) {
2090     return libtextclassifier3::Status::OK;
2091   }
2092 
2093   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
2094   for (DocumentId document_id = 0;
2095        document_id < document_id_mapper_->num_elements(); ++document_id) {
2096     if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
2097       continue;
2098     }
2099     // Guaranteed that the document exists now.
2100     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
2101                            filter_cache_->Get(document_id));
2102     SchemaTypeId schema_type_id = filter_data->schema_type_id();
2103     if (schema_type_ids.find(schema_type_id) == schema_type_ids.end()) {
2104       continue;
2105     }
2106 
2107     ICING_ASSIGN_OR_RETURN(DocumentProto document, Get(document_id));
2108     int32_t scorable_property_cache_index = kInvalidScorablePropertyCacheIndex;
2109     ICING_ASSIGN_OR_RETURN(
2110         scorable_property_cache_index,
2111         UpdateScorablePropertyCache(document, schema_type_id));
2112 
2113     // Update the score_cache_ with the new scorable property cache index.
2114     ICING_ASSIGN_OR_RETURN(
2115         typename FileBackedVector<DocumentAssociatedScoreData>::MutableView
2116             doc_score_data_view,
2117         score_cache_->GetMutable(document_id));
2118     doc_score_data_view.Get().set_scorable_property_cache_index(
2119         scorable_property_cache_index);
2120   }
2121 
2122   return libtextclassifier3::Status::OK;
2123 }
2124 
2125 // TODO(b/121227117): Implement Optimize()
Optimize()2126 libtextclassifier3::Status DocumentStore::Optimize() {
2127   return libtextclassifier3::Status::OK;
2128 }
2129 
2130 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
OptimizeInto(const std::string & new_directory,const LanguageSegmenter * lang_segmenter,std::unordered_set<std::string> && potentially_optimizable_blob_handles,OptimizeStatsProto * stats) const2131 DocumentStore::OptimizeInto(
2132     const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
2133     std::unordered_set<std::string>&& potentially_optimizable_blob_handles,
2134     OptimizeStatsProto* stats) const {
2135   // Validates directory
2136   if (new_directory == base_dir_) {
2137     return absl_ports::InvalidArgumentError(
2138         "New directory is the same as the current one.");
2139   }
2140 
2141   ICING_ASSIGN_OR_RETURN(
2142       auto doc_store_create_result,
2143       DocumentStore::Create(
2144           filesystem_, new_directory, &clock_, schema_store_, &feature_flags_,
2145           /*force_recovery_and_revalidate_documents=*/false, pre_mapping_fbv_,
2146           use_persistent_hash_map_, compression_level_,
2147           /*initialize_stats=*/nullptr));
2148   std::unique_ptr<DocumentStore> new_doc_store =
2149       std::move(doc_store_create_result.document_store);
2150 
2151   // Writes all valid docs into new document store (new directory)
2152   int document_cnt = document_id_mapper_->num_elements();
2153   int num_deleted_documents = 0;
2154   int num_expired_documents = 0;
2155   UsageStore::UsageScores default_usage;
2156   OptimizeResult result;
2157   result.document_id_old_to_new.resize(document_cnt, kInvalidDocumentId);
2158 
2159   result.dead_blob_handles = std::move(potentially_optimizable_blob_handles);
2160   std::unordered_map<std::string, std::vector<std::string>>
2161       type_blob_property_map;
2162   if (!result.dead_blob_handles.empty()) {
2163     // Get the blob property map from the schema store.
2164     if (num_documents() == 0) {
2165       return result;
2166     }
2167     auto type_blob_property_map_or = schema_store_->ConstructBlobPropertyMap();
2168     if (!type_blob_property_map_or.ok()) {
2169       // If we fail to retrieve this map when there *are* documents in
2170       // doc store, then something is seriously wrong. Return error.
2171       return type_blob_property_map_or.status();
2172     }
2173     type_blob_property_map = std::move(type_blob_property_map_or).ValueOrDie();
2174   }
2175 
2176   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
2177   for (DocumentId document_id = 0; document_id < document_cnt; document_id++) {
2178     auto document_or = Get(document_id, /*clear_internal_fields=*/false);
2179     if (absl_ports::IsNotFound(document_or.status())) {
2180       if (IsDeleted(document_id)) {
2181         ++num_deleted_documents;
2182       } else if (!GetNonExpiredDocumentFilterData(document_id,
2183                                                   current_time_ms)) {
2184         ++num_expired_documents;
2185       }
2186       continue;
2187     } else if (!document_or.ok()) {
2188       // Real error, pass up
2189       return absl_ports::Annotate(
2190           document_or.status(),
2191           IcingStringUtil::StringPrintf(
2192               "Failed to retrieve Document for DocumentId %d", document_id));
2193     }
2194 
2195     // Guaranteed to have a document now.
2196     DocumentProto document_to_keep = std::move(document_or).ValueOrDie();
2197     // Remove blobs that still have reference are removed from the
2198     // expired_blob_handles. So that all remaining are dead blob.
2199     RemoveAliveBlobHandles(document_to_keep, type_blob_property_map,
2200                            result.dead_blob_handles);
2201 
2202     libtextclassifier3::StatusOr<PutResult> put_result_or;
2203     if (document_to_keep.internal_fields().length_in_tokens() == 0) {
2204       auto tokenized_document_or = TokenizedDocument::Create(
2205           schema_store_, lang_segmenter, document_to_keep);
2206       if (!tokenized_document_or.ok()) {
2207         return absl_ports::Annotate(
2208             tokenized_document_or.status(),
2209             IcingStringUtil::StringPrintf(
2210                 "Failed to tokenize Document for DocumentId %d", document_id));
2211       }
2212       TokenizedDocument tokenized_document(
2213           std::move(tokenized_document_or).ValueOrDie());
2214       put_result_or = new_doc_store->Put(
2215           std::move(document_to_keep), tokenized_document.num_string_tokens());
2216     } else {
2217       // TODO(b/144458732): Implement a more robust version of
2218       // TC_ASSIGN_OR_RETURN that can support error logging.
2219       put_result_or = new_doc_store->InternalPut(std::move(document_to_keep));
2220     }
2221     if (!put_result_or.ok()) {
2222       ICING_LOG(ERROR) << put_result_or.status().error_message()
2223                        << "Failed to write into new document store";
2224       return put_result_or.status();
2225     }
2226 
2227     DocumentId new_document_id = put_result_or.ValueOrDie().new_document_id;
2228     result.document_id_old_to_new[document_id] = new_document_id;
2229 
2230     // Copy over usage scores.
2231     ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
2232                            usage_store_->GetUsageScores(document_id));
2233     if (!(usage_scores == default_usage)) {
2234       // If the usage scores for this document are the default (no usage),
2235       // then don't bother setting it. No need to possibly allocate storage if
2236       // there's nothing interesting to store.
2237       ICING_RETURN_IF_ERROR(
2238           new_doc_store->SetUsageScores(new_document_id, usage_scores));
2239     }
2240   }
2241   // Construct namespace_id_old_to_new
2242   int namespace_cnt = namespace_mapper_->num_keys();
2243   std::unordered_map<NamespaceId, std::string> old_namespaces =
2244       GetNamespaceIdsToNamespaces(namespace_mapper_.get());
2245   if (namespace_cnt != old_namespaces.size()) {
2246     // This really shouldn't happen. If it really happens, then:
2247     // - It won't block DocumentStore optimization, so don't return error here.
2248     // - Instead, write a warning log here and hint the caller to rebuild index.
2249     ICING_LOG(WARNING) << "Unexpected old namespace count " << namespace_cnt
2250                        << " vs " << old_namespaces.size();
2251     result.should_rebuild_index = true;
2252   } else {
2253     result.namespace_id_old_to_new.resize(namespace_cnt, kInvalidNamespaceId);
2254     for (const auto& [old_namespace_id, ns] : old_namespaces) {
2255       if (old_namespace_id >= result.namespace_id_old_to_new.size()) {
2256         // This really shouldn't happen. If it really happens, then:
2257         // - It won't block DocumentStore optimization, so don't return error
2258         //   here.
2259         // - Instead, write a warning log here and hint the caller to rebuild
2260         //   index.
2261         ICING_LOG(WARNING) << "Found unexpected namespace id "
2262                            << old_namespace_id << ". Should be in range 0 to "
2263                            << result.namespace_id_old_to_new.size()
2264                            << " (exclusive).";
2265         result.namespace_id_old_to_new.clear();
2266         result.should_rebuild_index = true;
2267         break;
2268       }
2269 
2270       auto new_namespace_id_or = new_doc_store->namespace_mapper_->Get(ns);
2271       if (!new_namespace_id_or.ok()) {
2272         if (absl_ports::IsNotFound(new_namespace_id_or.status())) {
2273           continue;
2274         }
2275         // Real error, return it.
2276         return std::move(new_namespace_id_or).status();
2277       }
2278 
2279       NamespaceId new_namespace_id = new_namespace_id_or.ValueOrDie();
2280       // Safe to use bracket to assign given that we've checked the range above.
2281       result.namespace_id_old_to_new[old_namespace_id] = new_namespace_id;
2282     }
2283   }
2284 
2285   if (stats != nullptr) {
2286     stats->set_num_original_documents(document_cnt);
2287     stats->set_num_deleted_documents(num_deleted_documents);
2288     stats->set_num_expired_documents(num_expired_documents);
2289     stats->set_num_original_namespaces(namespace_cnt);
2290     stats->set_num_deleted_namespaces(
2291         namespace_cnt - new_doc_store->namespace_mapper_->num_keys());
2292   }
2293   ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL));
2294   return result;
2295 }
2296 
2297 libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
GetOptimizeInfo() const2298 DocumentStore::GetOptimizeInfo() const {
2299   OptimizeInfo optimize_info;
2300 
2301   // Figure out our ratio of optimizable/total docs.
2302   int32_t num_documents = document_id_mapper_->num_elements();
2303   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
2304   for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
2305        ++document_id) {
2306     if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
2307       ++optimize_info.optimizable_docs;
2308     }
2309 
2310     ++optimize_info.total_docs;
2311   }
2312 
2313   if (optimize_info.total_docs == 0) {
2314     // Can exit early since there's nothing to calculate.
2315     return optimize_info;
2316   }
2317 
2318   // Get the total element size.
2319   //
2320   // We use file size instead of disk usage here because the files are not
2321   // sparse, so it's more accurate. Disk usage rounds up to the nearest block
2322   // size.
2323   ICING_ASSIGN_OR_RETURN(const int64_t document_log_file_size,
2324                          document_log_->GetElementsFileSize());
2325   ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_file_size,
2326                          document_id_mapper_->GetElementsFileSize());
2327   ICING_ASSIGN_OR_RETURN(const int64_t score_cache_file_size,
2328                          score_cache_->GetElementsFileSize());
2329   ICING_ASSIGN_OR_RETURN(const int64_t scorable_property_cache_file_size,
2330                          scorable_property_cache_->GetElementsFileSize());
2331   ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
2332                          filter_cache_->GetElementsFileSize());
2333   ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_file_size,
2334                          corpus_score_cache_->GetElementsFileSize());
2335 
2336   // Usage store might be sparse, but we'll still use file size for more
2337   // accurate counting.
2338   ICING_ASSIGN_OR_RETURN(const int64_t usage_store_file_size,
2339                          usage_store_->GetElementsFileSize());
2340 
2341   // We use a combined disk usage and file size for the DynamicTrieKeyMapper
2342   // because it's backed by a trie, which has some sparse property bitmaps.
2343   ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
2344                          document_key_mapper_->GetElementsSize());
2345 
2346   // We don't include the namespace_mapper or the corpus_mapper because it's
2347   // not clear if we could recover any space even if Optimize were called.
2348   // Deleting 100s of documents could still leave a few documents of a
2349   // namespace, and then there would be no change.
2350 
2351   int64_t total_size = document_log_file_size + document_key_mapper_size +
2352                        document_id_mapper_file_size + score_cache_file_size +
2353                        scorable_property_cache_file_size +
2354                        filter_cache_file_size + corpus_score_cache_file_size +
2355                        usage_store_file_size;
2356 
2357   optimize_info.estimated_optimizable_bytes =
2358       total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
2359   return optimize_info;
2360 }
2361 
UpdateCorpusAssociatedScoreCache(CorpusId corpus_id,const CorpusAssociatedScoreData & score_data)2362 libtextclassifier3::Status DocumentStore::UpdateCorpusAssociatedScoreCache(
2363     CorpusId corpus_id, const CorpusAssociatedScoreData& score_data) {
2364   return corpus_score_cache_->Set(corpus_id, score_data);
2365 }
2366 
UpdateDocumentAssociatedScoreCache(DocumentId document_id,const DocumentAssociatedScoreData & score_data)2367 libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
2368     DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
2369   return score_cache_->Set(document_id, score_data);
2370 }
2371 
UpdateFilterCache(DocumentId document_id,const DocumentFilterData & filter_data)2372 libtextclassifier3::Status DocumentStore::UpdateFilterCache(
2373     DocumentId document_id, const DocumentFilterData& filter_data) {
2374   return filter_cache_->Set(document_id, filter_data);
2375 }
2376 
ClearDerivedData(DocumentId document_id)2377 libtextclassifier3::Status DocumentStore::ClearDerivedData(
2378     DocumentId document_id) {
2379   // We intentionally leave the data in key_mapper_ because locating that data
2380   // requires fetching namespace and uri. Leaving data in key_mapper_ should
2381   // be fine because the data is hashed.
2382 
2383   ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
2384 
2385   // Resets the score cache entry
2386   ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
2387       document_id,
2388       DocumentAssociatedScoreData(
2389           kInvalidCorpusId,
2390           /*document_score=*/-1,
2391           /*creation_timestamp_ms=*/-1,
2392           /*scorable_property_cache_index=*/kInvalidScorablePropertyCacheIndex,
2393           /*length_in_tokens=*/0)));
2394 
2395   // Resets the filter cache entry
2396   ICING_RETURN_IF_ERROR(UpdateFilterCache(
2397       document_id,
2398       DocumentFilterData(kInvalidNamespaceId, /*uri_fingerprint=*/0,
2399                          kInvalidSchemaTypeId,
2400                          /*expiration_timestamp_ms=*/-1)));
2401 
2402   // Clears the usage scores.
2403   return usage_store_->DeleteUsageScores(document_id);
2404 }
2405 
SetUsageScores(DocumentId document_id,const UsageStore::UsageScores & usage_scores)2406 libtextclassifier3::Status DocumentStore::SetUsageScores(
2407     DocumentId document_id, const UsageStore::UsageScores& usage_scores) {
2408   return usage_store_->SetUsageScores(document_id, usage_scores);
2409 }
2410 
2411 libtextclassifier3::StatusOr<
2412     google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
CollectCorpusInfo() const2413 DocumentStore::CollectCorpusInfo() const {
2414   google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo> corpus_info;
2415   libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or =
2416       schema_store_->GetSchema();
2417   if (!schema_proto_or.ok()) {
2418     return corpus_info;
2419   }
2420   // Maps from CorpusId to the corresponding protocol buffer in the result.
2421   std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map;
2422   std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
2423       GetNamespaceIdsToNamespaces(namespace_mapper_.get());
2424   const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
2425   int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
2426   for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
2427        ++document_id) {
2428     if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
2429       continue;
2430     }
2431     ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
2432                            filter_cache_->Get(document_id));
2433     ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data,
2434                            score_cache_->Get(document_id));
2435     const std::string& name_space =
2436         namespace_id_to_namespace[filter_data->namespace_id()];
2437     const std::string& schema =
2438         schema_proto->types()[filter_data->schema_type_id()].schema_type();
2439     auto iter = info_map.find(score_data->corpus_id());
2440     if (iter == info_map.end()) {
2441       DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add();
2442       entry->set_namespace_(name_space);
2443       entry->set_schema(schema);
2444       iter = info_map.insert({score_data->corpus_id(), entry}).first;
2445     }
2446     iter->second->set_total_documents(iter->second->total_documents() + 1);
2447     iter->second->set_total_token(iter->second->total_token() +
2448                                   score_data->length_in_tokens());
2449   }
2450   return corpus_info;
2451 }
2452 
2453 libtextclassifier3::StatusOr<DocumentDebugInfoProto>
GetDebugInfo(int verbosity) const2454 DocumentStore::GetDebugInfo(int verbosity) const {
2455   DocumentDebugInfoProto debug_info;
2456   *debug_info.mutable_document_storage_info() = GetStorageInfo();
2457   ICING_ASSIGN_OR_RETURN(Crc32 crc, GetChecksum());
2458   debug_info.set_crc(crc.Get());
2459   if (verbosity > 0) {
2460     ICING_ASSIGN_OR_RETURN(
2461         google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>
2462             corpus_info,
2463         CollectCorpusInfo());
2464     *debug_info.mutable_corpus_info() = std::move(corpus_info);
2465   }
2466   return debug_info;
2467 }
2468 
UpdateScorablePropertyCache(const DocumentProto & document,SchemaTypeId schema_type_id)2469 libtextclassifier3::StatusOr<int> DocumentStore::UpdateScorablePropertyCache(
2470     const DocumentProto& document, SchemaTypeId schema_type_id) {
2471   if (!feature_flags_.enable_scorable_properties()) {
2472     return kInvalidScorablePropertyCacheIndex;
2473   }
2474   ICING_ASSIGN_OR_RETURN(
2475       const std::vector<ScorablePropertyManager::ScorablePropertyInfo>*
2476           ordered_scorable_property_info,
2477       schema_store_->GetOrderedScorablePropertyInfo(schema_type_id));
2478   if (ordered_scorable_property_info == nullptr ||
2479       ordered_scorable_property_info->empty()) {
2480     // No scorable property defined under the schema config of the
2481     // schema_type_id.
2482     return kInvalidScorablePropertyCacheIndex;
2483   }
2484   ICING_ASSIGN_OR_RETURN(
2485       std::unique_ptr<ScorablePropertySet> scorable_property_set,
2486       ScorablePropertySet::Create(document, schema_type_id, schema_store_));
2487 
2488   return scorable_property_cache_->Write(
2489       scorable_property_set->GetScorablePropertySetProto());
2490 }
2491 
2492 }  // namespace lib
2493 }  // namespace icing
2494