1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/store/document-store.h"
16
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <optional>
21 #include <string>
22 #include <string_view>
23 #include <unordered_map>
24 #include <unordered_set>
25 #include <utility>
26 #include <vector>
27
28 #include "icing/text_classifier/lib3/utils/base/status.h"
29 #include "icing/text_classifier/lib3/utils/base/statusor.h"
30 #include "icing/absl_ports/annotate.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/feature-flags.h"
34 #include "icing/file/file-backed-proto-log.h"
35 #include "icing/file/file-backed-vector.h"
36 #include "icing/file/filesystem.h"
37 #include "icing/file/memory-mapped-file-backed-proto-log.h"
38 #include "icing/file/memory-mapped-file.h"
39 #include "icing/file/portable-file-backed-proto-log.h"
40 #include "icing/legacy/core/icing-string-util.h"
41 #include "icing/proto/debug.pb.h"
42 #include "icing/proto/document.pb.h"
43 #include "icing/proto/document_wrapper.pb.h"
44 #include "icing/proto/internal/scorable_property_set.pb.h"
45 #include "icing/proto/logging.pb.h"
46 #include "icing/proto/optimize.pb.h"
47 #include "icing/proto/persist.pb.h"
48 #include "icing/proto/schema.pb.h"
49 #include "icing/proto/storage.pb.h"
50 #include "icing/proto/usage.pb.h"
51 #include "icing/schema/property-util.h"
52 #include "icing/schema/schema-store.h"
53 #include "icing/schema/scorable_property_manager.h"
54 #include "icing/store/blob-store.h"
55 #include "icing/store/corpus-associated-scoring-data.h"
56 #include "icing/store/corpus-id.h"
57 #include "icing/store/document-associated-score-data.h"
58 #include "icing/store/document-filter-data.h"
59 #include "icing/store/document-id.h"
60 #include "icing/store/document-log-creator.h"
61 #include "icing/store/dynamic-trie-key-mapper.h"
62 #include "icing/store/key-mapper.h"
63 #include "icing/store/namespace-id-fingerprint.h"
64 #include "icing/store/namespace-id.h"
65 #include "icing/store/persistent-hash-map-key-mapper.h"
66 #include "icing/store/usage-store.h"
67 #include "icing/tokenization/language-segmenter.h"
68 #include "icing/util/clock.h"
69 #include "icing/util/crc32.h"
70 #include "icing/util/data-loss.h"
71 #include "icing/util/fingerprint-util.h"
72 #include "icing/util/logging.h"
73 #include "icing/util/scorable_property_set.h"
74 #include "icing/util/status-macros.h"
75 #include "icing/util/tokenized-document.h"
76
77 namespace icing {
78 namespace lib {
79
80 namespace {
81
82 // Used in DocumentId mapper to mark a document as deleted
83 constexpr int64_t kDocDeletedFlag = -1;
84 constexpr int32_t kInvalidScorablePropertyCacheIndex = -1;
85 constexpr char kDocumentIdMapperFilename[] = "document_id_mapper";
86 constexpr char kUriHashMapperWorkingPath[] = "uri_mapper";
87 constexpr char kDocumentStoreHeaderFilename[] = "document_store_header";
88 constexpr char kScoreCacheFilename[] = "score_cache";
89 constexpr char kScorablePropertyCacheFilename[] = "scorable_property_cache";
90 constexpr char kCorpusScoreCache[] = "corpus_score_cache";
91 constexpr char kFilterCacheFilename[] = "filter_cache";
92 constexpr char kNamespaceMapperFilename[] = "namespace_mapper";
93 constexpr char kUsageStoreDirectoryName[] = "usage_store";
94 constexpr char kCorpusIdMapperFilename[] = "corpus_mapper";
95
96 // Determined through manual testing to allow for 4 million uris. 4 million
97 // because we allow up to 4 million DocumentIds.
98 constexpr int32_t kUriDynamicTrieKeyMapperMaxSize =
99 144 * 1024 * 1024; // 144 MiB
100
101 constexpr int32_t kUriHashKeyMapperMaxNumEntries =
102 kMaxDocumentId + 1; // 1 << 22, 4M
103 // - Key: namespace_id_str (3 bytes) + fingerprinted_uri (10 bytes) + '\0' (1
104 // byte)
105 // - Value: DocumentId (4 bytes)
106 constexpr int32_t kUriHashKeyMapperKVByteSize = 13 + 1 + sizeof(DocumentId);
107
108 // 384 KiB for a DynamicTrieKeyMapper would allow each internal array to have a
109 // max of 128 KiB for storage.
110 constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024; // 384 KiB
111 constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024; // 384 KiB
112
CreateDocumentWrapper(DocumentProto && document)113 DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) {
114 DocumentWrapper document_wrapper;
115 *document_wrapper.mutable_document() = std::move(document);
116 return document_wrapper;
117 }
118
MakeHeaderFilename(const std::string & base_dir)119 std::string MakeHeaderFilename(const std::string& base_dir) {
120 return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename);
121 }
122
MakeUriHashMapperWorkingPath(const std::string & base_dir)123 std::string MakeUriHashMapperWorkingPath(const std::string& base_dir) {
124 return absl_ports::StrCat(base_dir, "/", kUriHashMapperWorkingPath);
125 }
126
MakeDocumentIdMapperFilename(const std::string & base_dir)127 std::string MakeDocumentIdMapperFilename(const std::string& base_dir) {
128 return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename);
129 }
130
MakeScoreCacheFilename(const std::string & base_dir)131 std::string MakeScoreCacheFilename(const std::string& base_dir) {
132 return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename);
133 }
134
MakeScorablePropertyCacheFilename(const std::string & base_dir)135 std::string MakeScorablePropertyCacheFilename(const std::string& base_dir) {
136 return absl_ports::StrCat(base_dir, "/", kScorablePropertyCacheFilename);
137 }
138
MakeCorpusScoreCache(const std::string & base_dir)139 std::string MakeCorpusScoreCache(const std::string& base_dir) {
140 return absl_ports::StrCat(base_dir, "/", kCorpusScoreCache);
141 }
142
MakeFilterCacheFilename(const std::string & base_dir)143 std::string MakeFilterCacheFilename(const std::string& base_dir) {
144 return absl_ports::StrCat(base_dir, "/", kFilterCacheFilename);
145 }
146
MakeNamespaceMapperFilename(const std::string & base_dir)147 std::string MakeNamespaceMapperFilename(const std::string& base_dir) {
148 return absl_ports::StrCat(base_dir, "/", kNamespaceMapperFilename);
149 }
150
MakeUsageStoreDirectoryName(const std::string & base_dir)151 std::string MakeUsageStoreDirectoryName(const std::string& base_dir) {
152 return absl_ports::StrCat(base_dir, "/", kUsageStoreDirectoryName);
153 }
154
MakeCorpusMapperFilename(const std::string & base_dir)155 std::string MakeCorpusMapperFilename(const std::string& base_dir) {
156 return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename);
157 }
158
CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,int64_t ttl_ms)159 int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
160 int64_t ttl_ms) {
161 if (ttl_ms == 0) {
162 // Special case where a TTL of 0 indicates the document should never
163 // expire. int64_t max, interpreted as seconds since epoch, represents
164 // some point in the year 292,277,026,596. So we're probably ok to use
165 // this as "never reaching this point".
166 return std::numeric_limits<int64_t>::max();
167 }
168
169 int64_t expiration_timestamp_ms;
170 if (__builtin_add_overflow(creation_timestamp_ms, ttl_ms,
171 &expiration_timestamp_ms)) {
172 // Overflow detected. Treat overflow as the same behavior of just int64_t
173 // max
174 return std::numeric_limits<int64_t>::max();
175 }
176
177 return expiration_timestamp_ms;
178 }
179
GetRecoveryCause(const DocumentLogCreator::CreateResult & create_result,bool force_recovery_and_revalidate_documents)180 InitializeStatsProto::RecoveryCause GetRecoveryCause(
181 const DocumentLogCreator::CreateResult& create_result,
182 bool force_recovery_and_revalidate_documents) {
183 if (force_recovery_and_revalidate_documents) {
184 return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC;
185 } else if (create_result.log_create_result.has_data_loss()) {
186 return InitializeStatsProto::DATA_LOSS;
187 } else if (create_result.preexisting_file_version !=
188 DocumentLogCreator::kCurrentVersion) {
189 return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT;
190 }
191 return InitializeStatsProto::NONE;
192 }
193
GetDataStatus(DataLoss data_loss)194 InitializeStatsProto::DocumentStoreDataStatus GetDataStatus(
195 DataLoss data_loss) {
196 switch (data_loss) {
197 case DataLoss::PARTIAL:
198 return InitializeStatsProto::PARTIAL_LOSS;
199 case DataLoss::COMPLETE:
200 return InitializeStatsProto::COMPLETE_LOSS;
201 case DataLoss::NONE:
202 return InitializeStatsProto::NO_DATA_LOSS;
203 }
204 }
205
GetNamespaceIdsToNamespaces(const KeyMapper<NamespaceId> * key_mapper)206 std::unordered_map<NamespaceId, std::string> GetNamespaceIdsToNamespaces(
207 const KeyMapper<NamespaceId>* key_mapper) {
208 std::unordered_map<NamespaceId, std::string> namespace_ids_to_namespaces;
209
210 std::unique_ptr<typename KeyMapper<NamespaceId>::Iterator> itr =
211 key_mapper->GetIterator();
212 while (itr->Advance()) {
213 namespace_ids_to_namespaces.insert(
214 {itr->GetValue(), std::string(itr->GetKey())});
215 }
216 return namespace_ids_to_namespaces;
217 }
218
219 libtextclassifier3::StatusOr<std::unique_ptr<
220 KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>>>
CreateUriMapper(const Filesystem & filesystem,const std::string & base_dir,bool use_persistent_hash_map)221 CreateUriMapper(const Filesystem& filesystem, const std::string& base_dir,
222 bool use_persistent_hash_map) {
223 std::string uri_hash_mapper_working_path =
224 MakeUriHashMapperWorkingPath(base_dir);
225 // Due to historic issue, we use document store's base_dir directly as
226 // DynamicTrieKeyMapper's working directory for uri mapper.
227 // DynamicTrieKeyMapper also creates a subdirectory "key_mapper_dir", so the
228 // actual files will be put under "<base_dir>/key_mapper_dir/".
229 bool dynamic_trie_key_mapper_dir_exists = filesystem.DirectoryExists(
230 absl_ports::StrCat(base_dir, "/key_mapper_dir").c_str());
231 bool persistent_hash_map_dir_exists =
232 filesystem.DirectoryExists(uri_hash_mapper_working_path.c_str());
233 if ((use_persistent_hash_map && dynamic_trie_key_mapper_dir_exists) ||
234 (!use_persistent_hash_map && persistent_hash_map_dir_exists)) {
235 // Return a failure here so that the caller can properly delete and rebuild
236 // this component.
237 return absl_ports::FailedPreconditionError("Key mapper type mismatch");
238 }
239
240 if (use_persistent_hash_map) {
241 return PersistentHashMapKeyMapper<
242 DocumentId, fingerprint_util::FingerprintStringFormatter>::
243 Create(filesystem, std::move(uri_hash_mapper_working_path),
244 /*pre_mapping_fbv=*/false,
245 /*max_num_entries=*/kUriHashKeyMapperMaxNumEntries,
246 /*average_kv_byte_size=*/kUriHashKeyMapperKVByteSize);
247 } else {
248 return DynamicTrieKeyMapper<DocumentId,
249 fingerprint_util::FingerprintStringFormatter>::
250 Create(filesystem, base_dir, kUriDynamicTrieKeyMapperMaxSize);
251 }
252 }
253
254 // Find the existing blob handles in the given document and remove them from the
255 // dead_blob_handles set. Those are the blob handles that are still in use.
256 //
257 // This method is flag-guarded by the flag enable_blob_store. If the flag is
258 // disabled, the dead_blob_handles must be empty and this method will be a
259 // no-op.
260 //
261 // The type_blob_map is a map from schema type to a set of blob property names.
RemoveAliveBlobHandles(const DocumentProto & document,const std::unordered_map<std::string,std::vector<std::string>> & type_blob_property_map,std::unordered_set<std::string> & dead_blob_handles)262 void RemoveAliveBlobHandles(
263 const DocumentProto& document,
264 const std::unordered_map<std::string, std::vector<std::string>>&
265 type_blob_property_map,
266 std::unordered_set<std::string>& dead_blob_handles) {
267 if (dead_blob_handles.empty() ||
268 type_blob_property_map.find(document.schema()) ==
269 type_blob_property_map.end()) {
270 // This document does not have any blob properties.
271 return;
272 }
273 const std::vector<std::string>& blob_property_paths =
274 type_blob_property_map.at(document.schema());
275
276 for (const std::string& blob_property_path : blob_property_paths) {
277 auto content_or = property_util::ExtractPropertyValuesFromDocument<
278 PropertyProto::BlobHandleProto>(document, blob_property_path);
279 if (content_or.ok()) {
280 for (const PropertyProto::BlobHandleProto& blob_handle :
281 content_or.ValueOrDie()) {
282 dead_blob_handles.erase(BlobStore::BuildBlobHandleStr(blob_handle));
283 }
284 }
285 }
286 }
287
288 } // namespace
289
DocumentStore(const Filesystem * filesystem,const std::string_view base_dir,const Clock * clock,const SchemaStore * schema_store,const FeatureFlags * feature_flags,bool pre_mapping_fbv,bool use_persistent_hash_map,int32_t compression_level)290 DocumentStore::DocumentStore(const Filesystem* filesystem,
291 const std::string_view base_dir,
292 const Clock* clock,
293 const SchemaStore* schema_store,
294 const FeatureFlags* feature_flags,
295 bool pre_mapping_fbv, bool use_persistent_hash_map,
296 int32_t compression_level)
297 : filesystem_(filesystem),
298 base_dir_(base_dir),
299 clock_(*clock),
300 feature_flags_(*feature_flags),
301 schema_store_(schema_store),
302 document_validator_(schema_store),
303 pre_mapping_fbv_(pre_mapping_fbv),
304 use_persistent_hash_map_(use_persistent_hash_map),
305 compression_level_(compression_level) {}
306
Put(const DocumentProto & document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)307 libtextclassifier3::StatusOr<DocumentStore::PutResult> DocumentStore::Put(
308 const DocumentProto& document, int32_t num_tokens,
309 PutDocumentStatsProto* put_document_stats) {
310 return Put(DocumentProto(document), num_tokens, put_document_stats);
311 }
312
Put(DocumentProto && document,int32_t num_tokens,PutDocumentStatsProto * put_document_stats)313 libtextclassifier3::StatusOr<DocumentStore::PutResult> DocumentStore::Put(
314 DocumentProto&& document, int32_t num_tokens,
315 PutDocumentStatsProto* put_document_stats) {
316 document.mutable_internal_fields()->set_length_in_tokens(num_tokens);
317 return InternalPut(std::move(document), put_document_stats);
318 }
319
~DocumentStore()320 DocumentStore::~DocumentStore() {
321 if (initialized_) {
322 if (!PersistToDisk(PersistType::FULL).ok()) {
323 ICING_LOG(ERROR)
324 << "Error persisting to disk in DocumentStore destructor";
325 }
326 }
327 }
328
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const SchemaStore * schema_store,const FeatureFlags * feature_flags,bool force_recovery_and_revalidate_documents,bool pre_mapping_fbv,bool use_persistent_hash_map,int32_t compression_level,InitializeStatsProto * initialize_stats)329 libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create(
330 const Filesystem* filesystem, const std::string& base_dir,
331 const Clock* clock, const SchemaStore* schema_store,
332 const FeatureFlags* feature_flags,
333 bool force_recovery_and_revalidate_documents, bool pre_mapping_fbv,
334 bool use_persistent_hash_map, int32_t compression_level,
335 InitializeStatsProto* initialize_stats) {
336 ICING_RETURN_ERROR_IF_NULL(filesystem);
337 ICING_RETURN_ERROR_IF_NULL(clock);
338 ICING_RETURN_ERROR_IF_NULL(schema_store);
339 ICING_RETURN_ERROR_IF_NULL(feature_flags);
340
341 auto document_store = std::unique_ptr<DocumentStore>(new DocumentStore(
342 filesystem, base_dir, clock, schema_store, feature_flags, pre_mapping_fbv,
343 use_persistent_hash_map, compression_level));
344 ICING_ASSIGN_OR_RETURN(
345 InitializeResult initialize_result,
346 document_store->Initialize(force_recovery_and_revalidate_documents,
347 initialize_stats));
348
349 CreateResult create_result;
350 create_result.document_store = std::move(document_store);
351 create_result.data_loss = initialize_result.data_loss;
352 create_result.derived_files_regenerated =
353 initialize_result.derived_files_regenerated;
354 return create_result;
355 }
356
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)357 /* static */ libtextclassifier3::Status DocumentStore::DiscardDerivedFiles(
358 const Filesystem* filesystem, const std::string& base_dir) {
359 // Header
360 const std::string header_filename = MakeHeaderFilename(base_dir);
361 if (!filesystem->DeleteFile(MakeHeaderFilename(base_dir).c_str())) {
362 return absl_ports::InternalError("Couldn't delete header file");
363 }
364
365 // Document key mapper. Doesn't hurt to delete both dynamic trie and
366 // persistent hash map without checking.
367 ICING_RETURN_IF_ERROR(
368 DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem, base_dir));
369 ICING_RETURN_IF_ERROR(PersistentHashMapKeyMapper<DocumentId>::Delete(
370 *filesystem, MakeUriHashMapperWorkingPath(base_dir)));
371
372 // Document id mapper
373 ICING_RETURN_IF_ERROR(FileBackedVector<int64_t>::Delete(
374 *filesystem, MakeDocumentIdMapperFilename(base_dir)));
375
376 // Document associated score cache
377 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
378 *filesystem, MakeScoreCacheFilename(base_dir)));
379
380 // Filter cache
381 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
382 *filesystem, MakeFilterCacheFilename(base_dir)));
383
384 // Namespace mapper
385 ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<NamespaceId>::Delete(
386 *filesystem, MakeNamespaceMapperFilename(base_dir)));
387
388 // Corpus mapper
389 ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<CorpusId>::Delete(
390 *filesystem, MakeCorpusMapperFilename(base_dir)));
391
392 // Corpus associated score cache
393 ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
394 *filesystem, MakeCorpusScoreCache(base_dir)));
395
396 // Scorable Property Cache
397 ICING_RETURN_IF_ERROR(
398 MemoryMappedFileBackedProtoLog<ScorablePropertySetProto>::Delete(
399 *filesystem, MakeScorablePropertyCacheFilename(base_dir)));
400
401 return libtextclassifier3::Status::OK;
402 }
403
404 libtextclassifier3::StatusOr<DocumentStore::InitializeResult>
Initialize(bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)405 DocumentStore::Initialize(bool force_recovery_and_revalidate_documents,
406 InitializeStatsProto* initialize_stats) {
407 auto create_result_or =
408 DocumentLogCreator::Create(filesystem_, base_dir_, compression_level_);
409
410 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
411 // that can support error logging.
412 if (!create_result_or.ok()) {
413 ICING_LOG(ERROR) << create_result_or.status().error_message()
414 << "\nFailed to initialize DocumentLog.";
415 return create_result_or.status();
416 }
417 DocumentLogCreator::CreateResult create_result =
418 std::move(create_result_or).ValueOrDie();
419
420 document_log_ = std::move(create_result.log_create_result.proto_log);
421 InitializeStatsProto::RecoveryCause recovery_cause =
422 GetRecoveryCause(create_result, force_recovery_and_revalidate_documents);
423
424 bool derived_files_regenerated = false;
425 if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) {
426 ICING_LOG(INFO) << "Starting Document Store Recovery with cause="
427 << recovery_cause << ", and create result { new_file="
428 << create_result.new_file << ", preeisting_file_version="
429 << create_result.preexisting_file_version << ", data_loss="
430 << create_result.log_create_result.data_loss
431 << "} and kCurrentVersion="
432 << DocumentLogCreator::kCurrentVersion;
433 // We can't rely on any existing derived files. Recreate them from scratch.
434 // Currently happens if:
435 // 1) This is a new log and we don't have derived files yet
436 // 2) Client wanted us to force a regeneration.
437 // 3) Log has some data loss, can't rely on existing derived data.
438 std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
439 libtextclassifier3::Status status =
440 RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
441 if (recovery_cause != InitializeStatsProto::NONE) {
442 // Only consider it a recovery if the client forced a recovery or there
443 // was data loss. Otherwise, this could just be the first time we're
444 // initializing and generating derived files.
445 derived_files_regenerated = true;
446 if (initialize_stats != nullptr) {
447 initialize_stats->set_document_store_recovery_latency_ms(
448 document_recovery_timer->GetElapsedMilliseconds());
449 initialize_stats->set_document_store_recovery_cause(recovery_cause);
450 initialize_stats->set_document_store_data_status(
451 GetDataStatus(create_result.log_create_result.data_loss));
452 }
453 }
454 if (!status.ok()) {
455 ICING_LOG(ERROR)
456 << "Failed to regenerate derived files for DocumentStore";
457 return status;
458 }
459 } else {
460 if (!InitializeExistingDerivedFiles().ok()) {
461 ICING_LOG(WARNING)
462 << "Couldn't find derived files or failed to initialize them, "
463 "regenerating derived files for DocumentStore.";
464 std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
465 derived_files_regenerated = true;
466 libtextclassifier3::Status status = RegenerateDerivedFiles(
467 /*force_recovery_and_revalidate_documents=*/false);
468 if (initialize_stats != nullptr) {
469 initialize_stats->set_document_store_recovery_cause(
470 InitializeStatsProto::IO_ERROR);
471 initialize_stats->set_document_store_recovery_latency_ms(
472 document_recovery_timer->GetElapsedMilliseconds());
473 }
474 if (!status.ok()) {
475 ICING_LOG(ERROR)
476 << "Failed to regenerate derived files for DocumentStore";
477 return status;
478 }
479 }
480 }
481
482 initialized_ = true;
483 if (initialize_stats != nullptr) {
484 initialize_stats->set_num_documents(document_id_mapper_->num_elements());
485 }
486
487 InitializeResult initialize_result = {
488 .data_loss = create_result.log_create_result.data_loss,
489 .derived_files_regenerated = derived_files_regenerated};
490 return initialize_result;
491 }
492
InitializeExistingDerivedFiles()493 libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() {
494 if (!HeaderExists()) {
495 // Without a header, we don't know if things are consistent between each
496 // other so the caller should just regenerate everything from ground
497 // truth.
498 return absl_ports::InternalError("DocumentStore header doesn't exist");
499 }
500
501 DocumentStore::Header header;
502 if (!filesystem_->Read(MakeHeaderFilename(base_dir_).c_str(), &header,
503 sizeof(header))) {
504 return absl_ports::InternalError(
505 absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
506 }
507
508 if (header.magic != DocumentStore::Header::kMagic) {
509 return absl_ports::InternalError(absl_ports::StrCat(
510 "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
511 }
512
513 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
514 // that can support error logging.
515 auto document_key_mapper_or =
516 CreateUriMapper(*filesystem_, base_dir_, use_persistent_hash_map_);
517 if (!document_key_mapper_or.ok()) {
518 ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
519 << "Failed to initialize KeyMapper";
520 return document_key_mapper_or.status();
521 }
522 document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
523
524 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
525 // that can support error logging.
526 auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
527 *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
528 MemoryMappedFile::READ_WRITE_AUTO_SYNC);
529 if (!document_id_mapper_or.ok()) {
530 ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
531 << "Failed to initialize DocumentIdMapper";
532 return document_id_mapper_or.status();
533 }
534 document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
535
536 ICING_ASSIGN_OR_RETURN(score_cache_,
537 FileBackedVector<DocumentAssociatedScoreData>::Create(
538 *filesystem_, MakeScoreCacheFilename(base_dir_),
539 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
540
541 ICING_ASSIGN_OR_RETURN(
542 scorable_property_cache_,
543 MemoryMappedFileBackedProtoLog<ScorablePropertySetProto>::Create(
544 *filesystem_, MakeScorablePropertyCacheFilename(base_dir_)));
545
546 ICING_ASSIGN_OR_RETURN(filter_cache_,
547 FileBackedVector<DocumentFilterData>::Create(
548 *filesystem_, MakeFilterCacheFilename(base_dir_),
549 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
550
551 ICING_ASSIGN_OR_RETURN(
552 namespace_mapper_,
553 DynamicTrieKeyMapper<NamespaceId>::Create(
554 *filesystem_, MakeNamespaceMapperFilename(base_dir_),
555 kNamespaceMapperMaxSize));
556
557 ICING_ASSIGN_OR_RETURN(
558 usage_store_,
559 UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
560
561 auto corpus_mapper_or =
562 DynamicTrieKeyMapper<CorpusId,
563 fingerprint_util::FingerprintStringFormatter>::
564 Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
565 kCorpusMapperMaxSize);
566 if (!corpus_mapper_or.ok()) {
567 return std::move(corpus_mapper_or).status();
568 }
569 corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
570
571 ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
572 FileBackedVector<CorpusAssociatedScoreData>::Create(
573 *filesystem_, MakeCorpusScoreCache(base_dir_),
574 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
575
576 // Ensure the usage store is the correct size.
577 ICING_RETURN_IF_ERROR(
578 usage_store_->TruncateTo(document_id_mapper_->num_elements()));
579
580 Crc32 expected_checksum(header.checksum);
581 ICING_ASSIGN_OR_RETURN(Crc32 checksum, GetChecksum());
582 if (checksum != expected_checksum) {
583 return absl_ports::InternalError(
584 "Combined checksum of DocStore was inconsistent");
585 }
586
587 return libtextclassifier3::Status::OK;
588 }
589
RegenerateDerivedFiles(bool revalidate_documents)590 libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
591 bool revalidate_documents) {
592 ICING_RETURN_IF_ERROR(ResetDocumentKeyMapper());
593 ICING_RETURN_IF_ERROR(ResetDocumentIdMapper());
594 ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache());
595 ICING_RETURN_IF_ERROR(ResetScorablePropertyCache());
596 ICING_RETURN_IF_ERROR(ResetFilterCache());
597 ICING_RETURN_IF_ERROR(ResetNamespaceMapper());
598 ICING_RETURN_IF_ERROR(ResetCorpusMapper());
599 ICING_RETURN_IF_ERROR(ResetCorpusAssociatedScoreCache());
600
601 // Creates a new UsageStore instance. Note that we don't reset the data in
602 // usage store here because we're not able to regenerate the usage scores.
603 ICING_ASSIGN_OR_RETURN(
604 usage_store_,
605 UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
606
607 // Iterates through document log
608 auto iterator = document_log_->GetIterator();
609 auto iterator_status = iterator.Advance();
610 libtextclassifier3::StatusOr<int64_t> element_size =
611 document_log_->GetElementsFileSize();
612 libtextclassifier3::StatusOr<int64_t> disk_usage =
613 document_log_->GetDiskUsage();
614 if (element_size.ok() && disk_usage.ok()) {
615 ICING_VLOG(1) << "Starting recovery of document store. Document store "
616 "elements file size:"
617 << element_size.ValueOrDie()
618 << ", disk usage=" << disk_usage.ValueOrDie();
619 }
620 while (iterator_status.ok()) {
621 ICING_VLOG(2) << "Attempting to read document at offset="
622 << iterator.GetOffset();
623 libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
624 document_log_->ReadProto(iterator.GetOffset());
625
626 if (absl_ports::IsNotFound(document_wrapper_or.status())) {
627 // The erased document still occupies 1 document id.
628 DocumentId new_document_id = document_id_mapper_->num_elements();
629 ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
630 iterator_status = iterator.Advance();
631 continue;
632 } else if (!document_wrapper_or.ok()) {
633 return document_wrapper_or.status();
634 }
635
636 DocumentWrapper document_wrapper =
637 std::move(document_wrapper_or).ValueOrDie();
638 // Revalidate that this document is still compatible if requested.
639 if (revalidate_documents) {
640 if (!document_validator_.Validate(document_wrapper.document()).ok()) {
641 // Document is no longer valid with the current schema. Mark as
642 // deleted
643 DocumentId new_document_id = document_id_mapper_->num_elements();
644 ICING_RETURN_IF_ERROR(document_log_->EraseProto(iterator.GetOffset()));
645 ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
646 continue;
647 }
648 }
649
650 ICING_ASSIGN_OR_RETURN(
651 NamespaceId namespace_id,
652 namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
653 namespace_mapper_->num_keys()));
654
655 // Updates key mapper and document_id mapper with the new document
656 DocumentId new_document_id = document_id_mapper_->num_elements();
657 NamespaceIdFingerprint new_doc_nsid_uri_fingerprint(
658 namespace_id, document_wrapper.document().uri());
659 ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
660 new_doc_nsid_uri_fingerprint.EncodeToCString(), new_document_id));
661 ICING_RETURN_IF_ERROR(
662 document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
663
664 SchemaTypeId schema_type_id;
665 auto schema_type_id_or =
666 schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
667 if (absl_ports::IsNotFound(schema_type_id_or.status())) {
668 // Didn't find a SchemaTypeId. This means that the DocumentStore and
669 // the SchemaStore are out of sync. But DocumentStore can't do
670 // anything about it so just ignore this for now. This should be
671 // detected/handled by the owner of DocumentStore. Set it to some
672 // arbitrary invalid value for now, it'll get updated to the correct
673 // ID later.
674 schema_type_id = -1;
675 } else if (!schema_type_id_or.ok()) {
676 // Real error. Pass it up
677 return schema_type_id_or.status();
678 } else {
679 // We're guaranteed that SchemaTypeId is valid now
680 schema_type_id = schema_type_id_or.ValueOrDie();
681 }
682
683 // Update corpus maps
684 NamespaceIdFingerprint corpus_nsid_schema_fingerprint(
685 namespace_id, document_wrapper.document().schema());
686 ICING_ASSIGN_OR_RETURN(CorpusId corpus_id,
687 corpus_mapper_->GetOrPut(
688 corpus_nsid_schema_fingerprint.EncodeToCString(),
689 corpus_mapper_->num_keys()));
690
691 ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
692 GetCorpusAssociatedScoreDataToUpdate(corpus_id));
693 scoring_data.AddDocument(
694 document_wrapper.document().internal_fields().length_in_tokens());
695
696 ICING_RETURN_IF_ERROR(
697 UpdateCorpusAssociatedScoreCache(corpus_id, scoring_data));
698
699 int32_t scorable_property_cache_index = kInvalidScorablePropertyCacheIndex;
700 // Swallow the error when schema_type_id is not found, and skip updating the
701 // scorable property cache.
702 if (schema_type_id != -1) {
703 ICING_ASSIGN_OR_RETURN(scorable_property_cache_index,
704 UpdateScorablePropertyCache(
705 document_wrapper.document(), schema_type_id));
706 }
707
708 ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
709 new_document_id,
710 DocumentAssociatedScoreData(
711 corpus_id, document_wrapper.document().score(),
712 document_wrapper.document().creation_timestamp_ms(),
713 scorable_property_cache_index,
714 document_wrapper.document().internal_fields().length_in_tokens())));
715
716 int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
717 document_wrapper.document().creation_timestamp_ms(),
718 document_wrapper.document().ttl_ms());
719
720 ICING_RETURN_IF_ERROR(UpdateFilterCache(
721 new_document_id,
722 DocumentFilterData(namespace_id,
723 new_doc_nsid_uri_fingerprint.fingerprint(),
724 schema_type_id, expiration_timestamp_ms)));
725 iterator_status = iterator.Advance();
726 }
727
728 if (!absl_ports::IsOutOfRange(iterator_status)) {
729 ICING_LOG(WARNING)
730 << "Failed to iterate through proto log while regenerating "
731 "derived files";
732 return absl_ports::Annotate(iterator_status,
733 "Failed to iterate through proto log.");
734 }
735
736 // Shrink usage_store_ to the correct size.
737 ICING_RETURN_IF_ERROR(
738 usage_store_->TruncateTo(document_id_mapper_->num_elements()));
739
740 // Write the header
741 ICING_RETURN_IF_ERROR(UpdateChecksum());
742 return libtextclassifier3::Status::OK;
743 }
744
ResetDocumentKeyMapper()745 libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
746 // Only one type of KeyMapper (either DynamicTrieKeyMapper or
747 // PersistentHashMapKeyMapper) will actually exist at any moment, but it is ok
748 // to call Delete() for both since Delete() returns OK if any of them doesn't
749 // exist.
750 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
751 document_key_mapper_.reset();
752 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
753 // that can support error logging.
754 libtextclassifier3::Status status =
755 DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
756 if (!status.ok()) {
757 ICING_LOG(ERROR) << status.error_message()
758 << "Failed to delete old dynamic trie key mapper";
759 return status;
760 }
761 status = PersistentHashMapKeyMapper<DocumentId>::Delete(
762 *filesystem_, MakeUriHashMapperWorkingPath(base_dir_));
763 if (!status.ok()) {
764 ICING_LOG(ERROR) << status.error_message()
765 << "Failed to delete old persistent hash map key mapper";
766 return status;
767 }
768
769 // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
770 // that can support error logging.
771 auto document_key_mapper_or =
772 CreateUriMapper(*filesystem_, base_dir_, use_persistent_hash_map_);
773 if (!document_key_mapper_or.ok()) {
774 ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
775 << "Failed to re-init key mapper";
776 return document_key_mapper_or.status();
777 }
778 document_key_mapper_ = std::move(document_key_mapper_or).ValueOrDie();
779 return libtextclassifier3::Status::OK;
780 }
781
ResetDocumentIdMapper()782 libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
783 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
784 document_id_mapper_.reset();
785 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
786 // that can support error logging.
787 libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
788 *filesystem_, MakeDocumentIdMapperFilename(base_dir_));
789 if (!status.ok()) {
790 ICING_LOG(ERROR) << status.error_message()
791 << "Failed to delete old document_id mapper";
792 return status;
793 }
794 // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
795 // that can support error logging.
796 auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
797 *filesystem_, MakeDocumentIdMapperFilename(base_dir_),
798 MemoryMappedFile::READ_WRITE_AUTO_SYNC);
799 if (!document_id_mapper_or.ok()) {
800 ICING_LOG(ERROR) << document_id_mapper_or.status().error_message()
801 << "Failed to re-init document_id mapper";
802 return document_id_mapper_or.status();
803 }
804 document_id_mapper_ = std::move(document_id_mapper_or).ValueOrDie();
805 return libtextclassifier3::Status::OK;
806 }
807
ResetDocumentAssociatedScoreCache()808 libtextclassifier3::Status DocumentStore::ResetDocumentAssociatedScoreCache() {
809 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
810 score_cache_.reset();
811 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
812 *filesystem_, MakeScoreCacheFilename(base_dir_)));
813 ICING_ASSIGN_OR_RETURN(score_cache_,
814 FileBackedVector<DocumentAssociatedScoreData>::Create(
815 *filesystem_, MakeScoreCacheFilename(base_dir_),
816 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
817 return libtextclassifier3::Status::OK;
818 }
819
ResetScorablePropertyCache()820 libtextclassifier3::Status DocumentStore::ResetScorablePropertyCache() {
821 scorable_property_cache_.reset();
822 ICING_RETURN_IF_ERROR(
823 MemoryMappedFileBackedProtoLog<ScorablePropertySetProto>::Delete(
824 *filesystem_, MakeScorablePropertyCacheFilename(base_dir_)));
825 ICING_ASSIGN_OR_RETURN(
826 scorable_property_cache_,
827 MemoryMappedFileBackedProtoLog<ScorablePropertySetProto>::Create(
828 *filesystem_, MakeScorablePropertyCacheFilename(base_dir_)));
829 return libtextclassifier3::Status::OK;
830 }
831
ResetCorpusAssociatedScoreCache()832 libtextclassifier3::Status DocumentStore::ResetCorpusAssociatedScoreCache() {
833 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
834 corpus_score_cache_.reset();
835 ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
836 *filesystem_, MakeCorpusScoreCache(base_dir_)));
837 ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
838 FileBackedVector<CorpusAssociatedScoreData>::Create(
839 *filesystem_, MakeCorpusScoreCache(base_dir_),
840 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
841 return libtextclassifier3::Status::OK;
842 }
843
ResetFilterCache()844 libtextclassifier3::Status DocumentStore::ResetFilterCache() {
845 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
846 filter_cache_.reset();
847 ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
848 *filesystem_, MakeFilterCacheFilename(base_dir_)));
849 ICING_ASSIGN_OR_RETURN(filter_cache_,
850 FileBackedVector<DocumentFilterData>::Create(
851 *filesystem_, MakeFilterCacheFilename(base_dir_),
852 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
853 return libtextclassifier3::Status::OK;
854 }
855
ResetNamespaceMapper()856 libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
857 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
858 namespace_mapper_.reset();
859 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
860 // that can support error logging.
861 libtextclassifier3::Status status = DynamicTrieKeyMapper<NamespaceId>::Delete(
862 *filesystem_, MakeNamespaceMapperFilename(base_dir_));
863 if (!status.ok()) {
864 ICING_LOG(ERROR) << status.error_message()
865 << "Failed to delete old namespace_id mapper";
866 return status;
867 }
868 ICING_ASSIGN_OR_RETURN(
869 namespace_mapper_,
870 DynamicTrieKeyMapper<NamespaceId>::Create(
871 *filesystem_, MakeNamespaceMapperFilename(base_dir_),
872 kNamespaceMapperMaxSize));
873 return libtextclassifier3::Status::OK;
874 }
875
ResetCorpusMapper()876 libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
877 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
878 corpus_mapper_.reset();
879 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
880 // that can support error logging.
881 libtextclassifier3::Status status = DynamicTrieKeyMapper<CorpusId>::Delete(
882 *filesystem_, MakeCorpusMapperFilename(base_dir_));
883 if (!status.ok()) {
884 ICING_LOG(ERROR) << status.error_message()
885 << "Failed to delete old corpus_id mapper";
886 return status;
887 }
888 auto corpus_mapper_or =
889 DynamicTrieKeyMapper<CorpusId,
890 fingerprint_util::FingerprintStringFormatter>::
891 Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
892 kCorpusMapperMaxSize);
893 if (!corpus_mapper_or.ok()) {
894 return std::move(corpus_mapper_or).status();
895 }
896 corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
897 return libtextclassifier3::Status::OK;
898 }
899
GetChecksum() const900 libtextclassifier3::StatusOr<Crc32> DocumentStore::GetChecksum() const {
901 Crc32 total_checksum;
902
903 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
904 // that can support error logging.
905 auto checksum_or = document_log_->GetChecksum();
906 if (!checksum_or.ok()) {
907 ICING_LOG(ERROR) << checksum_or.status().error_message()
908 << "Failed to compute checksum of DocumentLog";
909 return checksum_or.status();
910 }
911 Crc32 document_log_checksum = std::move(checksum_or).ValueOrDie();
912
913 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
914 // that can support error logging.
915 checksum_or = document_key_mapper_->GetChecksum();
916 if (!checksum_or.ok()) {
917 ICING_LOG(ERROR) << checksum_or.status().error_message()
918 << "Failed to compute checksum of DocumentKeyMapper";
919 return checksum_or.status();
920 }
921 Crc32 document_key_mapper_checksum = std::move(checksum_or).ValueOrDie();
922
923 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
924 // that can support error logging.
925 checksum_or = document_id_mapper_->GetChecksum();
926 if (!checksum_or.ok()) {
927 ICING_LOG(ERROR) << checksum_or.status().error_message()
928 << "Failed to compute checksum of DocumentIdMapper";
929 return checksum_or.status();
930 }
931 Crc32 document_id_mapper_checksum = std::move(checksum_or).ValueOrDie();
932
933 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
934 // that can support error logging.
935 checksum_or = score_cache_->GetChecksum();
936 if (!checksum_or.ok()) {
937 ICING_LOG(ERROR) << checksum_or.status().error_message()
938 << "Failed to compute checksum of score cache";
939 return checksum_or.status();
940 }
941 Crc32 score_cache_checksum = std::move(checksum_or).ValueOrDie();
942
943 checksum_or = scorable_property_cache_->GetChecksum();
944 if (!checksum_or.ok()) {
945 ICING_LOG(ERROR) << checksum_or.status().error_message()
946 << "Failed to compute checksum of scorable property cache";
947 return checksum_or.status();
948 }
949 Crc32 scorable_property_cache_checksum = std::move(checksum_or).ValueOrDie();
950
951 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
952 // that can support error logging.
953 checksum_or = filter_cache_->GetChecksum();
954 if (!checksum_or.ok()) {
955 ICING_LOG(ERROR) << checksum_or.status().error_message()
956 << "Failed to compute checksum of filter cache";
957 return checksum_or.status();
958 }
959 Crc32 filter_cache_checksum = std::move(checksum_or).ValueOrDie();
960
961 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
962 // that can support error logging.
963 checksum_or = namespace_mapper_->GetChecksum();
964 if (!checksum_or.ok()) {
965 ICING_LOG(ERROR) << checksum_or.status().error_message()
966 << "Failed to compute checksum of namespace mapper";
967 return checksum_or.status();
968 }
969 Crc32 namespace_mapper_checksum = std::move(checksum_or).ValueOrDie();
970
971 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
972 // that can support error logging.
973 checksum_or = corpus_mapper_->GetChecksum();
974 if (!checksum_or.ok()) {
975 ICING_LOG(ERROR) << checksum_or.status().error_message()
976 << "Failed to compute checksum of corpus mapper";
977 return checksum_or.status();
978 }
979 Crc32 corpus_mapper_checksum = std::move(checksum_or).ValueOrDie();
980
981 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
982 // that can support error logging.
983 checksum_or = corpus_score_cache_->GetChecksum();
984 if (!checksum_or.ok()) {
985 ICING_LOG(WARNING) << checksum_or.status().error_message()
986 << "Failed to compute checksum of score cache";
987 return checksum_or.status();
988 }
989 Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie();
990
991 // NOTE: We purposely don't include usage_store checksum here because we can't
992 // regenerate it from ground truth documents. If it gets corrupted, we'll just
993 // clear all usage reports, but we shouldn't throw everything else in the
994 // document store out.
995
996 total_checksum.Append(std::to_string(document_log_checksum.Get()));
997 total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
998 total_checksum.Append(std::to_string(document_id_mapper_checksum.Get()));
999 total_checksum.Append(std::to_string(score_cache_checksum.Get()));
1000 total_checksum.Append(std::to_string(scorable_property_cache_checksum.Get()));
1001 total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
1002 total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
1003 total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
1004 total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get()));
1005 return total_checksum;
1006 }
1007
UpdateChecksum()1008 libtextclassifier3::StatusOr<Crc32> DocumentStore::UpdateChecksum() {
1009 Crc32 total_checksum;
1010
1011 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1012 // that can support error logging.
1013 auto checksum_or = document_log_->UpdateChecksum();
1014 if (!checksum_or.ok()) {
1015 ICING_LOG(ERROR) << checksum_or.status().error_message()
1016 << "Failed to compute checksum of DocumentLog";
1017 return checksum_or.status();
1018 }
1019 Crc32 document_log_checksum = std::move(checksum_or).ValueOrDie();
1020
1021 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1022 // that can support error logging.
1023 checksum_or = document_key_mapper_->UpdateChecksum();
1024 if (!checksum_or.ok()) {
1025 ICING_LOG(ERROR) << checksum_or.status().error_message()
1026 << "Failed to compute checksum of DocumentKeyMapper";
1027 return checksum_or.status();
1028 }
1029 Crc32 document_key_mapper_checksum = std::move(checksum_or).ValueOrDie();
1030
1031 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1032 // that can support error logging.
1033 checksum_or = document_id_mapper_->UpdateChecksum();
1034 if (!checksum_or.ok()) {
1035 ICING_LOG(ERROR) << checksum_or.status().error_message()
1036 << "Failed to compute checksum of DocumentIdMapper";
1037 return checksum_or.status();
1038 }
1039 Crc32 document_id_mapper_checksum = std::move(checksum_or).ValueOrDie();
1040
1041 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1042 // that can support error logging.
1043 checksum_or = score_cache_->UpdateChecksum();
1044 if (!checksum_or.ok()) {
1045 ICING_LOG(ERROR) << checksum_or.status().error_message()
1046 << "Failed to compute checksum of score cache";
1047 return checksum_or.status();
1048 }
1049 Crc32 score_cache_checksum = std::move(checksum_or).ValueOrDie();
1050
1051 checksum_or = scorable_property_cache_->UpdateChecksum();
1052 if (!checksum_or.ok()) {
1053 ICING_LOG(ERROR) << checksum_or.status().error_message()
1054 << "Failed to compute checksum of scorable property cache";
1055 return checksum_or.status();
1056 }
1057 Crc32 scorable_property_cache_checksum = std::move(checksum_or).ValueOrDie();
1058
1059 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1060 // that can support error logging.
1061 checksum_or = filter_cache_->UpdateChecksum();
1062 if (!checksum_or.ok()) {
1063 ICING_LOG(ERROR) << checksum_or.status().error_message()
1064 << "Failed to compute checksum of filter cache";
1065 return checksum_or.status();
1066 }
1067 Crc32 filter_cache_checksum = std::move(checksum_or).ValueOrDie();
1068
1069 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1070 // that can support error logging.
1071 checksum_or = namespace_mapper_->UpdateChecksum();
1072 if (!checksum_or.ok()) {
1073 ICING_LOG(ERROR) << checksum_or.status().error_message()
1074 << "Failed to compute checksum of namespace mapper";
1075 return checksum_or.status();
1076 }
1077 Crc32 namespace_mapper_checksum = std::move(checksum_or).ValueOrDie();
1078
1079 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1080 // that can support error logging.
1081 checksum_or = corpus_mapper_->UpdateChecksum();
1082 if (!checksum_or.ok()) {
1083 ICING_LOG(ERROR) << checksum_or.status().error_message()
1084 << "Failed to compute checksum of corpus mapper";
1085 return checksum_or.status();
1086 }
1087 Crc32 corpus_mapper_checksum = std::move(checksum_or).ValueOrDie();
1088
1089 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1090 // that can support error logging.
1091 checksum_or = corpus_score_cache_->UpdateChecksum();
1092 if (!checksum_or.ok()) {
1093 ICING_LOG(WARNING) << checksum_or.status().error_message()
1094 << "Failed to compute checksum of score cache";
1095 return checksum_or.status();
1096 }
1097 Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie();
1098
1099 // NOTE: We purposely don't include usage_store checksum here because we can't
1100 // regenerate it from ground truth documents. If it gets corrupted, we'll just
1101 // clear all usage reports, but we shouldn't throw everything else in the
1102 // document store out.
1103
1104 total_checksum.Append(std::to_string(document_log_checksum.Get()));
1105 total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
1106 total_checksum.Append(std::to_string(document_id_mapper_checksum.Get()));
1107 total_checksum.Append(std::to_string(score_cache_checksum.Get()));
1108 total_checksum.Append(std::to_string(scorable_property_cache_checksum.Get()));
1109 total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
1110 total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
1111 total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
1112 total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get()));
1113
1114 // Write the header
1115 DocumentStore::Header header;
1116 header.magic = DocumentStore::Header::kMagic;
1117 header.checksum = total_checksum.Get();
1118
1119 // This should overwrite the header.
1120 ScopedFd sfd(
1121 filesystem_->OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
1122 if (!sfd.is_valid() ||
1123 !filesystem_->Write(sfd.get(), &header, sizeof(header)) ||
1124 !filesystem_->DataSync(sfd.get())) {
1125 return absl_ports::InternalError(absl_ports::StrCat(
1126 "Failed to write DocStore header: ", MakeHeaderFilename(base_dir_)));
1127 }
1128 return total_checksum;
1129 }
1130
HeaderExists()1131 bool DocumentStore::HeaderExists() {
1132 if (!filesystem_->FileExists(MakeHeaderFilename(base_dir_).c_str())) {
1133 return false;
1134 }
1135
1136 int64_t file_size =
1137 filesystem_->GetFileSize(MakeHeaderFilename(base_dir_).c_str());
1138
1139 // If it's been truncated to size 0 before, we consider it to be a new file
1140 return file_size != 0 && file_size != Filesystem::kBadFileSize;
1141 }
1142
1143 libtextclassifier3::StatusOr<DocumentStore::PutResult>
InternalPut(DocumentProto && document,PutDocumentStatsProto * put_document_stats)1144 DocumentStore::InternalPut(DocumentProto&& document,
1145 PutDocumentStatsProto* put_document_stats) {
1146 std::unique_ptr<Timer> put_timer = clock_.GetNewTimer();
1147 ICING_RETURN_IF_ERROR(document_validator_.Validate(document));
1148
1149 if (put_document_stats != nullptr) {
1150 put_document_stats->set_document_size(document.ByteSizeLong());
1151 }
1152
1153 // Copy fields needed before they are moved
1154 std::string name_space = document.namespace_();
1155 std::string uri = document.uri();
1156 std::string schema = document.schema();
1157 int document_score = document.score();
1158 int32_t length_in_tokens = document.internal_fields().length_in_tokens();
1159 int64_t creation_timestamp_ms = document.creation_timestamp_ms();
1160
1161 // Sets the creation timestamp if caller hasn't specified.
1162 if (document.creation_timestamp_ms() == 0) {
1163 creation_timestamp_ms = clock_.GetSystemTimeMilliseconds();
1164 document.set_creation_timestamp_ms(creation_timestamp_ms);
1165 }
1166
1167 int64_t expiration_timestamp_ms =
1168 CalculateExpirationTimestampMs(creation_timestamp_ms, document.ttl_ms());
1169
1170 // Update ground truth first
1171 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1172 // that can support error logging.
1173 DocumentWrapper document_wrapper = CreateDocumentWrapper(std::move(document));
1174 auto offset_or = document_log_->WriteProto(document_wrapper);
1175 if (!offset_or.ok()) {
1176 ICING_LOG(ERROR) << offset_or.status().error_message()
1177 << "Failed to write document";
1178 return offset_or.status();
1179 }
1180 int64_t file_offset = std::move(offset_or).ValueOrDie();
1181
1182 // Get existing document id
1183 auto old_document_id_or = GetDocumentId(name_space, uri);
1184 if (!old_document_id_or.ok() &&
1185 !absl_ports::IsNotFound(old_document_id_or.status())) {
1186 return absl_ports::InternalError("Failed to read from key mapper");
1187 }
1188
1189 // Creates a new document id, updates key mapper and document_id mapper
1190 DocumentId new_document_id = document_id_mapper_->num_elements();
1191 if (!IsDocumentIdValid(new_document_id)) {
1192 return absl_ports::ResourceExhaustedError(
1193 "Exceeded maximum number of documents. Try calling Optimize to reclaim "
1194 "some space.");
1195 }
1196 PutResult put_result;
1197 put_result.new_document_id = new_document_id;
1198
1199 // Update namespace maps
1200 ICING_ASSIGN_OR_RETURN(
1201 NamespaceId namespace_id,
1202 namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys()));
1203
1204 NamespaceIdFingerprint new_doc_nsid_uri_fingerprint(namespace_id, uri);
1205
1206 // Updates key mapper and document_id mapper
1207 ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
1208 new_doc_nsid_uri_fingerprint.EncodeToCString(), new_document_id));
1209 ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
1210
1211 // Update corpus maps
1212 NamespaceIdFingerprint corpus_nsid_schema_fingerprint(namespace_id, schema);
1213 ICING_ASSIGN_OR_RETURN(
1214 CorpusId corpus_id,
1215 corpus_mapper_->GetOrPut(corpus_nsid_schema_fingerprint.EncodeToCString(),
1216 corpus_mapper_->num_keys()));
1217
1218 ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
1219 GetCorpusAssociatedScoreDataToUpdate(corpus_id));
1220 scoring_data.AddDocument(length_in_tokens);
1221
1222 ICING_RETURN_IF_ERROR(
1223 UpdateCorpusAssociatedScoreCache(corpus_id, scoring_data));
1224
1225 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1226 schema_store_->GetSchemaTypeId(schema));
1227 ICING_ASSIGN_OR_RETURN(
1228 int scorable_property_cache_index,
1229 UpdateScorablePropertyCache(document_wrapper.document(), schema_type_id));
1230
1231 ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
1232 new_document_id, DocumentAssociatedScoreData(
1233 corpus_id, document_score, creation_timestamp_ms,
1234 scorable_property_cache_index, length_in_tokens)));
1235
1236 ICING_RETURN_IF_ERROR(UpdateFilterCache(
1237 new_document_id,
1238 DocumentFilterData(namespace_id,
1239 new_doc_nsid_uri_fingerprint.fingerprint(),
1240 schema_type_id, expiration_timestamp_ms)));
1241
1242 if (old_document_id_or.ok()) {
1243 // The old document exists, copy over the usage scores and delete the old
1244 // document.
1245 DocumentId old_document_id = old_document_id_or.ValueOrDie();
1246 put_result.old_document_id = old_document_id;
1247
1248 ICING_RETURN_IF_ERROR(
1249 usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
1250 /*to_document_id=*/new_document_id));
1251
1252 // Delete the old document. It's fine if it's not found since it might have
1253 // been deleted previously.
1254 auto delete_status =
1255 Delete(old_document_id, clock_.GetSystemTimeMilliseconds());
1256 if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
1257 // Real error, pass it up.
1258 return delete_status;
1259 }
1260 }
1261
1262 if (put_document_stats != nullptr) {
1263 put_document_stats->set_document_store_latency_ms(
1264 put_timer->GetElapsedMilliseconds());
1265 }
1266
1267 return put_result;
1268 }
1269
Get(const std::string_view name_space,const std::string_view uri,bool clear_internal_fields) const1270 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
1271 const std::string_view name_space, const std::string_view uri,
1272 bool clear_internal_fields) const {
1273 // TODO(b/147231617): Make a better way to replace the error message in an
1274 // existing Status.
1275 auto document_id_or = GetDocumentId(name_space, uri);
1276 if (!document_id_or.ok()) {
1277 if (absl_ports::IsNotFound(document_id_or.status())) {
1278 ICING_VLOG(1) << document_id_or.status().error_message();
1279 return absl_ports::NotFoundError(absl_ports::StrCat(
1280 "Document (", name_space, ", ", uri, ") not found."));
1281 }
1282
1283 // Real error. Log it in error level and pass it up.
1284 ICING_LOG(ERROR) << document_id_or.status().error_message();
1285 return std::move(document_id_or).status();
1286 }
1287 DocumentId document_id = document_id_or.ValueOrDie();
1288
1289 // TODO(b/147231617): Make a better way to replace the error message in an
1290 // existing Status.
1291 auto status_or = Get(document_id, clear_internal_fields);
1292 if (!status_or.ok()) {
1293 if (absl_ports::IsNotFound(status_or.status())) {
1294 ICING_VLOG(1) << status_or.status().error_message();
1295 return absl_ports::NotFoundError(absl_ports::StrCat(
1296 "Document (", name_space, ", ", uri, ") not found."));
1297 }
1298
1299 // Real error. Log it in error level.
1300 ICING_LOG(ERROR) << status_or.status().error_message();
1301 }
1302 return status_or;
1303 }
1304
Get(DocumentId document_id,bool clear_internal_fields) const1305 libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
1306 DocumentId document_id, bool clear_internal_fields) const {
1307 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1308 auto document_filter_data_optional =
1309 GetAliveDocumentFilterData(document_id, current_time_ms);
1310 if (!document_filter_data_optional) {
1311 // The document doesn't exist. Let's check if the document id is invalid, we
1312 // will return InvalidArgumentError. Otherwise we should return NOT_FOUND
1313 // error.
1314 if (!IsDocumentIdValid(document_id)) {
1315 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
1316 "Document id '%d' invalid.", document_id));
1317 }
1318 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1319 "Document id '%d' doesn't exist", document_id));
1320 }
1321
1322 auto document_log_offset_or = document_id_mapper_->Get(document_id);
1323 if (!document_log_offset_or.ok()) {
1324 // Since we've just checked that our document_id is valid a few lines
1325 // above, there's no reason this should fail and an error should never
1326 // happen.
1327 return absl_ports::InternalError("Failed to find document offset.");
1328 }
1329 int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1330
1331 // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
1332 // that can support error logging.
1333 auto document_wrapper_or = document_log_->ReadProto(document_log_offset);
1334 if (!document_wrapper_or.ok()) {
1335 ICING_LOG(ERROR) << document_wrapper_or.status().error_message()
1336 << "Failed to read from document log";
1337 return document_wrapper_or.status();
1338 }
1339 DocumentWrapper document_wrapper =
1340 std::move(document_wrapper_or).ValueOrDie();
1341 if (clear_internal_fields) {
1342 document_wrapper.mutable_document()->clear_internal_fields();
1343 }
1344
1345 return std::move(*document_wrapper.mutable_document());
1346 }
1347
GetScorablePropertySet(DocumentId document_id,int64_t current_time_ms) const1348 std::unique_ptr<ScorablePropertySet> DocumentStore::GetScorablePropertySet(
1349 DocumentId document_id, int64_t current_time_ms) const {
1350 if (!feature_flags_.enable_scorable_properties()) {
1351 return nullptr;
1352 }
1353
1354 // Get scorable property cache index from the score_cache_
1355 libtextclassifier3::StatusOr<const DocumentAssociatedScoreData*>
1356 score_data_or = score_cache_->Get(document_id);
1357 if (!score_data_or.ok()) {
1358 return nullptr;
1359 }
1360 if (score_data_or.ValueOrDie()->scorable_property_cache_index() ==
1361 kInvalidScorablePropertyCacheIndex) {
1362 return nullptr;
1363 }
1364
1365 // Get ScorablePropertySetProto.
1366 libtextclassifier3::StatusOr<ScorablePropertySetProto>
1367 scorable_property_set_proto_or = scorable_property_cache_->Read(
1368 score_data_or.ValueOrDie()->scorable_property_cache_index());
1369 if (!scorable_property_set_proto_or.ok()) {
1370 return nullptr;
1371 }
1372
1373 // Get schema type id.
1374 auto document_filter_data_optional =
1375 GetAliveDocumentFilterData(document_id, current_time_ms);
1376 if (!document_filter_data_optional) {
1377 return nullptr;
1378 }
1379
1380 libtextclassifier3::StatusOr<std::unique_ptr<ScorablePropertySet>>
1381 scorable_property_set_or = ScorablePropertySet::Create(
1382 std::move(scorable_property_set_proto_or.ValueOrDie()),
1383 document_filter_data_optional.value().schema_type_id(),
1384 schema_store_);
1385 if (!scorable_property_set_or.ok()) {
1386 return nullptr;
1387 }
1388 return std::move(scorable_property_set_or.ValueOrDie());
1389 }
1390
GetDocumentId(const std::string_view name_space,const std::string_view uri) const1391 libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
1392 const std::string_view name_space, const std::string_view uri) const {
1393 auto namespace_id_or = namespace_mapper_->Get(name_space);
1394 libtextclassifier3::Status status = namespace_id_or.status();
1395 if (status.ok()) {
1396 NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1397 NamespaceIdFingerprint doc_nsid_uri_fingerprint(namespace_id, uri);
1398 auto document_id_or =
1399 document_key_mapper_->Get(doc_nsid_uri_fingerprint.EncodeToCString());
1400 status = document_id_or.status();
1401 if (status.ok()) {
1402 // Guaranteed to have a DocumentId now
1403 return document_id_or.ValueOrDie();
1404 }
1405 }
1406 return absl_ports::Annotate(
1407 status, absl_ports::StrCat(
1408 "Failed to find DocumentId by key: ", name_space, ", ", uri));
1409 }
1410
GetDocumentId(const NamespaceIdFingerprint & doc_namespace_id_uri_fingerprint) const1411 libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
1412 const NamespaceIdFingerprint& doc_namespace_id_uri_fingerprint) const {
1413 auto document_id_or = document_key_mapper_->Get(
1414 doc_namespace_id_uri_fingerprint.EncodeToCString());
1415 if (document_id_or.ok()) {
1416 return document_id_or.ValueOrDie();
1417 }
1418 return absl_ports::Annotate(
1419 std::move(document_id_or).status(),
1420 "Failed to find DocumentId by namespace id + fingerprint");
1421 }
1422
GetAllNamespaces() const1423 std::vector<std::string> DocumentStore::GetAllNamespaces() const {
1424 std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1425 GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1426
1427 std::unordered_set<NamespaceId> existing_namespace_ids;
1428 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1429 for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1430 ++document_id) {
1431 // filter_cache_->Get can only fail if document_id is < 0
1432 // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1433 auto status_or_data = filter_cache_->Get(document_id);
1434 if (!status_or_data.ok()) {
1435 ICING_LOG(ERROR)
1436 << "Error while iterating over filter cache in GetAllNamespaces";
1437 return std::vector<std::string>();
1438 }
1439 const DocumentFilterData* data = status_or_data.ValueOrDie();
1440
1441 if (GetAliveDocumentFilterData(document_id, current_time_ms)) {
1442 existing_namespace_ids.insert(data->namespace_id());
1443 }
1444 }
1445
1446 std::vector<std::string> existing_namespaces;
1447 for (auto itr = existing_namespace_ids.begin();
1448 itr != existing_namespace_ids.end(); ++itr) {
1449 existing_namespaces.push_back(namespace_id_to_namespace.at(*itr));
1450 }
1451 return existing_namespaces;
1452 }
1453
GetAliveDocumentFilterData(DocumentId document_id,int64_t current_time_ms) const1454 std::optional<DocumentFilterData> DocumentStore::GetAliveDocumentFilterData(
1455 DocumentId document_id, int64_t current_time_ms) const {
1456 if (IsDeleted(document_id)) {
1457 return std::nullopt;
1458 }
1459 return GetNonExpiredDocumentFilterData(document_id, current_time_ms);
1460 }
1461
1462 std::optional<DocumentFilterData>
GetNonDeletedDocumentFilterData(DocumentId document_id) const1463 DocumentStore::GetNonDeletedDocumentFilterData(DocumentId document_id) const {
1464 if (IsDeleted(document_id)) {
1465 return std::nullopt;
1466 }
1467
1468 auto filter_data_or = filter_cache_->GetCopy(document_id);
1469 if (!filter_data_or.ok()) {
1470 // This would only happen if document_id is out of range of the
1471 // filter_cache, meaning we got some invalid document_id. Callers should
1472 // already have checked that their document_id is valid or used
1473 // DoesDocumentExist(WithStatus). Regardless, return std::nullopt since the
1474 // document doesn't exist.
1475 return std::nullopt;
1476 }
1477
1478 // At this point, it's guaranteed that the document has not been deleted. It
1479 // could still be expired, but the filter data is guaranteed to be valid here.
1480 return std::move(filter_data_or).ValueOrDie();
1481 }
1482
IsDeleted(DocumentId document_id) const1483 bool DocumentStore::IsDeleted(DocumentId document_id) const {
1484 auto file_offset_or = document_id_mapper_->Get(document_id);
1485 if (!file_offset_or.ok()) {
1486 // This would only happen if document_id is out of range of the
1487 // document_id_mapper, meaning we got some invalid document_id. Callers
1488 // should already have checked that their document_id is valid or used
1489 // DoesDocumentExist(WithStatus). Regardless, return true since the
1490 // document doesn't exist.
1491 return true;
1492 }
1493 int64_t file_offset = *file_offset_or.ValueOrDie();
1494 return file_offset == kDocDeletedFlag;
1495 }
1496
1497 // Returns DocumentFilterData if the document is not expired. Otherwise,
1498 // std::nullopt.
1499 std::optional<DocumentFilterData>
GetNonExpiredDocumentFilterData(DocumentId document_id,int64_t current_time_ms) const1500 DocumentStore::GetNonExpiredDocumentFilterData(DocumentId document_id,
1501 int64_t current_time_ms) const {
1502 auto filter_data_or = filter_cache_->GetCopy(document_id);
1503 if (!filter_data_or.ok()) {
1504 // This would only happen if document_id is out of range of the
1505 // filter_cache, meaning we got some invalid document_id. Callers should
1506 // already have checked that their document_id is valid or used
1507 // DoesDocumentExist(WithStatus). Regardless, return std::nullopt since the
1508 // document doesn't exist.
1509 return std::nullopt;
1510 }
1511 DocumentFilterData document_filter_data = filter_data_or.ValueOrDie();
1512
1513 // Check if it's past the expiration time
1514 if (current_time_ms >= document_filter_data.expiration_timestamp_ms()) {
1515 return std::nullopt;
1516 }
1517 return document_filter_data;
1518 }
1519
Delete(const std::string_view name_space,const std::string_view uri,int64_t current_time_ms)1520 libtextclassifier3::Status DocumentStore::Delete(
1521 const std::string_view name_space, const std::string_view uri,
1522 int64_t current_time_ms) {
1523 // Try to get the DocumentId first
1524 auto document_id_or = GetDocumentId(name_space, uri);
1525 if (!document_id_or.ok()) {
1526 return absl_ports::Annotate(
1527 document_id_or.status(),
1528 absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
1529 ", uri: ", uri));
1530 }
1531 return Delete(document_id_or.ValueOrDie(), current_time_ms);
1532 }
1533
Delete(DocumentId document_id,int64_t current_time_ms)1534 libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id,
1535 int64_t current_time_ms) {
1536 auto document_filter_data_optional =
1537 GetAliveDocumentFilterData(document_id, current_time_ms);
1538 if (!document_filter_data_optional) {
1539 // The document doesn't exist. We should return InvalidArgumentError if the
1540 // document id is invalid. Otherwise we should return NOT_FOUND error.
1541 if (!IsDocumentIdValid(document_id)) {
1542 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
1543 "Document id '%d' invalid.", document_id));
1544 }
1545 return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
1546 "Document id '%d' doesn't exist", document_id));
1547 }
1548
1549 auto document_log_offset_or = document_id_mapper_->Get(document_id);
1550 if (!document_log_offset_or.ok()) {
1551 return absl_ports::InternalError("Failed to find document offset.");
1552 }
1553 int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
1554
1555 // Erases document proto.
1556 ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
1557 return ClearDerivedData(document_id);
1558 }
1559
GetNamespaceId(std::string_view name_space) const1560 libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
1561 std::string_view name_space) const {
1562 return namespace_mapper_->Get(name_space);
1563 }
1564
GetCorpusId(const std::string_view name_space,const std::string_view schema) const1565 libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId(
1566 const std::string_view name_space, const std::string_view schema) const {
1567 ICING_ASSIGN_OR_RETURN(NamespaceId namespace_id,
1568 namespace_mapper_->Get(name_space));
1569 NamespaceIdFingerprint corpus_nsid_schema_fp(namespace_id, schema);
1570 return corpus_mapper_->Get(corpus_nsid_schema_fp.EncodeToCString());
1571 }
1572
GetResultGroupingEntryId(ResultSpecProto::ResultGroupingType result_group_type,const std::string_view name_space,const std::string_view schema) const1573 libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
1574 ResultSpecProto::ResultGroupingType result_group_type,
1575 const std::string_view name_space, const std::string_view schema) const {
1576 auto namespace_id = GetNamespaceId(name_space);
1577 auto schema_type_id = schema_store_->GetSchemaTypeId(schema);
1578 switch (result_group_type) {
1579 case ResultSpecProto::NONE:
1580 return absl_ports::InvalidArgumentError(
1581 "Cannot group by ResultSpecProto::NONE");
1582 case ResultSpecProto::SCHEMA_TYPE:
1583 if (schema_type_id.ok()) {
1584 return schema_type_id.ValueOrDie();
1585 }
1586 break;
1587 case ResultSpecProto::NAMESPACE:
1588 if (namespace_id.ok()) {
1589 return namespace_id.ValueOrDie();
1590 }
1591 break;
1592 case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
1593 if (namespace_id.ok() && schema_type_id.ok()) {
1594 // TODO(b/258715421): Temporary workaround to get a
1595 // ResultGroupingEntryId given the Namespace string
1596 // and Schema string.
1597 return namespace_id.ValueOrDie() << 16 | schema_type_id.ValueOrDie();
1598 }
1599 break;
1600 }
1601 return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
1602 }
1603
GetResultGroupingEntryId(ResultSpecProto::ResultGroupingType result_group_type,const NamespaceId namespace_id,const SchemaTypeId schema_type_id) const1604 libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
1605 ResultSpecProto::ResultGroupingType result_group_type,
1606 const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const {
1607 switch (result_group_type) {
1608 case ResultSpecProto::NONE:
1609 return absl_ports::InvalidArgumentError(
1610 "Cannot group by ResultSpecProto::NONE");
1611 case ResultSpecProto::SCHEMA_TYPE:
1612 return schema_type_id;
1613 case ResultSpecProto::NAMESPACE:
1614 return namespace_id;
1615 case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
1616 // TODO(b/258715421): Temporary workaround to get a ResultGroupingEntryId
1617 // given the Namespace Id and SchemaType Id.
1618 return namespace_id << 16 | schema_type_id;
1619 }
1620 return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
1621 }
1622
1623 libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const1624 DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
1625 auto score_data_or = score_cache_->GetCopy(document_id);
1626 if (!score_data_or.ok()) {
1627 ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
1628 << " from score_cache_";
1629 return absl_ports::NotFoundError(
1630 std::move(score_data_or).status().error_message());
1631 }
1632
1633 DocumentAssociatedScoreData document_associated_score_data =
1634 std::move(score_data_or).ValueOrDie();
1635 return document_associated_score_data;
1636 }
1637
1638 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreData(CorpusId corpus_id) const1639 DocumentStore::GetCorpusAssociatedScoreData(CorpusId corpus_id) const {
1640 return corpus_score_cache_->GetCopy(corpus_id);
1641 }
1642
1643 libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const1644 DocumentStore::GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const {
1645 auto corpus_scoring_data_or = GetCorpusAssociatedScoreData(corpus_id);
1646 if (!corpus_scoring_data_or.ok() &&
1647 absl_ports::IsOutOfRange(corpus_scoring_data_or.status())) {
1648 // OUT_OF_RANGE is the StatusCode returned when a corpus id is added to
1649 // corpus_score_cache_ for the first time. Return a default
1650 // CorpusAssociatedScoreData object in this case.
1651 return CorpusAssociatedScoreData();
1652 }
1653
1654 return corpus_scoring_data_or;
1655 }
1656
1657 // TODO(b/273826815): Decide on and adopt a consistent pattern for handling
1658 // NOT_FOUND 'errors' returned by our internal classes.
GetUsageScores(DocumentId document_id,int64_t current_time_ms) const1659 std::optional<UsageStore::UsageScores> DocumentStore::GetUsageScores(
1660 DocumentId document_id, int64_t current_time_ms) const {
1661 std::optional<DocumentFilterData> opt =
1662 GetAliveDocumentFilterData(document_id, current_time_ms);
1663 if (!opt) {
1664 return std::nullopt;
1665 }
1666 if (document_id >= usage_store_->num_elements()) {
1667 return std::nullopt;
1668 }
1669 auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1670 if (!usage_scores_or.ok()) {
1671 ICING_LOG(ERROR) << "Error retrieving usage for " << document_id << ": "
1672 << usage_scores_or.status().error_message();
1673 return std::nullopt;
1674 }
1675 return std::move(usage_scores_or).ValueOrDie();
1676 }
1677
ReportUsage(const UsageReport & usage_report)1678 libtextclassifier3::Status DocumentStore::ReportUsage(
1679 const UsageReport& usage_report) {
1680 ICING_ASSIGN_OR_RETURN(DocumentId document_id,
1681 GetDocumentId(usage_report.document_namespace(),
1682 usage_report.document_uri()));
1683 // We can use the internal version here because we got our document_id from
1684 // our internal data structures. We would have thrown some error if the
1685 // namespace and/or uri were incorrect.
1686 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1687 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
1688 // Document was probably deleted or expired.
1689 return absl_ports::NotFoundError(absl_ports::StrCat(
1690 "Couldn't report usage on a nonexistent document: (namespace: '",
1691 usage_report.document_namespace(), "', uri: '",
1692 usage_report.document_uri(), "')"));
1693 }
1694
1695 return usage_store_->AddUsageReport(usage_report, document_id);
1696 }
1697
DeleteByNamespace(std::string_view name_space)1698 DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace(
1699 std::string_view name_space) {
1700 DeleteByGroupResult result;
1701 auto namespace_id_or = namespace_mapper_->Get(name_space);
1702 if (!namespace_id_or.ok()) {
1703 result.status = absl_ports::Annotate(
1704 namespace_id_or.status(),
1705 absl_ports::StrCat("Failed to find namespace: ", name_space));
1706 return result;
1707 }
1708 NamespaceId namespace_id = namespace_id_or.ValueOrDie();
1709 auto num_deleted_or = BatchDelete(namespace_id, kInvalidSchemaTypeId);
1710 if (!num_deleted_or.ok()) {
1711 result.status = std::move(num_deleted_or).status();
1712 return result;
1713 }
1714
1715 result.num_docs_deleted = num_deleted_or.ValueOrDie();
1716 if (result.num_docs_deleted <= 0) {
1717 // Treat the fact that no existing documents had this namespace to be the
1718 // same as this namespace not existing at all.
1719 result.status = absl_ports::NotFoundError(
1720 absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
1721 return result;
1722 }
1723
1724 return result;
1725 }
1726
DeleteBySchemaType(std::string_view schema_type)1727 DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType(
1728 std::string_view schema_type) {
1729 DeleteByGroupResult result;
1730 auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
1731 if (!schema_type_id_or.ok()) {
1732 result.status = absl_ports::Annotate(
1733 schema_type_id_or.status(),
1734 absl_ports::StrCat("Failed to find schema type. schema_type: ",
1735 schema_type));
1736 return result;
1737 }
1738 SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
1739 auto num_deleted_or = BatchDelete(kInvalidNamespaceId, schema_type_id);
1740 if (!num_deleted_or.ok()) {
1741 result.status = std::move(num_deleted_or).status();
1742 return result;
1743 }
1744
1745 result.num_docs_deleted = num_deleted_or.ValueOrDie();
1746 if (result.num_docs_deleted <= 0) {
1747 result.status = absl_ports::NotFoundError(absl_ports::StrCat(
1748 "No documents found with schema type '", schema_type, "'"));
1749 return result;
1750 }
1751
1752 return result;
1753 }
1754
BatchDelete(NamespaceId namespace_id,SchemaTypeId schema_type_id)1755 libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
1756 NamespaceId namespace_id, SchemaTypeId schema_type_id) {
1757 // Tracks if there were any existing documents with this namespace that we
1758 // will mark as deleted.
1759 int num_updated_documents = 0;
1760
1761 // Traverse FilterCache and delete all docs that match namespace_id and
1762 // schema_type_id.
1763 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1764 for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
1765 ++document_id) {
1766 // filter_cache_->Get can only fail if document_id is < 0
1767 // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
1768 ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
1769 filter_cache_->Get(document_id));
1770
1771 // Check namespace only when the input namespace id is valid.
1772 if (namespace_id != kInvalidNamespaceId &&
1773 (data->namespace_id() == kInvalidNamespaceId ||
1774 data->namespace_id() != namespace_id)) {
1775 // The document has already been hard-deleted or isn't from the desired
1776 // namespace.
1777 continue;
1778 }
1779
1780 // Check schema type only when the input schema type id is valid.
1781 if (schema_type_id != kInvalidSchemaTypeId &&
1782 (data->schema_type_id() == kInvalidSchemaTypeId ||
1783 data->schema_type_id() != schema_type_id)) {
1784 // The document has already been hard-deleted or doesn't have the
1785 // desired schema type.
1786 continue;
1787 }
1788
1789 // The document has the desired namespace and schema type, it either
1790 // exists or has expired.
1791 libtextclassifier3::Status delete_status =
1792 Delete(document_id, current_time_ms);
1793 if (absl_ports::IsNotFound(delete_status)) {
1794 continue;
1795 } else if (!delete_status.ok()) {
1796 // Real error, pass up.
1797 return delete_status;
1798 }
1799 ++num_updated_documents;
1800 }
1801
1802 return num_updated_documents;
1803 }
1804
PersistToDisk(PersistType::Code persist_type)1805 libtextclassifier3::Status DocumentStore::PersistToDisk(
1806 PersistType::Code persist_type) {
1807 ICING_RETURN_IF_ERROR(document_log_->PersistToDisk());
1808 if (persist_type == PersistType::LITE) {
1809 // only persist the document log.
1810 return libtextclassifier3::Status::OK;
1811 }
1812 if (persist_type == PersistType::RECOVERY_PROOF) {
1813 return UpdateChecksum().status();
1814 }
1815 ICING_RETURN_IF_ERROR(document_key_mapper_->PersistToDisk());
1816 ICING_RETURN_IF_ERROR(document_id_mapper_->PersistToDisk());
1817 ICING_RETURN_IF_ERROR(score_cache_->PersistToDisk());
1818 ICING_RETURN_IF_ERROR(scorable_property_cache_->PersistToDisk());
1819 ICING_RETURN_IF_ERROR(filter_cache_->PersistToDisk());
1820 ICING_RETURN_IF_ERROR(namespace_mapper_->PersistToDisk());
1821 ICING_RETURN_IF_ERROR(usage_store_->PersistToDisk());
1822 ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk());
1823 ICING_RETURN_IF_ERROR(corpus_score_cache_->PersistToDisk());
1824
1825 // Update the combined checksum and write to header file.
1826 ICING_RETURN_IF_ERROR(UpdateChecksum());
1827 return libtextclassifier3::Status::OK;
1828 }
1829
GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t> & value_or,int64_t default_value)1830 int64_t GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t>& value_or,
1831 int64_t default_value) {
1832 return (value_or.ok()) ? value_or.ValueOrDie() : default_value;
1833 }
1834
GetMemberStorageInfo() const1835 DocumentStorageInfoProto DocumentStore::GetMemberStorageInfo() const {
1836 DocumentStorageInfoProto storage_info;
1837 storage_info.set_document_log_size(
1838 GetValueOrDefault(document_log_->GetDiskUsage(), -1));
1839 storage_info.set_key_mapper_size(
1840 GetValueOrDefault(document_key_mapper_->GetDiskUsage(), -1));
1841 storage_info.set_document_id_mapper_size(
1842 GetValueOrDefault(document_id_mapper_->GetDiskUsage(), -1));
1843 storage_info.set_score_cache_size(
1844 GetValueOrDefault(score_cache_->GetDiskUsage(), -1));
1845 storage_info.set_scorable_property_cache_size(
1846 GetValueOrDefault(scorable_property_cache_->GetDiskUsage(), -1));
1847 storage_info.set_filter_cache_size(
1848 GetValueOrDefault(filter_cache_->GetDiskUsage(), -1));
1849 storage_info.set_namespace_id_mapper_size(
1850 GetValueOrDefault(namespace_mapper_->GetDiskUsage(), -1));
1851 storage_info.set_corpus_mapper_size(
1852 GetValueOrDefault(corpus_mapper_->GetDiskUsage(), -1));
1853 storage_info.set_corpus_score_cache_size(
1854 GetValueOrDefault(corpus_score_cache_->GetDiskUsage(), -1));
1855 return storage_info;
1856 }
1857
CalculateDocumentStatusCounts(DocumentStorageInfoProto storage_info) const1858 DocumentStorageInfoProto DocumentStore::CalculateDocumentStatusCounts(
1859 DocumentStorageInfoProto storage_info) const {
1860 int total_num_alive = 0;
1861 int total_num_expired = 0;
1862 int total_num_deleted = 0;
1863 std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
1864 GetNamespaceIdsToNamespaces(namespace_mapper_.get());
1865 std::unordered_map<std::string, NamespaceStorageInfoProto>
1866 namespace_to_storage_info;
1867
1868 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1869 for (DocumentId document_id = 0;
1870 document_id < document_id_mapper_->num_elements(); ++document_id) {
1871 // Check if it's deleted first.
1872 if (IsDeleted(document_id)) {
1873 // We don't have the namespace id of hard deleted documents anymore, so
1874 // we can't add to our namespace storage info.
1875 ++total_num_deleted;
1876 continue;
1877 }
1878
1879 // At this point, the document is either alive or expired, we can get
1880 // namespace info for it.
1881 auto filter_data_or = filter_cache_->Get(document_id);
1882 if (!filter_data_or.ok()) {
1883 ICING_VLOG(1) << "Error trying to get filter data for document store "
1884 "storage info counts.";
1885 continue;
1886 }
1887 const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
1888 auto itr = namespace_id_to_namespace.find(filter_data->namespace_id());
1889 if (itr == namespace_id_to_namespace.end()) {
1890 ICING_VLOG(1) << "Error trying to find namespace for document store "
1891 "storage info counts.";
1892 continue;
1893 }
1894 const std::string& name_space = itr->second;
1895
1896 // Always set the namespace, if the NamespaceStorageInfoProto didn't exist
1897 // before, we'll get back a default instance of it.
1898 NamespaceStorageInfoProto& namespace_storage_info =
1899 namespace_to_storage_info[name_space];
1900 namespace_storage_info.set_namespace_(name_space);
1901
1902 // Get usage scores
1903 auto usage_scores_or = usage_store_->GetUsageScores(document_id);
1904 if (!usage_scores_or.ok()) {
1905 ICING_VLOG(1) << "Error trying to get usage scores for document store "
1906 "storage info counts.";
1907 continue;
1908 }
1909 UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie();
1910
1911 // Update our stats
1912 if (!GetNonExpiredDocumentFilterData(document_id, current_time_ms)) {
1913 ++total_num_expired;
1914 namespace_storage_info.set_num_expired_documents(
1915 namespace_storage_info.num_expired_documents() + 1);
1916 if (usage_scores.usage_type1_count > 0) {
1917 namespace_storage_info.set_num_expired_documents_usage_type1(
1918 namespace_storage_info.num_expired_documents_usage_type1() + 1);
1919 }
1920 if (usage_scores.usage_type2_count > 0) {
1921 namespace_storage_info.set_num_expired_documents_usage_type2(
1922 namespace_storage_info.num_expired_documents_usage_type2() + 1);
1923 }
1924 if (usage_scores.usage_type3_count > 0) {
1925 namespace_storage_info.set_num_expired_documents_usage_type3(
1926 namespace_storage_info.num_expired_documents_usage_type3() + 1);
1927 }
1928 } else {
1929 ++total_num_alive;
1930 namespace_storage_info.set_num_alive_documents(
1931 namespace_storage_info.num_alive_documents() + 1);
1932 if (usage_scores.usage_type1_count > 0) {
1933 namespace_storage_info.set_num_alive_documents_usage_type1(
1934 namespace_storage_info.num_alive_documents_usage_type1() + 1);
1935 }
1936 if (usage_scores.usage_type2_count > 0) {
1937 namespace_storage_info.set_num_alive_documents_usage_type2(
1938 namespace_storage_info.num_alive_documents_usage_type2() + 1);
1939 }
1940 if (usage_scores.usage_type3_count > 0) {
1941 namespace_storage_info.set_num_alive_documents_usage_type3(
1942 namespace_storage_info.num_alive_documents_usage_type3() + 1);
1943 }
1944 }
1945 }
1946
1947 for (auto& itr : namespace_to_storage_info) {
1948 storage_info.mutable_namespace_storage_info()->Add(std::move(itr.second));
1949 }
1950 storage_info.set_num_alive_documents(total_num_alive);
1951 storage_info.set_num_deleted_documents(total_num_deleted);
1952 storage_info.set_num_expired_documents(total_num_expired);
1953 return storage_info;
1954 }
1955
GetStorageInfo() const1956 DocumentStorageInfoProto DocumentStore::GetStorageInfo() const {
1957 DocumentStorageInfoProto storage_info = GetMemberStorageInfo();
1958 int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
1959 if (directory_size != Filesystem::kBadFileSize) {
1960 storage_info.set_document_store_size(directory_size);
1961 } else {
1962 storage_info.set_document_store_size(-1);
1963 }
1964 storage_info.set_num_namespaces(namespace_mapper_->num_keys());
1965 return CalculateDocumentStatusCounts(std::move(storage_info));
1966 }
1967
UpdateSchemaStore(const SchemaStore * schema_store)1968 libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
1969 const SchemaStore* schema_store) {
1970 // Update all references to the SchemaStore
1971 schema_store_ = schema_store;
1972 document_validator_.UpdateSchemaStore(schema_store);
1973
1974 int size = document_id_mapper_->num_elements();
1975 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
1976 for (DocumentId document_id = 0; document_id < size; document_id++) {
1977 auto document_or = Get(document_id);
1978 if (absl_ports::IsNotFound(document_or.status())) {
1979 // Skip nonexistent documents
1980 continue;
1981 } else if (!document_or.ok()) {
1982 // Real error, pass up
1983 return absl_ports::Annotate(
1984 document_or.status(),
1985 IcingStringUtil::StringPrintf(
1986 "Failed to retrieve Document for DocumentId %d", document_id));
1987 }
1988
1989 // Guaranteed to have a document now.
1990 DocumentProto document = document_or.ValueOrDie();
1991
1992 // Revalidate that this document is still compatible
1993 if (document_validator_.Validate(document).ok()) {
1994 // Update the SchemaTypeId for this entry
1995 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1996 schema_store_->GetSchemaTypeId(document.schema()));
1997 ICING_ASSIGN_OR_RETURN(
1998 typename FileBackedVector<DocumentFilterData>::MutableView
1999 doc_filter_data_view,
2000 filter_cache_->GetMutable(document_id));
2001 doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
2002 } else {
2003 // Document is no longer valid with the new SchemaStore. Mark as
2004 // deleted
2005 auto delete_status =
2006 Delete(document.namespace_(), document.uri(), current_time_ms);
2007 if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
2008 // Real error, pass up
2009 return delete_status;
2010 }
2011 }
2012 }
2013
2014 return libtextclassifier3::Status::OK;
2015 }
2016
OptimizedUpdateSchemaStore(const SchemaStore * schema_store,const SchemaStore::SetSchemaResult & set_schema_result)2017 libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore(
2018 const SchemaStore* schema_store,
2019 const SchemaStore::SetSchemaResult& set_schema_result) {
2020 if (!set_schema_result.success) {
2021 // No new schema was set, no work to be done
2022 return libtextclassifier3::Status::OK;
2023 }
2024
2025 // Update all references to the SchemaStore
2026 schema_store_ = schema_store;
2027 document_validator_.UpdateSchemaStore(schema_store);
2028
2029 int size = document_id_mapper_->num_elements();
2030 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
2031 for (DocumentId document_id = 0; document_id < size; document_id++) {
2032 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
2033 // Skip nonexistent documents
2034 continue;
2035 }
2036
2037 // Guaranteed that the document exists now.
2038 ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
2039 filter_cache_->Get(document_id));
2040
2041 bool delete_document = set_schema_result.schema_types_deleted_by_id.count(
2042 filter_data->schema_type_id()) != 0;
2043
2044 // Check if we need to update the FilterCache entry for this document. It
2045 // may have been assigned a different SchemaTypeId in the new SchemaStore.
2046 bool update_filter_cache =
2047 set_schema_result.old_schema_type_ids_changed.count(
2048 filter_data->schema_type_id()) != 0;
2049
2050 // Check if we need to revalidate this document if the type is now
2051 // incompatible
2052 bool revalidate_document =
2053 set_schema_result.schema_types_incompatible_by_id.count(
2054 filter_data->schema_type_id()) != 0;
2055
2056 if (update_filter_cache || revalidate_document) {
2057 ICING_ASSIGN_OR_RETURN(DocumentProto document, Get(document_id));
2058
2059 if (update_filter_cache) {
2060 ICING_ASSIGN_OR_RETURN(
2061 SchemaTypeId schema_type_id,
2062 schema_store_->GetSchemaTypeId(document.schema()));
2063 ICING_ASSIGN_OR_RETURN(
2064 typename FileBackedVector<DocumentFilterData>::MutableView
2065 doc_filter_data_view,
2066 filter_cache_->GetMutable(document_id));
2067 doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
2068 }
2069 if (revalidate_document) {
2070 delete_document = !document_validator_.Validate(document).ok();
2071 }
2072 }
2073
2074 if (delete_document) {
2075 // Document is no longer valid with the new SchemaStore. Mark as deleted
2076 auto delete_status = Delete(document_id, current_time_ms);
2077 if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
2078 // Real error, pass up
2079 return delete_status;
2080 }
2081 }
2082 }
2083
2084 return libtextclassifier3::Status::OK;
2085 }
2086
RegenerateScorablePropertyCache(const std::unordered_set<SchemaTypeId> & schema_type_ids)2087 libtextclassifier3::Status DocumentStore::RegenerateScorablePropertyCache(
2088 const std::unordered_set<SchemaTypeId>& schema_type_ids) {
2089 if (schema_type_ids.empty()) {
2090 return libtextclassifier3::Status::OK;
2091 }
2092
2093 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
2094 for (DocumentId document_id = 0;
2095 document_id < document_id_mapper_->num_elements(); ++document_id) {
2096 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
2097 continue;
2098 }
2099 // Guaranteed that the document exists now.
2100 ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
2101 filter_cache_->Get(document_id));
2102 SchemaTypeId schema_type_id = filter_data->schema_type_id();
2103 if (schema_type_ids.find(schema_type_id) == schema_type_ids.end()) {
2104 continue;
2105 }
2106
2107 ICING_ASSIGN_OR_RETURN(DocumentProto document, Get(document_id));
2108 int32_t scorable_property_cache_index = kInvalidScorablePropertyCacheIndex;
2109 ICING_ASSIGN_OR_RETURN(
2110 scorable_property_cache_index,
2111 UpdateScorablePropertyCache(document, schema_type_id));
2112
2113 // Update the score_cache_ with the new scorable property cache index.
2114 ICING_ASSIGN_OR_RETURN(
2115 typename FileBackedVector<DocumentAssociatedScoreData>::MutableView
2116 doc_score_data_view,
2117 score_cache_->GetMutable(document_id));
2118 doc_score_data_view.Get().set_scorable_property_cache_index(
2119 scorable_property_cache_index);
2120 }
2121
2122 return libtextclassifier3::Status::OK;
2123 }
2124
2125 // TODO(b/121227117): Implement Optimize()
Optimize()2126 libtextclassifier3::Status DocumentStore::Optimize() {
2127 return libtextclassifier3::Status::OK;
2128 }
2129
2130 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
OptimizeInto(const std::string & new_directory,const LanguageSegmenter * lang_segmenter,std::unordered_set<std::string> && potentially_optimizable_blob_handles,OptimizeStatsProto * stats) const2131 DocumentStore::OptimizeInto(
2132 const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
2133 std::unordered_set<std::string>&& potentially_optimizable_blob_handles,
2134 OptimizeStatsProto* stats) const {
2135 // Validates directory
2136 if (new_directory == base_dir_) {
2137 return absl_ports::InvalidArgumentError(
2138 "New directory is the same as the current one.");
2139 }
2140
2141 ICING_ASSIGN_OR_RETURN(
2142 auto doc_store_create_result,
2143 DocumentStore::Create(
2144 filesystem_, new_directory, &clock_, schema_store_, &feature_flags_,
2145 /*force_recovery_and_revalidate_documents=*/false, pre_mapping_fbv_,
2146 use_persistent_hash_map_, compression_level_,
2147 /*initialize_stats=*/nullptr));
2148 std::unique_ptr<DocumentStore> new_doc_store =
2149 std::move(doc_store_create_result.document_store);
2150
2151 // Writes all valid docs into new document store (new directory)
2152 int document_cnt = document_id_mapper_->num_elements();
2153 int num_deleted_documents = 0;
2154 int num_expired_documents = 0;
2155 UsageStore::UsageScores default_usage;
2156 OptimizeResult result;
2157 result.document_id_old_to_new.resize(document_cnt, kInvalidDocumentId);
2158
2159 result.dead_blob_handles = std::move(potentially_optimizable_blob_handles);
2160 std::unordered_map<std::string, std::vector<std::string>>
2161 type_blob_property_map;
2162 if (!result.dead_blob_handles.empty()) {
2163 // Get the blob property map from the schema store.
2164 if (num_documents() == 0) {
2165 return result;
2166 }
2167 auto type_blob_property_map_or = schema_store_->ConstructBlobPropertyMap();
2168 if (!type_blob_property_map_or.ok()) {
2169 // If we fail to retrieve this map when there *are* documents in
2170 // doc store, then something is seriously wrong. Return error.
2171 return type_blob_property_map_or.status();
2172 }
2173 type_blob_property_map = std::move(type_blob_property_map_or).ValueOrDie();
2174 }
2175
2176 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
2177 for (DocumentId document_id = 0; document_id < document_cnt; document_id++) {
2178 auto document_or = Get(document_id, /*clear_internal_fields=*/false);
2179 if (absl_ports::IsNotFound(document_or.status())) {
2180 if (IsDeleted(document_id)) {
2181 ++num_deleted_documents;
2182 } else if (!GetNonExpiredDocumentFilterData(document_id,
2183 current_time_ms)) {
2184 ++num_expired_documents;
2185 }
2186 continue;
2187 } else if (!document_or.ok()) {
2188 // Real error, pass up
2189 return absl_ports::Annotate(
2190 document_or.status(),
2191 IcingStringUtil::StringPrintf(
2192 "Failed to retrieve Document for DocumentId %d", document_id));
2193 }
2194
2195 // Guaranteed to have a document now.
2196 DocumentProto document_to_keep = std::move(document_or).ValueOrDie();
2197 // Remove blobs that still have reference are removed from the
2198 // expired_blob_handles. So that all remaining are dead blob.
2199 RemoveAliveBlobHandles(document_to_keep, type_blob_property_map,
2200 result.dead_blob_handles);
2201
2202 libtextclassifier3::StatusOr<PutResult> put_result_or;
2203 if (document_to_keep.internal_fields().length_in_tokens() == 0) {
2204 auto tokenized_document_or = TokenizedDocument::Create(
2205 schema_store_, lang_segmenter, document_to_keep);
2206 if (!tokenized_document_or.ok()) {
2207 return absl_ports::Annotate(
2208 tokenized_document_or.status(),
2209 IcingStringUtil::StringPrintf(
2210 "Failed to tokenize Document for DocumentId %d", document_id));
2211 }
2212 TokenizedDocument tokenized_document(
2213 std::move(tokenized_document_or).ValueOrDie());
2214 put_result_or = new_doc_store->Put(
2215 std::move(document_to_keep), tokenized_document.num_string_tokens());
2216 } else {
2217 // TODO(b/144458732): Implement a more robust version of
2218 // TC_ASSIGN_OR_RETURN that can support error logging.
2219 put_result_or = new_doc_store->InternalPut(std::move(document_to_keep));
2220 }
2221 if (!put_result_or.ok()) {
2222 ICING_LOG(ERROR) << put_result_or.status().error_message()
2223 << "Failed to write into new document store";
2224 return put_result_or.status();
2225 }
2226
2227 DocumentId new_document_id = put_result_or.ValueOrDie().new_document_id;
2228 result.document_id_old_to_new[document_id] = new_document_id;
2229
2230 // Copy over usage scores.
2231 ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
2232 usage_store_->GetUsageScores(document_id));
2233 if (!(usage_scores == default_usage)) {
2234 // If the usage scores for this document are the default (no usage),
2235 // then don't bother setting it. No need to possibly allocate storage if
2236 // there's nothing interesting to store.
2237 ICING_RETURN_IF_ERROR(
2238 new_doc_store->SetUsageScores(new_document_id, usage_scores));
2239 }
2240 }
2241 // Construct namespace_id_old_to_new
2242 int namespace_cnt = namespace_mapper_->num_keys();
2243 std::unordered_map<NamespaceId, std::string> old_namespaces =
2244 GetNamespaceIdsToNamespaces(namespace_mapper_.get());
2245 if (namespace_cnt != old_namespaces.size()) {
2246 // This really shouldn't happen. If it really happens, then:
2247 // - It won't block DocumentStore optimization, so don't return error here.
2248 // - Instead, write a warning log here and hint the caller to rebuild index.
2249 ICING_LOG(WARNING) << "Unexpected old namespace count " << namespace_cnt
2250 << " vs " << old_namespaces.size();
2251 result.should_rebuild_index = true;
2252 } else {
2253 result.namespace_id_old_to_new.resize(namespace_cnt, kInvalidNamespaceId);
2254 for (const auto& [old_namespace_id, ns] : old_namespaces) {
2255 if (old_namespace_id >= result.namespace_id_old_to_new.size()) {
2256 // This really shouldn't happen. If it really happens, then:
2257 // - It won't block DocumentStore optimization, so don't return error
2258 // here.
2259 // - Instead, write a warning log here and hint the caller to rebuild
2260 // index.
2261 ICING_LOG(WARNING) << "Found unexpected namespace id "
2262 << old_namespace_id << ". Should be in range 0 to "
2263 << result.namespace_id_old_to_new.size()
2264 << " (exclusive).";
2265 result.namespace_id_old_to_new.clear();
2266 result.should_rebuild_index = true;
2267 break;
2268 }
2269
2270 auto new_namespace_id_or = new_doc_store->namespace_mapper_->Get(ns);
2271 if (!new_namespace_id_or.ok()) {
2272 if (absl_ports::IsNotFound(new_namespace_id_or.status())) {
2273 continue;
2274 }
2275 // Real error, return it.
2276 return std::move(new_namespace_id_or).status();
2277 }
2278
2279 NamespaceId new_namespace_id = new_namespace_id_or.ValueOrDie();
2280 // Safe to use bracket to assign given that we've checked the range above.
2281 result.namespace_id_old_to_new[old_namespace_id] = new_namespace_id;
2282 }
2283 }
2284
2285 if (stats != nullptr) {
2286 stats->set_num_original_documents(document_cnt);
2287 stats->set_num_deleted_documents(num_deleted_documents);
2288 stats->set_num_expired_documents(num_expired_documents);
2289 stats->set_num_original_namespaces(namespace_cnt);
2290 stats->set_num_deleted_namespaces(
2291 namespace_cnt - new_doc_store->namespace_mapper_->num_keys());
2292 }
2293 ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL));
2294 return result;
2295 }
2296
2297 libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
GetOptimizeInfo() const2298 DocumentStore::GetOptimizeInfo() const {
2299 OptimizeInfo optimize_info;
2300
2301 // Figure out our ratio of optimizable/total docs.
2302 int32_t num_documents = document_id_mapper_->num_elements();
2303 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
2304 for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
2305 ++document_id) {
2306 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
2307 ++optimize_info.optimizable_docs;
2308 }
2309
2310 ++optimize_info.total_docs;
2311 }
2312
2313 if (optimize_info.total_docs == 0) {
2314 // Can exit early since there's nothing to calculate.
2315 return optimize_info;
2316 }
2317
2318 // Get the total element size.
2319 //
2320 // We use file size instead of disk usage here because the files are not
2321 // sparse, so it's more accurate. Disk usage rounds up to the nearest block
2322 // size.
2323 ICING_ASSIGN_OR_RETURN(const int64_t document_log_file_size,
2324 document_log_->GetElementsFileSize());
2325 ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_file_size,
2326 document_id_mapper_->GetElementsFileSize());
2327 ICING_ASSIGN_OR_RETURN(const int64_t score_cache_file_size,
2328 score_cache_->GetElementsFileSize());
2329 ICING_ASSIGN_OR_RETURN(const int64_t scorable_property_cache_file_size,
2330 scorable_property_cache_->GetElementsFileSize());
2331 ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
2332 filter_cache_->GetElementsFileSize());
2333 ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_file_size,
2334 corpus_score_cache_->GetElementsFileSize());
2335
2336 // Usage store might be sparse, but we'll still use file size for more
2337 // accurate counting.
2338 ICING_ASSIGN_OR_RETURN(const int64_t usage_store_file_size,
2339 usage_store_->GetElementsFileSize());
2340
2341 // We use a combined disk usage and file size for the DynamicTrieKeyMapper
2342 // because it's backed by a trie, which has some sparse property bitmaps.
2343 ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
2344 document_key_mapper_->GetElementsSize());
2345
2346 // We don't include the namespace_mapper or the corpus_mapper because it's
2347 // not clear if we could recover any space even if Optimize were called.
2348 // Deleting 100s of documents could still leave a few documents of a
2349 // namespace, and then there would be no change.
2350
2351 int64_t total_size = document_log_file_size + document_key_mapper_size +
2352 document_id_mapper_file_size + score_cache_file_size +
2353 scorable_property_cache_file_size +
2354 filter_cache_file_size + corpus_score_cache_file_size +
2355 usage_store_file_size;
2356
2357 optimize_info.estimated_optimizable_bytes =
2358 total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
2359 return optimize_info;
2360 }
2361
UpdateCorpusAssociatedScoreCache(CorpusId corpus_id,const CorpusAssociatedScoreData & score_data)2362 libtextclassifier3::Status DocumentStore::UpdateCorpusAssociatedScoreCache(
2363 CorpusId corpus_id, const CorpusAssociatedScoreData& score_data) {
2364 return corpus_score_cache_->Set(corpus_id, score_data);
2365 }
2366
UpdateDocumentAssociatedScoreCache(DocumentId document_id,const DocumentAssociatedScoreData & score_data)2367 libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
2368 DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
2369 return score_cache_->Set(document_id, score_data);
2370 }
2371
UpdateFilterCache(DocumentId document_id,const DocumentFilterData & filter_data)2372 libtextclassifier3::Status DocumentStore::UpdateFilterCache(
2373 DocumentId document_id, const DocumentFilterData& filter_data) {
2374 return filter_cache_->Set(document_id, filter_data);
2375 }
2376
ClearDerivedData(DocumentId document_id)2377 libtextclassifier3::Status DocumentStore::ClearDerivedData(
2378 DocumentId document_id) {
2379 // We intentionally leave the data in key_mapper_ because locating that data
2380 // requires fetching namespace and uri. Leaving data in key_mapper_ should
2381 // be fine because the data is hashed.
2382
2383 ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
2384
2385 // Resets the score cache entry
2386 ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
2387 document_id,
2388 DocumentAssociatedScoreData(
2389 kInvalidCorpusId,
2390 /*document_score=*/-1,
2391 /*creation_timestamp_ms=*/-1,
2392 /*scorable_property_cache_index=*/kInvalidScorablePropertyCacheIndex,
2393 /*length_in_tokens=*/0)));
2394
2395 // Resets the filter cache entry
2396 ICING_RETURN_IF_ERROR(UpdateFilterCache(
2397 document_id,
2398 DocumentFilterData(kInvalidNamespaceId, /*uri_fingerprint=*/0,
2399 kInvalidSchemaTypeId,
2400 /*expiration_timestamp_ms=*/-1)));
2401
2402 // Clears the usage scores.
2403 return usage_store_->DeleteUsageScores(document_id);
2404 }
2405
SetUsageScores(DocumentId document_id,const UsageStore::UsageScores & usage_scores)2406 libtextclassifier3::Status DocumentStore::SetUsageScores(
2407 DocumentId document_id, const UsageStore::UsageScores& usage_scores) {
2408 return usage_store_->SetUsageScores(document_id, usage_scores);
2409 }
2410
2411 libtextclassifier3::StatusOr<
2412 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
CollectCorpusInfo() const2413 DocumentStore::CollectCorpusInfo() const {
2414 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo> corpus_info;
2415 libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or =
2416 schema_store_->GetSchema();
2417 if (!schema_proto_or.ok()) {
2418 return corpus_info;
2419 }
2420 // Maps from CorpusId to the corresponding protocol buffer in the result.
2421 std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map;
2422 std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
2423 GetNamespaceIdsToNamespaces(namespace_mapper_.get());
2424 const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
2425 int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
2426 for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
2427 ++document_id) {
2428 if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
2429 continue;
2430 }
2431 ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
2432 filter_cache_->Get(document_id));
2433 ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data,
2434 score_cache_->Get(document_id));
2435 const std::string& name_space =
2436 namespace_id_to_namespace[filter_data->namespace_id()];
2437 const std::string& schema =
2438 schema_proto->types()[filter_data->schema_type_id()].schema_type();
2439 auto iter = info_map.find(score_data->corpus_id());
2440 if (iter == info_map.end()) {
2441 DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add();
2442 entry->set_namespace_(name_space);
2443 entry->set_schema(schema);
2444 iter = info_map.insert({score_data->corpus_id(), entry}).first;
2445 }
2446 iter->second->set_total_documents(iter->second->total_documents() + 1);
2447 iter->second->set_total_token(iter->second->total_token() +
2448 score_data->length_in_tokens());
2449 }
2450 return corpus_info;
2451 }
2452
2453 libtextclassifier3::StatusOr<DocumentDebugInfoProto>
GetDebugInfo(int verbosity) const2454 DocumentStore::GetDebugInfo(int verbosity) const {
2455 DocumentDebugInfoProto debug_info;
2456 *debug_info.mutable_document_storage_info() = GetStorageInfo();
2457 ICING_ASSIGN_OR_RETURN(Crc32 crc, GetChecksum());
2458 debug_info.set_crc(crc.Get());
2459 if (verbosity > 0) {
2460 ICING_ASSIGN_OR_RETURN(
2461 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>
2462 corpus_info,
2463 CollectCorpusInfo());
2464 *debug_info.mutable_corpus_info() = std::move(corpus_info);
2465 }
2466 return debug_info;
2467 }
2468
UpdateScorablePropertyCache(const DocumentProto & document,SchemaTypeId schema_type_id)2469 libtextclassifier3::StatusOr<int> DocumentStore::UpdateScorablePropertyCache(
2470 const DocumentProto& document, SchemaTypeId schema_type_id) {
2471 if (!feature_flags_.enable_scorable_properties()) {
2472 return kInvalidScorablePropertyCacheIndex;
2473 }
2474 ICING_ASSIGN_OR_RETURN(
2475 const std::vector<ScorablePropertyManager::ScorablePropertyInfo>*
2476 ordered_scorable_property_info,
2477 schema_store_->GetOrderedScorablePropertyInfo(schema_type_id));
2478 if (ordered_scorable_property_info == nullptr ||
2479 ordered_scorable_property_info->empty()) {
2480 // No scorable property defined under the schema config of the
2481 // schema_type_id.
2482 return kInvalidScorablePropertyCacheIndex;
2483 }
2484 ICING_ASSIGN_OR_RETURN(
2485 std::unique_ptr<ScorablePropertySet> scorable_property_set,
2486 ScorablePropertySet::Create(document, schema_type_id, schema_store_));
2487
2488 return scorable_property_cache_->Write(
2489 scorable_property_set->GetScorablePropertySetProto());
2490 }
2491
2492 } // namespace lib
2493 } // namespace icing
2494