1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_STORE_DOCUMENT_STORE_H_ 16 #define ICING_STORE_DOCUMENT_STORE_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <optional> 21 #include <string> 22 #include <string_view> 23 #include <unordered_set> 24 #include <vector> 25 26 #include "icing/text_classifier/lib3/utils/base/status.h" 27 #include "icing/text_classifier/lib3/utils/base/statusor.h" 28 #include "icing/feature-flags.h" 29 #include "icing/file/file-backed-vector.h" 30 #include "icing/file/filesystem.h" 31 #include "icing/file/memory-mapped-file-backed-proto-log.h" 32 #include "icing/file/portable-file-backed-proto-log.h" 33 #include "icing/proto/debug.pb.h" 34 #include "icing/proto/document.pb.h" 35 #include "icing/proto/document_wrapper.pb.h" 36 #include "icing/proto/internal/scorable_property_set.pb.h" 37 #include "icing/proto/logging.pb.h" 38 #include "icing/proto/optimize.pb.h" 39 #include "icing/proto/persist.pb.h" 40 #include "icing/proto/search.pb.h" 41 #include "icing/proto/storage.pb.h" 42 #include "icing/proto/usage.pb.h" 43 #include "icing/schema/schema-store.h" 44 #include "icing/store/corpus-associated-scoring-data.h" 45 #include "icing/store/corpus-id.h" 46 #include "icing/store/document-associated-score-data.h" 47 #include "icing/store/document-filter-data.h" 48 #include "icing/store/document-id.h" 49 #include "icing/store/key-mapper.h" 50 #include "icing/store/namespace-id-fingerprint.h" 51 #include "icing/store/namespace-id.h" 52 #include "icing/store/usage-store.h" 53 #include "icing/tokenization/language-segmenter.h" 54 #include "icing/util/clock.h" 55 #include "icing/util/crc32.h" 56 #include "icing/util/data-loss.h" 57 #include "icing/util/document-validator.h" 58 #include "icing/util/fingerprint-util.h" 59 #include "icing/util/scorable_property_set.h" 60 61 namespace icing { 62 namespace lib { 63 64 // Provides storage interfaces for documents. 65 class DocumentStore { 66 public: 67 struct Header { 68 // Previously used magic numbers, please avoid reusing those: 69 // [0x1b99c8b0, 0x3e005b5e] 70 static constexpr int32_t kMagic = 0x8a32cd1f; 71 72 // Holds the magic as a quick sanity check against file corruption. 73 int32_t magic; 74 75 // Checksum of the DocumentStore's sub-component's checksums. 76 uint32_t checksum; 77 }; 78 79 struct OptimizeInfo { 80 // The estimated size in bytes of the optimizable docs. We don't track the 81 // size of each document, so we estimate by taking the size of the entire 82 // DocumentStore and dividing that by the total number of documents we have. 83 // So we end up with an average document size. 84 int64_t estimated_optimizable_bytes = 0; 85 86 // Number of total documents the DocumentStore tracks. 87 int32_t total_docs = 0; 88 89 // Number of optimizable (deleted + expired) docs the DocumentStore tracks. 90 int32_t optimizable_docs = 0; 91 }; 92 93 struct DeleteByGroupResult { 94 // Status representing whether or not the operation succeeded. See the 95 // comments above the function that returns this result to determine what 96 // possible statuses could be returned. 97 libtextclassifier3::Status status; 98 99 int num_docs_deleted = 0; 100 }; 101 102 struct CreateResult { 103 // A successfully initialized document store. 104 std::unique_ptr<DocumentStore> document_store; 105 106 // The data status after initializing from a previous state. Data loss can 107 // happen if the file is corrupted or some previously added data was 108 // unpersisted. This may be used to signal that any derived data off of the 109 // document store may need to be regenerated. 110 DataLoss data_loss; 111 112 // A boolean flag indicating if derived files of the document store have 113 // been regenerated or not. This is usually a signal for callers to detect 114 // if any id assignment has changed (e.g. NamespaceId). 115 bool derived_files_regenerated; 116 }; 117 118 // Not copyable 119 DocumentStore(const DocumentStore&) = delete; 120 DocumentStore& operator=(const DocumentStore&) = delete; 121 122 // Persists and updates checksum of subcomponents. 123 ~DocumentStore(); 124 125 // Factory method to create, initialize, and return a DocumentStore. The base 126 // directory is used to persist document store files. If document store was 127 // previously initialized with this directory, it will reload the files saved 128 // by the last instance. 129 // 130 // force_recovery_and_revalidate_documents=true will pre-emptively throw out 131 // the derived files and validate each document while recreating them. This 132 // can be used to indicate that the schema (and type ids) may have changed and 133 // those changes might not have been applied to the document store. 134 // 135 // If initialize_stats is present, the fields related to DocumentStore will be 136 // populated. 137 // 138 // Does not take any ownership, and all pointers except initialize_stats must 139 // refer to valid objects that outlive the one constructed. 140 // 141 // TODO(cassiewang): Consider returning a status indicating that derived files 142 // were regenerated. This may be helpful in logs. 143 // 144 // Returns: 145 // A DocumentStore::CreateResult on success 146 // FAILED_PRECONDITION on any null pointer input 147 // INTERNAL_ERROR on IO error 148 static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create( 149 const Filesystem* filesystem, const std::string& base_dir, 150 const Clock* clock, const SchemaStore* schema_store, 151 const FeatureFlags* feature_flags, 152 bool force_recovery_and_revalidate_documents, bool pre_mapping_fbv, 153 bool use_persistent_hash_map, int32_t compression_level, 154 InitializeStatsProto* initialize_stats); 155 156 // Discards all derived data in the document store. 157 // 158 // Returns: 159 // OK on success or nothing to discard 160 // INTERNAL_ERROR on any I/O errors 161 static libtextclassifier3::Status DiscardDerivedFiles( 162 const Filesystem* filesystem, const std::string& base_dir); 163 164 // Returns the maximum DocumentId that the DocumentStore has assigned. If 165 // there has not been any DocumentIds assigned, i.e. the DocumentStore is 166 // empty, then kInvalidDocumentId is returned. This does not filter out 167 // DocumentIds of deleted or expired documents. last_added_document_id()168 DocumentId last_added_document_id() const { 169 if (document_id_mapper_->num_elements() == 0) { 170 return kInvalidDocumentId; 171 } 172 return document_id_mapper_->num_elements() - 1; 173 } 174 175 // Returns the number of documents. The result does not filter out DocumentIds 176 // of deleted or expired documents. num_documents()177 int num_documents() const { return document_id_mapper_->num_elements(); } 178 179 // Puts the document into document store. 180 // 181 // If put_document_stats is present, the fields related to DocumentStore will 182 // be populated. 183 // 184 // Returns: 185 // - On success, a PutResult with the DocumentId of the newly added document 186 // and the old DocumentId before replacement. If this is a new document, 187 // then old DocumentId will be kInvalidDocumentId. 188 // - RESOURCE_EXHAUSTED if exceeds maximum number of allowed documents 189 // - FAILED_PRECONDITION if schema hasn't been set yet 190 // - NOT_FOUND if the schema_type or a property config of the document 191 // doesn't exist in schema 192 // - INTERNAL_ERROR on IO error 193 struct PutResult { 194 DocumentId old_document_id = kInvalidDocumentId; 195 DocumentId new_document_id = kInvalidDocumentId; 196 was_replacementPutResult197 bool was_replacement() const { 198 return old_document_id != kInvalidDocumentId; 199 } 200 }; 201 libtextclassifier3::StatusOr<PutResult> Put( 202 const DocumentProto& document, int32_t num_tokens = 0, 203 PutDocumentStatsProto* put_document_stats = nullptr); 204 libtextclassifier3::StatusOr<PutResult> Put( 205 DocumentProto&& document, int32_t num_tokens = 0, 206 PutDocumentStatsProto* put_document_stats = nullptr); 207 208 // Finds and returns the document identified by the given key (namespace + 209 // uri). If 'clear_internal_fields' is true, document level data that's 210 // generated internally by DocumentStore is cleared. 211 // 212 // Returns: 213 // The document found on success 214 // NOT_FOUND if the key doesn't exist or document has been deleted 215 // INTERNAL_ERROR on IO error 216 libtextclassifier3::StatusOr<DocumentProto> Get( 217 std::string_view name_space, std::string_view uri, 218 bool clear_internal_fields = true) const; 219 220 // Finds and returns the document identified by the given document id. If 221 // 'clear_internal_fields' is true, document level data that's generated 222 // internally by DocumentStore is cleared. 223 // 224 // Returns: 225 // The document found on success 226 // INVALID_ARGUMENT if document_id is less than 0 or greater than the 227 // maximum value 228 // NOT_FOUND if the document doesn't exist or has been deleted 229 // INTERNAL_ERROR on IO error 230 libtextclassifier3::StatusOr<DocumentProto> Get( 231 DocumentId document_id, bool clear_internal_fields = true) const; 232 233 // Returns the ScorablePropertySet of the document specified by the 234 // DocumentId. 235 // 236 // Returns: 237 // - ScorablePropertySet on success 238 // - nullptr when the ScorablePropertySet fails to be created, it could be 239 // due to that: 240 // - |document_id| is invalid, or 241 // - no ScorablePropertySetProto is found for the document in the cache 242 // - internal IO error 243 std::unique_ptr<ScorablePropertySet> GetScorablePropertySet( 244 DocumentId document_id, int64_t current_time_ms) const; 245 246 // Returns all namespaces which have at least 1 active document (not deleted 247 // or expired). Order of namespaces is undefined. 248 std::vector<std::string> GetAllNamespaces() const; 249 250 // Deletes the document identified by the given namespace and uri. The 251 // document proto will be erased immediately. 252 // 253 // NOTE: 254 // Space is not reclaimed for deleted documents until Optimize() is 255 // called. 256 // 257 // Returns: 258 // OK on success 259 // NOT_FOUND if no document exists with namespace, uri 260 // INTERNAL_ERROR on IO error 261 libtextclassifier3::Status Delete(std::string_view name_space, 262 std::string_view uri, 263 int64_t current_time_ms); 264 265 // Deletes the document identified by the given document_id. The document 266 // proto will be erased immediately. 267 // 268 // NOTE: 269 // Space is not reclaimed for deleted documents until Optimize() is 270 // called. 271 // 272 // Returns: 273 // OK on success 274 // NOT_FOUND if the document doesn't exist (i.e. deleted or expired) 275 // INTERNAL_ERROR on IO error 276 // INVALID_ARGUMENT if document_id is invalid. 277 libtextclassifier3::Status Delete(DocumentId document_id, 278 int64_t current_time_ms); 279 280 // Returns the NamespaceId of the string namespace 281 // 282 // Returns: 283 // NamespaceId on success 284 // NOT_FOUND if the namespace doesn't exist 285 // INTERNAL_ERROR on IO error 286 libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId( 287 std::string_view name_space) const; 288 289 // Helper method to find a DocumentId that is associated with the given 290 // namespace and uri. 291 // 292 // NOTE: The DocumentId may refer to a invalid document (deleted 293 // or expired). Callers can call DoesDocumentExist(document_id) to ensure it 294 // refers to a valid Document. 295 // 296 // Returns: 297 // A DocumentId on success 298 // NOT_FOUND if the key doesn't exist 299 // INTERNAL_ERROR on IO error 300 libtextclassifier3::StatusOr<DocumentId> GetDocumentId( 301 std::string_view name_space, std::string_view uri) const; 302 303 // Helper method to find a DocumentId that is associated with the given 304 // NamespaceIdFingerprint. 305 // 306 // NOTE: The DocumentId may refer to a invalid document (deleted 307 // or expired). Callers can call DoesDocumentExist(document_id) to ensure it 308 // refers to a valid Document. 309 // 310 // Returns: 311 // A DocumentId on success 312 // NOT_FOUND if the key doesn't exist 313 // INTERNAL_ERROR on IO error 314 libtextclassifier3::StatusOr<DocumentId> GetDocumentId( 315 const NamespaceIdFingerprint& doc_namespace_id_uri_fingerprint) const; 316 317 // Returns the CorpusId associated with the given namespace and schema. 318 // 319 // Returns: 320 // A CorpusId on success 321 // NOT_FOUND if the key doesn't exist 322 // INTERNAL_ERROR on IO error 323 libtextclassifier3::StatusOr<CorpusId> GetCorpusId( 324 const std::string_view name_space, const std::string_view schema) const; 325 326 // Returns the ResultGroupingEntryId associated with the given namespace 327 // and schema. 328 // 329 // NOTE: ResultGroupingEntryIds that are generated by calls with different 330 // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds 331 // are only guarenteed to be unique within their own ResultGroupingType. 332 // 333 // Returns: 334 // A ResultGroupingEntryId on success 335 // NOT_FOUND if the key doesn't exist 336 // INTERNAL_ERROR on IO error 337 libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId( 338 ResultSpecProto::ResultGroupingType result_group_type, 339 const std::string_view name_space, const std::string_view schema) const; 340 341 // Returns the ResultGrouping Entry Id associated with the given NamespaceId 342 // and SchemaTypeId 343 // 344 // NOTE: ResultGroupingEntryIds that are generated by calls with different 345 // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds 346 // are only guarenteed to be unique within their own ResultGroupingType. 347 // 348 // Returns: 349 // A ResultGroupingEntryId on success 350 // NOT_FOUND if the key doesn't exist 351 // INTERNAL_ERROR on IO error 352 libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId( 353 ResultSpecProto::ResultGroupingType result_group_type, 354 const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const; 355 356 // Returns the DocumentAssociatedScoreData of the document specified by the 357 // DocumentId. 358 // 359 // Returns: 360 // DocumentAssociatedScoreData on success 361 // NOT_FOUND if the document or the score data is not found 362 libtextclassifier3::StatusOr<DocumentAssociatedScoreData> 363 GetDocumentAssociatedScoreData(DocumentId document_id) const; 364 365 // Returns the CorpusAssociatedScoreData of the corpus specified by the 366 // corpus_id. 367 // 368 // NOTE: This does not check if the corpus exists and will return the 369 // CorpusAssociatedScoreData of the corpus even if all documents belonging to 370 // that corpus have been deleted. 371 // 372 // Returns: 373 // CorpusAssociatedScoreData on success 374 // OUT_OF_RANGE if corpus_id is negative or exceeds previously seen 375 // CorpusIds 376 libtextclassifier3::StatusOr<CorpusAssociatedScoreData> 377 GetCorpusAssociatedScoreData(CorpusId corpus_id) const; 378 379 // Gets the document filter data if a document exists and is not expired. 380 // Otherwise, will get a false optional. 381 // 382 // Existence means it hasn't been deleted and it hasn't expired yet. 383 // 384 // Returns: 385 // True:DocumentFilterData if the given document exists. 386 // False if the given document doesn't exist. 387 std::optional<DocumentFilterData> GetAliveDocumentFilterData( 388 DocumentId document_id, int64_t current_time_ms) const; 389 390 // Gets the document filter data if a document has not been deleted. If the 391 // document is expired but not deleted, will still return a valid document 392 // filter data. Otherwise, will get a false optional. 393 // 394 // Returns: 395 // True:DocumentFilterData if the given document exists. 396 // False if the given document has been deleted. 397 std::optional<DocumentFilterData> GetNonDeletedDocumentFilterData( 398 DocumentId document_id) const; 399 400 // Gets the SchemaTypeId of a document. 401 // 402 // Returns: 403 // SchemaTypeId on success 404 // kInvalidSchemaTypeId if the document is deleted or expired. GetSchemaTypeId(DocumentId document_id,int64_t current_time_ms)405 SchemaTypeId GetSchemaTypeId(DocumentId document_id, 406 int64_t current_time_ms) const { 407 std::optional<DocumentFilterData> document_filter_data_optional = 408 GetAliveDocumentFilterData(document_id, current_time_ms); 409 if (document_filter_data_optional) { 410 return document_filter_data_optional.value().schema_type_id(); 411 } else { 412 return kInvalidSchemaTypeId; 413 } 414 } 415 416 // Gets the usage scores of a document. 417 // 418 // Returns: 419 // UsageScores on success 420 // nullopt if there are no usage scores stored for the requested docid. 421 std::optional<UsageStore::UsageScores> GetUsageScores( 422 DocumentId document_id, int64_t current_time_ms) const; 423 424 // Reports usage. The corresponding usage scores of the specified document in 425 // the report will be updated. 426 // 427 // Returns: 428 // OK on success 429 // NOT_FOUND if the [namesapce + uri] key in the report doesn't exist 430 // INTERNAL_ERROR on I/O errors. 431 libtextclassifier3::Status ReportUsage(const UsageReport& usage_report); 432 433 // Deletes all documents belonging to the given namespace. The documents will 434 // be erased immediately. 435 // 436 // NOTE: 437 // Space is not reclaimed for deleted documents until Optimize() is 438 // called. 439 // 440 // Returns: 441 // OK on success 442 // NOT_FOUND if namespace doesn't exist 443 // INTERNAL_ERROR on IO error 444 DeleteByGroupResult DeleteByNamespace(std::string_view name_space); 445 446 // Deletes all documents belonging to the given schema type. The documents 447 // will be erased immediately. 448 // 449 // NOTE: 450 // Space is not reclaimed for deleted documents until Optimize() is 451 // called. 452 // 453 // Returns: 454 // OK on success 455 // NOT_FOUND if schema_type doesn't exist 456 // INTERNAL_ERROR on IO error 457 DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type); 458 459 // Syncs all the data and metadata changes to disk. 460 // 461 // Returns: 462 // OK on success 463 // INTERNAL on I/O error 464 libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type); 465 466 // Calculates the StorageInfo for the Document Store. 467 // 468 // If an IO error occurs while trying to calculate the value for a field, then 469 // that field will be set to -1. 470 DocumentStorageInfoProto GetStorageInfo() const; 471 472 // Update any derived data off of the SchemaStore with the new SchemaStore. 473 // This may include pointers, SchemaTypeIds, etc. 474 // 475 // NOTE: This function may delete documents. A document may be invalidated by 476 // the new SchemaStore, such as failing validation or having its schema type 477 // deleted from the schema. 478 // 479 // This is best used if the caller is unsure about what's changed in the 480 // SchemaStore, and wants to update all information no matter what. If the 481 // caller does know what has changed, then it's recommended to call 482 // OptimizedUpdateSchemaStore. 483 // 484 // Returns; 485 // OK on success 486 // INTERNAL_ERROR on IO error 487 libtextclassifier3::Status UpdateSchemaStore(const SchemaStore* schema_store); 488 489 // Performs the same funtionality as UpdateSchemaStore, but this can be more 490 // optimized in terms of less disk reads and less work if we know exactly 491 // what's changed between the old and new SchemaStore. 492 // 493 // Returns; 494 // OK on success 495 // INTERNAL_ERROR on IO error 496 libtextclassifier3::Status OptimizedUpdateSchemaStore( 497 const SchemaStore* schema_store, 498 const SchemaStore::SetSchemaResult& set_schema_result); 499 500 // Re-generates the scorable property cache for documents with the given 501 // schema types. 502 // 503 // Returns: 504 // OK on success 505 // INTERNAL_ERROR on IO error 506 libtextclassifier3::Status RegenerateScorablePropertyCache( 507 const std::unordered_set<SchemaTypeId>& schema_type_ids); 508 509 // Reduces internal file sizes by reclaiming space of deleted documents and 510 // regenerating derived files. 511 // 512 // NOTE: The tasks in this method are too expensive to be executed in 513 // real-time. The caller should decide how frequently and when to call this 514 // method based on device usage. 515 // 516 // Returns: 517 // OK on success 518 // INTERNAL_ERROR on IO error 519 libtextclassifier3::Status Optimize(); 520 521 struct OptimizeResult { 522 // A vector that maps old document id to new document id. 523 std::vector<DocumentId> document_id_old_to_new; 524 525 // A vector that maps old namespace id to new namespace id. Will be empty if 526 // should_rebuild_index is set to true. 527 std::vector<NamespaceId> namespace_id_old_to_new; 528 529 // A boolean flag that hints the caller (usually IcingSearchEngine) if it 530 // should rebuild index instead of adopting the id changes via the 2 vectors 531 // above. It will be set to true if finding any id inconsistency. 532 bool should_rebuild_index = false; 533 534 // A set of blob handles that are dead and need to be removed. 535 std::unordered_set<std::string> dead_blob_handles; 536 }; 537 // Copy data from current base directory into a new directory. Any outdated or 538 // deleted data won't be copied. During the process, document/namespace ids 539 // will be reassigned so any files / classes that are based on old 540 // document/namespace ids may be outdated. 541 // 542 // stats will be set if non-null. 543 // 544 // NOTE: The tasks in this method are too expensive to be executed in 545 // real-time. The caller should decide how frequently and when to call this 546 // method based on device usage. 547 // 548 // Returns: 549 // OptimizeResult which contains a vector mapping from old document id to 550 // new document id and another vector mapping from old namespace id to new 551 // namespace id, on success 552 // INVALID_ARGUMENT if new_directory is same as current base directory 553 // INTERNAL_ERROR on IO error 554 libtextclassifier3::StatusOr<OptimizeResult> OptimizeInto( 555 const std::string& new_directory, const LanguageSegmenter* lang_segmenter, 556 std::unordered_set<std::string>&& expired_blob_handles, 557 OptimizeStatsProto* stats = nullptr) const; 558 559 // Calculates status for a potential Optimize call. Includes how many docs 560 // there are vs how many would be optimized away. And also includes an 561 // estimated size gains, in bytes, if Optimize were called. 562 // 563 // Returns: 564 // OptimizeInfo on success 565 // INTERNAL_ERROR on IO error 566 libtextclassifier3::StatusOr<OptimizeInfo> GetOptimizeInfo() const; 567 568 // Update, replace and persist the header file. Creates the header file if it 569 // doesn't exist. 570 // 571 // Returns: 572 // OK on success 573 // INTERNAL on I/O error 574 libtextclassifier3::StatusOr<Crc32> UpdateChecksum(); 575 576 // Calculates and returns the checksum of the document store. 577 // 578 // Returns: 579 // OK on success 580 // INTERNAL on I/O error 581 libtextclassifier3::StatusOr<Crc32> GetChecksum() const; 582 583 // Get debug information for the document store. 584 // verbosity <= 0, simplest debug information 585 // verbosity > 0, also return the total number of documents and tokens in each 586 // (namespace, schema type) pair. 587 // 588 // Returns: 589 // DocumentDebugInfoProto on success 590 // INTERNAL_ERROR on IO errors, crc compute error 591 libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo( 592 int verbosity) const; 593 594 private: 595 // Use DocumentStore::Create() to instantiate. 596 explicit DocumentStore(const Filesystem* filesystem, 597 std::string_view base_dir, const Clock* clock, 598 const SchemaStore* schema_store, 599 const FeatureFlags* feature_flags, 600 bool pre_mapping_fbv, bool use_persistent_hash_map, 601 int32_t compression_level); 602 603 const Filesystem* const filesystem_; 604 const std::string base_dir_; 605 const Clock& clock_; 606 const FeatureFlags& feature_flags_; // Does not own. 607 608 // Handles the ground truth schema and all of the derived data off of the 609 // schema 610 const SchemaStore* schema_store_; 611 612 // Used to validate incoming documents 613 DocumentValidator document_validator_; 614 615 // Flag indicating whether memory map max possible file size for underlying 616 // FileBackedVector before growing the actual file size. 617 bool pre_mapping_fbv_; 618 619 // Flag indicating whether use persistent hash map as the key mapper (if 620 // false, then fall back to dynamic trie key mapper). Note: we only use 621 // persistent hash map for uri mapper if it is true. 622 bool use_persistent_hash_map_; 623 624 const int32_t compression_level_; 625 626 // A log used to store all documents, it serves as a ground truth of doc 627 // store. key_mapper_ and document_id_mapper_ can be regenerated from it. 628 std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_; 629 630 // Key (namespace + uri) to DocumentId mapping 631 std::unique_ptr< 632 KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>> 633 document_key_mapper_; 634 635 // DocumentId to file offset mapping 636 std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_; 637 638 // A cache of document associated scores. The ground truth of the scores is 639 // DocumentProto stored in document_log_. This cache contains: 640 // - CorpusId 641 // - Document score 642 // - Document creation timestamp in seconds 643 // - Document length in number of tokens 644 // - Index of the ScorablePropertySetProto at the scorable_property_cache_ 645 std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_; 646 647 // A cache of document scorable properties. The ground truth of the data is 648 // DocumentProto stored in document_log_. 649 std::unique_ptr<MemoryMappedFileBackedProtoLog<ScorablePropertySetProto>> 650 scorable_property_cache_; 651 652 // A cache of data, indexed by DocumentId, used to filter documents. Currently 653 // contains: 654 // - NamespaceId 655 // - SchemaTypeId 656 // - Expiration timestamp in seconds 657 std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_; 658 659 // A cache of corpus associated scores. The ground truth of the scores is 660 // DocumentProto stored in document_log_. This cache contains: 661 // - Number of documents belonging to the corpus score 662 // - The sum of the documents' lengths, in number of tokens. 663 std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>> 664 corpus_score_cache_; 665 666 // Maps namespaces to a densely-assigned unique id. Namespaces are assigned an 667 // id when the first document belonging to that namespace is added to the 668 // DocumentStore. Namespaces may be removed from the mapper during compaction. 669 std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_; 670 671 // Maps a corpus, i.e. a (namespace, schema type) pair, to a densely-assigned 672 // unique id. A coprus is assigned an 673 // id when the first document belonging to that corpus is added to the 674 // DocumentStore. Corpus ids may be removed from the mapper during compaction. 675 std::unique_ptr< 676 KeyMapper<CorpusId, fingerprint_util::FingerprintStringFormatter>> 677 corpus_mapper_; 678 679 // A storage class that caches all usage scores. Usage scores are not 680 // considered as ground truth. Usage scores are associated with document ids 681 // so they need to be updated when document ids change. 682 std::unique_ptr<UsageStore> usage_store_; 683 684 // Used internally to indicate whether the class has been initialized. This is 685 // to guard against cases where the object has been created, but Initialize 686 // fails in the constructor. If we have successfully exited the constructor, 687 // then this field can be ignored. Clients of DocumentStore should not need to 688 // worry about this field. 689 bool initialized_ = false; 690 691 struct InitializeResult { 692 DataLoss data_loss; 693 694 // A boolean flag indicating if derived files of the document store have 695 // been regenerated or not. This is usually a signal for callers to detect 696 // if any id assignment has changed (e.g. NamespaceId). 697 bool derived_files_regenerated; 698 }; 699 libtextclassifier3::StatusOr<InitializeResult> Initialize( 700 bool force_recovery_and_revalidate_documents, 701 InitializeStatsProto* initialize_stats); 702 703 // Creates sub-components and verifies the integrity of each sub-component. 704 // This assumes that the the underlying files already exist, and will return 705 // an error if it doesn't find what it's expecting. 706 // 707 // Returns an error if subcomponents failed to initialize successfully. 708 // INTERNAL_ERROR on IO error 709 libtextclassifier3::Status InitializeExistingDerivedFiles(); 710 711 // Re-generates all files derived from the ground truth: the document log. 712 // 713 // revalidate_documents=true will also cause each document to be revalidated 714 // the schema as it is read out of the document log. 715 // 716 // NOTE: if this function fails, the only thing we can do is to retry it until 717 // it succeeds or prevent the initialization of a DocumentStore. The 718 // DocumentStore object wouldn't work reliably if this fails. 719 // 720 // Steps: 721 // 1. Delete all derived files. 722 // 2. Iterate through document log, put data into new key mapper and 723 // document_id 724 // mapper. 725 // 3. Create header and store the updated combined checksum 726 libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents); 727 728 // Resets the unique_ptr to the document_key_mapper, deletes the underlying 729 // file, and re-creates a new instance of the document_key_mapper . 730 // 731 // Returns OK or any IO errors. 732 libtextclassifier3::Status ResetDocumentKeyMapper(); 733 734 // Resets the unique_ptr to the document_id_mapper, deletes the underlying 735 // file, and re-creates a new instance of the document_id_mapper. 736 // 737 // Returns OK or any IO errors. 738 libtextclassifier3::Status ResetDocumentIdMapper(); 739 740 // Resets the unique_ptr to the score_cache, deletes the underlying file, and 741 // re-creates a new instance of the score_cache. 742 // 743 // Returns OK or any IO errors. 744 libtextclassifier3::Status ResetDocumentAssociatedScoreCache(); 745 746 // Resets the unique_ptr to the |scorable_property_cache_|, deletes the 747 // underlying file, and re-creates a new instance of it. 748 // 749 // Returns OK or any IO errors. 750 libtextclassifier3::Status ResetScorablePropertyCache(); 751 752 // Resets the unique_ptr to the corpus_score_cache, deletes the underlying 753 // file, and re-creates a new instance of the corpus_score_cache. 754 // 755 // Returns OK or any IO errors. 756 libtextclassifier3::Status ResetCorpusAssociatedScoreCache(); 757 758 // Resets the unique_ptr to the filter_cache, deletes the underlying file, and 759 // re-creates a new instance of the filter_cache. 760 // 761 // Returns OK or any IO errors. 762 libtextclassifier3::Status ResetFilterCache(); 763 764 // Resets the unique_ptr to the namespace_mapper, deletes the underlying file, 765 // and re-creates a new instance of the namespace_mapper. 766 // 767 // Returns OK or any IO errors. 768 libtextclassifier3::Status ResetNamespaceMapper(); 769 770 // Resets the unique_ptr to the corpus_mapper, deletes the underlying file, 771 // and re-creates a new instance of the corpus_mapper. 772 // 773 // Returns OK or any IO errors. 774 libtextclassifier3::Status ResetCorpusMapper(); 775 776 // Checks if the header exists already. This does not create the header file 777 // if it doesn't exist. 778 bool HeaderExists(); 779 780 libtextclassifier3::StatusOr<PutResult> InternalPut( 781 DocumentProto&& document, 782 PutDocumentStatsProto* put_document_stats = nullptr); 783 784 // Helper function to do batch deletes. Documents with the given 785 // "namespace_id" and "schema_type_id" will be deleted. If callers don't need 786 // to specify the namespace or schema type, pass in kInvalidNamespaceId or 787 // kInvalidSchemaTypeId. The document protos with their derived data will be 788 // erased / cleared immediately. 789 // 790 // NOTE: Space is not reclaimed in the derived files until Optimize() is 791 // called. 792 // 793 // Returns: 794 // Number of documents that were actually updated to be deleted 795 // INTERNAL_ERROR on IO error 796 libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id, 797 SchemaTypeId schema_type_id); 798 799 // Returns the CorpusAssociatedScoreData of the corpus specified by the 800 // corpus_id. 801 // 802 // If the corpus_id has never been seen before, it returns a 803 // CorpusAssociatedScoreData with properties set to default values. 804 // 805 // NOTE: This does not check if the corpus exists and will return the 806 // CorpusAssociatedScoreData of the corpus even if all documents belonging to 807 // that corpus have been deleted. 808 // 809 // Returns: 810 // CorpusAssociatedScoreData on success 811 libtextclassifier3::StatusOr<CorpusAssociatedScoreData> 812 GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const; 813 814 // Check if a document exists. Existence means it hasn't been deleted and it 815 // hasn't expired yet. 816 // 817 // Returns: 818 // OK if the document exists 819 // INVALID_ARGUMENT if document_id is less than 0 or greater than the 820 // maximum value 821 // NOT_FOUND if the document doesn't exist (i.e. deleted or expired) 822 // INTERNAL_ERROR on IO error 823 libtextclassifier3::Status DoesDocumentExistWithStatus( 824 DocumentId document_id) const; 825 826 // Checks if a document has been deleted 827 // 828 // This is for internal-use only because we assume that the document_id is 829 // already valid. If you're unsure if the document_id is valid, use 830 // DoesDocumentExist(document_id) instead, which will perform those additional 831 // checks. 832 bool IsDeleted(DocumentId document_id) const; 833 834 // Checks if a document has expired. 835 // 836 // This is for internal-use only because we assume that the document_id is 837 // already valid. If you're unsure if the document_id is valid, use 838 // DoesDocumentExist(document_id) instead, which will perform those additional 839 // checks. 840 841 // Returns: 842 // True:DocumentFilterData if the given document isn't expired. 843 // False if the given doesn't document is expired. 844 std::optional<DocumentFilterData> GetNonExpiredDocumentFilterData( 845 DocumentId document_id, int64_t current_time_ms) const; 846 847 // Updates the entry in the score cache for document_id. 848 libtextclassifier3::Status UpdateDocumentAssociatedScoreCache( 849 DocumentId document_id, const DocumentAssociatedScoreData& score_data); 850 851 // Updates the entry in the corpus score cache for corpus_id. 852 libtextclassifier3::Status UpdateCorpusAssociatedScoreCache( 853 CorpusId corpus_id, const CorpusAssociatedScoreData& score_data); 854 855 // Updates the entry in the filter cache for document_id. 856 libtextclassifier3::Status UpdateFilterCache( 857 DocumentId document_id, const DocumentFilterData& filter_data); 858 859 // Helper method to clear the derived data of a document 860 libtextclassifier3::Status ClearDerivedData(DocumentId document_id); 861 862 // Sets usage scores for the given document. 863 libtextclassifier3::Status SetUsageScores( 864 DocumentId document_id, const UsageStore::UsageScores& usage_scores); 865 866 // Returns: 867 // - on success, a DocumentStorageInfoProto with the fields relating to the 868 // size of Document Store member variables populated. 869 // - INTERNAL on failure to get file size 870 DocumentStorageInfoProto GetMemberStorageInfo() const; 871 872 // Returns: 873 // - on success, the storage_info that was passed in but with the number of 874 // alive, deleted and expired documents also set. 875 // - OUT_OF_RANGE, this should never happen. This could only be returned if 876 // the document_id_mapper somehow became larger than the filter cache. 877 DocumentStorageInfoProto CalculateDocumentStatusCounts( 878 DocumentStorageInfoProto storage_info) const; 879 880 // Returns: 881 // - on success, a RepeatedPtrField for CorpusInfo collected. 882 // - OUT_OF_RANGE, this should never happen. 883 libtextclassifier3::StatusOr< 884 google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>> 885 CollectCorpusInfo() const; 886 887 // Extracts the ScorablePropertySetProto from the |document| and add it to 888 // the |scorable_property_cache_|. 889 // 890 // Returns: 891 // - Index of the newly inserted ScorablePropertySetProto in the 892 // |scorable_property_cache_|. 893 // - kInvalidScorablePropertyCacheIndex if the schema contains no 894 // scorable properties. 895 // - INVALID_ARGUMENT if |schema_type_id| is invalid, or the converted 896 // ScorablePropertySetProto exceeds the size limit of 16MiB. 897 // - INTERNAL_ERROR on IO error. 898 libtextclassifier3::StatusOr<int> UpdateScorablePropertyCache( 899 const DocumentProto& document, SchemaTypeId schema_type_id); 900 }; 901 902 } // namespace lib 903 } // namespace icing 904 905 #endif // ICING_STORE_DOCUMENT_STORE_H_ 906