1 // Copyright (C) 2022 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_FILE_PERSISTENT_HASH_MAP_H_ 16 #define ICING_FILE_PERSISTENT_HASH_MAP_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 23 #include "icing/text_classifier/lib3/utils/base/statusor.h" 24 #include "icing/file/file-backed-vector.h" 25 #include "icing/file/filesystem.h" 26 #include "icing/file/memory-mapped-file.h" 27 #include "icing/file/persistent-storage.h" 28 #include "icing/util/crc32.h" 29 30 namespace icing { 31 namespace lib { 32 33 // Low level persistent hash map. 34 // It supports variant length serialized key + fixed length serialized value. 35 // Key and value can be any type, but callers should serialize key/value by 36 // themselves and pass raw bytes into the hash map, and the serialized key 37 // should not contain termination character '\0'. 38 class PersistentHashMap : public PersistentStorage { 39 public: 40 // For iterating through persistent hash map. The order is not guaranteed. 41 // 42 // Not thread-safe. 43 // 44 // Change in underlying persistent hash map invalidates iterator. 45 class Iterator { 46 public: 47 // Advance to the next entry. 48 // 49 // Returns: 50 // True on success, otherwise false. 51 bool Advance(); 52 GetIndex()53 int32_t GetIndex() const { return curr_kv_idx_; } 54 55 // Get the key. 56 // 57 // REQUIRES: The preceding call for Advance() is true. GetKey()58 std::string_view GetKey() const { 59 return std::string_view(map_->kv_storage_->array() + curr_kv_idx_, 60 curr_key_len_); 61 } 62 63 // Get the memory mapped address of the value. 64 // 65 // REQUIRES: The preceding call for Advance() is true. GetValue()66 const void* GetValue() const { 67 return static_cast<const void*>(map_->kv_storage_->array() + 68 curr_kv_idx_ + curr_key_len_ + 1); 69 } 70 71 private: Iterator(const PersistentHashMap * map)72 explicit Iterator(const PersistentHashMap* map) 73 : map_(map), curr_kv_idx_(0), curr_key_len_(0) {} 74 75 // Does not own 76 const PersistentHashMap* map_; 77 78 int32_t curr_kv_idx_; 79 int32_t curr_key_len_; 80 81 friend class PersistentHashMap; 82 }; 83 84 // Metadata file layout: <Crcs><Info> 85 static constexpr int32_t kCrcsMetadataFileOffset = 0; 86 static constexpr int32_t kInfoMetadataFileOffset = 87 static_cast<int32_t>(sizeof(Crcs)); 88 89 struct Info { 90 static constexpr int32_t kMagic = 0x653afd7b; 91 92 int32_t magic; 93 int32_t value_type_size; 94 int32_t max_load_factor_percent; 95 int32_t num_deleted_entries; 96 int32_t num_deleted_key_value_bytes; 97 GetChecksumInfo98 Crc32 GetChecksum() const { 99 return Crc32( 100 std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); 101 } 102 } __attribute__((packed)); 103 static_assert(sizeof(Info) == 20, ""); 104 105 static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); 106 static_assert(kMetadataFileSize == 32, ""); 107 108 // Bucket 109 class Bucket { 110 public: 111 // Absolute max # of buckets allowed. Since we're using FileBackedVector to 112 // store buckets, add some static_asserts to ensure numbers here are 113 // compatible with FileBackedVector. 114 static constexpr int32_t kMaxNumBuckets = 1 << 24; 115 116 explicit Bucket(int32_t head_entry_index = Entry::kInvalidIndex) head_entry_index_(head_entry_index)117 : head_entry_index_(head_entry_index) {} 118 119 // For FileBackedVector 120 bool operator==(const Bucket& other) const { 121 return head_entry_index_ == other.head_entry_index_; 122 } 123 head_entry_index()124 int32_t head_entry_index() const { return head_entry_index_; } set_head_entry_index(int32_t head_entry_index)125 void set_head_entry_index(int32_t head_entry_index) { 126 head_entry_index_ = head_entry_index; 127 } 128 129 private: 130 int32_t head_entry_index_; 131 } __attribute__((packed)); 132 static_assert(sizeof(Bucket) == 4, ""); 133 static_assert(sizeof(Bucket) == FileBackedVector<Bucket>::kElementTypeSize, 134 "Bucket type size is inconsistent with FileBackedVector " 135 "element type size"); 136 static_assert(Bucket::kMaxNumBuckets <= 137 (FileBackedVector<Bucket>::kMaxFileSize - 138 FileBackedVector<Bucket>::Header::kHeaderSize) / 139 FileBackedVector<Bucket>::kElementTypeSize, 140 "Max # of buckets cannot fit into FileBackedVector"); 141 142 // Entry 143 class Entry { 144 public: 145 // Absolute max # of entries allowed. Since we're using FileBackedVector to 146 // store entries, add some static_asserts to ensure numbers here are 147 // compatible with FileBackedVector. 148 // 149 // Still the actual max # of entries are determined by key-value storage, 150 // since length of the key varies and affects # of actual key-value pairs 151 // that can be stored. 152 static constexpr int32_t kMaxNumEntries = 1 << 23; 153 static constexpr int32_t kMaxIndex = kMaxNumEntries - 1; 154 static constexpr int32_t kInvalidIndex = -1; 155 Entry(int32_t key_value_index,int32_t next_entry_index)156 explicit Entry(int32_t key_value_index, int32_t next_entry_index) 157 : key_value_index_(key_value_index), 158 next_entry_index_(next_entry_index) {} 159 160 bool operator==(const Entry& other) const { 161 return key_value_index_ == other.key_value_index_ && 162 next_entry_index_ == other.next_entry_index_; 163 } 164 key_value_index()165 int32_t key_value_index() const { return key_value_index_; } set_key_value_index(int32_t key_value_index)166 void set_key_value_index(int32_t key_value_index) { 167 key_value_index_ = key_value_index; 168 } 169 next_entry_index()170 int32_t next_entry_index() const { return next_entry_index_; } set_next_entry_index(int32_t next_entry_index)171 void set_next_entry_index(int32_t next_entry_index) { 172 next_entry_index_ = next_entry_index; 173 } 174 175 private: 176 int32_t key_value_index_; 177 int32_t next_entry_index_; 178 } __attribute__((packed)); 179 static_assert(sizeof(Entry) == 8, ""); 180 static_assert(sizeof(Entry) == FileBackedVector<Entry>::kElementTypeSize, 181 "Entry type size is inconsistent with FileBackedVector " 182 "element type size"); 183 static_assert(Entry::kMaxNumEntries <= 184 (FileBackedVector<Entry>::kMaxFileSize - 185 FileBackedVector<Entry>::Header::kHeaderSize) / 186 FileBackedVector<Entry>::kElementTypeSize, 187 "Max # of entries cannot fit into FileBackedVector"); 188 189 // Key-value serialized type 190 static constexpr int32_t kMaxKVTotalByteSize = 1 << 28; 191 static constexpr int32_t kMaxKVIndex = kMaxKVTotalByteSize - 1; 192 static constexpr int32_t kInvalidKVIndex = -1; 193 static_assert(sizeof(char) == FileBackedVector<char>::kElementTypeSize, 194 "Char type size is inconsistent with FileBackedVector element " 195 "type size"); 196 static_assert(kMaxKVTotalByteSize <= 197 FileBackedVector<char>::kMaxFileSize - 198 FileBackedVector<char>::Header::kHeaderSize, 199 "Max total byte size of key value pairs cannot fit into " 200 "FileBackedVector"); 201 202 static constexpr int32_t kMaxValueTypeSize = 1 << 10; 203 204 struct Options { 205 static constexpr int32_t kDefaultMaxLoadFactorPercent = 100; 206 static constexpr int32_t kDefaultAverageKVByteSize = 32; 207 static constexpr int32_t kDefaultInitNumBuckets = 1 << 13; 208 209 explicit Options( 210 int32_t value_type_size_in, 211 int32_t max_num_entries_in = Entry::kMaxNumEntries, 212 int32_t max_load_factor_percent_in = kDefaultMaxLoadFactorPercent, 213 int32_t average_kv_byte_size_in = kDefaultAverageKVByteSize, 214 int32_t init_num_buckets_in = kDefaultInitNumBuckets, 215 bool pre_mapping_fbv_in = false) value_type_sizeOptions216 : value_type_size(value_type_size_in), 217 max_num_entries(max_num_entries_in), 218 max_load_factor_percent(max_load_factor_percent_in), 219 average_kv_byte_size(average_kv_byte_size_in), 220 init_num_buckets(init_num_buckets_in), 221 pre_mapping_fbv(pre_mapping_fbv_in) {} 222 223 bool IsValid() const; 224 225 // (fixed) size of the serialized value type for hash map. 226 int32_t value_type_size; 227 228 // Max # of entries, default Entry::kMaxNumEntries. 229 int32_t max_num_entries; 230 231 // Percentage of the max loading for the hash map. If load_factor_percent 232 // exceeds max_load_factor_percent, then rehash will be invoked (and # of 233 // buckets will be doubled). 234 // load_factor_percent = 100 * num_keys / num_buckets 235 // 236 // Note that load_factor_percent exceeding 100 is considered valid. 237 int32_t max_load_factor_percent; 238 239 // Average byte size of a key value pair. It is used to estimate kv_storage_ 240 // pre_mapping_mmap_size. 241 int32_t average_kv_byte_size; 242 243 // Initial # of buckets for the persistent hash map. It should be 2's power. 244 // It is used when creating new persistent hash map and ignored when 245 // creating the instance from existing files. 246 int32_t init_num_buckets; 247 248 // Flag indicating whether memory map max possible file size for underlying 249 // FileBackedVector before growing the actual file size. 250 bool pre_mapping_fbv; 251 }; 252 253 static constexpr WorkingPathType kWorkingPathType = 254 WorkingPathType::kDirectory; 255 static constexpr std::string_view kFilePrefix = "persistent_hash_map"; 256 257 // Creates a new PersistentHashMap to read/write/delete key value pairs. 258 // 259 // filesystem: Object to make system level calls 260 // working_path: Specifies the working path for PersistentStorage. 261 // PersistentHashMap uses working path as working directory and 262 // all related files will be stored under this directory. It 263 // takes full ownership and of working_path_, including 264 // creation/deletion. It is the caller's responsibility to 265 // specify correct working path and avoid mixing different 266 // persistent storages together under the same path. Also the 267 // caller has the ownership for the parent directory of 268 // working_path_, and it is responsible for parent directory 269 // creation/deletion. See PersistentStorage for more details 270 // about the concept of working_path. 271 // options: Options instance. 272 // 273 // Returns: 274 // INVALID_ARGUMENT_ERROR if any value in options is invalid. 275 // FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored 276 // checksum or any other inconsistency. 277 // INTERNAL_ERROR on I/O errors. 278 // Any FileBackedVector errors. 279 static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> 280 Create(const Filesystem& filesystem, std::string working_path, 281 Options options); 282 283 // Deletes PersistentHashMap under working_path. 284 // 285 // Returns: 286 // - OK on success 287 // - INTERNAL_ERROR on I/O error Discard(const Filesystem & filesystem,std::string working_path)288 static libtextclassifier3::Status Discard(const Filesystem& filesystem, 289 std::string working_path) { 290 return PersistentStorage::Discard(filesystem, working_path, 291 kWorkingPathType); 292 } 293 294 ~PersistentHashMap() override; 295 296 // Update a key value pair. If key does not exist, then insert (key, value) 297 // into the storage. Otherwise overwrite the value into the storage. 298 // 299 // REQUIRES: the buffer pointed to by value must be of value_size() 300 // 301 // Returns: 302 // OK on success 303 // RESOURCE_EXHAUSTED_ERROR if # of entries reach options_.max_num_entries 304 // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0') 305 // INTERNAL_ERROR on I/O error or any data inconsistency 306 // Any FileBackedVector errors 307 libtextclassifier3::Status Put(std::string_view key, const void* value); 308 309 // If key does not exist, then insert (key, next_value) into the storage. 310 // Otherwise, copy the hash map value into next_value. 311 // 312 // REQUIRES: the buffer pointed to by next_value must be of value_size() 313 // 314 // Returns: 315 // OK on success 316 // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0') 317 // INTERNAL_ERROR on I/O error or any data inconsistency 318 // Any FileBackedVector errors 319 libtextclassifier3::Status GetOrPut(std::string_view key, void* next_value); 320 321 // Get the value by key from the storage. If key exists, then copy the hash 322 // map value into into value buffer. Otherwise, return NOT_FOUND_ERROR. 323 // 324 // REQUIRES: the buffer pointed to by value must be of value_size() 325 // 326 // Returns: 327 // OK on success 328 // NOT_FOUND_ERROR if the key doesn't exist 329 // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0') 330 // INTERNAL_ERROR on I/O error or any data inconsistency 331 // Any FileBackedVector errors 332 libtextclassifier3::Status Get(std::string_view key, void* value) const; 333 334 // Delete the key value pair from the storage. If key doesn't exist, then do 335 // nothing and return NOT_FOUND_ERROR. 336 // 337 // Returns: 338 // OK on success 339 // NOT_FOUND_ERROR if the key doesn't exist 340 // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0') 341 // INTERNAL_ERROR on I/O error or any data inconsistency 342 // Any FileBackedVector errors 343 libtextclassifier3::Status Delete(std::string_view key); 344 GetIterator()345 Iterator GetIterator() const { return Iterator(this); } 346 347 // Calculates and returns the disk usage (metadata + 3 storages total file 348 // size) in bytes. 349 // 350 // Returns: 351 // Disk usage on success 352 // INTERNAL_ERROR on I/O error 353 libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const; 354 355 // Returns the total file size of the all the elements held in the persistent 356 // hash map. File size is in bytes. This excludes the size of any internal 357 // metadata, i.e. crcs/info of persistent hash map, file backed vector's 358 // header. 359 // 360 // Returns: 361 // File size on success 362 // INTERNAL_ERROR on I/O error 363 libtextclassifier3::StatusOr<int64_t> GetElementsSize() const; 364 size()365 int32_t size() const { 366 return entry_storage_->num_elements() - info().num_deleted_entries; 367 } 368 empty()369 bool empty() const { return size() == 0; } 370 num_buckets()371 int32_t num_buckets() const { return bucket_storage_->num_elements(); } 372 373 private: 374 struct EntryIndexPair { 375 int32_t target_entry_index; 376 int32_t prev_entry_index; 377 EntryIndexPairEntryIndexPair378 explicit EntryIndexPair(int32_t target_entry_index_in, 379 int32_t prev_entry_index_in) 380 : target_entry_index(target_entry_index_in), 381 prev_entry_index(prev_entry_index_in) {} 382 }; 383 PersistentHashMap(const Filesystem & filesystem,std::string && working_path,Options && options,MemoryMappedFile && metadata_mmapped_file,std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,std::unique_ptr<FileBackedVector<Entry>> entry_storage,std::unique_ptr<FileBackedVector<char>> kv_storage)384 explicit PersistentHashMap( 385 const Filesystem& filesystem, std::string&& working_path, 386 Options&& options, MemoryMappedFile&& metadata_mmapped_file, 387 std::unique_ptr<FileBackedVector<Bucket>> bucket_storage, 388 std::unique_ptr<FileBackedVector<Entry>> entry_storage, 389 std::unique_ptr<FileBackedVector<char>> kv_storage) 390 : PersistentStorage(filesystem, std::move(working_path), 391 kWorkingPathType), 392 options_(std::move(options)), 393 metadata_mmapped_file_(std::make_unique<MemoryMappedFile>( 394 std::move(metadata_mmapped_file))), 395 bucket_storage_(std::move(bucket_storage)), 396 entry_storage_(std::move(entry_storage)), 397 kv_storage_(std::move(kv_storage)), 398 is_info_dirty_(false), 399 is_storage_dirty_(false) {} 400 401 static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> 402 InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, 403 Options&& options); 404 405 static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>> 406 InitializeExistingFiles(const Filesystem& filesystem, 407 std::string&& working_path, Options&& options); 408 409 libtextclassifier3::Status PersistStoragesToDisk() override; 410 411 libtextclassifier3::Status PersistMetadataToDisk() override; 412 WriteMetadata()413 libtextclassifier3::Status WriteMetadata() override { 414 // PersistentHashMap::Header is mmapped. Therefore, writes occur when the 415 // metadata is modified. So just return OK. 416 return libtextclassifier3::Status::OK; 417 } 418 419 libtextclassifier3::StatusOr<Crc32> UpdateStoragesChecksum() override; 420 421 libtextclassifier3::StatusOr<Crc32> GetInfoChecksum() const override; 422 423 libtextclassifier3::StatusOr<Crc32> GetStoragesChecksum() const override; 424 425 // Find the index of the target entry (that contains the key) from a bucket 426 // (specified by bucket index). Also return the previous entry index, since 427 // Delete() needs it to update the linked list and head entry index. The 428 // caller should specify the desired bucket index. 429 // 430 // Returns: 431 // std::pair<int32_t, int32_t>: target entry index and previous entry index 432 // on success. If not found, then target entry 433 // index will be Entry::kInvalidIndex 434 // INTERNAL_ERROR if any content inconsistency 435 // Any FileBackedVector errors 436 libtextclassifier3::StatusOr<EntryIndexPair> FindEntryIndexByKey( 437 int32_t bucket_idx, std::string_view key) const; 438 439 // Copy the hash map value of the entry into value buffer. 440 // 441 // REQUIRES: entry_idx should be valid. 442 // REQUIRES: the buffer pointed to by value must be of value_size() 443 // 444 // Returns: 445 // OK on success 446 // Any FileBackedVector errors 447 libtextclassifier3::Status CopyEntryValue(int32_t entry_idx, 448 void* value) const; 449 450 // Insert a new key value pair into a bucket (specified by the bucket index). 451 // The caller should specify the desired bucket index and make sure that the 452 // key is not present in the hash map before calling. 453 // 454 // Returns: 455 // OK on success 456 // Any FileBackedVector errors 457 libtextclassifier3::Status Insert(int32_t bucket_idx, std::string_view key, 458 const void* value); 459 460 // Rehash function. If force_rehash is true or the hash map loading is greater 461 // than max_load_factor, then it will rehash all keys. 462 // 463 // Returns: 464 // OK on success 465 // INTERNAL_ERROR on I/O error or any data inconsistency 466 // Any FileBackedVector errors 467 libtextclassifier3::Status RehashIfNecessary(bool force_rehash); 468 crcs()469 Crcs& crcs() override { 470 return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() + 471 kCrcsMetadataFileOffset); 472 } 473 crcs()474 const Crcs& crcs() const override { 475 return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() + 476 kCrcsMetadataFileOffset); 477 } 478 info()479 Info& info() { 480 return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() + 481 kInfoMetadataFileOffset); 482 } 483 info()484 const Info& info() const { 485 return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() + 486 kInfoMetadataFileOffset); 487 } 488 SetInfoDirty()489 void SetInfoDirty() { is_info_dirty_ = true; } 490 491 // When the storage is dirty, then the checksum in the info is invalid and 492 // must be recalculated. Therefore, also mark the info as dirty. SetDirty()493 void SetDirty() { 494 SetInfoDirty(); 495 is_storage_dirty_ = true; 496 } 497 is_info_dirty()498 bool is_info_dirty() const { return is_info_dirty_; } is_storage_dirty()499 bool is_storage_dirty() const { return is_storage_dirty_; } 500 501 Options options_; 502 503 std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_; 504 505 // Storages 506 std::unique_ptr<FileBackedVector<Bucket>> bucket_storage_; 507 std::unique_ptr<FileBackedVector<Entry>> entry_storage_; 508 std::unique_ptr<FileBackedVector<char>> kv_storage_; 509 510 bool is_info_dirty_; 511 bool is_storage_dirty_; 512 }; 513 514 } // namespace lib 515 } // namespace icing 516 517 #endif // ICING_FILE_PERSISTENT_HASH_MAP_H_ 518