xref: /aosp_15_r20/external/icing/icing/file/persistent-hash-map.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_FILE_PERSISTENT_HASH_MAP_H_
16 #define ICING_FILE_PERSISTENT_HASH_MAP_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 #include "icing/file/file-backed-vector.h"
25 #include "icing/file/filesystem.h"
26 #include "icing/file/memory-mapped-file.h"
27 #include "icing/file/persistent-storage.h"
28 #include "icing/util/crc32.h"
29 
30 namespace icing {
31 namespace lib {
32 
33 // Low level persistent hash map.
34 // It supports variant length serialized key + fixed length serialized value.
35 // Key and value can be any type, but callers should serialize key/value by
36 // themselves and pass raw bytes into the hash map, and the serialized key
37 // should not contain termination character '\0'.
38 class PersistentHashMap : public PersistentStorage {
39  public:
40   // For iterating through persistent hash map. The order is not guaranteed.
41   //
42   // Not thread-safe.
43   //
44   // Change in underlying persistent hash map invalidates iterator.
45   class Iterator {
46    public:
47     // Advance to the next entry.
48     //
49     // Returns:
50     //   True on success, otherwise false.
51     bool Advance();
52 
GetIndex()53     int32_t GetIndex() const { return curr_kv_idx_; }
54 
55     // Get the key.
56     //
57     // REQUIRES: The preceding call for Advance() is true.
GetKey()58     std::string_view GetKey() const {
59       return std::string_view(map_->kv_storage_->array() + curr_kv_idx_,
60                               curr_key_len_);
61     }
62 
63     // Get the memory mapped address of the value.
64     //
65     // REQUIRES: The preceding call for Advance() is true.
GetValue()66     const void* GetValue() const {
67       return static_cast<const void*>(map_->kv_storage_->array() +
68                                       curr_kv_idx_ + curr_key_len_ + 1);
69     }
70 
71    private:
Iterator(const PersistentHashMap * map)72     explicit Iterator(const PersistentHashMap* map)
73         : map_(map), curr_kv_idx_(0), curr_key_len_(0) {}
74 
75     // Does not own
76     const PersistentHashMap* map_;
77 
78     int32_t curr_kv_idx_;
79     int32_t curr_key_len_;
80 
81     friend class PersistentHashMap;
82   };
83 
84   // Metadata file layout: <Crcs><Info>
85   static constexpr int32_t kCrcsMetadataFileOffset = 0;
86   static constexpr int32_t kInfoMetadataFileOffset =
87       static_cast<int32_t>(sizeof(Crcs));
88 
89   struct Info {
90     static constexpr int32_t kMagic = 0x653afd7b;
91 
92     int32_t magic;
93     int32_t value_type_size;
94     int32_t max_load_factor_percent;
95     int32_t num_deleted_entries;
96     int32_t num_deleted_key_value_bytes;
97 
GetChecksumInfo98     Crc32 GetChecksum() const {
99       return Crc32(
100           std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
101     }
102   } __attribute__((packed));
103   static_assert(sizeof(Info) == 20, "");
104 
105   static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
106   static_assert(kMetadataFileSize == 32, "");
107 
108   // Bucket
109   class Bucket {
110    public:
111     // Absolute max # of buckets allowed. Since we're using FileBackedVector to
112     // store buckets, add some static_asserts to ensure numbers here are
113     // compatible with FileBackedVector.
114     static constexpr int32_t kMaxNumBuckets = 1 << 24;
115 
116     explicit Bucket(int32_t head_entry_index = Entry::kInvalidIndex)
head_entry_index_(head_entry_index)117         : head_entry_index_(head_entry_index) {}
118 
119     // For FileBackedVector
120     bool operator==(const Bucket& other) const {
121       return head_entry_index_ == other.head_entry_index_;
122     }
123 
head_entry_index()124     int32_t head_entry_index() const { return head_entry_index_; }
set_head_entry_index(int32_t head_entry_index)125     void set_head_entry_index(int32_t head_entry_index) {
126       head_entry_index_ = head_entry_index;
127     }
128 
129    private:
130     int32_t head_entry_index_;
131   } __attribute__((packed));
132   static_assert(sizeof(Bucket) == 4, "");
133   static_assert(sizeof(Bucket) == FileBackedVector<Bucket>::kElementTypeSize,
134                 "Bucket type size is inconsistent with FileBackedVector "
135                 "element type size");
136   static_assert(Bucket::kMaxNumBuckets <=
137                     (FileBackedVector<Bucket>::kMaxFileSize -
138                      FileBackedVector<Bucket>::Header::kHeaderSize) /
139                         FileBackedVector<Bucket>::kElementTypeSize,
140                 "Max # of buckets cannot fit into FileBackedVector");
141 
142   // Entry
143   class Entry {
144    public:
145     // Absolute max # of entries allowed. Since we're using FileBackedVector to
146     // store entries, add some static_asserts to ensure numbers here are
147     // compatible with FileBackedVector.
148     //
149     // Still the actual max # of entries are determined by key-value storage,
150     // since length of the key varies and affects # of actual key-value pairs
151     // that can be stored.
152     static constexpr int32_t kMaxNumEntries = 1 << 23;
153     static constexpr int32_t kMaxIndex = kMaxNumEntries - 1;
154     static constexpr int32_t kInvalidIndex = -1;
155 
Entry(int32_t key_value_index,int32_t next_entry_index)156     explicit Entry(int32_t key_value_index, int32_t next_entry_index)
157         : key_value_index_(key_value_index),
158           next_entry_index_(next_entry_index) {}
159 
160     bool operator==(const Entry& other) const {
161       return key_value_index_ == other.key_value_index_ &&
162              next_entry_index_ == other.next_entry_index_;
163     }
164 
key_value_index()165     int32_t key_value_index() const { return key_value_index_; }
set_key_value_index(int32_t key_value_index)166     void set_key_value_index(int32_t key_value_index) {
167       key_value_index_ = key_value_index;
168     }
169 
next_entry_index()170     int32_t next_entry_index() const { return next_entry_index_; }
set_next_entry_index(int32_t next_entry_index)171     void set_next_entry_index(int32_t next_entry_index) {
172       next_entry_index_ = next_entry_index;
173     }
174 
175    private:
176     int32_t key_value_index_;
177     int32_t next_entry_index_;
178   } __attribute__((packed));
179   static_assert(sizeof(Entry) == 8, "");
180   static_assert(sizeof(Entry) == FileBackedVector<Entry>::kElementTypeSize,
181                 "Entry type size is inconsistent with FileBackedVector "
182                 "element type size");
183   static_assert(Entry::kMaxNumEntries <=
184                     (FileBackedVector<Entry>::kMaxFileSize -
185                      FileBackedVector<Entry>::Header::kHeaderSize) /
186                         FileBackedVector<Entry>::kElementTypeSize,
187                 "Max # of entries cannot fit into FileBackedVector");
188 
189   // Key-value serialized type
190   static constexpr int32_t kMaxKVTotalByteSize = 1 << 28;
191   static constexpr int32_t kMaxKVIndex = kMaxKVTotalByteSize - 1;
192   static constexpr int32_t kInvalidKVIndex = -1;
193   static_assert(sizeof(char) == FileBackedVector<char>::kElementTypeSize,
194                 "Char type size is inconsistent with FileBackedVector element "
195                 "type size");
196   static_assert(kMaxKVTotalByteSize <=
197                     FileBackedVector<char>::kMaxFileSize -
198                         FileBackedVector<char>::Header::kHeaderSize,
199                 "Max total byte size of key value pairs cannot fit into "
200                 "FileBackedVector");
201 
202   static constexpr int32_t kMaxValueTypeSize = 1 << 10;
203 
204   struct Options {
205     static constexpr int32_t kDefaultMaxLoadFactorPercent = 100;
206     static constexpr int32_t kDefaultAverageKVByteSize = 32;
207     static constexpr int32_t kDefaultInitNumBuckets = 1 << 13;
208 
209     explicit Options(
210         int32_t value_type_size_in,
211         int32_t max_num_entries_in = Entry::kMaxNumEntries,
212         int32_t max_load_factor_percent_in = kDefaultMaxLoadFactorPercent,
213         int32_t average_kv_byte_size_in = kDefaultAverageKVByteSize,
214         int32_t init_num_buckets_in = kDefaultInitNumBuckets,
215         bool pre_mapping_fbv_in = false)
value_type_sizeOptions216         : value_type_size(value_type_size_in),
217           max_num_entries(max_num_entries_in),
218           max_load_factor_percent(max_load_factor_percent_in),
219           average_kv_byte_size(average_kv_byte_size_in),
220           init_num_buckets(init_num_buckets_in),
221           pre_mapping_fbv(pre_mapping_fbv_in) {}
222 
223     bool IsValid() const;
224 
225     // (fixed) size of the serialized value type for hash map.
226     int32_t value_type_size;
227 
228     // Max # of entries, default Entry::kMaxNumEntries.
229     int32_t max_num_entries;
230 
231     // Percentage of the max loading for the hash map. If load_factor_percent
232     // exceeds max_load_factor_percent, then rehash will be invoked (and # of
233     // buckets will be doubled).
234     //   load_factor_percent = 100 * num_keys / num_buckets
235     //
236     // Note that load_factor_percent exceeding 100 is considered valid.
237     int32_t max_load_factor_percent;
238 
239     // Average byte size of a key value pair. It is used to estimate kv_storage_
240     // pre_mapping_mmap_size.
241     int32_t average_kv_byte_size;
242 
243     // Initial # of buckets for the persistent hash map. It should be 2's power.
244     // It is used when creating new persistent hash map and ignored when
245     // creating the instance from existing files.
246     int32_t init_num_buckets;
247 
248     // Flag indicating whether memory map max possible file size for underlying
249     // FileBackedVector before growing the actual file size.
250     bool pre_mapping_fbv;
251   };
252 
253   static constexpr WorkingPathType kWorkingPathType =
254       WorkingPathType::kDirectory;
255   static constexpr std::string_view kFilePrefix = "persistent_hash_map";
256 
257   // Creates a new PersistentHashMap to read/write/delete key value pairs.
258   //
259   // filesystem: Object to make system level calls
260   // working_path: Specifies the working path for PersistentStorage.
261   //               PersistentHashMap uses working path as working directory and
262   //               all related files will be stored under this directory. It
263   //               takes full ownership and of working_path_, including
264   //               creation/deletion. It is the caller's responsibility to
265   //               specify correct working path and avoid mixing different
266   //               persistent storages together under the same path. Also the
267   //               caller has the ownership for the parent directory of
268   //               working_path_, and it is responsible for parent directory
269   //               creation/deletion. See PersistentStorage for more details
270   //               about the concept of working_path.
271   // options: Options instance.
272   //
273   // Returns:
274   //   INVALID_ARGUMENT_ERROR if any value in options is invalid.
275   //   FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
276   //                             checksum or any other inconsistency.
277   //   INTERNAL_ERROR on I/O errors.
278   //   Any FileBackedVector errors.
279   static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
280   Create(const Filesystem& filesystem, std::string working_path,
281          Options options);
282 
283   // Deletes PersistentHashMap under working_path.
284   //
285   // Returns:
286   //   - OK on success
287   //   - INTERNAL_ERROR on I/O error
Discard(const Filesystem & filesystem,std::string working_path)288   static libtextclassifier3::Status Discard(const Filesystem& filesystem,
289                                             std::string working_path) {
290     return PersistentStorage::Discard(filesystem, working_path,
291                                       kWorkingPathType);
292   }
293 
294   ~PersistentHashMap() override;
295 
296   // Update a key value pair. If key does not exist, then insert (key, value)
297   // into the storage. Otherwise overwrite the value into the storage.
298   //
299   // REQUIRES: the buffer pointed to by value must be of value_size()
300   //
301   // Returns:
302   //   OK on success
303   //   RESOURCE_EXHAUSTED_ERROR if # of entries reach options_.max_num_entries
304   //   INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
305   //   INTERNAL_ERROR on I/O error or any data inconsistency
306   //   Any FileBackedVector errors
307   libtextclassifier3::Status Put(std::string_view key, const void* value);
308 
309   // If key does not exist, then insert (key, next_value) into the storage.
310   // Otherwise, copy the hash map value into next_value.
311   //
312   // REQUIRES: the buffer pointed to by next_value must be of value_size()
313   //
314   // Returns:
315   //   OK on success
316   //   INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
317   //   INTERNAL_ERROR on I/O error or any data inconsistency
318   //   Any FileBackedVector errors
319   libtextclassifier3::Status GetOrPut(std::string_view key, void* next_value);
320 
321   // Get the value by key from the storage. If key exists, then copy the hash
322   // map value into into value buffer. Otherwise, return NOT_FOUND_ERROR.
323   //
324   // REQUIRES: the buffer pointed to by value must be of value_size()
325   //
326   // Returns:
327   //   OK on success
328   //   NOT_FOUND_ERROR if the key doesn't exist
329   //   INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
330   //   INTERNAL_ERROR on I/O error or any data inconsistency
331   //   Any FileBackedVector errors
332   libtextclassifier3::Status Get(std::string_view key, void* value) const;
333 
334   // Delete the key value pair from the storage. If key doesn't exist, then do
335   // nothing and return NOT_FOUND_ERROR.
336   //
337   // Returns:
338   //   OK on success
339   //   NOT_FOUND_ERROR if the key doesn't exist
340   //   INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
341   //   INTERNAL_ERROR on I/O error or any data inconsistency
342   //   Any FileBackedVector errors
343   libtextclassifier3::Status Delete(std::string_view key);
344 
GetIterator()345   Iterator GetIterator() const { return Iterator(this); }
346 
347   // Calculates and returns the disk usage (metadata + 3 storages total file
348   // size) in bytes.
349   //
350   // Returns:
351   //   Disk usage on success
352   //   INTERNAL_ERROR on I/O error
353   libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
354 
355   // Returns the total file size of the all the elements held in the persistent
356   // hash map. File size is in bytes. This excludes the size of any internal
357   // metadata, i.e. crcs/info of persistent hash map, file backed vector's
358   // header.
359   //
360   // Returns:
361   //   File size on success
362   //   INTERNAL_ERROR on I/O error
363   libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
364 
size()365   int32_t size() const {
366     return entry_storage_->num_elements() - info().num_deleted_entries;
367   }
368 
empty()369   bool empty() const { return size() == 0; }
370 
num_buckets()371   int32_t num_buckets() const { return bucket_storage_->num_elements(); }
372 
373  private:
374   struct EntryIndexPair {
375     int32_t target_entry_index;
376     int32_t prev_entry_index;
377 
EntryIndexPairEntryIndexPair378     explicit EntryIndexPair(int32_t target_entry_index_in,
379                             int32_t prev_entry_index_in)
380         : target_entry_index(target_entry_index_in),
381           prev_entry_index(prev_entry_index_in) {}
382   };
383 
PersistentHashMap(const Filesystem & filesystem,std::string && working_path,Options && options,MemoryMappedFile && metadata_mmapped_file,std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,std::unique_ptr<FileBackedVector<Entry>> entry_storage,std::unique_ptr<FileBackedVector<char>> kv_storage)384   explicit PersistentHashMap(
385       const Filesystem& filesystem, std::string&& working_path,
386       Options&& options, MemoryMappedFile&& metadata_mmapped_file,
387       std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
388       std::unique_ptr<FileBackedVector<Entry>> entry_storage,
389       std::unique_ptr<FileBackedVector<char>> kv_storage)
390       : PersistentStorage(filesystem, std::move(working_path),
391                           kWorkingPathType),
392         options_(std::move(options)),
393         metadata_mmapped_file_(std::make_unique<MemoryMappedFile>(
394             std::move(metadata_mmapped_file))),
395         bucket_storage_(std::move(bucket_storage)),
396         entry_storage_(std::move(entry_storage)),
397         kv_storage_(std::move(kv_storage)),
398         is_info_dirty_(false),
399         is_storage_dirty_(false) {}
400 
401   static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
402   InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path,
403                      Options&& options);
404 
405   static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
406   InitializeExistingFiles(const Filesystem& filesystem,
407                           std::string&& working_path, Options&& options);
408 
409   libtextclassifier3::Status PersistStoragesToDisk() override;
410 
411   libtextclassifier3::Status PersistMetadataToDisk() override;
412 
WriteMetadata()413   libtextclassifier3::Status WriteMetadata() override {
414     // PersistentHashMap::Header is mmapped. Therefore, writes occur when the
415     // metadata is modified. So just return OK.
416     return libtextclassifier3::Status::OK;
417   }
418 
419   libtextclassifier3::StatusOr<Crc32> UpdateStoragesChecksum() override;
420 
421   libtextclassifier3::StatusOr<Crc32> GetInfoChecksum() const override;
422 
423   libtextclassifier3::StatusOr<Crc32> GetStoragesChecksum() const override;
424 
425   // Find the index of the target entry (that contains the key) from a bucket
426   // (specified by bucket index). Also return the previous entry index, since
427   // Delete() needs it to update the linked list and head entry index. The
428   // caller should specify the desired bucket index.
429   //
430   // Returns:
431   //   std::pair<int32_t, int32_t>: target entry index and previous entry index
432   //                                on success. If not found, then target entry
433   //                                index will be Entry::kInvalidIndex
434   //   INTERNAL_ERROR if any content inconsistency
435   //   Any FileBackedVector errors
436   libtextclassifier3::StatusOr<EntryIndexPair> FindEntryIndexByKey(
437       int32_t bucket_idx, std::string_view key) const;
438 
439   // Copy the hash map value of the entry into value buffer.
440   //
441   // REQUIRES: entry_idx should be valid.
442   // REQUIRES: the buffer pointed to by value must be of value_size()
443   //
444   // Returns:
445   //   OK on success
446   //   Any FileBackedVector errors
447   libtextclassifier3::Status CopyEntryValue(int32_t entry_idx,
448                                             void* value) const;
449 
450   // Insert a new key value pair into a bucket (specified by the bucket index).
451   // The caller should specify the desired bucket index and make sure that the
452   // key is not present in the hash map before calling.
453   //
454   // Returns:
455   //   OK on success
456   //   Any FileBackedVector errors
457   libtextclassifier3::Status Insert(int32_t bucket_idx, std::string_view key,
458                                     const void* value);
459 
460   // Rehash function. If force_rehash is true or the hash map loading is greater
461   // than max_load_factor, then it will rehash all keys.
462   //
463   // Returns:
464   //   OK on success
465   //   INTERNAL_ERROR on I/O error or any data inconsistency
466   //   Any FileBackedVector errors
467   libtextclassifier3::Status RehashIfNecessary(bool force_rehash);
468 
crcs()469   Crcs& crcs() override {
470     return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() +
471                                     kCrcsMetadataFileOffset);
472   }
473 
crcs()474   const Crcs& crcs() const override {
475     return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() +
476                                           kCrcsMetadataFileOffset);
477   }
478 
info()479   Info& info() {
480     return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() +
481                                     kInfoMetadataFileOffset);
482   }
483 
info()484   const Info& info() const {
485     return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() +
486                                           kInfoMetadataFileOffset);
487   }
488 
SetInfoDirty()489   void SetInfoDirty() { is_info_dirty_ = true; }
490 
491   // When the storage is dirty, then the checksum in the info is invalid and
492   // must be recalculated. Therefore, also mark the info as dirty.
SetDirty()493   void SetDirty() {
494     SetInfoDirty();
495     is_storage_dirty_ = true;
496   }
497 
is_info_dirty()498   bool is_info_dirty() const { return is_info_dirty_; }
is_storage_dirty()499   bool is_storage_dirty() const { return is_storage_dirty_; }
500 
501   Options options_;
502 
503   std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_;
504 
505   // Storages
506   std::unique_ptr<FileBackedVector<Bucket>> bucket_storage_;
507   std::unique_ptr<FileBackedVector<Entry>> entry_storage_;
508   std::unique_ptr<FileBackedVector<char>> kv_storage_;
509 
510   bool is_info_dirty_;
511   bool is_storage_dirty_;
512 };
513 
514 }  // namespace lib
515 }  // namespace icing
516 
517 #endif  // ICING_FILE_PERSISTENT_HASH_MAP_H_
518