xref: /aosp_15_r20/external/icing/icing/file/persistent-hash-map.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/file/persistent-hash-map.h"
16 
17 #include <cstdint>
18 #include <cstring>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <utility>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/absl_ports/canonical_errors.h"
27 #include "icing/absl_ports/str_cat.h"
28 #include "icing/file/file-backed-vector.h"
29 #include "icing/file/memory-mapped-file.h"
30 #include "icing/file/persistent-storage.h"
31 #include "icing/util/crc32.h"
32 #include "icing/util/status-macros.h"
33 
34 namespace icing {
35 namespace lib {
36 
37 namespace {
38 
39 // Helper function to check if there is no termination character '\0' in the
40 // key.
ValidateKey(std::string_view key)41 libtextclassifier3::Status ValidateKey(std::string_view key) {
42   if (key.find('\0') != std::string_view::npos) {  // NOLINT
43     return absl_ports::InvalidArgumentError(
44         "Key cannot contain termination character '\\0'");
45   }
46   return libtextclassifier3::Status::OK;
47 }
48 
49 // Helper function to convert the key to bucket index by hash.
50 //
51 // Returns:
52 //   int32_t: A valid bucket index with range [0, num_buckets - 1].
53 //   INTERNAL_ERROR if num_buckets == 0
HashKeyToBucketIndex(std::string_view key,int32_t num_buckets)54 libtextclassifier3::StatusOr<int32_t> HashKeyToBucketIndex(
55     std::string_view key, int32_t num_buckets) {
56   if (num_buckets == 0) {
57     return absl_ports::InternalError("Should not have empty bucket");
58   }
59   return static_cast<int32_t>(std::hash<std::string_view>()(key) % num_buckets);
60 }
61 
62 // The following 4 methods are helper functions to get the correct path of
63 // metadata/bucket/entry/key-value storages, according to the given working
64 // directory path.
GetMetadataFilePath(std::string_view working_path)65 std::string GetMetadataFilePath(std::string_view working_path) {
66   return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
67                             ".m");
68 }
69 
GetBucketStorageFilePath(std::string_view working_path)70 std::string GetBucketStorageFilePath(std::string_view working_path) {
71   return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
72                             ".b");
73 }
74 
GetEntryStorageFilePath(std::string_view working_path)75 std::string GetEntryStorageFilePath(std::string_view working_path) {
76   return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
77                             ".e");
78 }
79 
GetKeyValueStorageFilePath(std::string_view working_path)80 std::string GetKeyValueStorageFilePath(std::string_view working_path) {
81   return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
82                             ".k");
83 }
84 
85 // Calculates how many buckets we need given num_entries and
86 // max_load_factor_percent. Round it up to 2's power.
87 //
88 // REQUIRES: 0 < num_entries <= Entry::kMaxNumEntries &&
89 //           max_load_factor_percent > 0
CalculateNumBucketsRequired(int32_t num_entries,int32_t max_load_factor_percent)90 int32_t CalculateNumBucketsRequired(int32_t num_entries,
91                                     int32_t max_load_factor_percent) {
92   // Calculate ceil(num_entries * 100 / max_load_factor_percent)
93   int32_t num_entries_100 = num_entries * 100;
94   int32_t num_buckets_required =
95       num_entries_100 / max_load_factor_percent +
96       (num_entries_100 % max_load_factor_percent == 0 ? 0 : 1);
97   if ((num_buckets_required & (num_buckets_required - 1)) != 0) {
98     // not 2's power
99     return 1 << (32 - __builtin_clz(num_buckets_required));
100   }
101   return num_buckets_required;
102 }
103 
104 }  // namespace
105 
IsValid() const106 bool PersistentHashMap::Options::IsValid() const {
107   if (!(value_type_size > 0 && value_type_size <= kMaxValueTypeSize &&
108         max_num_entries > 0 && max_num_entries <= Entry::kMaxNumEntries &&
109         max_load_factor_percent > 0 && average_kv_byte_size > 0 &&
110         init_num_buckets > 0 && init_num_buckets <= Bucket::kMaxNumBuckets)) {
111     return false;
112   }
113 
114   // We've ensured (static_assert) that storing kMaxNumBuckets buckets won't
115   // exceed FileBackedVector::kMaxFileSize, so only need to verify # of buckets
116   // required won't exceed kMaxNumBuckets.
117   if (CalculateNumBucketsRequired(max_num_entries, max_load_factor_percent) >
118       Bucket::kMaxNumBuckets) {
119     return false;
120   }
121 
122   // Verify # of key value pairs can fit into kv_storage.
123   if (average_kv_byte_size > kMaxKVTotalByteSize / max_num_entries) {
124     return false;
125   }
126 
127   // Verify init_num_buckets is 2's power. Requiring init_num_buckets to be 2^n
128   // guarantees that num_buckets will eventually grow to be exactly
129   // max_num_buckets since CalculateNumBucketsRequired rounds it up to 2^n.
130   if ((init_num_buckets & (init_num_buckets - 1)) != 0) {
131     return false;
132   }
133 
134   return true;
135 }
136 
137 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
Create(const Filesystem & filesystem,std::string working_path,Options options)138 PersistentHashMap::Create(const Filesystem& filesystem,
139                           std::string working_path, Options options) {
140   if (!options.IsValid()) {
141     return absl_ports::InvalidArgumentError(
142         "Invalid PersistentHashMap options");
143   }
144 
145   if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) ||
146       !filesystem.FileExists(GetBucketStorageFilePath(working_path).c_str()) ||
147       !filesystem.FileExists(GetEntryStorageFilePath(working_path).c_str()) ||
148       !filesystem.FileExists(
149           GetKeyValueStorageFilePath(working_path).c_str())) {
150     // Discard working_path if any of them is missing, and reinitialize.
151     if (filesystem.DirectoryExists(working_path.c_str())) {
152       ICING_RETURN_IF_ERROR(Discard(filesystem, working_path));
153     }
154     return InitializeNewFiles(filesystem, std::move(working_path),
155                               std::move(options));
156   }
157   return InitializeExistingFiles(filesystem, std::move(working_path),
158                                  std::move(options));
159 }
160 
~PersistentHashMap()161 PersistentHashMap::~PersistentHashMap() {
162   if (!PersistToDisk().ok()) {
163     ICING_LOG(WARNING)
164         << "Failed to persist hash map to disk while destructing "
165         << working_path_;
166   }
167 }
168 
Put(std::string_view key,const void * value)169 libtextclassifier3::Status PersistentHashMap::Put(std::string_view key,
170                                                   const void* value) {
171   SetDirty();
172 
173   ICING_RETURN_IF_ERROR(ValidateKey(key));
174   ICING_ASSIGN_OR_RETURN(
175       int32_t bucket_idx,
176       HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
177 
178   ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
179                          FindEntryIndexByKey(bucket_idx, key));
180   if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
181     // If not found, then insert new key value pair.
182     return Insert(bucket_idx, key, value);
183   }
184 
185   // Otherwise, overwrite the value.
186   ICING_ASSIGN_OR_RETURN(const Entry* entry,
187                          entry_storage_->Get(idx_pair.target_entry_index));
188 
189   int32_t kv_len = key.length() + 1 + info().value_type_size;
190   int32_t value_offset = key.length() + 1;
191   ICING_ASSIGN_OR_RETURN(
192       typename FileBackedVector<char>::MutableArrayView mutable_kv_arr,
193       kv_storage_->GetMutable(entry->key_value_index(), kv_len));
194   // It is the same key and value_size is fixed, so we can directly overwrite
195   // serialized value.
196   mutable_kv_arr.SetArray(value_offset, reinterpret_cast<const char*>(value),
197                           info().value_type_size);
198 
199   return libtextclassifier3::Status::OK;
200 }
201 
GetOrPut(std::string_view key,void * next_value)202 libtextclassifier3::Status PersistentHashMap::GetOrPut(std::string_view key,
203                                                        void* next_value) {
204   ICING_RETURN_IF_ERROR(ValidateKey(key));
205   ICING_ASSIGN_OR_RETURN(
206       int32_t bucket_idx,
207       HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
208 
209   ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
210                          FindEntryIndexByKey(bucket_idx, key));
211   if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
212     // If not found, then insert new key value pair.
213     SetDirty();
214     return Insert(bucket_idx, key, next_value);
215   }
216 
217   // Otherwise, copy the hash map value into next_value.
218   return CopyEntryValue(idx_pair.target_entry_index, next_value);
219 }
220 
Get(std::string_view key,void * value) const221 libtextclassifier3::Status PersistentHashMap::Get(std::string_view key,
222                                                   void* value) const {
223   ICING_RETURN_IF_ERROR(ValidateKey(key));
224   ICING_ASSIGN_OR_RETURN(
225       int32_t bucket_idx,
226       HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
227 
228   ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
229                          FindEntryIndexByKey(bucket_idx, key));
230   if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
231     return absl_ports::NotFoundError(absl_ports::StrCat(
232         "Key not found in PersistentHashMap ", working_path_));
233   }
234 
235   return CopyEntryValue(idx_pair.target_entry_index, value);
236 }
237 
Delete(std::string_view key)238 libtextclassifier3::Status PersistentHashMap::Delete(std::string_view key) {
239   SetDirty();
240 
241   ICING_RETURN_IF_ERROR(ValidateKey(key));
242   ICING_ASSIGN_OR_RETURN(
243       int32_t bucket_idx,
244       HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
245 
246   ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
247                          FindEntryIndexByKey(bucket_idx, key));
248   if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
249     return absl_ports::NotFoundError(absl_ports::StrCat(
250         "Key not found in PersistentHashMap ", working_path_));
251   }
252 
253   ICING_ASSIGN_OR_RETURN(
254       typename FileBackedVector<Entry>::MutableView mutable_target_entry,
255       entry_storage_->GetMutable(idx_pair.target_entry_index));
256   if (idx_pair.prev_entry_index == Entry::kInvalidIndex) {
257     // If prev_entry_idx is Entry::kInvalidIndex, then target_entry must be the
258     // head element of the entry linked list, and we have to update
259     // bucket->head_entry_index_.
260     //
261     // Before: target_entry (head) -> next_entry -> ...
262     // After: next_entry (head) -> ...
263     ICING_ASSIGN_OR_RETURN(
264         typename FileBackedVector<Bucket>::MutableView mutable_bucket,
265         bucket_storage_->GetMutable(bucket_idx));
266     if (mutable_bucket.Get().head_entry_index() !=
267         idx_pair.target_entry_index) {
268       return absl_ports::InternalError(
269           "Bucket head entry index is inconsistent with the actual entry linked"
270           "list head. This shouldn't happen");
271     }
272     mutable_bucket.Get().set_head_entry_index(
273         mutable_target_entry.Get().next_entry_index());
274   } else {
275     // Otherwise, connect prev_entry and next_entry, to remove target_entry from
276     // the entry linked list.
277     //
278     // Before: ... -> prev_entry -> target_entry -> next_entry -> ...
279     // After: ... -> prev_entry -> next_entry -> ...
280     ICING_ASSIGN_OR_RETURN(
281         typename FileBackedVector<Entry>::MutableView mutable_prev_entry,
282         entry_storage_->GetMutable(idx_pair.prev_entry_index));
283     mutable_prev_entry.Get().set_next_entry_index(
284         mutable_target_entry.Get().next_entry_index());
285   }
286 
287   // Zero out the key value bytes. It is necessary for iterator to iterate
288   // through kv_storage and handle deleted keys properly.
289   int32_t kv_len = key.length() + 1 + info().value_type_size;
290   ICING_RETURN_IF_ERROR(kv_storage_->Set(
291       mutable_target_entry.Get().key_value_index(), kv_len, '\0'));
292 
293   // Invalidate target_entry
294   mutable_target_entry.Get().set_key_value_index(kInvalidKVIndex);
295   mutable_target_entry.Get().set_next_entry_index(Entry::kInvalidIndex);
296 
297   ++(info().num_deleted_entries);
298 
299   return libtextclassifier3::Status::OK;
300 }
301 
GetDiskUsage() const302 libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetDiskUsage() const {
303   ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_disk_usage,
304                          bucket_storage_->GetDiskUsage());
305   ICING_ASSIGN_OR_RETURN(int64_t entry_storage_disk_usage,
306                          entry_storage_->GetDiskUsage());
307   ICING_ASSIGN_OR_RETURN(int64_t kv_storage_disk_usage,
308                          kv_storage_->GetDiskUsage());
309 
310   int64_t total = bucket_storage_disk_usage + entry_storage_disk_usage +
311                   kv_storage_disk_usage;
312   Filesystem::IncrementByOrSetInvalid(
313       filesystem_.GetDiskUsage(GetMetadataFilePath(working_path_).c_str()),
314       &total);
315 
316   if (total < 0 || total == Filesystem::kBadFileSize) {
317     return absl_ports::InternalError(
318         "Failed to get disk usage of PersistentHashMap");
319   }
320   return total;
321 }
322 
GetElementsSize() const323 libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetElementsSize()
324     const {
325   ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_elements_size,
326                          bucket_storage_->GetElementsFileSize());
327   ICING_ASSIGN_OR_RETURN(int64_t entry_storage_elements_size,
328                          entry_storage_->GetElementsFileSize());
329   ICING_ASSIGN_OR_RETURN(int64_t kv_storage_elements_size,
330                          kv_storage_->GetElementsFileSize());
331   return bucket_storage_elements_size + entry_storage_elements_size +
332          kv_storage_elements_size;
333 }
334 
335 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
InitializeNewFiles(const Filesystem & filesystem,std::string && working_path,Options && options)336 PersistentHashMap::InitializeNewFiles(const Filesystem& filesystem,
337                                       std::string&& working_path,
338                                       Options&& options) {
339   // PersistentHashMap uses working_path as working directory path.
340   // Create working directory.
341   if (!filesystem.CreateDirectory(working_path.c_str())) {
342     return absl_ports::InternalError(
343         absl_ports::StrCat("Failed to create directory: ", working_path));
344   }
345 
346   int32_t max_num_buckets_required =
347       std::max(options.init_num_buckets,
348                CalculateNumBucketsRequired(options.max_num_entries,
349                                            options.max_load_factor_percent));
350 
351   // Initialize bucket_storage
352   int32_t pre_mapping_mmap_size = sizeof(Bucket) * max_num_buckets_required;
353   int32_t max_file_size =
354       pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
355   ICING_ASSIGN_OR_RETURN(
356       std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
357       FileBackedVector<Bucket>::Create(
358           filesystem, GetBucketStorageFilePath(working_path),
359           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
360           options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
361 
362   // Initialize entry_storage
363   pre_mapping_mmap_size = sizeof(Entry) * options.max_num_entries;
364   max_file_size =
365       pre_mapping_mmap_size + FileBackedVector<Entry>::Header::kHeaderSize;
366   ICING_ASSIGN_OR_RETURN(
367       std::unique_ptr<FileBackedVector<Entry>> entry_storage,
368       FileBackedVector<Entry>::Create(
369           filesystem, GetEntryStorageFilePath(working_path),
370           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
371           options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
372 
373   // Initialize kv_storage
374   pre_mapping_mmap_size =
375       options.average_kv_byte_size * options.max_num_entries;
376   max_file_size =
377       pre_mapping_mmap_size + FileBackedVector<char>::Header::kHeaderSize;
378   ICING_ASSIGN_OR_RETURN(
379       std::unique_ptr<FileBackedVector<char>> kv_storage,
380       FileBackedVector<char>::Create(
381           filesystem, GetKeyValueStorageFilePath(working_path),
382           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
383           options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
384 
385   // Initialize buckets.
386   ICING_RETURN_IF_ERROR(bucket_storage->Set(
387       /*idx=*/0, /*len=*/options.init_num_buckets, Bucket()));
388   ICING_RETURN_IF_ERROR(bucket_storage->PersistToDisk());
389 
390   // Initialize metadata file. Create MemoryMappedFile with pre-mapping, and
391   // call GrowAndRemapIfNecessary to grow the underlying file.
392   ICING_ASSIGN_OR_RETURN(
393       MemoryMappedFile metadata_mmapped_file,
394       MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
395                                MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
396                                /*max_file_size=*/kMetadataFileSize,
397                                /*pre_mapping_file_offset=*/0,
398                                /*pre_mapping_mmap_size=*/kMetadataFileSize));
399   ICING_RETURN_IF_ERROR(metadata_mmapped_file.GrowAndRemapIfNecessary(
400       /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
401 
402   // Create instance.
403   auto new_persistent_hash_map =
404       std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
405           filesystem, std::move(working_path), std::move(options),
406           std::move(metadata_mmapped_file), std::move(bucket_storage),
407           std::move(entry_storage), std::move(kv_storage)));
408   // Initialize info content by writing mapped memory directly.
409   Info& info_ref = new_persistent_hash_map->info();
410   info_ref.magic = Info::kMagic;
411   info_ref.value_type_size = new_persistent_hash_map->options_.value_type_size;
412   info_ref.max_load_factor_percent =
413       new_persistent_hash_map->options_.max_load_factor_percent;
414   info_ref.num_deleted_entries = 0;
415   info_ref.num_deleted_key_value_bytes = 0;
416 
417   // Initialize new PersistentStorage. The initial checksums will be computed
418   // and set via InitializeNewStorage.
419   ICING_RETURN_IF_ERROR(new_persistent_hash_map->InitializeNewStorage());
420 
421   return new_persistent_hash_map;
422 }
423 
424 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
InitializeExistingFiles(const Filesystem & filesystem,std::string && working_path,Options && options)425 PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem,
426                                            std::string&& working_path,
427                                            Options&& options) {
428   // Initialize metadata file
429   ICING_ASSIGN_OR_RETURN(
430       MemoryMappedFile metadata_mmapped_file,
431       MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
432                                MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
433                                /*max_file_size=*/kMetadataFileSize,
434                                /*pre_mapping_file_offset=*/0,
435                                /*pre_mapping_mmap_size=*/kMetadataFileSize));
436   if (metadata_mmapped_file.available_size() != kMetadataFileSize) {
437     return absl_ports::FailedPreconditionError("Incorrect metadata file size");
438   }
439 
440   int32_t max_num_buckets_required = CalculateNumBucketsRequired(
441       options.max_num_entries, options.max_load_factor_percent);
442 
443   // Initialize bucket_storage
444   int32_t pre_mapping_mmap_size = sizeof(Bucket) * max_num_buckets_required;
445   int32_t max_file_size =
446       pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
447   ICING_ASSIGN_OR_RETURN(
448       std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
449       FileBackedVector<Bucket>::Create(
450           filesystem, GetBucketStorageFilePath(working_path),
451           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
452           options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
453 
454   // Initialize entry_storage
455   pre_mapping_mmap_size = sizeof(Entry) * options.max_num_entries;
456   max_file_size =
457       pre_mapping_mmap_size + FileBackedVector<Entry>::Header::kHeaderSize;
458   ICING_ASSIGN_OR_RETURN(
459       std::unique_ptr<FileBackedVector<Entry>> entry_storage,
460       FileBackedVector<Entry>::Create(
461           filesystem, GetEntryStorageFilePath(working_path),
462           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
463           options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
464 
465   // Initialize kv_storage
466   pre_mapping_mmap_size =
467       options.average_kv_byte_size * options.max_num_entries;
468   max_file_size =
469       pre_mapping_mmap_size + FileBackedVector<char>::Header::kHeaderSize;
470   ICING_ASSIGN_OR_RETURN(
471       std::unique_ptr<FileBackedVector<char>> kv_storage,
472       FileBackedVector<char>::Create(
473           filesystem, GetKeyValueStorageFilePath(working_path),
474           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
475           options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
476 
477   // Create instance.
478   auto persistent_hash_map =
479       std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
480           filesystem, std::move(working_path), std::move(options),
481           std::move(metadata_mmapped_file), std::move(bucket_storage),
482           std::move(entry_storage), std::move(kv_storage)));
483 
484   // Initialize existing PersistentStorage. Checksums will be validated.
485   ICING_RETURN_IF_ERROR(persistent_hash_map->InitializeExistingStorage());
486 
487   // Validate other values of info and options.
488   // Current # of entries should not exceed options_.max_num_entries
489   // We compute max_file_size of 3 storages by options_.max_num_entries. Since
490   // we won't recycle space of deleted entries (and key-value bytes), they're
491   // still occupying space in storages. Even if # of "active" entries doesn't
492   // exceed options_.max_num_entries, the new kvp to be inserted still
493   // potentially exceeds max_file_size.
494   // Therefore, we should use entry_storage_->num_elements() instead of # of
495   // "active" entries
496   // (i.e. entry_storage_->num_elements() - info_ptr->num_deleted_entries) to
497   // check. This feature avoids storages being grown extremely large when there
498   // are many Delete() and Put() operations.
499   if (persistent_hash_map->entry_storage_->num_elements() >
500       persistent_hash_map->options_.max_num_entries) {
501     return absl_ports::FailedPreconditionError(
502         "Current # of entries exceeds max num entries");
503   }
504 
505   // Magic should be the same.
506   if (persistent_hash_map->info().magic != Info::kMagic) {
507     return absl_ports::FailedPreconditionError(
508         "PersistentHashMap header magic mismatch");
509   }
510 
511   // Value type size should be consistent.
512   if (persistent_hash_map->options_.value_type_size !=
513       persistent_hash_map->info().value_type_size) {
514     return absl_ports::FailedPreconditionError("Incorrect value type size");
515   }
516 
517   // Allow max_load_factor_percent_ change.
518   if (persistent_hash_map->options_.max_load_factor_percent !=
519       persistent_hash_map->info().max_load_factor_percent) {
520     ICING_VLOG(2) << "Changing max_load_factor_percent from "
521                   << persistent_hash_map->info().max_load_factor_percent
522                   << " to "
523                   << persistent_hash_map->options_.max_load_factor_percent;
524 
525     persistent_hash_map->SetInfoDirty();
526     persistent_hash_map->info().max_load_factor_percent =
527         persistent_hash_map->options_.max_load_factor_percent;
528     ICING_RETURN_IF_ERROR(
529         persistent_hash_map->RehashIfNecessary(/*force_rehash=*/false));
530 
531     ICING_RETURN_IF_ERROR(persistent_hash_map->PersistToDisk());
532   }
533 
534   return persistent_hash_map;
535 }
536 
PersistStoragesToDisk()537 libtextclassifier3::Status PersistentHashMap::PersistStoragesToDisk() {
538   if (is_initialized_ && !is_storage_dirty()) {
539     return libtextclassifier3::Status::OK;
540   }
541 
542   ICING_RETURN_IF_ERROR(bucket_storage_->PersistToDisk());
543   ICING_RETURN_IF_ERROR(entry_storage_->PersistToDisk());
544   ICING_RETURN_IF_ERROR(kv_storage_->PersistToDisk());
545   is_storage_dirty_ = false;
546   return libtextclassifier3::Status::OK;
547 }
548 
PersistMetadataToDisk()549 libtextclassifier3::Status PersistentHashMap::PersistMetadataToDisk() {
550   // We can skip persisting metadata to disk only if both info and storage are
551   // clean.
552   if (is_initialized_ && !is_info_dirty() && !is_storage_dirty()) {
553     return libtextclassifier3::Status::OK;
554   }
555 
556   // Changes should have been applied to the underlying file when using
557   // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an
558   // extra safety step to ensure they are written out.
559   ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk());
560   is_info_dirty_ = false;
561   return libtextclassifier3::Status::OK;
562 }
563 
564 libtextclassifier3::StatusOr<Crc32>
UpdateStoragesChecksum()565 PersistentHashMap::UpdateStoragesChecksum() {
566   if (is_initialized_ && !is_storage_dirty()) {
567     return Crc32(crcs().component_crcs.storages_crc);
568   }
569 
570   // Compute crcs
571   ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc,
572                          bucket_storage_->UpdateChecksum());
573   ICING_ASSIGN_OR_RETURN(Crc32 entry_storage_crc,
574                          entry_storage_->UpdateChecksum());
575   ICING_ASSIGN_OR_RETURN(Crc32 kv_storage_crc, kv_storage_->UpdateChecksum());
576   return Crc32(bucket_storage_crc.Get() ^ entry_storage_crc.Get() ^
577                kv_storage_crc.Get());
578 }
579 
GetInfoChecksum() const580 libtextclassifier3::StatusOr<Crc32> PersistentHashMap::GetInfoChecksum() const {
581   if (is_initialized_ && !is_info_dirty()) {
582     return Crc32(crcs().component_crcs.info_crc);
583   }
584   return info().GetChecksum();
585 }
586 
GetStoragesChecksum() const587 libtextclassifier3::StatusOr<Crc32> PersistentHashMap::GetStoragesChecksum()
588     const {
589   if (is_initialized_ && !is_storage_dirty()) {
590     return Crc32(crcs().component_crcs.storages_crc);
591   }
592 
593   // Compute crcs
594   Crc32 bucket_storage_crc = bucket_storage_->GetChecksum();
595   Crc32 entry_storage_crc = entry_storage_->GetChecksum();
596   Crc32 kv_storage_crc = kv_storage_->GetChecksum();
597   return Crc32(bucket_storage_crc.Get() ^ entry_storage_crc.Get() ^
598                kv_storage_crc.Get());
599 }
600 
601 libtextclassifier3::StatusOr<PersistentHashMap::EntryIndexPair>
FindEntryIndexByKey(int32_t bucket_idx,std::string_view key) const602 PersistentHashMap::FindEntryIndexByKey(int32_t bucket_idx,
603                                        std::string_view key) const {
604   // Iterate all entries in the bucket, compare with key, and return the entry
605   // index if exists.
606   ICING_ASSIGN_OR_RETURN(const Bucket* bucket,
607                          bucket_storage_->Get(bucket_idx));
608 
609   int32_t prev_entry_idx = Entry::kInvalidIndex;
610   int32_t curr_entry_idx = bucket->head_entry_index();
611   while (curr_entry_idx != Entry::kInvalidIndex) {
612     ICING_ASSIGN_OR_RETURN(const Entry* entry,
613                            entry_storage_->Get(curr_entry_idx));
614     if (entry->key_value_index() == kInvalidKVIndex) {
615       ICING_LOG(ERROR) << "Got an invalid key value index in the persistent "
616                           "hash map bucket. This shouldn't happen";
617       return absl_ports::InternalError("Unexpected invalid key value index");
618     }
619     ICING_ASSIGN_OR_RETURN(const char* kv_arr,
620                            kv_storage_->Get(entry->key_value_index()));
621     if (key.compare(kv_arr) == 0) {
622       return EntryIndexPair(curr_entry_idx, prev_entry_idx);
623     }
624 
625     prev_entry_idx = curr_entry_idx;
626     curr_entry_idx = entry->next_entry_index();
627   }
628 
629   return EntryIndexPair(curr_entry_idx, prev_entry_idx);
630 }
631 
CopyEntryValue(int32_t entry_idx,void * value) const632 libtextclassifier3::Status PersistentHashMap::CopyEntryValue(
633     int32_t entry_idx, void* value) const {
634   ICING_ASSIGN_OR_RETURN(const Entry* entry, entry_storage_->Get(entry_idx));
635 
636   ICING_ASSIGN_OR_RETURN(const char* kv_arr,
637                          kv_storage_->Get(entry->key_value_index()));
638   int32_t value_offset = strlen(kv_arr) + 1;
639   memcpy(value, kv_arr + value_offset, info().value_type_size);
640 
641   return libtextclassifier3::Status::OK;
642 }
643 
Insert(int32_t bucket_idx,std::string_view key,const void * value)644 libtextclassifier3::Status PersistentHashMap::Insert(int32_t bucket_idx,
645                                                      std::string_view key,
646                                                      const void* value) {
647   SetDirty();
648 
649   // If entry_storage_->num_elements() + 1 exceeds options_.max_num_entries,
650   // then return error.
651   // We compute max_file_size of 3 storages by options_.max_num_entries. Since
652   // we won't recycle space of deleted entries (and key-value bytes), they're
653   // still occupying space in storages. Even if # of "active" entries (i.e.
654   // size()) doesn't exceed options_.max_num_entries, the new kvp to be inserted
655   // still potentially exceeds max_file_size.
656   // Therefore, we should use entry_storage_->num_elements() instead of size()
657   // to check. This feature avoids storages being grown extremely large when
658   // there are many Delete() and Put() operations.
659   if (entry_storage_->num_elements() > options_.max_num_entries - 1) {
660     return absl_ports::ResourceExhaustedError("Cannot insert new entry");
661   }
662 
663   ICING_ASSIGN_OR_RETURN(
664       typename FileBackedVector<Bucket>::MutableView mutable_bucket,
665       bucket_storage_->GetMutable(bucket_idx));
666 
667   // Append new key value.
668   int32_t new_kv_idx = kv_storage_->num_elements();
669   int32_t kv_len = key.size() + 1 + info().value_type_size;
670   int32_t value_offset = key.size() + 1;
671   ICING_ASSIGN_OR_RETURN(
672       typename FileBackedVector<char>::MutableArrayView mutable_new_kv_arr,
673       kv_storage_->Allocate(kv_len));
674   mutable_new_kv_arr.SetArray(/*idx=*/0, key.data(), key.size());
675   mutable_new_kv_arr.SetArray(/*idx=*/key.size(), "\0", 1);
676   mutable_new_kv_arr.SetArray(/*idx=*/value_offset,
677                               reinterpret_cast<const char*>(value),
678                               info().value_type_size);
679 
680   // Append new entry.
681   int32_t new_entry_idx = entry_storage_->num_elements();
682   ICING_RETURN_IF_ERROR(entry_storage_->Append(
683       Entry(new_kv_idx, mutable_bucket.Get().head_entry_index())));
684   mutable_bucket.Get().set_head_entry_index(new_entry_idx);
685 
686   return RehashIfNecessary(/*force_rehash=*/false);
687 }
688 
RehashIfNecessary(bool force_rehash)689 libtextclassifier3::Status PersistentHashMap::RehashIfNecessary(
690     bool force_rehash) {
691   int32_t new_num_bucket = bucket_storage_->num_elements();
692   while (new_num_bucket <= Bucket::kMaxNumBuckets / 2 &&
693          size() > static_cast<int64_t>(new_num_bucket) *
694                       info().max_load_factor_percent / 100) {
695     new_num_bucket *= 2;
696   }
697 
698   if (!force_rehash && new_num_bucket == bucket_storage_->num_elements()) {
699     return libtextclassifier3::Status::OK;
700   }
701 
702   SetDirty();
703 
704   // Resize and reset buckets.
705   ICING_RETURN_IF_ERROR(
706       bucket_storage_->Set(0, new_num_bucket, Bucket(Entry::kInvalidIndex)));
707 
708   // Iterate all key value pairs in kv_storage, rehash and insert.
709   Iterator iter = GetIterator();
710   int32_t entry_idx = 0;
711   while (iter.Advance()) {
712     ICING_ASSIGN_OR_RETURN(int32_t bucket_idx,
713                            HashKeyToBucketIndex(iter.GetKey(), new_num_bucket));
714     ICING_ASSIGN_OR_RETURN(FileBackedVector<Bucket>::MutableView mutable_bucket,
715                            bucket_storage_->GetMutable(bucket_idx));
716 
717     // Update entry and bucket.
718     ICING_RETURN_IF_ERROR(entry_storage_->Set(
719         entry_idx,
720         Entry(iter.GetIndex(), mutable_bucket.Get().head_entry_index())));
721     mutable_bucket.Get().set_head_entry_index(entry_idx);
722 
723     ++entry_idx;
724   }
725 
726   // Since there will be some deleted entries, after rehashing entry_storage_
727   // # of vector elements may be greater than the actual # of entries.
728   // Therefore, we have to truncate entry_storage_ to the correct size.
729   if (entry_idx < entry_storage_->num_elements()) {
730     ICING_RETURN_IF_ERROR(entry_storage_->TruncateTo(entry_idx));
731   }
732 
733   info().num_deleted_entries = 0;
734 
735   return libtextclassifier3::Status::OK;
736 }
737 
Advance()738 bool PersistentHashMap::Iterator::Advance() {
739   // Jump over the current key value pair before advancing to the next valid
740   // key value pair. In the first round (after construction), curr_key_len_
741   // is 0, so don't jump over anything.
742   if (curr_key_len_ != 0) {
743     curr_kv_idx_ += curr_key_len_ + 1 + map_->info().value_type_size;
744     curr_key_len_ = 0;
745   }
746 
747   // By skipping null chars, we will be automatically handling deleted entries
748   // (which are zeroed out during deletion).
749   for (const char* curr_kv_ptr = map_->kv_storage_->array() + curr_kv_idx_;
750        curr_kv_idx_ < map_->kv_storage_->num_elements();
751        ++curr_kv_ptr, ++curr_kv_idx_) {
752     if (*curr_kv_ptr != '\0') {
753       curr_key_len_ = strlen(curr_kv_ptr);
754       return true;
755     }
756   }
757   return false;
758 }
759 
760 }  // namespace lib
761 }  // namespace icing
762