1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/file/persistent-hash-map.h"
16
17 #include <cstdint>
18 #include <cstring>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <utility>
23
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/absl_ports/canonical_errors.h"
27 #include "icing/absl_ports/str_cat.h"
28 #include "icing/file/file-backed-vector.h"
29 #include "icing/file/memory-mapped-file.h"
30 #include "icing/file/persistent-storage.h"
31 #include "icing/util/crc32.h"
32 #include "icing/util/status-macros.h"
33
34 namespace icing {
35 namespace lib {
36
37 namespace {
38
39 // Helper function to check if there is no termination character '\0' in the
40 // key.
ValidateKey(std::string_view key)41 libtextclassifier3::Status ValidateKey(std::string_view key) {
42 if (key.find('\0') != std::string_view::npos) { // NOLINT
43 return absl_ports::InvalidArgumentError(
44 "Key cannot contain termination character '\\0'");
45 }
46 return libtextclassifier3::Status::OK;
47 }
48
49 // Helper function to convert the key to bucket index by hash.
50 //
51 // Returns:
52 // int32_t: A valid bucket index with range [0, num_buckets - 1].
53 // INTERNAL_ERROR if num_buckets == 0
HashKeyToBucketIndex(std::string_view key,int32_t num_buckets)54 libtextclassifier3::StatusOr<int32_t> HashKeyToBucketIndex(
55 std::string_view key, int32_t num_buckets) {
56 if (num_buckets == 0) {
57 return absl_ports::InternalError("Should not have empty bucket");
58 }
59 return static_cast<int32_t>(std::hash<std::string_view>()(key) % num_buckets);
60 }
61
62 // The following 4 methods are helper functions to get the correct path of
63 // metadata/bucket/entry/key-value storages, according to the given working
64 // directory path.
GetMetadataFilePath(std::string_view working_path)65 std::string GetMetadataFilePath(std::string_view working_path) {
66 return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
67 ".m");
68 }
69
GetBucketStorageFilePath(std::string_view working_path)70 std::string GetBucketStorageFilePath(std::string_view working_path) {
71 return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
72 ".b");
73 }
74
GetEntryStorageFilePath(std::string_view working_path)75 std::string GetEntryStorageFilePath(std::string_view working_path) {
76 return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
77 ".e");
78 }
79
GetKeyValueStorageFilePath(std::string_view working_path)80 std::string GetKeyValueStorageFilePath(std::string_view working_path) {
81 return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
82 ".k");
83 }
84
85 // Calculates how many buckets we need given num_entries and
86 // max_load_factor_percent. Round it up to 2's power.
87 //
88 // REQUIRES: 0 < num_entries <= Entry::kMaxNumEntries &&
89 // max_load_factor_percent > 0
CalculateNumBucketsRequired(int32_t num_entries,int32_t max_load_factor_percent)90 int32_t CalculateNumBucketsRequired(int32_t num_entries,
91 int32_t max_load_factor_percent) {
92 // Calculate ceil(num_entries * 100 / max_load_factor_percent)
93 int32_t num_entries_100 = num_entries * 100;
94 int32_t num_buckets_required =
95 num_entries_100 / max_load_factor_percent +
96 (num_entries_100 % max_load_factor_percent == 0 ? 0 : 1);
97 if ((num_buckets_required & (num_buckets_required - 1)) != 0) {
98 // not 2's power
99 return 1 << (32 - __builtin_clz(num_buckets_required));
100 }
101 return num_buckets_required;
102 }
103
104 } // namespace
105
IsValid() const106 bool PersistentHashMap::Options::IsValid() const {
107 if (!(value_type_size > 0 && value_type_size <= kMaxValueTypeSize &&
108 max_num_entries > 0 && max_num_entries <= Entry::kMaxNumEntries &&
109 max_load_factor_percent > 0 && average_kv_byte_size > 0 &&
110 init_num_buckets > 0 && init_num_buckets <= Bucket::kMaxNumBuckets)) {
111 return false;
112 }
113
114 // We've ensured (static_assert) that storing kMaxNumBuckets buckets won't
115 // exceed FileBackedVector::kMaxFileSize, so only need to verify # of buckets
116 // required won't exceed kMaxNumBuckets.
117 if (CalculateNumBucketsRequired(max_num_entries, max_load_factor_percent) >
118 Bucket::kMaxNumBuckets) {
119 return false;
120 }
121
122 // Verify # of key value pairs can fit into kv_storage.
123 if (average_kv_byte_size > kMaxKVTotalByteSize / max_num_entries) {
124 return false;
125 }
126
127 // Verify init_num_buckets is 2's power. Requiring init_num_buckets to be 2^n
128 // guarantees that num_buckets will eventually grow to be exactly
129 // max_num_buckets since CalculateNumBucketsRequired rounds it up to 2^n.
130 if ((init_num_buckets & (init_num_buckets - 1)) != 0) {
131 return false;
132 }
133
134 return true;
135 }
136
137 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
Create(const Filesystem & filesystem,std::string working_path,Options options)138 PersistentHashMap::Create(const Filesystem& filesystem,
139 std::string working_path, Options options) {
140 if (!options.IsValid()) {
141 return absl_ports::InvalidArgumentError(
142 "Invalid PersistentHashMap options");
143 }
144
145 if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) ||
146 !filesystem.FileExists(GetBucketStorageFilePath(working_path).c_str()) ||
147 !filesystem.FileExists(GetEntryStorageFilePath(working_path).c_str()) ||
148 !filesystem.FileExists(
149 GetKeyValueStorageFilePath(working_path).c_str())) {
150 // Discard working_path if any of them is missing, and reinitialize.
151 if (filesystem.DirectoryExists(working_path.c_str())) {
152 ICING_RETURN_IF_ERROR(Discard(filesystem, working_path));
153 }
154 return InitializeNewFiles(filesystem, std::move(working_path),
155 std::move(options));
156 }
157 return InitializeExistingFiles(filesystem, std::move(working_path),
158 std::move(options));
159 }
160
~PersistentHashMap()161 PersistentHashMap::~PersistentHashMap() {
162 if (!PersistToDisk().ok()) {
163 ICING_LOG(WARNING)
164 << "Failed to persist hash map to disk while destructing "
165 << working_path_;
166 }
167 }
168
Put(std::string_view key,const void * value)169 libtextclassifier3::Status PersistentHashMap::Put(std::string_view key,
170 const void* value) {
171 SetDirty();
172
173 ICING_RETURN_IF_ERROR(ValidateKey(key));
174 ICING_ASSIGN_OR_RETURN(
175 int32_t bucket_idx,
176 HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
177
178 ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
179 FindEntryIndexByKey(bucket_idx, key));
180 if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
181 // If not found, then insert new key value pair.
182 return Insert(bucket_idx, key, value);
183 }
184
185 // Otherwise, overwrite the value.
186 ICING_ASSIGN_OR_RETURN(const Entry* entry,
187 entry_storage_->Get(idx_pair.target_entry_index));
188
189 int32_t kv_len = key.length() + 1 + info().value_type_size;
190 int32_t value_offset = key.length() + 1;
191 ICING_ASSIGN_OR_RETURN(
192 typename FileBackedVector<char>::MutableArrayView mutable_kv_arr,
193 kv_storage_->GetMutable(entry->key_value_index(), kv_len));
194 // It is the same key and value_size is fixed, so we can directly overwrite
195 // serialized value.
196 mutable_kv_arr.SetArray(value_offset, reinterpret_cast<const char*>(value),
197 info().value_type_size);
198
199 return libtextclassifier3::Status::OK;
200 }
201
GetOrPut(std::string_view key,void * next_value)202 libtextclassifier3::Status PersistentHashMap::GetOrPut(std::string_view key,
203 void* next_value) {
204 ICING_RETURN_IF_ERROR(ValidateKey(key));
205 ICING_ASSIGN_OR_RETURN(
206 int32_t bucket_idx,
207 HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
208
209 ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
210 FindEntryIndexByKey(bucket_idx, key));
211 if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
212 // If not found, then insert new key value pair.
213 SetDirty();
214 return Insert(bucket_idx, key, next_value);
215 }
216
217 // Otherwise, copy the hash map value into next_value.
218 return CopyEntryValue(idx_pair.target_entry_index, next_value);
219 }
220
Get(std::string_view key,void * value) const221 libtextclassifier3::Status PersistentHashMap::Get(std::string_view key,
222 void* value) const {
223 ICING_RETURN_IF_ERROR(ValidateKey(key));
224 ICING_ASSIGN_OR_RETURN(
225 int32_t bucket_idx,
226 HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
227
228 ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
229 FindEntryIndexByKey(bucket_idx, key));
230 if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
231 return absl_ports::NotFoundError(absl_ports::StrCat(
232 "Key not found in PersistentHashMap ", working_path_));
233 }
234
235 return CopyEntryValue(idx_pair.target_entry_index, value);
236 }
237
Delete(std::string_view key)238 libtextclassifier3::Status PersistentHashMap::Delete(std::string_view key) {
239 SetDirty();
240
241 ICING_RETURN_IF_ERROR(ValidateKey(key));
242 ICING_ASSIGN_OR_RETURN(
243 int32_t bucket_idx,
244 HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
245
246 ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
247 FindEntryIndexByKey(bucket_idx, key));
248 if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
249 return absl_ports::NotFoundError(absl_ports::StrCat(
250 "Key not found in PersistentHashMap ", working_path_));
251 }
252
253 ICING_ASSIGN_OR_RETURN(
254 typename FileBackedVector<Entry>::MutableView mutable_target_entry,
255 entry_storage_->GetMutable(idx_pair.target_entry_index));
256 if (idx_pair.prev_entry_index == Entry::kInvalidIndex) {
257 // If prev_entry_idx is Entry::kInvalidIndex, then target_entry must be the
258 // head element of the entry linked list, and we have to update
259 // bucket->head_entry_index_.
260 //
261 // Before: target_entry (head) -> next_entry -> ...
262 // After: next_entry (head) -> ...
263 ICING_ASSIGN_OR_RETURN(
264 typename FileBackedVector<Bucket>::MutableView mutable_bucket,
265 bucket_storage_->GetMutable(bucket_idx));
266 if (mutable_bucket.Get().head_entry_index() !=
267 idx_pair.target_entry_index) {
268 return absl_ports::InternalError(
269 "Bucket head entry index is inconsistent with the actual entry linked"
270 "list head. This shouldn't happen");
271 }
272 mutable_bucket.Get().set_head_entry_index(
273 mutable_target_entry.Get().next_entry_index());
274 } else {
275 // Otherwise, connect prev_entry and next_entry, to remove target_entry from
276 // the entry linked list.
277 //
278 // Before: ... -> prev_entry -> target_entry -> next_entry -> ...
279 // After: ... -> prev_entry -> next_entry -> ...
280 ICING_ASSIGN_OR_RETURN(
281 typename FileBackedVector<Entry>::MutableView mutable_prev_entry,
282 entry_storage_->GetMutable(idx_pair.prev_entry_index));
283 mutable_prev_entry.Get().set_next_entry_index(
284 mutable_target_entry.Get().next_entry_index());
285 }
286
287 // Zero out the key value bytes. It is necessary for iterator to iterate
288 // through kv_storage and handle deleted keys properly.
289 int32_t kv_len = key.length() + 1 + info().value_type_size;
290 ICING_RETURN_IF_ERROR(kv_storage_->Set(
291 mutable_target_entry.Get().key_value_index(), kv_len, '\0'));
292
293 // Invalidate target_entry
294 mutable_target_entry.Get().set_key_value_index(kInvalidKVIndex);
295 mutable_target_entry.Get().set_next_entry_index(Entry::kInvalidIndex);
296
297 ++(info().num_deleted_entries);
298
299 return libtextclassifier3::Status::OK;
300 }
301
GetDiskUsage() const302 libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetDiskUsage() const {
303 ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_disk_usage,
304 bucket_storage_->GetDiskUsage());
305 ICING_ASSIGN_OR_RETURN(int64_t entry_storage_disk_usage,
306 entry_storage_->GetDiskUsage());
307 ICING_ASSIGN_OR_RETURN(int64_t kv_storage_disk_usage,
308 kv_storage_->GetDiskUsage());
309
310 int64_t total = bucket_storage_disk_usage + entry_storage_disk_usage +
311 kv_storage_disk_usage;
312 Filesystem::IncrementByOrSetInvalid(
313 filesystem_.GetDiskUsage(GetMetadataFilePath(working_path_).c_str()),
314 &total);
315
316 if (total < 0 || total == Filesystem::kBadFileSize) {
317 return absl_ports::InternalError(
318 "Failed to get disk usage of PersistentHashMap");
319 }
320 return total;
321 }
322
GetElementsSize() const323 libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetElementsSize()
324 const {
325 ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_elements_size,
326 bucket_storage_->GetElementsFileSize());
327 ICING_ASSIGN_OR_RETURN(int64_t entry_storage_elements_size,
328 entry_storage_->GetElementsFileSize());
329 ICING_ASSIGN_OR_RETURN(int64_t kv_storage_elements_size,
330 kv_storage_->GetElementsFileSize());
331 return bucket_storage_elements_size + entry_storage_elements_size +
332 kv_storage_elements_size;
333 }
334
335 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
InitializeNewFiles(const Filesystem & filesystem,std::string && working_path,Options && options)336 PersistentHashMap::InitializeNewFiles(const Filesystem& filesystem,
337 std::string&& working_path,
338 Options&& options) {
339 // PersistentHashMap uses working_path as working directory path.
340 // Create working directory.
341 if (!filesystem.CreateDirectory(working_path.c_str())) {
342 return absl_ports::InternalError(
343 absl_ports::StrCat("Failed to create directory: ", working_path));
344 }
345
346 int32_t max_num_buckets_required =
347 std::max(options.init_num_buckets,
348 CalculateNumBucketsRequired(options.max_num_entries,
349 options.max_load_factor_percent));
350
351 // Initialize bucket_storage
352 int32_t pre_mapping_mmap_size = sizeof(Bucket) * max_num_buckets_required;
353 int32_t max_file_size =
354 pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
355 ICING_ASSIGN_OR_RETURN(
356 std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
357 FileBackedVector<Bucket>::Create(
358 filesystem, GetBucketStorageFilePath(working_path),
359 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
360 options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
361
362 // Initialize entry_storage
363 pre_mapping_mmap_size = sizeof(Entry) * options.max_num_entries;
364 max_file_size =
365 pre_mapping_mmap_size + FileBackedVector<Entry>::Header::kHeaderSize;
366 ICING_ASSIGN_OR_RETURN(
367 std::unique_ptr<FileBackedVector<Entry>> entry_storage,
368 FileBackedVector<Entry>::Create(
369 filesystem, GetEntryStorageFilePath(working_path),
370 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
371 options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
372
373 // Initialize kv_storage
374 pre_mapping_mmap_size =
375 options.average_kv_byte_size * options.max_num_entries;
376 max_file_size =
377 pre_mapping_mmap_size + FileBackedVector<char>::Header::kHeaderSize;
378 ICING_ASSIGN_OR_RETURN(
379 std::unique_ptr<FileBackedVector<char>> kv_storage,
380 FileBackedVector<char>::Create(
381 filesystem, GetKeyValueStorageFilePath(working_path),
382 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
383 options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
384
385 // Initialize buckets.
386 ICING_RETURN_IF_ERROR(bucket_storage->Set(
387 /*idx=*/0, /*len=*/options.init_num_buckets, Bucket()));
388 ICING_RETURN_IF_ERROR(bucket_storage->PersistToDisk());
389
390 // Initialize metadata file. Create MemoryMappedFile with pre-mapping, and
391 // call GrowAndRemapIfNecessary to grow the underlying file.
392 ICING_ASSIGN_OR_RETURN(
393 MemoryMappedFile metadata_mmapped_file,
394 MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
395 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
396 /*max_file_size=*/kMetadataFileSize,
397 /*pre_mapping_file_offset=*/0,
398 /*pre_mapping_mmap_size=*/kMetadataFileSize));
399 ICING_RETURN_IF_ERROR(metadata_mmapped_file.GrowAndRemapIfNecessary(
400 /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
401
402 // Create instance.
403 auto new_persistent_hash_map =
404 std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
405 filesystem, std::move(working_path), std::move(options),
406 std::move(metadata_mmapped_file), std::move(bucket_storage),
407 std::move(entry_storage), std::move(kv_storage)));
408 // Initialize info content by writing mapped memory directly.
409 Info& info_ref = new_persistent_hash_map->info();
410 info_ref.magic = Info::kMagic;
411 info_ref.value_type_size = new_persistent_hash_map->options_.value_type_size;
412 info_ref.max_load_factor_percent =
413 new_persistent_hash_map->options_.max_load_factor_percent;
414 info_ref.num_deleted_entries = 0;
415 info_ref.num_deleted_key_value_bytes = 0;
416
417 // Initialize new PersistentStorage. The initial checksums will be computed
418 // and set via InitializeNewStorage.
419 ICING_RETURN_IF_ERROR(new_persistent_hash_map->InitializeNewStorage());
420
421 return new_persistent_hash_map;
422 }
423
424 /* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
InitializeExistingFiles(const Filesystem & filesystem,std::string && working_path,Options && options)425 PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem,
426 std::string&& working_path,
427 Options&& options) {
428 // Initialize metadata file
429 ICING_ASSIGN_OR_RETURN(
430 MemoryMappedFile metadata_mmapped_file,
431 MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
432 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
433 /*max_file_size=*/kMetadataFileSize,
434 /*pre_mapping_file_offset=*/0,
435 /*pre_mapping_mmap_size=*/kMetadataFileSize));
436 if (metadata_mmapped_file.available_size() != kMetadataFileSize) {
437 return absl_ports::FailedPreconditionError("Incorrect metadata file size");
438 }
439
440 int32_t max_num_buckets_required = CalculateNumBucketsRequired(
441 options.max_num_entries, options.max_load_factor_percent);
442
443 // Initialize bucket_storage
444 int32_t pre_mapping_mmap_size = sizeof(Bucket) * max_num_buckets_required;
445 int32_t max_file_size =
446 pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
447 ICING_ASSIGN_OR_RETURN(
448 std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
449 FileBackedVector<Bucket>::Create(
450 filesystem, GetBucketStorageFilePath(working_path),
451 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
452 options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
453
454 // Initialize entry_storage
455 pre_mapping_mmap_size = sizeof(Entry) * options.max_num_entries;
456 max_file_size =
457 pre_mapping_mmap_size + FileBackedVector<Entry>::Header::kHeaderSize;
458 ICING_ASSIGN_OR_RETURN(
459 std::unique_ptr<FileBackedVector<Entry>> entry_storage,
460 FileBackedVector<Entry>::Create(
461 filesystem, GetEntryStorageFilePath(working_path),
462 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
463 options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
464
465 // Initialize kv_storage
466 pre_mapping_mmap_size =
467 options.average_kv_byte_size * options.max_num_entries;
468 max_file_size =
469 pre_mapping_mmap_size + FileBackedVector<char>::Header::kHeaderSize;
470 ICING_ASSIGN_OR_RETURN(
471 std::unique_ptr<FileBackedVector<char>> kv_storage,
472 FileBackedVector<char>::Create(
473 filesystem, GetKeyValueStorageFilePath(working_path),
474 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
475 options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
476
477 // Create instance.
478 auto persistent_hash_map =
479 std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
480 filesystem, std::move(working_path), std::move(options),
481 std::move(metadata_mmapped_file), std::move(bucket_storage),
482 std::move(entry_storage), std::move(kv_storage)));
483
484 // Initialize existing PersistentStorage. Checksums will be validated.
485 ICING_RETURN_IF_ERROR(persistent_hash_map->InitializeExistingStorage());
486
487 // Validate other values of info and options.
488 // Current # of entries should not exceed options_.max_num_entries
489 // We compute max_file_size of 3 storages by options_.max_num_entries. Since
490 // we won't recycle space of deleted entries (and key-value bytes), they're
491 // still occupying space in storages. Even if # of "active" entries doesn't
492 // exceed options_.max_num_entries, the new kvp to be inserted still
493 // potentially exceeds max_file_size.
494 // Therefore, we should use entry_storage_->num_elements() instead of # of
495 // "active" entries
496 // (i.e. entry_storage_->num_elements() - info_ptr->num_deleted_entries) to
497 // check. This feature avoids storages being grown extremely large when there
498 // are many Delete() and Put() operations.
499 if (persistent_hash_map->entry_storage_->num_elements() >
500 persistent_hash_map->options_.max_num_entries) {
501 return absl_ports::FailedPreconditionError(
502 "Current # of entries exceeds max num entries");
503 }
504
505 // Magic should be the same.
506 if (persistent_hash_map->info().magic != Info::kMagic) {
507 return absl_ports::FailedPreconditionError(
508 "PersistentHashMap header magic mismatch");
509 }
510
511 // Value type size should be consistent.
512 if (persistent_hash_map->options_.value_type_size !=
513 persistent_hash_map->info().value_type_size) {
514 return absl_ports::FailedPreconditionError("Incorrect value type size");
515 }
516
517 // Allow max_load_factor_percent_ change.
518 if (persistent_hash_map->options_.max_load_factor_percent !=
519 persistent_hash_map->info().max_load_factor_percent) {
520 ICING_VLOG(2) << "Changing max_load_factor_percent from "
521 << persistent_hash_map->info().max_load_factor_percent
522 << " to "
523 << persistent_hash_map->options_.max_load_factor_percent;
524
525 persistent_hash_map->SetInfoDirty();
526 persistent_hash_map->info().max_load_factor_percent =
527 persistent_hash_map->options_.max_load_factor_percent;
528 ICING_RETURN_IF_ERROR(
529 persistent_hash_map->RehashIfNecessary(/*force_rehash=*/false));
530
531 ICING_RETURN_IF_ERROR(persistent_hash_map->PersistToDisk());
532 }
533
534 return persistent_hash_map;
535 }
536
PersistStoragesToDisk()537 libtextclassifier3::Status PersistentHashMap::PersistStoragesToDisk() {
538 if (is_initialized_ && !is_storage_dirty()) {
539 return libtextclassifier3::Status::OK;
540 }
541
542 ICING_RETURN_IF_ERROR(bucket_storage_->PersistToDisk());
543 ICING_RETURN_IF_ERROR(entry_storage_->PersistToDisk());
544 ICING_RETURN_IF_ERROR(kv_storage_->PersistToDisk());
545 is_storage_dirty_ = false;
546 return libtextclassifier3::Status::OK;
547 }
548
PersistMetadataToDisk()549 libtextclassifier3::Status PersistentHashMap::PersistMetadataToDisk() {
550 // We can skip persisting metadata to disk only if both info and storage are
551 // clean.
552 if (is_initialized_ && !is_info_dirty() && !is_storage_dirty()) {
553 return libtextclassifier3::Status::OK;
554 }
555
556 // Changes should have been applied to the underlying file when using
557 // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an
558 // extra safety step to ensure they are written out.
559 ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk());
560 is_info_dirty_ = false;
561 return libtextclassifier3::Status::OK;
562 }
563
564 libtextclassifier3::StatusOr<Crc32>
UpdateStoragesChecksum()565 PersistentHashMap::UpdateStoragesChecksum() {
566 if (is_initialized_ && !is_storage_dirty()) {
567 return Crc32(crcs().component_crcs.storages_crc);
568 }
569
570 // Compute crcs
571 ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc,
572 bucket_storage_->UpdateChecksum());
573 ICING_ASSIGN_OR_RETURN(Crc32 entry_storage_crc,
574 entry_storage_->UpdateChecksum());
575 ICING_ASSIGN_OR_RETURN(Crc32 kv_storage_crc, kv_storage_->UpdateChecksum());
576 return Crc32(bucket_storage_crc.Get() ^ entry_storage_crc.Get() ^
577 kv_storage_crc.Get());
578 }
579
GetInfoChecksum() const580 libtextclassifier3::StatusOr<Crc32> PersistentHashMap::GetInfoChecksum() const {
581 if (is_initialized_ && !is_info_dirty()) {
582 return Crc32(crcs().component_crcs.info_crc);
583 }
584 return info().GetChecksum();
585 }
586
GetStoragesChecksum() const587 libtextclassifier3::StatusOr<Crc32> PersistentHashMap::GetStoragesChecksum()
588 const {
589 if (is_initialized_ && !is_storage_dirty()) {
590 return Crc32(crcs().component_crcs.storages_crc);
591 }
592
593 // Compute crcs
594 Crc32 bucket_storage_crc = bucket_storage_->GetChecksum();
595 Crc32 entry_storage_crc = entry_storage_->GetChecksum();
596 Crc32 kv_storage_crc = kv_storage_->GetChecksum();
597 return Crc32(bucket_storage_crc.Get() ^ entry_storage_crc.Get() ^
598 kv_storage_crc.Get());
599 }
600
601 libtextclassifier3::StatusOr<PersistentHashMap::EntryIndexPair>
FindEntryIndexByKey(int32_t bucket_idx,std::string_view key) const602 PersistentHashMap::FindEntryIndexByKey(int32_t bucket_idx,
603 std::string_view key) const {
604 // Iterate all entries in the bucket, compare with key, and return the entry
605 // index if exists.
606 ICING_ASSIGN_OR_RETURN(const Bucket* bucket,
607 bucket_storage_->Get(bucket_idx));
608
609 int32_t prev_entry_idx = Entry::kInvalidIndex;
610 int32_t curr_entry_idx = bucket->head_entry_index();
611 while (curr_entry_idx != Entry::kInvalidIndex) {
612 ICING_ASSIGN_OR_RETURN(const Entry* entry,
613 entry_storage_->Get(curr_entry_idx));
614 if (entry->key_value_index() == kInvalidKVIndex) {
615 ICING_LOG(ERROR) << "Got an invalid key value index in the persistent "
616 "hash map bucket. This shouldn't happen";
617 return absl_ports::InternalError("Unexpected invalid key value index");
618 }
619 ICING_ASSIGN_OR_RETURN(const char* kv_arr,
620 kv_storage_->Get(entry->key_value_index()));
621 if (key.compare(kv_arr) == 0) {
622 return EntryIndexPair(curr_entry_idx, prev_entry_idx);
623 }
624
625 prev_entry_idx = curr_entry_idx;
626 curr_entry_idx = entry->next_entry_index();
627 }
628
629 return EntryIndexPair(curr_entry_idx, prev_entry_idx);
630 }
631
CopyEntryValue(int32_t entry_idx,void * value) const632 libtextclassifier3::Status PersistentHashMap::CopyEntryValue(
633 int32_t entry_idx, void* value) const {
634 ICING_ASSIGN_OR_RETURN(const Entry* entry, entry_storage_->Get(entry_idx));
635
636 ICING_ASSIGN_OR_RETURN(const char* kv_arr,
637 kv_storage_->Get(entry->key_value_index()));
638 int32_t value_offset = strlen(kv_arr) + 1;
639 memcpy(value, kv_arr + value_offset, info().value_type_size);
640
641 return libtextclassifier3::Status::OK;
642 }
643
Insert(int32_t bucket_idx,std::string_view key,const void * value)644 libtextclassifier3::Status PersistentHashMap::Insert(int32_t bucket_idx,
645 std::string_view key,
646 const void* value) {
647 SetDirty();
648
649 // If entry_storage_->num_elements() + 1 exceeds options_.max_num_entries,
650 // then return error.
651 // We compute max_file_size of 3 storages by options_.max_num_entries. Since
652 // we won't recycle space of deleted entries (and key-value bytes), they're
653 // still occupying space in storages. Even if # of "active" entries (i.e.
654 // size()) doesn't exceed options_.max_num_entries, the new kvp to be inserted
655 // still potentially exceeds max_file_size.
656 // Therefore, we should use entry_storage_->num_elements() instead of size()
657 // to check. This feature avoids storages being grown extremely large when
658 // there are many Delete() and Put() operations.
659 if (entry_storage_->num_elements() > options_.max_num_entries - 1) {
660 return absl_ports::ResourceExhaustedError("Cannot insert new entry");
661 }
662
663 ICING_ASSIGN_OR_RETURN(
664 typename FileBackedVector<Bucket>::MutableView mutable_bucket,
665 bucket_storage_->GetMutable(bucket_idx));
666
667 // Append new key value.
668 int32_t new_kv_idx = kv_storage_->num_elements();
669 int32_t kv_len = key.size() + 1 + info().value_type_size;
670 int32_t value_offset = key.size() + 1;
671 ICING_ASSIGN_OR_RETURN(
672 typename FileBackedVector<char>::MutableArrayView mutable_new_kv_arr,
673 kv_storage_->Allocate(kv_len));
674 mutable_new_kv_arr.SetArray(/*idx=*/0, key.data(), key.size());
675 mutable_new_kv_arr.SetArray(/*idx=*/key.size(), "\0", 1);
676 mutable_new_kv_arr.SetArray(/*idx=*/value_offset,
677 reinterpret_cast<const char*>(value),
678 info().value_type_size);
679
680 // Append new entry.
681 int32_t new_entry_idx = entry_storage_->num_elements();
682 ICING_RETURN_IF_ERROR(entry_storage_->Append(
683 Entry(new_kv_idx, mutable_bucket.Get().head_entry_index())));
684 mutable_bucket.Get().set_head_entry_index(new_entry_idx);
685
686 return RehashIfNecessary(/*force_rehash=*/false);
687 }
688
RehashIfNecessary(bool force_rehash)689 libtextclassifier3::Status PersistentHashMap::RehashIfNecessary(
690 bool force_rehash) {
691 int32_t new_num_bucket = bucket_storage_->num_elements();
692 while (new_num_bucket <= Bucket::kMaxNumBuckets / 2 &&
693 size() > static_cast<int64_t>(new_num_bucket) *
694 info().max_load_factor_percent / 100) {
695 new_num_bucket *= 2;
696 }
697
698 if (!force_rehash && new_num_bucket == bucket_storage_->num_elements()) {
699 return libtextclassifier3::Status::OK;
700 }
701
702 SetDirty();
703
704 // Resize and reset buckets.
705 ICING_RETURN_IF_ERROR(
706 bucket_storage_->Set(0, new_num_bucket, Bucket(Entry::kInvalidIndex)));
707
708 // Iterate all key value pairs in kv_storage, rehash and insert.
709 Iterator iter = GetIterator();
710 int32_t entry_idx = 0;
711 while (iter.Advance()) {
712 ICING_ASSIGN_OR_RETURN(int32_t bucket_idx,
713 HashKeyToBucketIndex(iter.GetKey(), new_num_bucket));
714 ICING_ASSIGN_OR_RETURN(FileBackedVector<Bucket>::MutableView mutable_bucket,
715 bucket_storage_->GetMutable(bucket_idx));
716
717 // Update entry and bucket.
718 ICING_RETURN_IF_ERROR(entry_storage_->Set(
719 entry_idx,
720 Entry(iter.GetIndex(), mutable_bucket.Get().head_entry_index())));
721 mutable_bucket.Get().set_head_entry_index(entry_idx);
722
723 ++entry_idx;
724 }
725
726 // Since there will be some deleted entries, after rehashing entry_storage_
727 // # of vector elements may be greater than the actual # of entries.
728 // Therefore, we have to truncate entry_storage_ to the correct size.
729 if (entry_idx < entry_storage_->num_elements()) {
730 ICING_RETURN_IF_ERROR(entry_storage_->TruncateTo(entry_idx));
731 }
732
733 info().num_deleted_entries = 0;
734
735 return libtextclassifier3::Status::OK;
736 }
737
Advance()738 bool PersistentHashMap::Iterator::Advance() {
739 // Jump over the current key value pair before advancing to the next valid
740 // key value pair. In the first round (after construction), curr_key_len_
741 // is 0, so don't jump over anything.
742 if (curr_key_len_ != 0) {
743 curr_kv_idx_ += curr_key_len_ + 1 + map_->info().value_type_size;
744 curr_key_len_ = 0;
745 }
746
747 // By skipping null chars, we will be automatically handling deleted entries
748 // (which are zeroed out during deletion).
749 for (const char* curr_kv_ptr = map_->kv_storage_->array() + curr_kv_idx_;
750 curr_kv_idx_ < map_->kv_storage_->num_elements();
751 ++curr_kv_ptr, ++curr_kv_idx_) {
752 if (*curr_kv_ptr != '\0') {
753 curr_key_len_ = strlen(curr_kv_ptr);
754 return true;
755 }
756 }
757 return false;
758 }
759
760 } // namespace lib
761 } // namespace icing
762