1 // Copyright (C) 2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/join/qualified-id-join-index-impl-v3.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <cstring>
20 #include <memory>
21 #include <string>
22 #include <string_view>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/feature-flags.h"
31 #include "icing/file/destructible-directory.h"
32 #include "icing/file/file-backed-vector.h"
33 #include "icing/file/filesystem.h"
34 #include "icing/file/memory-mapped-file.h"
35 #include "icing/join/document-join-id-pair.h"
36 #include "icing/join/qualified-id-join-index.h"
37 #include "icing/store/document-id.h"
38 #include "icing/store/namespace-id.h"
39 #include "icing/util/crc32.h"
40 #include "icing/util/logging.h"
41 #include "icing/util/math-util.h"
42 #include "icing/util/status-macros.h"
43
44 namespace icing {
45 namespace lib {
46
47 static constexpr DocumentJoinIdPair kInvalidDocumentJoinIdPair;
48 static constexpr QualifiedIdJoinIndexImplV3::ArrayInfo kInvalidArrayInfo;
49
50 namespace {
51
MakeMetadataFilePath(std::string_view working_path)52 std::string MakeMetadataFilePath(std::string_view working_path) {
53 return absl_ports::StrCat(working_path, "/metadata");
54 }
55
MakeParentDocumentIdToChildArrayInfoFilePath(std::string_view working_path)56 std::string MakeParentDocumentIdToChildArrayInfoFilePath(
57 std::string_view working_path) {
58 return absl_ports::StrCat(working_path,
59 "/parent_document_id_to_child_array_info");
60 }
61
MakeChildDocumentJoinIdPairArrayFilePath(std::string_view working_path)62 std::string MakeChildDocumentJoinIdPairArrayFilePath(
63 std::string_view working_path) {
64 return absl_ports::StrCat(working_path, "/child_document_join_id_pair_array");
65 }
66
67 } // namespace
68
69 /* static */ libtextclassifier3::StatusOr<
70 std::unique_ptr<QualifiedIdJoinIndexImplV3>>
Create(const Filesystem & filesystem,std::string working_path,const FeatureFlags & feature_flags)71 QualifiedIdJoinIndexImplV3::Create(const Filesystem& filesystem,
72 std::string working_path,
73 const FeatureFlags& feature_flags) {
74 bool metadata_file_exists =
75 filesystem.FileExists(MakeMetadataFilePath(working_path).c_str());
76 bool parent_document_id_to_child_array_info_file_exists =
77 filesystem.FileExists(
78 MakeParentDocumentIdToChildArrayInfoFilePath(working_path).c_str());
79 bool child_document_join_id_pair_array_file_exists = filesystem.FileExists(
80 MakeChildDocumentJoinIdPairArrayFilePath(working_path).c_str());
81
82 // If all files exist, initialize from existing files.
83 if (metadata_file_exists &&
84 parent_document_id_to_child_array_info_file_exists &&
85 child_document_join_id_pair_array_file_exists) {
86 return InitializeExistingFiles(filesystem, std::move(working_path),
87 feature_flags);
88 }
89
90 // If all files don't exist, initialize new files.
91 if (!metadata_file_exists &&
92 !parent_document_id_to_child_array_info_file_exists &&
93 !child_document_join_id_pair_array_file_exists) {
94 return InitializeNewFiles(filesystem, std::move(working_path),
95 feature_flags);
96 }
97
98 // Otherwise, some files are missing, and the join index is somehow corrupted.
99 // Return error to let the caller discard and rebuild.
100 return absl_ports::FailedPreconditionError(
101 "Inconsistent state of qualified id join index (v3)");
102 }
103
~QualifiedIdJoinIndexImplV3()104 QualifiedIdJoinIndexImplV3::~QualifiedIdJoinIndexImplV3() {
105 if (!PersistToDisk().ok()) {
106 ICING_LOG(WARNING) << "Failed to persist qualified id join index (v3) to "
107 "disk while destructing "
108 << working_path_;
109 }
110 }
111
Put(const DocumentJoinIdPair & child_document_join_id_pair,std::vector<DocumentId> && parent_document_ids)112 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::Put(
113 const DocumentJoinIdPair& child_document_join_id_pair,
114 std::vector<DocumentId>&& parent_document_ids) {
115 if (!child_document_join_id_pair.is_valid()) {
116 return absl_ports::InvalidArgumentError("Invalid child DocumentJoinIdPair");
117 }
118
119 if (parent_document_ids.empty()) {
120 return libtextclassifier3::Status::OK;
121 }
122
123 SetDirty();
124
125 // Sort and dedupe parent document ids. It will be more efficient to access
126 // parent_document_id_to_child_array_info_ in the order of ids.
127 std::sort(parent_document_ids.begin(), parent_document_ids.end());
128 auto last =
129 std::unique(parent_document_ids.begin(), parent_document_ids.end());
130 parent_document_ids.erase(last, parent_document_ids.end());
131
132 // Append child_document_join_id_pair to each parent's DocumentJoinIdPair
133 // array.
134 for (DocumentId parent_document_id : parent_document_ids) {
135 if (parent_document_id < 0 || parent_document_id == kInvalidDocumentId) {
136 // Skip invalid parent document id.
137 continue;
138 }
139
140 // Insert child_document_join_id_pair into the parent's DocumentJoinIdPair
141 // array.
142 ICING_RETURN_IF_ERROR(AppendChildDocumentJoinIdPairsForParent(
143 parent_document_id, {child_document_join_id_pair}));
144 }
145 return libtextclassifier3::Status::OK;
146 }
147
148 libtextclassifier3::StatusOr<std::vector<DocumentJoinIdPair>>
Get(DocumentId parent_document_id) const149 QualifiedIdJoinIndexImplV3::Get(DocumentId parent_document_id) const {
150 if (parent_document_id < 0 || parent_document_id == kInvalidDocumentId) {
151 return absl_ports::InvalidArgumentError("Invalid parent document id");
152 }
153
154 if (parent_document_id >=
155 parent_document_id_to_child_array_info_->num_elements()) {
156 return std::vector<DocumentJoinIdPair>();
157 }
158
159 // Get the child array info for the parent.
160 ICING_ASSIGN_OR_RETURN(
161 const ArrayInfo* array_info,
162 parent_document_id_to_child_array_info_->Get(parent_document_id));
163 if (!array_info->IsValid()) {
164 return std::vector<DocumentJoinIdPair>();
165 }
166
167 // Safe check to avoid out-of-bound access. This should never happen unless
168 // the internal data structure is corrupted.
169 if (array_info->index + array_info->length >
170 child_document_join_id_pair_array_->num_elements()) {
171 return absl_ports::InternalError(absl_ports::StrCat(
172 "Invalid array info: ", std::to_string(array_info->index), " + ",
173 std::to_string(array_info->length), " > ",
174 std::to_string(child_document_join_id_pair_array_->num_elements())));
175 }
176
177 // Get the DocumentJoinIdPair array and return the child DocumentJoinIdPairs.
178 ICING_ASSIGN_OR_RETURN(
179 const DocumentJoinIdPair* ptr,
180 child_document_join_id_pair_array_->Get(array_info->index));
181 return std::vector<DocumentJoinIdPair>(ptr, ptr + array_info->used_length);
182 }
183
MigrateParent(DocumentId old_document_id,DocumentId new_document_id)184 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::MigrateParent(
185 DocumentId old_document_id, DocumentId new_document_id) {
186 if (!IsDocumentIdValid(old_document_id) ||
187 !IsDocumentIdValid(new_document_id)) {
188 return absl_ports::InvalidArgumentError(
189 "Invalid parent document id to migrate");
190 }
191
192 if (old_document_id >=
193 parent_document_id_to_child_array_info_->num_elements()) {
194 // It means the parent document doesn't have any children at this moment. No
195 // need to migrate.
196 return libtextclassifier3::Status::OK;
197 }
198
199 ICING_ASSIGN_OR_RETURN(
200 FileBackedVector<ArrayInfo>::MutableView mutable_old_array_info,
201 parent_document_id_to_child_array_info_->GetMutable(old_document_id));
202 if (!mutable_old_array_info.Get().IsValid()) {
203 // It means the parent document doesn't have any children at this moment. No
204 // need to migrate.
205 return libtextclassifier3::Status::OK;
206 }
207
208 ICING_RETURN_IF_ERROR(
209 ExtendParentDocumentIdToChildArrayInfoIfNecessary(new_document_id));
210 ICING_RETURN_IF_ERROR(parent_document_id_to_child_array_info_->Set(
211 new_document_id, mutable_old_array_info.Get()));
212 mutable_old_array_info.Get() = kInvalidArrayInfo;
213
214 return libtextclassifier3::Status::OK;
215 }
216
Optimize(const std::vector<DocumentId> & document_id_old_to_new,const std::vector<NamespaceId> & namespace_id_old_to_new,DocumentId new_last_added_document_id)217 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::Optimize(
218 const std::vector<DocumentId>& document_id_old_to_new,
219 const std::vector<NamespaceId>& namespace_id_old_to_new,
220 DocumentId new_last_added_document_id) {
221 std::string temp_working_path = working_path_ + "_temp";
222 ICING_RETURN_IF_ERROR(
223 QualifiedIdJoinIndex::Discard(filesystem_, temp_working_path));
224
225 DestructibleDirectory temp_working_path_ddir(&filesystem_,
226 std::move(temp_working_path));
227 if (!temp_working_path_ddir.is_valid()) {
228 return absl_ports::InternalError(
229 "Unable to create temp directory to build new qualified id join index "
230 "(v3)");
231 }
232
233 {
234 // Transfer all data from the current to new qualified id join index. Also
235 // PersistToDisk and destruct the instance after finishing, so we can safely
236 // swap directories later.
237 ICING_ASSIGN_OR_RETURN(
238 std::unique_ptr<QualifiedIdJoinIndexImplV3> new_index,
239 Create(filesystem_, temp_working_path_ddir.dir(), feature_flags_));
240 ICING_RETURN_IF_ERROR(
241 TransferIndex(document_id_old_to_new, new_index.get()));
242 new_index->set_last_added_document_id(new_last_added_document_id);
243 new_index->SetDirty();
244
245 ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
246 }
247
248 // Destruct current index's storage instances to safely swap directories.
249 child_document_join_id_pair_array_.reset();
250 parent_document_id_to_child_array_info_.reset();
251 metadata_mmapped_file_.reset();
252 if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(),
253 working_path_.c_str())) {
254 return absl_ports::InternalError(
255 "Unable to apply new qualified id join index (v3) due to failed swap");
256 }
257
258 // Reinitialize qualified id join index.
259 ICING_ASSIGN_OR_RETURN(
260 MemoryMappedFile metadata_mmapped_file,
261 MemoryMappedFile::Create(filesystem_, MakeMetadataFilePath(working_path_),
262 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
263 /*max_file_size=*/kMetadataFileSize,
264 /*pre_mapping_file_offset=*/0,
265 /*pre_mapping_mmap_size=*/kMetadataFileSize));
266 metadata_mmapped_file_ =
267 std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file));
268
269 ICING_ASSIGN_OR_RETURN(
270 parent_document_id_to_child_array_info_,
271 FileBackedVector<ArrayInfo>::Create(
272 filesystem_,
273 MakeParentDocumentIdToChildArrayInfoFilePath(working_path_),
274 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
275 FileBackedVector<ArrayInfo>::kMaxFileSize,
276 /*pre_mapping_mmap_size=*/0));
277
278 ICING_ASSIGN_OR_RETURN(
279 child_document_join_id_pair_array_,
280 FileBackedVector<DocumentJoinIdPair>::Create(
281 filesystem_, MakeChildDocumentJoinIdPairArrayFilePath(working_path_),
282 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
283 FileBackedVector<DocumentJoinIdPair>::kMaxFileSize,
284 /*pre_mapping_mmap_size=*/0));
285
286 return libtextclassifier3::Status::OK;
287 }
288
Clear()289 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::Clear() {
290 SetDirty();
291
292 parent_document_id_to_child_array_info_.reset();
293 // Discard and reinitialize parent_document_id_to_child_array_info.
294 std::string parent_document_id_to_child_array_info_file_path =
295 MakeParentDocumentIdToChildArrayInfoFilePath(working_path_);
296 if (!filesystem_.DeleteFile(
297 parent_document_id_to_child_array_info_file_path.c_str())) {
298 return absl_ports::InternalError(absl_ports::StrCat(
299 "Failed to clear parent document id to child array info: ",
300 parent_document_id_to_child_array_info_file_path));
301 }
302 ICING_ASSIGN_OR_RETURN(
303 parent_document_id_to_child_array_info_,
304 FileBackedVector<ArrayInfo>::Create(
305 filesystem_, parent_document_id_to_child_array_info_file_path,
306 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
307 FileBackedVector<ArrayInfo>::kMaxFileSize,
308 /*pre_mapping_mmap_size=*/0));
309
310 child_document_join_id_pair_array_.reset();
311 // Discard and reinitialize child_document_join_id_pair_array.
312 std::string child_document_join_id_pair_array_file_path =
313 MakeChildDocumentJoinIdPairArrayFilePath(working_path_);
314 if (!filesystem_.DeleteFile(
315 child_document_join_id_pair_array_file_path.c_str())) {
316 return absl_ports::InternalError(absl_ports::StrCat(
317 "Failed to clear child document join id pair array: ",
318 child_document_join_id_pair_array_file_path));
319 }
320 ICING_ASSIGN_OR_RETURN(
321 child_document_join_id_pair_array_,
322 FileBackedVector<DocumentJoinIdPair>::Create(
323 filesystem_, child_document_join_id_pair_array_file_path,
324 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
325 FileBackedVector<DocumentJoinIdPair>::kMaxFileSize,
326 /*pre_mapping_mmap_size=*/0));
327
328 info().num_data = 0;
329 info().last_added_document_id = kInvalidDocumentId;
330 return libtextclassifier3::Status::OK;
331 }
332
333 /* static */ libtextclassifier3::StatusOr<
334 std::unique_ptr<QualifiedIdJoinIndexImplV3>>
InitializeNewFiles(const Filesystem & filesystem,std::string working_path,const FeatureFlags & feature_flags)335 QualifiedIdJoinIndexImplV3::InitializeNewFiles(
336 const Filesystem& filesystem, std::string working_path,
337 const FeatureFlags& feature_flags) {
338 // Create working directory.
339 if (!filesystem.CreateDirectoryRecursively(working_path.c_str())) {
340 return absl_ports::InternalError(
341 absl_ports::StrCat("Failed to create directory: ", working_path));
342 }
343
344 // Initialize metadata file.
345 // - Create MemoryMappedFile with pre-mapping and call GrowAndRemapIfNecessary
346 // to grow the underlying file.
347 // - Initialize metadata content by writing 0 bytes to the memory region
348 // directly.
349 ICING_ASSIGN_OR_RETURN(
350 MemoryMappedFile metadata_mmapped_file,
351 MemoryMappedFile::Create(filesystem, MakeMetadataFilePath(working_path),
352 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
353 /*max_file_size=*/kMetadataFileSize,
354 /*pre_mapping_file_offset=*/0,
355 /*pre_mapping_mmap_size=*/kMetadataFileSize));
356 ICING_RETURN_IF_ERROR(metadata_mmapped_file.GrowAndRemapIfNecessary(
357 /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
358 memset(metadata_mmapped_file.mutable_region(), 0, kMetadataFileSize);
359
360 // Initialize parent_document_id_to_child_array_info.
361 ICING_ASSIGN_OR_RETURN(
362 std::unique_ptr<FileBackedVector<ArrayInfo>>
363 parent_document_id_to_child_array_info,
364 FileBackedVector<ArrayInfo>::Create(
365 filesystem,
366 MakeParentDocumentIdToChildArrayInfoFilePath(working_path),
367 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
368 FileBackedVector<ArrayInfo>::kMaxFileSize,
369 /*pre_mapping_mmap_size=*/0));
370
371 // Initialize child_document_join_id_pair_array.
372 ICING_ASSIGN_OR_RETURN(
373 std::unique_ptr<FileBackedVector<DocumentJoinIdPair>>
374 child_document_join_id_pair_array,
375 FileBackedVector<DocumentJoinIdPair>::Create(
376 filesystem, MakeChildDocumentJoinIdPairArrayFilePath(working_path),
377 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
378 FileBackedVector<DocumentJoinIdPair>::kMaxFileSize,
379 /*pre_mapping_mmap_size=*/0));
380
381 // Create instance.
382 auto new_join_index = std::unique_ptr<QualifiedIdJoinIndexImplV3>(
383 new QualifiedIdJoinIndexImplV3(
384 filesystem, std::move(working_path),
385 std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)),
386 std::move(parent_document_id_to_child_array_info),
387 std::move(child_document_join_id_pair_array), feature_flags));
388 // Initialize info content.
389 new_join_index->info().magic = Info::kMagic;
390 new_join_index->info().num_data = 0;
391 new_join_index->info().last_added_document_id = kInvalidDocumentId;
392
393 // Initialize new PersistentStorage. The initial checksums will be computed
394 // and set via InitializeNewStorage.
395 ICING_RETURN_IF_ERROR(new_join_index->InitializeNewStorage());
396
397 return new_join_index;
398 }
399
400 /* static */ libtextclassifier3::StatusOr<
401 std::unique_ptr<QualifiedIdJoinIndexImplV3>>
InitializeExistingFiles(const Filesystem & filesystem,std::string working_path,const FeatureFlags & feature_flags)402 QualifiedIdJoinIndexImplV3::InitializeExistingFiles(
403 const Filesystem& filesystem, std::string working_path,
404 const FeatureFlags& feature_flags) {
405 // Initialize metadata file.
406 ICING_ASSIGN_OR_RETURN(
407 MemoryMappedFile metadata_mmapped_file,
408 MemoryMappedFile::Create(filesystem, MakeMetadataFilePath(working_path),
409 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
410 /*max_file_size=*/kMetadataFileSize,
411 /*pre_mapping_file_offset=*/0,
412 /*pre_mapping_mmap_size=*/kMetadataFileSize));
413 if (metadata_mmapped_file.available_size() != kMetadataFileSize) {
414 return absl_ports::FailedPreconditionError("Incorrect metadata file size");
415 }
416
417 // Initialize parent_document_id_to_child_array_info. Set mmap pre-mapping
418 // size to 0, but MemoryMappedFile will still mmap to the file size.
419 ICING_ASSIGN_OR_RETURN(
420 std::unique_ptr<FileBackedVector<ArrayInfo>>
421 parent_document_id_to_child_array_info,
422 FileBackedVector<ArrayInfo>::Create(
423 filesystem,
424 MakeParentDocumentIdToChildArrayInfoFilePath(working_path),
425 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
426 FileBackedVector<ArrayInfo>::kMaxFileSize,
427 /*pre_mapping_mmap_size=*/0));
428
429 // Initialize child_document_join_id_pair_array. Set mmap pre-mapping size to
430 // 0, but MemoryMappedFile will still mmap to the file size.
431 ICING_ASSIGN_OR_RETURN(
432 std::unique_ptr<FileBackedVector<DocumentJoinIdPair>>
433 child_document_join_id_pair_array,
434 FileBackedVector<DocumentJoinIdPair>::Create(
435 filesystem, MakeChildDocumentJoinIdPairArrayFilePath(working_path),
436 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
437 FileBackedVector<DocumentJoinIdPair>::kMaxFileSize,
438 /*pre_mapping_mmap_size=*/0));
439
440 // Create instance.
441 auto join_index = std::unique_ptr<QualifiedIdJoinIndexImplV3>(
442 new QualifiedIdJoinIndexImplV3(
443 filesystem, std::move(working_path),
444 std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)),
445 std::move(parent_document_id_to_child_array_info),
446 std::move(child_document_join_id_pair_array), feature_flags));
447
448 // Initialize existing PersistentStorage. Checksums will be validated.
449 ICING_RETURN_IF_ERROR(join_index->InitializeExistingStorage());
450
451 // Validate magic.
452 if (join_index->info().magic != Info::kMagic) {
453 return absl_ports::FailedPreconditionError("Incorrect magic value");
454 }
455
456 return join_index;
457 }
458
459 libtextclassifier3::Status
AppendChildDocumentJoinIdPairsForParent(DocumentId parent_document_id,std::vector<DocumentJoinIdPair> && child_document_join_id_pairs)460 QualifiedIdJoinIndexImplV3::AppendChildDocumentJoinIdPairsForParent(
461 DocumentId parent_document_id,
462 std::vector<DocumentJoinIdPair>&& child_document_join_id_pairs) {
463 if (child_document_join_id_pairs.empty()) {
464 return libtextclassifier3::Status::OK;
465 }
466
467 // Step 1: extend parent_document_id_to_child_array_info_ if necessary.
468 ICING_RETURN_IF_ERROR(
469 ExtendParentDocumentIdToChildArrayInfoIfNecessary(parent_document_id));
470
471 // Step 2: get the parent's child DocumentJoinIdPair mutable array (extend if
472 // necessary), append new child_document_join_id_pairs to the array,
473 // and assign the new ArrayInfo to
474 // parent_document_id_to_child_array_info_.
475 ICING_ASSIGN_OR_RETURN(
476 const ArrayInfo* array_info,
477 parent_document_id_to_child_array_info_->Get(parent_document_id));
478 ICING_ASSIGN_OR_RETURN(
479 GetMutableAndExtendResult result,
480 GetMutableAndExtendChildDocumentJoinIdPairArrayIfNecessary(
481 *array_info, /*num_to_add=*/child_document_join_id_pairs.size()));
482 // - [0, result.array_info.used_length) contain valid elements.
483 // - We will write new elements starting from index
484 // result.array_info.used_length, and update the used_length.
485 result.mutable_arr.SetArray(/*idx=*/result.array_info.used_length,
486 /*arr=*/child_document_join_id_pairs.data(),
487 /*len=*/child_document_join_id_pairs.size());
488 result.array_info.used_length += child_document_join_id_pairs.size();
489
490 // Set ArrayInfo back to parent_document_id_to_child_array_info_.
491 ICING_RETURN_IF_ERROR(parent_document_id_to_child_array_info_->Set(
492 parent_document_id, result.array_info));
493
494 // Update header.
495 info().num_data += child_document_join_id_pairs.size();
496
497 return libtextclassifier3::Status::OK;
498 }
499
500 libtextclassifier3::Status
ExtendParentDocumentIdToChildArrayInfoIfNecessary(DocumentId parent_document_id)501 QualifiedIdJoinIndexImplV3::ExtendParentDocumentIdToChildArrayInfoIfNecessary(
502 DocumentId parent_document_id) {
503 if (parent_document_id >=
504 parent_document_id_to_child_array_info_->num_elements()) {
505 int32_t num_to_extend =
506 (parent_document_id + 1) -
507 parent_document_id_to_child_array_info_->num_elements();
508 ICING_ASSIGN_OR_RETURN(
509 FileBackedVector<ArrayInfo>::MutableArrayView mutable_arr,
510 parent_document_id_to_child_array_info_->Allocate(num_to_extend));
511 mutable_arr.Fill(/*idx=*/0, /*len=*/num_to_extend, kInvalidArrayInfo);
512 }
513 return libtextclassifier3::Status::OK;
514 }
515
516 libtextclassifier3::StatusOr<
517 QualifiedIdJoinIndexImplV3::GetMutableAndExtendResult>
518 QualifiedIdJoinIndexImplV3::
GetMutableAndExtendChildDocumentJoinIdPairArrayIfNecessary(const ArrayInfo & array_info,int32_t num_to_add)519 GetMutableAndExtendChildDocumentJoinIdPairArrayIfNecessary(
520 const ArrayInfo& array_info, int32_t num_to_add) {
521 if (num_to_add > kMaxNumChildrenPerParent) {
522 return absl_ports::ResourceExhaustedError(
523 absl_ports::StrCat("Too many children to add for a single parent: ",
524 std::to_string(num_to_add)));
525 }
526
527 if (!array_info.IsValid()) {
528 // If the array info is invalid, then it means the array is not allocated
529 // yet. Allocate a new array and fill it with invalid values.
530 ArrayInfo new_array_info(
531 /*index_in=*/child_document_join_id_pair_array_->num_elements(),
532 /*length_in=*/math_util::NextPowerOf2(num_to_add),
533 /*used_length_in=*/0);
534 ICING_ASSIGN_OR_RETURN(
535 FileBackedVector<DocumentJoinIdPair>::MutableArrayView new_mutable_arr,
536 child_document_join_id_pair_array_->Allocate(new_array_info.length));
537 new_mutable_arr.Fill(/*idx=*/0, /*len=*/new_array_info.length,
538 kInvalidDocumentJoinIdPair);
539
540 return GetMutableAndExtendResult{.array_info = new_array_info,
541 .mutable_arr = std::move(new_mutable_arr)};
542 }
543
544 // The original array is valid. Get it and check if it needs to be extended.
545 ICING_ASSIGN_OR_RETURN(FileBackedVector<DocumentJoinIdPair>::MutableArrayView
546 original_mutable_arr,
547 child_document_join_id_pair_array_->GetMutable(
548 array_info.index, array_info.length));
549 // - [0, array_info.used_length) contain valid elements.
550 // - [array_info.used_length, array_info.length) are invalid elements and
551 // available for new elements.
552 if (array_info.length - array_info.used_length >= num_to_add) {
553 // Available space is enough to fit num_to_add. Return the original array.
554 return GetMutableAndExtendResult{
555 .array_info = array_info,
556 .mutable_arr = std::move(original_mutable_arr)};
557 }
558
559 // Need to extend. The new array length should be able to fit all existing
560 // elements and the newly added elements.
561 //
562 // Check the new # of elements should not exceed the max limit. Use
563 // substraction to avoid overflow.
564 if (num_to_add > kMaxNumChildrenPerParent - array_info.used_length) {
565 return absl_ports::ResourceExhaustedError(absl_ports::StrCat(
566 "Too many children to add for an existing parent: ",
567 std::to_string(num_to_add),
568 "; existing: ", std::to_string(array_info.used_length)));
569 }
570
571 if (array_info.index + array_info.length ==
572 child_document_join_id_pair_array_->num_elements()) {
573 // The original array is at the end of the FBV. We can extend and reshape it
574 // directly without copying elements.
575 //
576 // (Note: # means data out of the array_info we're dealing with.)
577 // Original FBV and array_info layout:
578 // +-------------------------+---------------.------+
579 // |#########################| used |unused|
580 // +-------------------------+---------------.------+
581 // ^ ^
582 // index FBV_END
583 // |<-used_length->|
584 // |<- length ->|
585 //
586 // New FBV and new_array_info layout after extension:
587 // +-------------------------+---------------.------.------------------+
588 // |#########################| used |unused| extended slice |
589 // +-------------------------+---------------.------.------------------+
590 // ^ ^
591 // index FBV_END
592 // |<-used_length->|
593 // |<- new length ->|
594 ArrayInfo new_array_info = array_info;
595 new_array_info.length =
596 math_util::NextPowerOf2(array_info.used_length + num_to_add);
597
598 // Extend the original array by allocating a new slice.
599 ICING_ASSIGN_OR_RETURN(
600 FileBackedVector<DocumentJoinIdPair>::MutableArrayView extended_slice,
601 child_document_join_id_pair_array_->Allocate(new_array_info.length -
602 array_info.length));
603 // Fill the extended slice with invalid values.
604 extended_slice.Fill(/*idx=*/0,
605 /*len=*/new_array_info.length - array_info.length,
606 kInvalidDocumentJoinIdPair);
607
608 // Construct the mutable array view for new_array_info and return.
609 ICING_ASSIGN_OR_RETURN(
610 FileBackedVector<DocumentJoinIdPair>::MutableArrayView mutable_arr,
611 child_document_join_id_pair_array_->GetMutable(new_array_info.index,
612 new_array_info.length));
613
614 return GetMutableAndExtendResult{.array_info = std::move(new_array_info),
615 .mutable_arr = std::move(mutable_arr)};
616 }
617
618 // The original array is not at the end of the file. We allocate a new array
619 // starting at the end of the FBV, copy all existing elements to the new
620 // array, and fill the remaining with invalid values.
621 //
622 // Original FBV and array_info layout:
623 // +---+---------------.------+------+
624 // |###| used |unused|######|
625 // +---+---------------.------+------+
626 // ^ ^
627 // index FBV_END
628 // |<-used_length->|
629 // |<- length ->|
630 //
631 // New FBV and new_array_info layout after extension and moving:
632 // +---+----------------------+------+---------------.-----------------------+
633 // |###| <INVALID> |######| used | |
634 // +---+----------------------+------+---------------.-----------------------+
635 // ^
636 // new_index
637 // |<-used_length->|
638 // |<- new_length ->|
639 ArrayInfo new_array_info(
640 /*index_in=*/child_document_join_id_pair_array_->num_elements(),
641 /*length_in=*/
642 math_util::NextPowerOf2(array_info.used_length + num_to_add),
643 /*used_length_in=*/array_info.used_length);
644 ICING_ASSIGN_OR_RETURN(
645 FileBackedVector<DocumentJoinIdPair>::MutableArrayView new_mutable_arr,
646 child_document_join_id_pair_array_->Allocate(new_array_info.length));
647
648 // It is possible that Allocate() causes FBV to grow and remap. In this case,
649 // original_mutable_arr will point to an invalid memory region, so we need to
650 // refresh it.
651 //
652 // Note: original_mutable_arr has length = array_info.length and only contains
653 // valid elements with length = array_info.used_length. We could've
654 // constructed original_mutable_arr with length = array_info.used_length
655 // here, but let's make it consistent with all other cases (i.e.
656 // constructing the array view object for the entire extensible array).
657 ICING_ASSIGN_OR_RETURN(original_mutable_arr,
658 child_document_join_id_pair_array_->GetMutable(
659 array_info.index, array_info.length));
660
661 // Move all existing elements to the new array.
662 new_mutable_arr.SetArray(/*idx=*/0, original_mutable_arr.data(),
663 array_info.used_length);
664 // Fill the remaining with invalid values.
665 new_mutable_arr.Fill(/*idx=*/array_info.used_length,
666 /*len=*/new_array_info.length - array_info.used_length,
667 kInvalidDocumentJoinIdPair);
668 // Invalidate the original array.
669 original_mutable_arr.Fill(
670 /*idx=*/0, /*len=*/array_info.length, kInvalidDocumentJoinIdPair);
671 return GetMutableAndExtendResult{.array_info = new_array_info,
672 .mutable_arr = std::move(new_mutable_arr)};
673 }
674
TransferIndex(const std::vector<DocumentId> & document_id_old_to_new,QualifiedIdJoinIndexImplV3 * new_index) const675 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::TransferIndex(
676 const std::vector<DocumentId>& document_id_old_to_new,
677 QualifiedIdJoinIndexImplV3* new_index) const {
678 for (DocumentId old_parent_doc_id = 0;
679 old_parent_doc_id <
680 parent_document_id_to_child_array_info_->num_elements();
681 ++old_parent_doc_id) {
682 if (old_parent_doc_id >= document_id_old_to_new.size() ||
683 document_id_old_to_new[old_parent_doc_id] == kInvalidDocumentId) {
684 // Skip if the old parent document id is invalid after optimization.
685 continue;
686 }
687
688 ICING_ASSIGN_OR_RETURN(
689 const ArrayInfo* array_info,
690 parent_document_id_to_child_array_info_->Get(old_parent_doc_id));
691 if (!array_info->IsValid()) {
692 continue;
693 }
694 ICING_ASSIGN_OR_RETURN(
695 const DocumentJoinIdPair* ptr,
696 child_document_join_id_pair_array_->Get(array_info->index));
697
698 // Get all child DocumentJoinIdPairs and assign new child document ids.
699 std::vector<DocumentJoinIdPair> new_child_doc_join_id_pairs;
700 new_child_doc_join_id_pairs.reserve(array_info->length);
701 for (int i = 0; i < array_info->used_length; ++i) {
702 DocumentId old_child_doc_id = ptr[i].document_id();
703 DocumentId new_child_doc_id =
704 old_child_doc_id >= 0 &&
705 old_child_doc_id < document_id_old_to_new.size()
706 ? document_id_old_to_new[old_child_doc_id]
707 : kInvalidDocumentId;
708 if (new_child_doc_id == kInvalidDocumentId) {
709 continue;
710 }
711
712 new_child_doc_join_id_pairs.push_back(
713 DocumentJoinIdPair(new_child_doc_id, ptr[i].joinable_property_id()));
714 }
715
716 ICING_RETURN_IF_ERROR(new_index->AppendChildDocumentJoinIdPairsForParent(
717 document_id_old_to_new[old_parent_doc_id],
718 std::move(new_child_doc_join_id_pairs)));
719 }
720 return libtextclassifier3::Status::OK;
721 }
722
PersistMetadataToDisk()723 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::PersistMetadataToDisk() {
724 // We can skip persisting metadata to disk only if both info and storage are
725 // clean.
726 if (is_initialized_ && !is_info_dirty() && !is_storage_dirty()) {
727 return libtextclassifier3::Status::OK;
728 }
729
730 // Changes should have been applied to the underlying file when using
731 // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an
732 // extra safety step to ensure they are written out.
733 ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk());
734 is_info_dirty_ = false;
735 return libtextclassifier3::Status::OK;
736 }
737
PersistStoragesToDisk()738 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::PersistStoragesToDisk() {
739 if (is_initialized_ && !is_storage_dirty()) {
740 return libtextclassifier3::Status::OK;
741 }
742
743 ICING_RETURN_IF_ERROR(
744 parent_document_id_to_child_array_info_->PersistToDisk());
745 ICING_RETURN_IF_ERROR(child_document_join_id_pair_array_->PersistToDisk());
746
747 is_storage_dirty_ = false;
748 return libtextclassifier3::Status::OK;
749 }
750
751 libtextclassifier3::StatusOr<Crc32>
UpdateStoragesChecksum()752 QualifiedIdJoinIndexImplV3::UpdateStoragesChecksum() {
753 if (is_initialized_ && !is_storage_dirty()) {
754 return Crc32(crcs().component_crcs.storages_crc);
755 }
756
757 // Compute crcs
758 ICING_ASSIGN_OR_RETURN(
759 Crc32 parent_document_id_to_child_array_info_crc,
760 parent_document_id_to_child_array_info_->UpdateChecksum());
761 ICING_ASSIGN_OR_RETURN(Crc32 child_document_join_id_pair_array_crc,
762 child_document_join_id_pair_array_->UpdateChecksum());
763
764 return Crc32(parent_document_id_to_child_array_info_crc.Get() ^
765 child_document_join_id_pair_array_crc.Get());
766 }
767
768 libtextclassifier3::StatusOr<Crc32>
GetInfoChecksum() const769 QualifiedIdJoinIndexImplV3::GetInfoChecksum() const {
770 if (is_initialized_ && !is_info_dirty()) {
771 return Crc32(crcs().component_crcs.info_crc);
772 }
773
774 return info().GetChecksum();
775 }
776
777 libtextclassifier3::StatusOr<Crc32>
GetStoragesChecksum() const778 QualifiedIdJoinIndexImplV3::GetStoragesChecksum() const {
779 if (is_initialized_ && !is_storage_dirty()) {
780 return Crc32(crcs().component_crcs.storages_crc);
781 }
782
783 // Get checksums for all components.
784 Crc32 parent_document_id_to_child_array_info_crc =
785 parent_document_id_to_child_array_info_->GetChecksum();
786 Crc32 child_document_join_id_pair_array_crc =
787 child_document_join_id_pair_array_->GetChecksum();
788
789 return Crc32(parent_document_id_to_child_array_info_crc.Get() ^
790 child_document_join_id_pair_array_crc.Get());
791 }
792
793 } // namespace lib
794 } // namespace icing
795