xref: /aosp_15_r20/external/icing/icing/join/qualified-id-join-index-impl-v3.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/join/qualified-id-join-index-impl-v3.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <cstring>
20 #include <memory>
21 #include <string>
22 #include <string_view>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/feature-flags.h"
31 #include "icing/file/destructible-directory.h"
32 #include "icing/file/file-backed-vector.h"
33 #include "icing/file/filesystem.h"
34 #include "icing/file/memory-mapped-file.h"
35 #include "icing/join/document-join-id-pair.h"
36 #include "icing/join/qualified-id-join-index.h"
37 #include "icing/store/document-id.h"
38 #include "icing/store/namespace-id.h"
39 #include "icing/util/crc32.h"
40 #include "icing/util/logging.h"
41 #include "icing/util/math-util.h"
42 #include "icing/util/status-macros.h"
43 
44 namespace icing {
45 namespace lib {
46 
47 static constexpr DocumentJoinIdPair kInvalidDocumentJoinIdPair;
48 static constexpr QualifiedIdJoinIndexImplV3::ArrayInfo kInvalidArrayInfo;
49 
50 namespace {
51 
MakeMetadataFilePath(std::string_view working_path)52 std::string MakeMetadataFilePath(std::string_view working_path) {
53   return absl_ports::StrCat(working_path, "/metadata");
54 }
55 
MakeParentDocumentIdToChildArrayInfoFilePath(std::string_view working_path)56 std::string MakeParentDocumentIdToChildArrayInfoFilePath(
57     std::string_view working_path) {
58   return absl_ports::StrCat(working_path,
59                             "/parent_document_id_to_child_array_info");
60 }
61 
MakeChildDocumentJoinIdPairArrayFilePath(std::string_view working_path)62 std::string MakeChildDocumentJoinIdPairArrayFilePath(
63     std::string_view working_path) {
64   return absl_ports::StrCat(working_path, "/child_document_join_id_pair_array");
65 }
66 
67 }  // namespace
68 
69 /* static */ libtextclassifier3::StatusOr<
70     std::unique_ptr<QualifiedIdJoinIndexImplV3>>
Create(const Filesystem & filesystem,std::string working_path,const FeatureFlags & feature_flags)71 QualifiedIdJoinIndexImplV3::Create(const Filesystem& filesystem,
72                                    std::string working_path,
73                                    const FeatureFlags& feature_flags) {
74   bool metadata_file_exists =
75       filesystem.FileExists(MakeMetadataFilePath(working_path).c_str());
76   bool parent_document_id_to_child_array_info_file_exists =
77       filesystem.FileExists(
78           MakeParentDocumentIdToChildArrayInfoFilePath(working_path).c_str());
79   bool child_document_join_id_pair_array_file_exists = filesystem.FileExists(
80       MakeChildDocumentJoinIdPairArrayFilePath(working_path).c_str());
81 
82   // If all files exist, initialize from existing files.
83   if (metadata_file_exists &&
84       parent_document_id_to_child_array_info_file_exists &&
85       child_document_join_id_pair_array_file_exists) {
86     return InitializeExistingFiles(filesystem, std::move(working_path),
87                                    feature_flags);
88   }
89 
90   // If all files don't exist, initialize new files.
91   if (!metadata_file_exists &&
92       !parent_document_id_to_child_array_info_file_exists &&
93       !child_document_join_id_pair_array_file_exists) {
94     return InitializeNewFiles(filesystem, std::move(working_path),
95                               feature_flags);
96   }
97 
98   // Otherwise, some files are missing, and the join index is somehow corrupted.
99   // Return error to let the caller discard and rebuild.
100   return absl_ports::FailedPreconditionError(
101       "Inconsistent state of qualified id join index (v3)");
102 }
103 
~QualifiedIdJoinIndexImplV3()104 QualifiedIdJoinIndexImplV3::~QualifiedIdJoinIndexImplV3() {
105   if (!PersistToDisk().ok()) {
106     ICING_LOG(WARNING) << "Failed to persist qualified id join index (v3) to "
107                           "disk while destructing "
108                        << working_path_;
109   }
110 }
111 
Put(const DocumentJoinIdPair & child_document_join_id_pair,std::vector<DocumentId> && parent_document_ids)112 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::Put(
113     const DocumentJoinIdPair& child_document_join_id_pair,
114     std::vector<DocumentId>&& parent_document_ids) {
115   if (!child_document_join_id_pair.is_valid()) {
116     return absl_ports::InvalidArgumentError("Invalid child DocumentJoinIdPair");
117   }
118 
119   if (parent_document_ids.empty()) {
120     return libtextclassifier3::Status::OK;
121   }
122 
123   SetDirty();
124 
125   // Sort and dedupe parent document ids. It will be more efficient to access
126   // parent_document_id_to_child_array_info_ in the order of ids.
127   std::sort(parent_document_ids.begin(), parent_document_ids.end());
128   auto last =
129       std::unique(parent_document_ids.begin(), parent_document_ids.end());
130   parent_document_ids.erase(last, parent_document_ids.end());
131 
132   // Append child_document_join_id_pair to each parent's DocumentJoinIdPair
133   // array.
134   for (DocumentId parent_document_id : parent_document_ids) {
135     if (parent_document_id < 0 || parent_document_id == kInvalidDocumentId) {
136       // Skip invalid parent document id.
137       continue;
138     }
139 
140     // Insert child_document_join_id_pair into the parent's DocumentJoinIdPair
141     // array.
142     ICING_RETURN_IF_ERROR(AppendChildDocumentJoinIdPairsForParent(
143         parent_document_id, {child_document_join_id_pair}));
144   }
145   return libtextclassifier3::Status::OK;
146 }
147 
148 libtextclassifier3::StatusOr<std::vector<DocumentJoinIdPair>>
Get(DocumentId parent_document_id) const149 QualifiedIdJoinIndexImplV3::Get(DocumentId parent_document_id) const {
150   if (parent_document_id < 0 || parent_document_id == kInvalidDocumentId) {
151     return absl_ports::InvalidArgumentError("Invalid parent document id");
152   }
153 
154   if (parent_document_id >=
155       parent_document_id_to_child_array_info_->num_elements()) {
156     return std::vector<DocumentJoinIdPair>();
157   }
158 
159   // Get the child array info for the parent.
160   ICING_ASSIGN_OR_RETURN(
161       const ArrayInfo* array_info,
162       parent_document_id_to_child_array_info_->Get(parent_document_id));
163   if (!array_info->IsValid()) {
164     return std::vector<DocumentJoinIdPair>();
165   }
166 
167   // Safe check to avoid out-of-bound access. This should never happen unless
168   // the internal data structure is corrupted.
169   if (array_info->index + array_info->length >
170       child_document_join_id_pair_array_->num_elements()) {
171     return absl_ports::InternalError(absl_ports::StrCat(
172         "Invalid array info: ", std::to_string(array_info->index), " + ",
173         std::to_string(array_info->length), " > ",
174         std::to_string(child_document_join_id_pair_array_->num_elements())));
175   }
176 
177   // Get the DocumentJoinIdPair array and return the child DocumentJoinIdPairs.
178   ICING_ASSIGN_OR_RETURN(
179       const DocumentJoinIdPair* ptr,
180       child_document_join_id_pair_array_->Get(array_info->index));
181   return std::vector<DocumentJoinIdPair>(ptr, ptr + array_info->used_length);
182 }
183 
MigrateParent(DocumentId old_document_id,DocumentId new_document_id)184 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::MigrateParent(
185     DocumentId old_document_id, DocumentId new_document_id) {
186   if (!IsDocumentIdValid(old_document_id) ||
187       !IsDocumentIdValid(new_document_id)) {
188     return absl_ports::InvalidArgumentError(
189         "Invalid parent document id to migrate");
190   }
191 
192   if (old_document_id >=
193       parent_document_id_to_child_array_info_->num_elements()) {
194     // It means the parent document doesn't have any children at this moment. No
195     // need to migrate.
196     return libtextclassifier3::Status::OK;
197   }
198 
199   ICING_ASSIGN_OR_RETURN(
200       FileBackedVector<ArrayInfo>::MutableView mutable_old_array_info,
201       parent_document_id_to_child_array_info_->GetMutable(old_document_id));
202   if (!mutable_old_array_info.Get().IsValid()) {
203     // It means the parent document doesn't have any children at this moment. No
204     // need to migrate.
205     return libtextclassifier3::Status::OK;
206   }
207 
208   ICING_RETURN_IF_ERROR(
209       ExtendParentDocumentIdToChildArrayInfoIfNecessary(new_document_id));
210   ICING_RETURN_IF_ERROR(parent_document_id_to_child_array_info_->Set(
211       new_document_id, mutable_old_array_info.Get()));
212   mutable_old_array_info.Get() = kInvalidArrayInfo;
213 
214   return libtextclassifier3::Status::OK;
215 }
216 
Optimize(const std::vector<DocumentId> & document_id_old_to_new,const std::vector<NamespaceId> & namespace_id_old_to_new,DocumentId new_last_added_document_id)217 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::Optimize(
218     const std::vector<DocumentId>& document_id_old_to_new,
219     const std::vector<NamespaceId>& namespace_id_old_to_new,
220     DocumentId new_last_added_document_id) {
221   std::string temp_working_path = working_path_ + "_temp";
222   ICING_RETURN_IF_ERROR(
223       QualifiedIdJoinIndex::Discard(filesystem_, temp_working_path));
224 
225   DestructibleDirectory temp_working_path_ddir(&filesystem_,
226                                                std::move(temp_working_path));
227   if (!temp_working_path_ddir.is_valid()) {
228     return absl_ports::InternalError(
229         "Unable to create temp directory to build new qualified id join index "
230         "(v3)");
231   }
232 
233   {
234     // Transfer all data from the current to new qualified id join index. Also
235     // PersistToDisk and destruct the instance after finishing, so we can safely
236     // swap directories later.
237     ICING_ASSIGN_OR_RETURN(
238         std::unique_ptr<QualifiedIdJoinIndexImplV3> new_index,
239         Create(filesystem_, temp_working_path_ddir.dir(), feature_flags_));
240     ICING_RETURN_IF_ERROR(
241         TransferIndex(document_id_old_to_new, new_index.get()));
242     new_index->set_last_added_document_id(new_last_added_document_id);
243     new_index->SetDirty();
244 
245     ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
246   }
247 
248   // Destruct current index's storage instances to safely swap directories.
249   child_document_join_id_pair_array_.reset();
250   parent_document_id_to_child_array_info_.reset();
251   metadata_mmapped_file_.reset();
252   if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(),
253                              working_path_.c_str())) {
254     return absl_ports::InternalError(
255         "Unable to apply new qualified id join index (v3) due to failed swap");
256   }
257 
258   // Reinitialize qualified id join index.
259   ICING_ASSIGN_OR_RETURN(
260       MemoryMappedFile metadata_mmapped_file,
261       MemoryMappedFile::Create(filesystem_, MakeMetadataFilePath(working_path_),
262                                MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
263                                /*max_file_size=*/kMetadataFileSize,
264                                /*pre_mapping_file_offset=*/0,
265                                /*pre_mapping_mmap_size=*/kMetadataFileSize));
266   metadata_mmapped_file_ =
267       std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file));
268 
269   ICING_ASSIGN_OR_RETURN(
270       parent_document_id_to_child_array_info_,
271       FileBackedVector<ArrayInfo>::Create(
272           filesystem_,
273           MakeParentDocumentIdToChildArrayInfoFilePath(working_path_),
274           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
275           FileBackedVector<ArrayInfo>::kMaxFileSize,
276           /*pre_mapping_mmap_size=*/0));
277 
278   ICING_ASSIGN_OR_RETURN(
279       child_document_join_id_pair_array_,
280       FileBackedVector<DocumentJoinIdPair>::Create(
281           filesystem_, MakeChildDocumentJoinIdPairArrayFilePath(working_path_),
282           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
283           FileBackedVector<DocumentJoinIdPair>::kMaxFileSize,
284           /*pre_mapping_mmap_size=*/0));
285 
286   return libtextclassifier3::Status::OK;
287 }
288 
Clear()289 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::Clear() {
290   SetDirty();
291 
292   parent_document_id_to_child_array_info_.reset();
293   // Discard and reinitialize parent_document_id_to_child_array_info.
294   std::string parent_document_id_to_child_array_info_file_path =
295       MakeParentDocumentIdToChildArrayInfoFilePath(working_path_);
296   if (!filesystem_.DeleteFile(
297           parent_document_id_to_child_array_info_file_path.c_str())) {
298     return absl_ports::InternalError(absl_ports::StrCat(
299         "Failed to clear parent document id to child array info: ",
300         parent_document_id_to_child_array_info_file_path));
301   }
302   ICING_ASSIGN_OR_RETURN(
303       parent_document_id_to_child_array_info_,
304       FileBackedVector<ArrayInfo>::Create(
305           filesystem_, parent_document_id_to_child_array_info_file_path,
306           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
307           FileBackedVector<ArrayInfo>::kMaxFileSize,
308           /*pre_mapping_mmap_size=*/0));
309 
310   child_document_join_id_pair_array_.reset();
311   // Discard and reinitialize child_document_join_id_pair_array.
312   std::string child_document_join_id_pair_array_file_path =
313       MakeChildDocumentJoinIdPairArrayFilePath(working_path_);
314   if (!filesystem_.DeleteFile(
315           child_document_join_id_pair_array_file_path.c_str())) {
316     return absl_ports::InternalError(absl_ports::StrCat(
317         "Failed to clear child document join id pair array: ",
318         child_document_join_id_pair_array_file_path));
319   }
320   ICING_ASSIGN_OR_RETURN(
321       child_document_join_id_pair_array_,
322       FileBackedVector<DocumentJoinIdPair>::Create(
323           filesystem_, child_document_join_id_pair_array_file_path,
324           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
325           FileBackedVector<DocumentJoinIdPair>::kMaxFileSize,
326           /*pre_mapping_mmap_size=*/0));
327 
328   info().num_data = 0;
329   info().last_added_document_id = kInvalidDocumentId;
330   return libtextclassifier3::Status::OK;
331 }
332 
333 /* static */ libtextclassifier3::StatusOr<
334     std::unique_ptr<QualifiedIdJoinIndexImplV3>>
InitializeNewFiles(const Filesystem & filesystem,std::string working_path,const FeatureFlags & feature_flags)335 QualifiedIdJoinIndexImplV3::InitializeNewFiles(
336     const Filesystem& filesystem, std::string working_path,
337     const FeatureFlags& feature_flags) {
338   // Create working directory.
339   if (!filesystem.CreateDirectoryRecursively(working_path.c_str())) {
340     return absl_ports::InternalError(
341         absl_ports::StrCat("Failed to create directory: ", working_path));
342   }
343 
344   // Initialize metadata file.
345   // - Create MemoryMappedFile with pre-mapping and call GrowAndRemapIfNecessary
346   //   to grow the underlying file.
347   // - Initialize metadata content by writing 0 bytes to the memory region
348   //   directly.
349   ICING_ASSIGN_OR_RETURN(
350       MemoryMappedFile metadata_mmapped_file,
351       MemoryMappedFile::Create(filesystem, MakeMetadataFilePath(working_path),
352                                MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
353                                /*max_file_size=*/kMetadataFileSize,
354                                /*pre_mapping_file_offset=*/0,
355                                /*pre_mapping_mmap_size=*/kMetadataFileSize));
356   ICING_RETURN_IF_ERROR(metadata_mmapped_file.GrowAndRemapIfNecessary(
357       /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
358   memset(metadata_mmapped_file.mutable_region(), 0, kMetadataFileSize);
359 
360   // Initialize parent_document_id_to_child_array_info.
361   ICING_ASSIGN_OR_RETURN(
362       std::unique_ptr<FileBackedVector<ArrayInfo>>
363           parent_document_id_to_child_array_info,
364       FileBackedVector<ArrayInfo>::Create(
365           filesystem,
366           MakeParentDocumentIdToChildArrayInfoFilePath(working_path),
367           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
368           FileBackedVector<ArrayInfo>::kMaxFileSize,
369           /*pre_mapping_mmap_size=*/0));
370 
371   // Initialize child_document_join_id_pair_array.
372   ICING_ASSIGN_OR_RETURN(
373       std::unique_ptr<FileBackedVector<DocumentJoinIdPair>>
374           child_document_join_id_pair_array,
375       FileBackedVector<DocumentJoinIdPair>::Create(
376           filesystem, MakeChildDocumentJoinIdPairArrayFilePath(working_path),
377           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
378           FileBackedVector<DocumentJoinIdPair>::kMaxFileSize,
379           /*pre_mapping_mmap_size=*/0));
380 
381   // Create instance.
382   auto new_join_index = std::unique_ptr<QualifiedIdJoinIndexImplV3>(
383       new QualifiedIdJoinIndexImplV3(
384           filesystem, std::move(working_path),
385           std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)),
386           std::move(parent_document_id_to_child_array_info),
387           std::move(child_document_join_id_pair_array), feature_flags));
388   // Initialize info content.
389   new_join_index->info().magic = Info::kMagic;
390   new_join_index->info().num_data = 0;
391   new_join_index->info().last_added_document_id = kInvalidDocumentId;
392 
393   // Initialize new PersistentStorage. The initial checksums will be computed
394   // and set via InitializeNewStorage.
395   ICING_RETURN_IF_ERROR(new_join_index->InitializeNewStorage());
396 
397   return new_join_index;
398 }
399 
400 /* static */ libtextclassifier3::StatusOr<
401     std::unique_ptr<QualifiedIdJoinIndexImplV3>>
InitializeExistingFiles(const Filesystem & filesystem,std::string working_path,const FeatureFlags & feature_flags)402 QualifiedIdJoinIndexImplV3::InitializeExistingFiles(
403     const Filesystem& filesystem, std::string working_path,
404     const FeatureFlags& feature_flags) {
405   // Initialize metadata file.
406   ICING_ASSIGN_OR_RETURN(
407       MemoryMappedFile metadata_mmapped_file,
408       MemoryMappedFile::Create(filesystem, MakeMetadataFilePath(working_path),
409                                MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
410                                /*max_file_size=*/kMetadataFileSize,
411                                /*pre_mapping_file_offset=*/0,
412                                /*pre_mapping_mmap_size=*/kMetadataFileSize));
413   if (metadata_mmapped_file.available_size() != kMetadataFileSize) {
414     return absl_ports::FailedPreconditionError("Incorrect metadata file size");
415   }
416 
417   // Initialize parent_document_id_to_child_array_info. Set mmap pre-mapping
418   // size to 0, but MemoryMappedFile will still mmap to the file size.
419   ICING_ASSIGN_OR_RETURN(
420       std::unique_ptr<FileBackedVector<ArrayInfo>>
421           parent_document_id_to_child_array_info,
422       FileBackedVector<ArrayInfo>::Create(
423           filesystem,
424           MakeParentDocumentIdToChildArrayInfoFilePath(working_path),
425           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
426           FileBackedVector<ArrayInfo>::kMaxFileSize,
427           /*pre_mapping_mmap_size=*/0));
428 
429   // Initialize child_document_join_id_pair_array. Set mmap pre-mapping size to
430   // 0, but MemoryMappedFile will still mmap to the file size.
431   ICING_ASSIGN_OR_RETURN(
432       std::unique_ptr<FileBackedVector<DocumentJoinIdPair>>
433           child_document_join_id_pair_array,
434       FileBackedVector<DocumentJoinIdPair>::Create(
435           filesystem, MakeChildDocumentJoinIdPairArrayFilePath(working_path),
436           MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
437           FileBackedVector<DocumentJoinIdPair>::kMaxFileSize,
438           /*pre_mapping_mmap_size=*/0));
439 
440   // Create instance.
441   auto join_index = std::unique_ptr<QualifiedIdJoinIndexImplV3>(
442       new QualifiedIdJoinIndexImplV3(
443           filesystem, std::move(working_path),
444           std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)),
445           std::move(parent_document_id_to_child_array_info),
446           std::move(child_document_join_id_pair_array), feature_flags));
447 
448   // Initialize existing PersistentStorage. Checksums will be validated.
449   ICING_RETURN_IF_ERROR(join_index->InitializeExistingStorage());
450 
451   // Validate magic.
452   if (join_index->info().magic != Info::kMagic) {
453     return absl_ports::FailedPreconditionError("Incorrect magic value");
454   }
455 
456   return join_index;
457 }
458 
459 libtextclassifier3::Status
AppendChildDocumentJoinIdPairsForParent(DocumentId parent_document_id,std::vector<DocumentJoinIdPair> && child_document_join_id_pairs)460 QualifiedIdJoinIndexImplV3::AppendChildDocumentJoinIdPairsForParent(
461     DocumentId parent_document_id,
462     std::vector<DocumentJoinIdPair>&& child_document_join_id_pairs) {
463   if (child_document_join_id_pairs.empty()) {
464     return libtextclassifier3::Status::OK;
465   }
466 
467   // Step 1: extend parent_document_id_to_child_array_info_ if necessary.
468   ICING_RETURN_IF_ERROR(
469       ExtendParentDocumentIdToChildArrayInfoIfNecessary(parent_document_id));
470 
471   // Step 2: get the parent's child DocumentJoinIdPair mutable array (extend if
472   //         necessary), append new child_document_join_id_pairs to the array,
473   //         and assign the new ArrayInfo to
474   //         parent_document_id_to_child_array_info_.
475   ICING_ASSIGN_OR_RETURN(
476       const ArrayInfo* array_info,
477       parent_document_id_to_child_array_info_->Get(parent_document_id));
478   ICING_ASSIGN_OR_RETURN(
479       GetMutableAndExtendResult result,
480       GetMutableAndExtendChildDocumentJoinIdPairArrayIfNecessary(
481           *array_info, /*num_to_add=*/child_document_join_id_pairs.size()));
482   // - [0, result.array_info.used_length) contain valid elements.
483   // - We will write new elements starting from index
484   //   result.array_info.used_length, and update the used_length.
485   result.mutable_arr.SetArray(/*idx=*/result.array_info.used_length,
486                               /*arr=*/child_document_join_id_pairs.data(),
487                               /*len=*/child_document_join_id_pairs.size());
488   result.array_info.used_length += child_document_join_id_pairs.size();
489 
490   // Set ArrayInfo back to parent_document_id_to_child_array_info_.
491   ICING_RETURN_IF_ERROR(parent_document_id_to_child_array_info_->Set(
492       parent_document_id, result.array_info));
493 
494   // Update header.
495   info().num_data += child_document_join_id_pairs.size();
496 
497   return libtextclassifier3::Status::OK;
498 }
499 
500 libtextclassifier3::Status
ExtendParentDocumentIdToChildArrayInfoIfNecessary(DocumentId parent_document_id)501 QualifiedIdJoinIndexImplV3::ExtendParentDocumentIdToChildArrayInfoIfNecessary(
502     DocumentId parent_document_id) {
503   if (parent_document_id >=
504       parent_document_id_to_child_array_info_->num_elements()) {
505     int32_t num_to_extend =
506         (parent_document_id + 1) -
507         parent_document_id_to_child_array_info_->num_elements();
508     ICING_ASSIGN_OR_RETURN(
509         FileBackedVector<ArrayInfo>::MutableArrayView mutable_arr,
510         parent_document_id_to_child_array_info_->Allocate(num_to_extend));
511     mutable_arr.Fill(/*idx=*/0, /*len=*/num_to_extend, kInvalidArrayInfo);
512   }
513   return libtextclassifier3::Status::OK;
514 }
515 
516 libtextclassifier3::StatusOr<
517     QualifiedIdJoinIndexImplV3::GetMutableAndExtendResult>
518 QualifiedIdJoinIndexImplV3::
GetMutableAndExtendChildDocumentJoinIdPairArrayIfNecessary(const ArrayInfo & array_info,int32_t num_to_add)519     GetMutableAndExtendChildDocumentJoinIdPairArrayIfNecessary(
520         const ArrayInfo& array_info, int32_t num_to_add) {
521   if (num_to_add > kMaxNumChildrenPerParent) {
522     return absl_ports::ResourceExhaustedError(
523         absl_ports::StrCat("Too many children to add for a single parent: ",
524                            std::to_string(num_to_add)));
525   }
526 
527   if (!array_info.IsValid()) {
528     // If the array info is invalid, then it means the array is not allocated
529     // yet. Allocate a new array and fill it with invalid values.
530     ArrayInfo new_array_info(
531         /*index_in=*/child_document_join_id_pair_array_->num_elements(),
532         /*length_in=*/math_util::NextPowerOf2(num_to_add),
533         /*used_length_in=*/0);
534     ICING_ASSIGN_OR_RETURN(
535         FileBackedVector<DocumentJoinIdPair>::MutableArrayView new_mutable_arr,
536         child_document_join_id_pair_array_->Allocate(new_array_info.length));
537     new_mutable_arr.Fill(/*idx=*/0, /*len=*/new_array_info.length,
538                          kInvalidDocumentJoinIdPair);
539 
540     return GetMutableAndExtendResult{.array_info = new_array_info,
541                                      .mutable_arr = std::move(new_mutable_arr)};
542   }
543 
544   // The original array is valid. Get it and check if it needs to be extended.
545   ICING_ASSIGN_OR_RETURN(FileBackedVector<DocumentJoinIdPair>::MutableArrayView
546                              original_mutable_arr,
547                          child_document_join_id_pair_array_->GetMutable(
548                              array_info.index, array_info.length));
549   // - [0, array_info.used_length) contain valid elements.
550   // - [array_info.used_length, array_info.length) are invalid elements and
551   //   available for new elements.
552   if (array_info.length - array_info.used_length >= num_to_add) {
553     // Available space is enough to fit num_to_add. Return the original array.
554     return GetMutableAndExtendResult{
555         .array_info = array_info,
556         .mutable_arr = std::move(original_mutable_arr)};
557   }
558 
559   // Need to extend. The new array length should be able to fit all existing
560   // elements and the newly added elements.
561   //
562   // Check the new # of elements should not exceed the max limit. Use
563   // substraction to avoid overflow.
564   if (num_to_add > kMaxNumChildrenPerParent - array_info.used_length) {
565     return absl_ports::ResourceExhaustedError(absl_ports::StrCat(
566         "Too many children to add for an existing parent: ",
567         std::to_string(num_to_add),
568         "; existing: ", std::to_string(array_info.used_length)));
569   }
570 
571   if (array_info.index + array_info.length ==
572       child_document_join_id_pair_array_->num_elements()) {
573     // The original array is at the end of the FBV. We can extend and reshape it
574     // directly without copying elements.
575     //
576     // (Note: # means data out of the array_info we're dealing with.)
577     // Original FBV and array_info layout:
578     // +-------------------------+---------------.------+
579     // |#########################|     used      |unused|
580     // +-------------------------+---------------.------+
581     //                           ^                      ^
582     //                         index                 FBV_END
583     //                           |<-used_length->|
584     //                           |<-      length      ->|
585     //
586     // New FBV and new_array_info layout after extension:
587     // +-------------------------+---------------.------.------------------+
588     // |#########################|     used      |unused|  extended slice  |
589     // +-------------------------+---------------.------.------------------+
590     //                           ^                                         ^
591     //                         index                                    FBV_END
592     //                           |<-used_length->|
593     //                           |<-              new length             ->|
594     ArrayInfo new_array_info = array_info;
595     new_array_info.length =
596         math_util::NextPowerOf2(array_info.used_length + num_to_add);
597 
598     // Extend the original array by allocating a new slice.
599     ICING_ASSIGN_OR_RETURN(
600         FileBackedVector<DocumentJoinIdPair>::MutableArrayView extended_slice,
601         child_document_join_id_pair_array_->Allocate(new_array_info.length -
602                                                      array_info.length));
603     // Fill the extended slice with invalid values.
604     extended_slice.Fill(/*idx=*/0,
605                         /*len=*/new_array_info.length - array_info.length,
606                         kInvalidDocumentJoinIdPair);
607 
608     // Construct the mutable array view for new_array_info and return.
609     ICING_ASSIGN_OR_RETURN(
610         FileBackedVector<DocumentJoinIdPair>::MutableArrayView mutable_arr,
611         child_document_join_id_pair_array_->GetMutable(new_array_info.index,
612                                                        new_array_info.length));
613 
614     return GetMutableAndExtendResult{.array_info = std::move(new_array_info),
615                                      .mutable_arr = std::move(mutable_arr)};
616   }
617 
618   // The original array is not at the end of the file. We allocate a new array
619   // starting at the end of the FBV, copy all existing elements to the new
620   // array, and fill the remaining with invalid values.
621   //
622   // Original FBV and array_info layout:
623   // +---+---------------.------+------+
624   // |###|     used      |unused|######|
625   // +---+---------------.------+------+
626   //     ^                             ^
627   //   index                        FBV_END
628   //     |<-used_length->|
629   //     |<-      length      ->|
630   //
631   // New FBV and new_array_info layout after extension and moving:
632   // +---+----------------------+------+---------------.-----------------------+
633   // |###|      <INVALID>       |######|     used      |                       |
634   // +---+----------------------+------+---------------.-----------------------+
635   //                                   ^
636   //                               new_index
637   //                                   |<-used_length->|
638   //                                   |<-            new_length             ->|
639   ArrayInfo new_array_info(
640       /*index_in=*/child_document_join_id_pair_array_->num_elements(),
641       /*length_in=*/
642       math_util::NextPowerOf2(array_info.used_length + num_to_add),
643       /*used_length_in=*/array_info.used_length);
644   ICING_ASSIGN_OR_RETURN(
645       FileBackedVector<DocumentJoinIdPair>::MutableArrayView new_mutable_arr,
646       child_document_join_id_pair_array_->Allocate(new_array_info.length));
647 
648   // It is possible that Allocate() causes FBV to grow and remap. In this case,
649   // original_mutable_arr will point to an invalid memory region, so we need to
650   // refresh it.
651   //
652   // Note: original_mutable_arr has length = array_info.length and only contains
653   //   valid elements with length = array_info.used_length. We could've
654   //   constructed original_mutable_arr with length = array_info.used_length
655   //   here, but let's make it consistent with all other cases (i.e.
656   //   constructing the array view object for the entire extensible array).
657   ICING_ASSIGN_OR_RETURN(original_mutable_arr,
658                          child_document_join_id_pair_array_->GetMutable(
659                              array_info.index, array_info.length));
660 
661   // Move all existing elements to the new array.
662   new_mutable_arr.SetArray(/*idx=*/0, original_mutable_arr.data(),
663                            array_info.used_length);
664   // Fill the remaining with invalid values.
665   new_mutable_arr.Fill(/*idx=*/array_info.used_length,
666                        /*len=*/new_array_info.length - array_info.used_length,
667                        kInvalidDocumentJoinIdPair);
668   // Invalidate the original array.
669   original_mutable_arr.Fill(
670       /*idx=*/0, /*len=*/array_info.length, kInvalidDocumentJoinIdPair);
671   return GetMutableAndExtendResult{.array_info = new_array_info,
672                                    .mutable_arr = std::move(new_mutable_arr)};
673 }
674 
TransferIndex(const std::vector<DocumentId> & document_id_old_to_new,QualifiedIdJoinIndexImplV3 * new_index) const675 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::TransferIndex(
676     const std::vector<DocumentId>& document_id_old_to_new,
677     QualifiedIdJoinIndexImplV3* new_index) const {
678   for (DocumentId old_parent_doc_id = 0;
679        old_parent_doc_id <
680        parent_document_id_to_child_array_info_->num_elements();
681        ++old_parent_doc_id) {
682     if (old_parent_doc_id >= document_id_old_to_new.size() ||
683         document_id_old_to_new[old_parent_doc_id] == kInvalidDocumentId) {
684       // Skip if the old parent document id is invalid after optimization.
685       continue;
686     }
687 
688     ICING_ASSIGN_OR_RETURN(
689         const ArrayInfo* array_info,
690         parent_document_id_to_child_array_info_->Get(old_parent_doc_id));
691     if (!array_info->IsValid()) {
692       continue;
693     }
694     ICING_ASSIGN_OR_RETURN(
695         const DocumentJoinIdPair* ptr,
696         child_document_join_id_pair_array_->Get(array_info->index));
697 
698     // Get all child DocumentJoinIdPairs and assign new child document ids.
699     std::vector<DocumentJoinIdPair> new_child_doc_join_id_pairs;
700     new_child_doc_join_id_pairs.reserve(array_info->length);
701     for (int i = 0; i < array_info->used_length; ++i) {
702       DocumentId old_child_doc_id = ptr[i].document_id();
703       DocumentId new_child_doc_id =
704           old_child_doc_id >= 0 &&
705                   old_child_doc_id < document_id_old_to_new.size()
706               ? document_id_old_to_new[old_child_doc_id]
707               : kInvalidDocumentId;
708       if (new_child_doc_id == kInvalidDocumentId) {
709         continue;
710       }
711 
712       new_child_doc_join_id_pairs.push_back(
713           DocumentJoinIdPair(new_child_doc_id, ptr[i].joinable_property_id()));
714     }
715 
716     ICING_RETURN_IF_ERROR(new_index->AppendChildDocumentJoinIdPairsForParent(
717         document_id_old_to_new[old_parent_doc_id],
718         std::move(new_child_doc_join_id_pairs)));
719   }
720   return libtextclassifier3::Status::OK;
721 }
722 
PersistMetadataToDisk()723 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::PersistMetadataToDisk() {
724   // We can skip persisting metadata to disk only if both info and storage are
725   // clean.
726   if (is_initialized_ && !is_info_dirty() && !is_storage_dirty()) {
727     return libtextclassifier3::Status::OK;
728   }
729 
730   // Changes should have been applied to the underlying file when using
731   // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an
732   // extra safety step to ensure they are written out.
733   ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk());
734   is_info_dirty_ = false;
735   return libtextclassifier3::Status::OK;
736 }
737 
PersistStoragesToDisk()738 libtextclassifier3::Status QualifiedIdJoinIndexImplV3::PersistStoragesToDisk() {
739   if (is_initialized_ && !is_storage_dirty()) {
740     return libtextclassifier3::Status::OK;
741   }
742 
743   ICING_RETURN_IF_ERROR(
744       parent_document_id_to_child_array_info_->PersistToDisk());
745   ICING_RETURN_IF_ERROR(child_document_join_id_pair_array_->PersistToDisk());
746 
747   is_storage_dirty_ = false;
748   return libtextclassifier3::Status::OK;
749 }
750 
751 libtextclassifier3::StatusOr<Crc32>
UpdateStoragesChecksum()752 QualifiedIdJoinIndexImplV3::UpdateStoragesChecksum() {
753   if (is_initialized_ && !is_storage_dirty()) {
754     return Crc32(crcs().component_crcs.storages_crc);
755   }
756 
757   // Compute crcs
758   ICING_ASSIGN_OR_RETURN(
759       Crc32 parent_document_id_to_child_array_info_crc,
760       parent_document_id_to_child_array_info_->UpdateChecksum());
761   ICING_ASSIGN_OR_RETURN(Crc32 child_document_join_id_pair_array_crc,
762                          child_document_join_id_pair_array_->UpdateChecksum());
763 
764   return Crc32(parent_document_id_to_child_array_info_crc.Get() ^
765                child_document_join_id_pair_array_crc.Get());
766 }
767 
768 libtextclassifier3::StatusOr<Crc32>
GetInfoChecksum() const769 QualifiedIdJoinIndexImplV3::GetInfoChecksum() const {
770   if (is_initialized_ && !is_info_dirty()) {
771     return Crc32(crcs().component_crcs.info_crc);
772   }
773 
774   return info().GetChecksum();
775 }
776 
777 libtextclassifier3::StatusOr<Crc32>
GetStoragesChecksum() const778 QualifiedIdJoinIndexImplV3::GetStoragesChecksum() const {
779   if (is_initialized_ && !is_storage_dirty()) {
780     return Crc32(crcs().component_crcs.storages_crc);
781   }
782 
783   // Get checksums for all components.
784   Crc32 parent_document_id_to_child_array_info_crc =
785       parent_document_id_to_child_array_info_->GetChecksum();
786   Crc32 child_document_join_id_pair_array_crc =
787       child_document_join_id_pair_array_->GetChecksum();
788 
789   return Crc32(parent_document_id_to_child_array_info_crc.Get() ^
790                child_document_join_id_pair_array_crc.Get());
791 }
792 
793 }  // namespace lib
794 }  // namespace icing
795