xref: /aosp_15_r20/external/icing/icing/join/qualified-id-join-index-impl-v3.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V3_H_
16 #define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V3_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <utility>
23 #include <vector>
24 
25 #include "icing/text_classifier/lib3/utils/base/status.h"
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/feature-flags.h"
29 #include "icing/file/file-backed-vector.h"
30 #include "icing/file/filesystem.h"
31 #include "icing/file/memory-mapped-file.h"
32 #include "icing/file/persistent-storage.h"
33 #include "icing/join/document-join-id-pair.h"
34 #include "icing/join/qualified-id-join-index.h"
35 #include "icing/schema/joinable-property.h"
36 #include "icing/store/document-filter-data.h"
37 #include "icing/store/document-id.h"
38 #include "icing/store/namespace-id-fingerprint.h"
39 #include "icing/store/namespace-id.h"
40 #include "icing/util/crc32.h"
41 
42 namespace icing {
43 namespace lib {
44 
45 // QualifiedIdJoinIndexImplV3: a class to maintain join data. It uses 2
46 // FileBackedVectors to maintain the mapping between parent document id and a
47 // list of its joinable child infos (document id, joinable property id).
48 class QualifiedIdJoinIndexImplV3 : public QualifiedIdJoinIndex {
49  public:
50   struct Info {
51     static constexpr int32_t kMagic = 0x1529a926;
52 
53     int32_t magic;
54     int32_t num_data;
55     DocumentId last_added_document_id;
56 
57     // Padding exists just to reserve space for additional values and make the
58     // size of the metadata file 1024 bytes.
59     static constexpr int kPaddingSize = 1000;
60     uint8_t padding[kPaddingSize];
61 
GetChecksumInfo62     Crc32 GetChecksum() const {
63       return Crc32(
64           std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
65     }
66   } __attribute__((packed));
67   static_assert(sizeof(Info) == 1012, "");
68 
69   // Metadata file layout: <Crcs><Info<data><padding>>
70   static constexpr int32_t kCrcsMetadataFileOffset = 0;
71   static constexpr int32_t kInfoMetadataFileOffset =
72       static_cast<int32_t>(sizeof(Crcs));
73   static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
74   static_assert(kMetadataFileSize == 1024, "");
75 
76   static constexpr WorkingPathType kWorkingPathType =
77       WorkingPathType::kDirectory;
78 
79   // An internal struct to store the start index and length of a
80   // DocumentJoinIdPair array in child_document_join_id_pair_array_.
81   struct ArrayInfo {
82     int32_t index;
83     int32_t length;
84     int32_t used_length;
85 
ArrayInfoArrayInfo86     constexpr ArrayInfo() : index(-1), length(-1), used_length(-1) {}
87 
ArrayInfoArrayInfo88     explicit ArrayInfo(int32_t index_in, int32_t length_in,
89                        int32_t used_length_in)
90         : index(index_in), length(length_in), used_length(used_length_in) {}
91 
92     bool operator==(const ArrayInfo& other) const {
93       return index == other.index && length == other.length &&
94              used_length == other.used_length;
95     }
96 
IsValidArrayInfo97     bool IsValid() const {
98       return index >= 0 && length > 0 && used_length >= 0 &&
99              used_length <= length;
100     }
101   } __attribute__((packed));
102   static_assert(sizeof(ArrayInfo) == 12, "Invalid ArrayInfo size");
103 
104   // Creates a QualifiedIdJoinIndexImplV3 instance to store parent document to a
105   // list of its joinable children for future joining search. If any of the
106   // underlying file is missing, then delete the whole working_path and
107   // (re)initialize with new ones. Otherwise initialize and create the instance
108   // by existing files.
109   //
110   // filesystem: Object to make system level calls
111   // working_path: Specifies the working path for PersistentStorage.
112   //               QualifiedIdJoinIndexImplV3 uses working path as working
113   //               directory and all related files will be stored under this
114   //               directory. It takes full ownership and of working_path_,
115   //               including creation/deletion. It is the caller's
116   //               responsibility to specify correct working path and avoid
117   //               mixing different persistent storages together under the same
118   //               path. Also the caller has the ownership for the parent
119   //               directory of working_path_, and it is responsible for parent
120   //               directory creation/deletion. See PersistentStorage for more
121   //               details about the concept of working_path.
122   // feature_flags: a reference to (global) icing search engine feature flags.
123   //
124   // Returns:
125   //   - FAILED_PRECONDITION_ERROR if the state of the join index is
126   //                               inconsistent (e.g. checksum or the magic
127   //                               doesn't match the stored ones, missing some
128   //                               files, etc).
129   //   - INTERNAL_ERROR on I/O errors
130   //   - Any MemoryMappedFile, FileBackedVector errors
131   static libtextclassifier3::StatusOr<
132       std::unique_ptr<QualifiedIdJoinIndexImplV3>>
133   Create(const Filesystem& filesystem, std::string working_path,
134          const FeatureFlags& feature_flags);
135 
136   // Delete copy and move constructor/assignment operator.
137   QualifiedIdJoinIndexImplV3(const QualifiedIdJoinIndexImplV3&) = delete;
138   QualifiedIdJoinIndexImplV3& operator=(const QualifiedIdJoinIndexImplV3&) =
139       delete;
140 
141   QualifiedIdJoinIndexImplV3(QualifiedIdJoinIndexImplV3&&) = delete;
142   QualifiedIdJoinIndexImplV3& operator=(QualifiedIdJoinIndexImplV3&&) = delete;
143 
144   ~QualifiedIdJoinIndexImplV3() override;
145 
146   // Puts new join data into the index: adds a new child document and its
147   // referenced parent documents into the join index.
148   //
149   // Returns:
150   //   - OK on success
151   //   - INVALID_ARGUMENT_ERROR if child_document_join_id_pair is invalid
152   //   - Any FileBackedVector errors
153   libtextclassifier3::Status Put(
154       const DocumentJoinIdPair& child_document_join_id_pair,
155       std::vector<DocumentId>&& parent_document_ids) override;
156 
157   // Gets the list of joinable children for the given parent document id.
158   //
159   // Returns:
160   //   - A list of children's DocumentJoinIdPair on success
161   //   - Any FileBackedVector errors
162   libtextclassifier3::StatusOr<std::vector<DocumentJoinIdPair>> Get(
163       DocumentId parent_document_id) const override;
164 
165   // Migrates existing join data for a parent document from old_document_id to
166   // new_document_id.
167   //
168   // Note: when updating a document, we have to migrate the document's join data
169   // if it is used as a parent document. For its child join data, it will be
170   // tokenized and indexed separately, so no migration is needed.
171   //
172   // Returns:
173   //   - OK on success
174   //   - INVALID_ARGUMENT_ERROR if any document id is invalid
175   //   - Any FileBackedVector errors
176   libtextclassifier3::Status MigrateParent(DocumentId old_document_id,
177                                            DocumentId new_document_id) override;
178 
179   // v1 only API. Returns UNIMPLEMENTED_ERROR.
Put(const DocumentJoinIdPair & document_join_id_pair,std::string_view ref_qualified_id_str)180   libtextclassifier3::Status Put(
181       const DocumentJoinIdPair& document_join_id_pair,
182       std::string_view ref_qualified_id_str) override {
183     return absl_ports::UnimplementedError("This API is not supported in V3");
184   }
185 
186   // v1 only API. Returns UNIMPLEMENTED_ERROR.
Get(const DocumentJoinIdPair & document_join_id_pair)187   libtextclassifier3::StatusOr<std::string_view> Get(
188       const DocumentJoinIdPair& document_join_id_pair) const override {
189     return absl_ports::UnimplementedError("This API is not supported in V3");
190   }
191 
192   // v2 only API. Returns UNIMPLEMENTED_ERROR.
Put(SchemaTypeId schema_type_id,JoinablePropertyId joinable_property_id,DocumentId document_id,std::vector<NamespaceIdFingerprint> && ref_namespace_id_uri_fingerprints)193   libtextclassifier3::Status Put(
194       SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id,
195       DocumentId document_id,
196       std::vector<NamespaceIdFingerprint>&& ref_namespace_id_uri_fingerprints)
197       override {
198     return absl_ports::UnimplementedError("This API is not supported in V3");
199   }
200 
201   // v2 only API. Returns UNIMPLEMENTED_ERROR.
202   libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>>
GetIterator(SchemaTypeId schema_type_id,JoinablePropertyId joinable_property_id)203   GetIterator(SchemaTypeId schema_type_id,
204               JoinablePropertyId joinable_property_id) const override {
205     return absl_ports::UnimplementedError("This API is not supported in V3");
206   }
207 
208   libtextclassifier3::Status Optimize(
209       const std::vector<DocumentId>& document_id_old_to_new,
210       const std::vector<NamespaceId>& namespace_id_old_to_new,
211       DocumentId new_last_added_document_id) override;
212 
213   libtextclassifier3::Status Clear() override;
214 
version()215   QualifiedIdJoinIndex::Version version() const override {
216     return QualifiedIdJoinIndex::Version::kV3;
217   }
218 
size()219   int32_t size() const override { return info().num_data; }
220 
empty()221   bool empty() const override { return size() == 0; }
222 
last_added_document_id()223   DocumentId last_added_document_id() const override {
224     return info().last_added_document_id;
225   }
226 
set_last_added_document_id(DocumentId document_id)227   void set_last_added_document_id(DocumentId document_id) override {
228     SetInfoDirty();
229 
230     Info& info_ref = info();
231     if (info_ref.last_added_document_id == kInvalidDocumentId ||
232         document_id > info_ref.last_added_document_id) {
233       info_ref.last_added_document_id = document_id;
234     }
235   }
236 
237  private:
238   // Set max # of child DocumentJoinIdPair per parent to 2^16.
239   // - It is unlikely that a single parent document will have extremely large
240   //   # of children.
241   // - Also prevent over extending the array.
242   static constexpr int32_t kMaxNumChildrenPerParent = INT32_C(1) << 16;
243   static_assert(kMaxNumChildrenPerParent <= (1u << 31),
244                 "Required for math_util::NextPowerOf2");
245 
QualifiedIdJoinIndexImplV3(const Filesystem & filesystem,std::string working_path,std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,std::unique_ptr<FileBackedVector<ArrayInfo>> parent_document_id_to_child_array_info,std::unique_ptr<FileBackedVector<DocumentJoinIdPair>> child_document_join_id_pair_array,const FeatureFlags & feature_flags)246   explicit QualifiedIdJoinIndexImplV3(
247       const Filesystem& filesystem, std::string working_path,
248       std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,
249       std::unique_ptr<FileBackedVector<ArrayInfo>>
250           parent_document_id_to_child_array_info,
251       std::unique_ptr<FileBackedVector<DocumentJoinIdPair>>
252           child_document_join_id_pair_array,
253       const FeatureFlags& feature_flags)
254       : QualifiedIdJoinIndex(filesystem, std::move(working_path)),
255         metadata_mmapped_file_(std::move(metadata_mmapped_file)),
256         parent_document_id_to_child_array_info_(
257             std::move(parent_document_id_to_child_array_info)),
258         child_document_join_id_pair_array_(
259             std::move(child_document_join_id_pair_array)),
260         feature_flags_(feature_flags),
261         is_info_dirty_(false),
262         is_storage_dirty_(false) {}
263 
264   static libtextclassifier3::StatusOr<
265       std::unique_ptr<QualifiedIdJoinIndexImplV3>>
266   InitializeNewFiles(const Filesystem& filesystem, std::string working_path,
267                      const FeatureFlags& feature_flags);
268 
269   static libtextclassifier3::StatusOr<
270       std::unique_ptr<QualifiedIdJoinIndexImplV3>>
271   InitializeExistingFiles(const Filesystem& filesystem,
272                           std::string working_path,
273                           const FeatureFlags& feature_flags);
274 
275   // Appends a list of new child DocumentJoinIdPair to the parent's
276   // DocumentJoinIdPair (extensible) array. If the array is invalid or doesn't
277   // have enough space for new elements, then extend it and set the new array
278   // info.
279   //
280   // Returns:
281   //   - OK on success
282   //   - RESOURCE_EXHAUSTED_ERROR if the new # of elements exceed
283   //     kMaxNumChildrenPerParent
284   //   - Any FileBackedVector errors
285   libtextclassifier3::Status AppendChildDocumentJoinIdPairsForParent(
286       DocumentId parent_document_id,
287       std::vector<DocumentJoinIdPair>&& child_document_join_id_pairs);
288 
289   // Extends the parent document id to child array info if necessary, according
290   // to the new parent document id.
291   //
292   // Returns:
293   //   - OK on success
294   //   - Any FileBackedVector errors
295   libtextclassifier3::Status ExtendParentDocumentIdToChildArrayInfoIfNecessary(
296       DocumentId parent_document_id);
297 
298   // Gets the DocumentJoinIdPair mutable array and extends it if necessary to
299   // fit num_to_add new elements in the future for the caller. If extended:
300   // - Allocate a new array with size rounded up to the next 2's power to fit
301   //   all existing elements and num_to_add new elements.
302   // - Old elements will be migrated to the new array.
303   // - The old array will be invalidated.
304   //
305   // Note: this function only prepares the array to fit "future" num_to_add
306   // elements for the caller. It potentially does the extension but without
307   // adding any new elements into the array, so the returned ArrayInfo's
308   // used_length will remain unchanged.
309   //
310   // Returns:
311   //   - GetOrExtendResult on success
312   //   - RESOURCE_EXHAUSTED_ERROR if the new # of elements exceed
313   //     kMaxNumChildrenPerParent
314   //   - Any FileBackedVector errors
315   struct GetMutableAndExtendResult {
316     // ArrayInfo of the DocumentJoinIdPair array.
317     // - If extended (allocated a new array), then the new array's ArrayInfo
318     //   will be returned.
319     // - Otherwise, the input (original) ArrayInfo will be returned.
320     ArrayInfo array_info;
321 
322     // MutableArrayView object of the DocumentJoinIdPair array corresponding to
323     // array_info. It will be either the new array after extension or the
324     // original array, depending on whether it is extended or not.
325     FileBackedVector<DocumentJoinIdPair>::MutableArrayView mutable_arr;
326   };
327   libtextclassifier3::StatusOr<GetMutableAndExtendResult>
328   GetMutableAndExtendChildDocumentJoinIdPairArrayIfNecessary(
329       const ArrayInfo& array_info, int32_t num_to_add);
330 
331   // Transfers qualified id join index data from the current to new_index and
332   // convert to new document id according to document_id_old_to_new. It is a
333   // helper function for Optimize.
334   //
335   // Returns:
336   //   - OK on success
337   //   - INTERNAL_ERROR on I/O error
338   libtextclassifier3::Status TransferIndex(
339       const std::vector<DocumentId>& document_id_old_to_new,
340       QualifiedIdJoinIndexImplV3* new_index) const;
341 
342   libtextclassifier3::Status PersistMetadataToDisk() override;
343 
344   libtextclassifier3::Status PersistStoragesToDisk() override;
345 
WriteMetadata()346   libtextclassifier3::Status WriteMetadata() override {
347     // QualifiedIdJoinIndexImplV3::Header is mmapped. Therefore, writes occur
348     // when the metadata is modified. So just return OK.
349     return libtextclassifier3::Status::OK;
350   }
351 
352   libtextclassifier3::StatusOr<Crc32> UpdateStoragesChecksum() override;
353 
354   libtextclassifier3::StatusOr<Crc32> GetInfoChecksum() const override;
355 
356   libtextclassifier3::StatusOr<Crc32> GetStoragesChecksum() const override;
357 
crcs()358   Crcs& crcs() override {
359     return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() +
360                                     kCrcsMetadataFileOffset);
361   }
362 
crcs()363   const Crcs& crcs() const override {
364     return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() +
365                                           kCrcsMetadataFileOffset);
366   }
367 
info()368   Info& info() {
369     return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() +
370                                     kInfoMetadataFileOffset);
371   }
372 
info()373   const Info& info() const {
374     return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() +
375                                           kInfoMetadataFileOffset);
376   }
377 
SetInfoDirty()378   void SetInfoDirty() { is_info_dirty_ = true; }
379   // When storage is dirty, we have to set info dirty as well. So just expose
380   // SetDirty to set both.
SetDirty()381   void SetDirty() {
382     is_info_dirty_ = true;
383     is_storage_dirty_ = true;
384   }
385 
is_info_dirty()386   bool is_info_dirty() const { return is_info_dirty_; }
is_storage_dirty()387   bool is_storage_dirty() const { return is_storage_dirty_; }
388 
389   std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_;
390 
391   // Storage for mapping parent document id to the ArrayInfo, which points to an
392   // extensible array stored in child_document_join_id_pair_array_, and the
393   // extensible array contains the parent's joinable children information.
394   std::unique_ptr<FileBackedVector<ArrayInfo>>
395       parent_document_id_to_child_array_info_;
396 
397   // Storage for DocumentJoinIdPair.
398   // - It is a collection of multiple extensible arrays for parents.
399   // - Each extensible array contains a list of child DocumentJoinIdPair.
400   // - The extensible array information (starting index of the FileBackedVector,
401   //   length) is stored in parent_document_id_to_child_array_info_.
402   std::unique_ptr<FileBackedVector<DocumentJoinIdPair>>
403       child_document_join_id_pair_array_;
404 
405   const FeatureFlags& feature_flags_;  // Does not own.
406 
407   bool is_info_dirty_;
408   bool is_storage_dirty_;
409 };
410 
411 }  // namespace lib
412 }  // namespace icing
413 
414 #endif  // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V3_H_
415