xref: /aosp_15_r20/external/icing/icing/join/qualified-id-join-index-impl-v2.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_
16 #define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <utility>
23 #include <vector>
24 
25 #include "icing/text_classifier/lib3/utils/base/status.h"
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/file/filesystem.h"
29 #include "icing/file/persistent-storage.h"
30 #include "icing/file/posting_list/flash-index-storage.h"
31 #include "icing/file/posting_list/posting-list-identifier.h"
32 #include "icing/join/document-id-to-join-info.h"
33 #include "icing/join/document-join-id-pair.h"
34 #include "icing/join/posting-list-join-data-accessor.h"
35 #include "icing/join/posting-list-join-data-serializer.h"
36 #include "icing/join/qualified-id-join-index.h"
37 #include "icing/schema/joinable-property.h"
38 #include "icing/store/document-filter-data.h"
39 #include "icing/store/document-id.h"
40 #include "icing/store/key-mapper.h"
41 #include "icing/store/namespace-id-fingerprint.h"
42 #include "icing/store/namespace-id.h"
43 #include "icing/util/crc32.h"
44 
45 namespace icing {
46 namespace lib {
47 
48 // QualifiedIdJoinIndexImplV2: a class to maintain join data (DocumentId to
49 // referenced NamespaceIdFingerprint). It stores join data in posting lists
50 // and bucketizes them by (schema_type_id, joinable_property_id).
51 class QualifiedIdJoinIndexImplV2 : public QualifiedIdJoinIndex {
52  public:
53   using JoinDataType = DocumentIdToJoinInfo<NamespaceIdFingerprint>;
54 
55   class JoinDataIterator : public JoinDataIteratorBase {
56    public:
JoinDataIterator(std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor)57     explicit JoinDataIterator(
58         std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor)
59         : pl_accessor_(std::move(pl_accessor)),
60           should_retrieve_next_batch_(true) {}
61 
62     ~JoinDataIterator() override = default;
63 
64     // Advances to the next data.
65     //
66     // Returns:
67     //   - OK on success
68     //   - RESOURCE_EXHAUSTED_ERROR if reaching the end (i.e. no more relevant
69     //     data)
70     //   - Any other PostingListJoinDataAccessor errors
71     libtextclassifier3::Status Advance() override;
72 
GetCurrent()73     const JoinDataType& GetCurrent() const override { return *curr_; }
74 
75    private:
76     // Gets next batch of data from the posting list chain, caches in
77     // cached_batch_integer_index_data_, and sets curr_ to the begin of the
78     // cache.
79     libtextclassifier3::Status GetNextDataBatch();
80 
81     std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor_;
82     std::vector<JoinDataType> cached_batch_join_data_;
83     std::vector<JoinDataType>::const_iterator curr_;
84     bool should_retrieve_next_batch_;
85   };
86 
87   struct Info {
88     static constexpr int32_t kMagic = 0x32e374a7;
89 
90     int32_t magic;
91     int32_t num_data;
92     DocumentId last_added_document_id;
93 
GetChecksumInfo94     Crc32 GetChecksum() const {
95       return Crc32(
96           std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
97     }
98   } __attribute__((packed));
99   static_assert(sizeof(Info) == 12, "");
100 
101   // Metadata file layout: <Crcs><Info>
102   static constexpr int32_t kCrcsMetadataBufferOffset = 0;
103   static constexpr int32_t kInfoMetadataBufferOffset =
104       static_cast<int32_t>(sizeof(Crcs));
105   static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
106   static_assert(kMetadataFileSize == 24, "");
107 
108   static constexpr WorkingPathType kWorkingPathType =
109       WorkingPathType::kDirectory;
110 
111   // Creates a QualifiedIdJoinIndexImplV2 instance to store join data
112   // (DocumentId to referenced NamespaceIdFingerprint) for future joining
113   // search. If any of the underlying file is missing, then delete the whole
114   // working_path and (re)initialize with new ones. Otherwise initialize and
115   // create the instance by existing files.
116   //
117   // filesystem: Object to make system level calls
118   // working_path: Specifies the working path for PersistentStorage.
119   //               QualifiedIdJoinIndexImplV2 uses working path as working
120   //               directory and all related files will be stored under this
121   //               directory. It takes full ownership and of working_path_,
122   //               including creation/deletion. It is the caller's
123   //               responsibility to specify correct working path and avoid
124   //               mixing different persistent storages together under the same
125   //               path. Also the caller has the ownership for the parent
126   //               directory of working_path_, and it is responsible for parent
127   //               directory creation/deletion. See PersistentStorage for more
128   //               details about the concept of working_path.
129   // pre_mapping_fbv: flag indicating whether memory map max possible file size
130   //                  for underlying FileBackedVector before growing the actual
131   //                  file size.
132   //
133   // Returns:
134   //   - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
135   //                               checksum
136   //   - INTERNAL_ERROR on I/O errors
137   //   - Any KeyMapper errors
138   static libtextclassifier3::StatusOr<
139       std::unique_ptr<QualifiedIdJoinIndexImplV2>>
140   Create(const Filesystem& filesystem, std::string working_path,
141          bool pre_mapping_fbv);
142 
143   // Delete copy and move constructor/assignment operator.
144   QualifiedIdJoinIndexImplV2(const QualifiedIdJoinIndexImplV2&) = delete;
145   QualifiedIdJoinIndexImplV2& operator=(const QualifiedIdJoinIndexImplV2&) =
146       delete;
147 
148   QualifiedIdJoinIndexImplV2(QualifiedIdJoinIndexImplV2&&) = delete;
149   QualifiedIdJoinIndexImplV2& operator=(QualifiedIdJoinIndexImplV2&&) = delete;
150 
151   ~QualifiedIdJoinIndexImplV2() override;
152 
153   // v1 only API. Returns UNIMPLEMENTED_ERROR.
Put(const DocumentJoinIdPair & document_join_id_pair,std::string_view ref_qualified_id_str)154   libtextclassifier3::Status Put(
155       const DocumentJoinIdPair& document_join_id_pair,
156       std::string_view ref_qualified_id_str) override {
157     return absl_ports::UnimplementedError("This API is not supported in V2");
158   }
159 
160   // v1 only API. Returns UNIMPLEMENTED_ERROR.
Get(const DocumentJoinIdPair & document_join_id_pair)161   libtextclassifier3::StatusOr<std::string_view> Get(
162       const DocumentJoinIdPair& document_join_id_pair) const override {
163     return absl_ports::UnimplementedError("This API is not supported in V2");
164   }
165 
166   // v3 only API. Returns UNIMPLEMENTED_ERROR.
Put(const DocumentJoinIdPair & child_document_join_id_pair,std::vector<DocumentId> && parent_document_ids)167   libtextclassifier3::Status Put(
168       const DocumentJoinIdPair& child_document_join_id_pair,
169       std::vector<DocumentId>&& parent_document_ids) override {
170     return absl_ports::UnimplementedError("This API is not supported in V2");
171   }
172 
173   // v3 only API. Returns UNIMPLEMENTED_ERROR.
Get(DocumentId parent_document_id)174   libtextclassifier3::StatusOr<std::vector<DocumentJoinIdPair>> Get(
175       DocumentId parent_document_id) const override {
176     return absl_ports::UnimplementedError("This API is not supported in V2");
177   }
178 
179   libtextclassifier3::Status Put(
180       SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id,
181       DocumentId document_id,
182       std::vector<NamespaceIdFingerprint>&& ref_namespace_id_uri_fingerprints)
183       override;
184 
185   libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>>
186   GetIterator(SchemaTypeId schema_type_id,
187               JoinablePropertyId joinable_property_id) const override;
188 
189   // No-op since v2 stores parent information in (namespace_id,
190   // fingerprint(uri)) format and does not require parent migration.
MigrateParent(DocumentId old_document_id,DocumentId new_document_id)191   libtextclassifier3::Status MigrateParent(
192       DocumentId old_document_id, DocumentId new_document_id) override {
193     return libtextclassifier3::Status::OK;
194   }
195 
196   libtextclassifier3::Status Optimize(
197       const std::vector<DocumentId>& document_id_old_to_new,
198       const std::vector<NamespaceId>& namespace_id_old_to_new,
199       DocumentId new_last_added_document_id) override;
200 
201   libtextclassifier3::Status Clear() override;
202 
version()203   QualifiedIdJoinIndex::Version version() const override {
204     return QualifiedIdJoinIndex::Version::kV2;
205   }
206 
size()207   int32_t size() const override { return info().num_data; }
208 
empty()209   bool empty() const override { return size() == 0; }
210 
last_added_document_id()211   DocumentId last_added_document_id() const override {
212     return info().last_added_document_id;
213   }
214 
set_last_added_document_id(DocumentId document_id)215   void set_last_added_document_id(DocumentId document_id) override {
216     SetInfoDirty();
217 
218     Info& info_ref = info();
219     if (info_ref.last_added_document_id == kInvalidDocumentId ||
220         document_id > info_ref.last_added_document_id) {
221       info_ref.last_added_document_id = document_id;
222     }
223   }
224 
225  private:
QualifiedIdJoinIndexImplV2(const Filesystem & filesystem,std::string && working_path,std::unique_ptr<uint8_t[]> metadata_buffer,std::unique_ptr<KeyMapper<PostingListIdentifier>> schema_joinable_id_to_posting_list_mapper,std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> posting_list_serializer,std::unique_ptr<FlashIndexStorage> flash_index_storage,bool pre_mapping_fbv)226   explicit QualifiedIdJoinIndexImplV2(
227       const Filesystem& filesystem, std::string&& working_path,
228       std::unique_ptr<uint8_t[]> metadata_buffer,
229       std::unique_ptr<KeyMapper<PostingListIdentifier>>
230           schema_joinable_id_to_posting_list_mapper,
231       std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>>
232           posting_list_serializer,
233       std::unique_ptr<FlashIndexStorage> flash_index_storage,
234       bool pre_mapping_fbv)
235       : QualifiedIdJoinIndex(filesystem, std::move(working_path)),
236         metadata_buffer_(std::move(metadata_buffer)),
237         schema_joinable_id_to_posting_list_mapper_(
238             std::move(schema_joinable_id_to_posting_list_mapper)),
239         posting_list_serializer_(std::move(posting_list_serializer)),
240         flash_index_storage_(std::move(flash_index_storage)),
241         pre_mapping_fbv_(pre_mapping_fbv),
242         is_info_dirty_(false),
243         is_storage_dirty_(false) {}
244 
245   static libtextclassifier3::StatusOr<
246       std::unique_ptr<QualifiedIdJoinIndexImplV2>>
247   InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path,
248                      bool pre_mapping_fbv);
249 
250   static libtextclassifier3::StatusOr<
251       std::unique_ptr<QualifiedIdJoinIndexImplV2>>
252   InitializeExistingFiles(const Filesystem& filesystem,
253                           std::string&& working_path, bool pre_mapping_fbv);
254 
255   // Transfers qualified id join index data from the current to new_index and
256   // convert to new document id according to document_id_old_to_new and
257   // namespace_id_old_to_new. It is a helper function for Optimize.
258   //
259   // Returns:
260   //   - OK on success
261   //   - INTERNAL_ERROR on I/O error
262   libtextclassifier3::Status TransferIndex(
263       const std::vector<DocumentId>& document_id_old_to_new,
264       const std::vector<NamespaceId>& namespace_id_old_to_new,
265       QualifiedIdJoinIndexImplV2* new_index) const;
266 
267   libtextclassifier3::Status PersistMetadataToDisk() override;
268 
269   libtextclassifier3::Status PersistStoragesToDisk() override;
270 
271   libtextclassifier3::Status WriteMetadata() override;
272 
273   libtextclassifier3::Status InternalWriteMetadata(const ScopedFd& sfd);
274 
275   libtextclassifier3::StatusOr<Crc32> UpdateStoragesChecksum() override;
276 
277   libtextclassifier3::StatusOr<Crc32> GetInfoChecksum() const override;
278 
279   libtextclassifier3::StatusOr<Crc32> GetStoragesChecksum() const override;
280 
crcs()281   Crcs& crcs() override {
282     return *reinterpret_cast<Crcs*>(metadata_buffer_.get() +
283                                     kCrcsMetadataBufferOffset);
284   }
285 
crcs()286   const Crcs& crcs() const override {
287     return *reinterpret_cast<const Crcs*>(metadata_buffer_.get() +
288                                           kCrcsMetadataBufferOffset);
289   }
290 
info()291   Info& info() {
292     return *reinterpret_cast<Info*>(metadata_buffer_.get() +
293                                     kInfoMetadataBufferOffset);
294   }
295 
info()296   const Info& info() const {
297     return *reinterpret_cast<const Info*>(metadata_buffer_.get() +
298                                           kInfoMetadataBufferOffset);
299   }
300 
SetInfoDirty()301   void SetInfoDirty() { is_info_dirty_ = true; }
302   // When storage is dirty, we have to set info dirty as well. So just expose
303   // SetDirty to set both.
SetDirty()304   void SetDirty() {
305     is_info_dirty_ = true;
306     is_storage_dirty_ = true;
307   }
308 
is_info_dirty()309   bool is_info_dirty() const { return is_info_dirty_; }
is_storage_dirty()310   bool is_storage_dirty() const { return is_storage_dirty_; }
311 
312   // Metadata buffer
313   std::unique_ptr<uint8_t[]> metadata_buffer_;
314 
315   // Persistent KeyMapper for mapping (schema_type_id, joinable_property_id) to
316   // PostingListIdentifier.
317   std::unique_ptr<KeyMapper<PostingListIdentifier>>
318       schema_joinable_id_to_posting_list_mapper_;
319 
320   // Posting list related members. Use posting list to store join data
321   // (document id to referenced NamespaceIdFingerprint).
322   std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>>
323       posting_list_serializer_;
324   std::unique_ptr<FlashIndexStorage> flash_index_storage_;
325 
326   // TODO(b/268521214): add delete propagation storage
327 
328   // Flag indicating whether memory map max possible file size for underlying
329   // FileBackedVector before growing the actual file size.
330   bool pre_mapping_fbv_;
331 
332   bool is_info_dirty_;
333   bool is_storage_dirty_;
334 };
335 
336 }  // namespace lib
337 }  // namespace icing
338 
339 #endif  // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_
340