1 // Copyright (C) 2024 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V3_H_ 16 #define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V3_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <utility> 23 #include <vector> 24 25 #include "icing/text_classifier/lib3/utils/base/status.h" 26 #include "icing/text_classifier/lib3/utils/base/statusor.h" 27 #include "icing/absl_ports/canonical_errors.h" 28 #include "icing/feature-flags.h" 29 #include "icing/file/file-backed-vector.h" 30 #include "icing/file/filesystem.h" 31 #include "icing/file/memory-mapped-file.h" 32 #include "icing/file/persistent-storage.h" 33 #include "icing/join/document-join-id-pair.h" 34 #include "icing/join/qualified-id-join-index.h" 35 #include "icing/schema/joinable-property.h" 36 #include "icing/store/document-filter-data.h" 37 #include "icing/store/document-id.h" 38 #include "icing/store/namespace-id-fingerprint.h" 39 #include "icing/store/namespace-id.h" 40 #include "icing/util/crc32.h" 41 42 namespace icing { 43 namespace lib { 44 45 // QualifiedIdJoinIndexImplV3: a class to maintain join data. It uses 2 46 // FileBackedVectors to maintain the mapping between parent document id and a 47 // list of its joinable child infos (document id, joinable property id). 48 class QualifiedIdJoinIndexImplV3 : public QualifiedIdJoinIndex { 49 public: 50 struct Info { 51 static constexpr int32_t kMagic = 0x1529a926; 52 53 int32_t magic; 54 int32_t num_data; 55 DocumentId last_added_document_id; 56 57 // Padding exists just to reserve space for additional values and make the 58 // size of the metadata file 1024 bytes. 59 static constexpr int kPaddingSize = 1000; 60 uint8_t padding[kPaddingSize]; 61 GetChecksumInfo62 Crc32 GetChecksum() const { 63 return Crc32( 64 std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); 65 } 66 } __attribute__((packed)); 67 static_assert(sizeof(Info) == 1012, ""); 68 69 // Metadata file layout: <Crcs><Info<data><padding>> 70 static constexpr int32_t kCrcsMetadataFileOffset = 0; 71 static constexpr int32_t kInfoMetadataFileOffset = 72 static_cast<int32_t>(sizeof(Crcs)); 73 static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); 74 static_assert(kMetadataFileSize == 1024, ""); 75 76 static constexpr WorkingPathType kWorkingPathType = 77 WorkingPathType::kDirectory; 78 79 // An internal struct to store the start index and length of a 80 // DocumentJoinIdPair array in child_document_join_id_pair_array_. 81 struct ArrayInfo { 82 int32_t index; 83 int32_t length; 84 int32_t used_length; 85 ArrayInfoArrayInfo86 constexpr ArrayInfo() : index(-1), length(-1), used_length(-1) {} 87 ArrayInfoArrayInfo88 explicit ArrayInfo(int32_t index_in, int32_t length_in, 89 int32_t used_length_in) 90 : index(index_in), length(length_in), used_length(used_length_in) {} 91 92 bool operator==(const ArrayInfo& other) const { 93 return index == other.index && length == other.length && 94 used_length == other.used_length; 95 } 96 IsValidArrayInfo97 bool IsValid() const { 98 return index >= 0 && length > 0 && used_length >= 0 && 99 used_length <= length; 100 } 101 } __attribute__((packed)); 102 static_assert(sizeof(ArrayInfo) == 12, "Invalid ArrayInfo size"); 103 104 // Creates a QualifiedIdJoinIndexImplV3 instance to store parent document to a 105 // list of its joinable children for future joining search. If any of the 106 // underlying file is missing, then delete the whole working_path and 107 // (re)initialize with new ones. Otherwise initialize and create the instance 108 // by existing files. 109 // 110 // filesystem: Object to make system level calls 111 // working_path: Specifies the working path for PersistentStorage. 112 // QualifiedIdJoinIndexImplV3 uses working path as working 113 // directory and all related files will be stored under this 114 // directory. It takes full ownership and of working_path_, 115 // including creation/deletion. It is the caller's 116 // responsibility to specify correct working path and avoid 117 // mixing different persistent storages together under the same 118 // path. Also the caller has the ownership for the parent 119 // directory of working_path_, and it is responsible for parent 120 // directory creation/deletion. See PersistentStorage for more 121 // details about the concept of working_path. 122 // feature_flags: a reference to (global) icing search engine feature flags. 123 // 124 // Returns: 125 // - FAILED_PRECONDITION_ERROR if the state of the join index is 126 // inconsistent (e.g. checksum or the magic 127 // doesn't match the stored ones, missing some 128 // files, etc). 129 // - INTERNAL_ERROR on I/O errors 130 // - Any MemoryMappedFile, FileBackedVector errors 131 static libtextclassifier3::StatusOr< 132 std::unique_ptr<QualifiedIdJoinIndexImplV3>> 133 Create(const Filesystem& filesystem, std::string working_path, 134 const FeatureFlags& feature_flags); 135 136 // Delete copy and move constructor/assignment operator. 137 QualifiedIdJoinIndexImplV3(const QualifiedIdJoinIndexImplV3&) = delete; 138 QualifiedIdJoinIndexImplV3& operator=(const QualifiedIdJoinIndexImplV3&) = 139 delete; 140 141 QualifiedIdJoinIndexImplV3(QualifiedIdJoinIndexImplV3&&) = delete; 142 QualifiedIdJoinIndexImplV3& operator=(QualifiedIdJoinIndexImplV3&&) = delete; 143 144 ~QualifiedIdJoinIndexImplV3() override; 145 146 // Puts new join data into the index: adds a new child document and its 147 // referenced parent documents into the join index. 148 // 149 // Returns: 150 // - OK on success 151 // - INVALID_ARGUMENT_ERROR if child_document_join_id_pair is invalid 152 // - Any FileBackedVector errors 153 libtextclassifier3::Status Put( 154 const DocumentJoinIdPair& child_document_join_id_pair, 155 std::vector<DocumentId>&& parent_document_ids) override; 156 157 // Gets the list of joinable children for the given parent document id. 158 // 159 // Returns: 160 // - A list of children's DocumentJoinIdPair on success 161 // - Any FileBackedVector errors 162 libtextclassifier3::StatusOr<std::vector<DocumentJoinIdPair>> Get( 163 DocumentId parent_document_id) const override; 164 165 // Migrates existing join data for a parent document from old_document_id to 166 // new_document_id. 167 // 168 // Note: when updating a document, we have to migrate the document's join data 169 // if it is used as a parent document. For its child join data, it will be 170 // tokenized and indexed separately, so no migration is needed. 171 // 172 // Returns: 173 // - OK on success 174 // - INVALID_ARGUMENT_ERROR if any document id is invalid 175 // - Any FileBackedVector errors 176 libtextclassifier3::Status MigrateParent(DocumentId old_document_id, 177 DocumentId new_document_id) override; 178 179 // v1 only API. Returns UNIMPLEMENTED_ERROR. Put(const DocumentJoinIdPair & document_join_id_pair,std::string_view ref_qualified_id_str)180 libtextclassifier3::Status Put( 181 const DocumentJoinIdPair& document_join_id_pair, 182 std::string_view ref_qualified_id_str) override { 183 return absl_ports::UnimplementedError("This API is not supported in V3"); 184 } 185 186 // v1 only API. Returns UNIMPLEMENTED_ERROR. Get(const DocumentJoinIdPair & document_join_id_pair)187 libtextclassifier3::StatusOr<std::string_view> Get( 188 const DocumentJoinIdPair& document_join_id_pair) const override { 189 return absl_ports::UnimplementedError("This API is not supported in V3"); 190 } 191 192 // v2 only API. Returns UNIMPLEMENTED_ERROR. Put(SchemaTypeId schema_type_id,JoinablePropertyId joinable_property_id,DocumentId document_id,std::vector<NamespaceIdFingerprint> && ref_namespace_id_uri_fingerprints)193 libtextclassifier3::Status Put( 194 SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id, 195 DocumentId document_id, 196 std::vector<NamespaceIdFingerprint>&& ref_namespace_id_uri_fingerprints) 197 override { 198 return absl_ports::UnimplementedError("This API is not supported in V3"); 199 } 200 201 // v2 only API. Returns UNIMPLEMENTED_ERROR. 202 libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>> GetIterator(SchemaTypeId schema_type_id,JoinablePropertyId joinable_property_id)203 GetIterator(SchemaTypeId schema_type_id, 204 JoinablePropertyId joinable_property_id) const override { 205 return absl_ports::UnimplementedError("This API is not supported in V3"); 206 } 207 208 libtextclassifier3::Status Optimize( 209 const std::vector<DocumentId>& document_id_old_to_new, 210 const std::vector<NamespaceId>& namespace_id_old_to_new, 211 DocumentId new_last_added_document_id) override; 212 213 libtextclassifier3::Status Clear() override; 214 version()215 QualifiedIdJoinIndex::Version version() const override { 216 return QualifiedIdJoinIndex::Version::kV3; 217 } 218 size()219 int32_t size() const override { return info().num_data; } 220 empty()221 bool empty() const override { return size() == 0; } 222 last_added_document_id()223 DocumentId last_added_document_id() const override { 224 return info().last_added_document_id; 225 } 226 set_last_added_document_id(DocumentId document_id)227 void set_last_added_document_id(DocumentId document_id) override { 228 SetInfoDirty(); 229 230 Info& info_ref = info(); 231 if (info_ref.last_added_document_id == kInvalidDocumentId || 232 document_id > info_ref.last_added_document_id) { 233 info_ref.last_added_document_id = document_id; 234 } 235 } 236 237 private: 238 // Set max # of child DocumentJoinIdPair per parent to 2^16. 239 // - It is unlikely that a single parent document will have extremely large 240 // # of children. 241 // - Also prevent over extending the array. 242 static constexpr int32_t kMaxNumChildrenPerParent = INT32_C(1) << 16; 243 static_assert(kMaxNumChildrenPerParent <= (1u << 31), 244 "Required for math_util::NextPowerOf2"); 245 QualifiedIdJoinIndexImplV3(const Filesystem & filesystem,std::string working_path,std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,std::unique_ptr<FileBackedVector<ArrayInfo>> parent_document_id_to_child_array_info,std::unique_ptr<FileBackedVector<DocumentJoinIdPair>> child_document_join_id_pair_array,const FeatureFlags & feature_flags)246 explicit QualifiedIdJoinIndexImplV3( 247 const Filesystem& filesystem, std::string working_path, 248 std::unique_ptr<MemoryMappedFile> metadata_mmapped_file, 249 std::unique_ptr<FileBackedVector<ArrayInfo>> 250 parent_document_id_to_child_array_info, 251 std::unique_ptr<FileBackedVector<DocumentJoinIdPair>> 252 child_document_join_id_pair_array, 253 const FeatureFlags& feature_flags) 254 : QualifiedIdJoinIndex(filesystem, std::move(working_path)), 255 metadata_mmapped_file_(std::move(metadata_mmapped_file)), 256 parent_document_id_to_child_array_info_( 257 std::move(parent_document_id_to_child_array_info)), 258 child_document_join_id_pair_array_( 259 std::move(child_document_join_id_pair_array)), 260 feature_flags_(feature_flags), 261 is_info_dirty_(false), 262 is_storage_dirty_(false) {} 263 264 static libtextclassifier3::StatusOr< 265 std::unique_ptr<QualifiedIdJoinIndexImplV3>> 266 InitializeNewFiles(const Filesystem& filesystem, std::string working_path, 267 const FeatureFlags& feature_flags); 268 269 static libtextclassifier3::StatusOr< 270 std::unique_ptr<QualifiedIdJoinIndexImplV3>> 271 InitializeExistingFiles(const Filesystem& filesystem, 272 std::string working_path, 273 const FeatureFlags& feature_flags); 274 275 // Appends a list of new child DocumentJoinIdPair to the parent's 276 // DocumentJoinIdPair (extensible) array. If the array is invalid or doesn't 277 // have enough space for new elements, then extend it and set the new array 278 // info. 279 // 280 // Returns: 281 // - OK on success 282 // - RESOURCE_EXHAUSTED_ERROR if the new # of elements exceed 283 // kMaxNumChildrenPerParent 284 // - Any FileBackedVector errors 285 libtextclassifier3::Status AppendChildDocumentJoinIdPairsForParent( 286 DocumentId parent_document_id, 287 std::vector<DocumentJoinIdPair>&& child_document_join_id_pairs); 288 289 // Extends the parent document id to child array info if necessary, according 290 // to the new parent document id. 291 // 292 // Returns: 293 // - OK on success 294 // - Any FileBackedVector errors 295 libtextclassifier3::Status ExtendParentDocumentIdToChildArrayInfoIfNecessary( 296 DocumentId parent_document_id); 297 298 // Gets the DocumentJoinIdPair mutable array and extends it if necessary to 299 // fit num_to_add new elements in the future for the caller. If extended: 300 // - Allocate a new array with size rounded up to the next 2's power to fit 301 // all existing elements and num_to_add new elements. 302 // - Old elements will be migrated to the new array. 303 // - The old array will be invalidated. 304 // 305 // Note: this function only prepares the array to fit "future" num_to_add 306 // elements for the caller. It potentially does the extension but without 307 // adding any new elements into the array, so the returned ArrayInfo's 308 // used_length will remain unchanged. 309 // 310 // Returns: 311 // - GetOrExtendResult on success 312 // - RESOURCE_EXHAUSTED_ERROR if the new # of elements exceed 313 // kMaxNumChildrenPerParent 314 // - Any FileBackedVector errors 315 struct GetMutableAndExtendResult { 316 // ArrayInfo of the DocumentJoinIdPair array. 317 // - If extended (allocated a new array), then the new array's ArrayInfo 318 // will be returned. 319 // - Otherwise, the input (original) ArrayInfo will be returned. 320 ArrayInfo array_info; 321 322 // MutableArrayView object of the DocumentJoinIdPair array corresponding to 323 // array_info. It will be either the new array after extension or the 324 // original array, depending on whether it is extended or not. 325 FileBackedVector<DocumentJoinIdPair>::MutableArrayView mutable_arr; 326 }; 327 libtextclassifier3::StatusOr<GetMutableAndExtendResult> 328 GetMutableAndExtendChildDocumentJoinIdPairArrayIfNecessary( 329 const ArrayInfo& array_info, int32_t num_to_add); 330 331 // Transfers qualified id join index data from the current to new_index and 332 // convert to new document id according to document_id_old_to_new. It is a 333 // helper function for Optimize. 334 // 335 // Returns: 336 // - OK on success 337 // - INTERNAL_ERROR on I/O error 338 libtextclassifier3::Status TransferIndex( 339 const std::vector<DocumentId>& document_id_old_to_new, 340 QualifiedIdJoinIndexImplV3* new_index) const; 341 342 libtextclassifier3::Status PersistMetadataToDisk() override; 343 344 libtextclassifier3::Status PersistStoragesToDisk() override; 345 WriteMetadata()346 libtextclassifier3::Status WriteMetadata() override { 347 // QualifiedIdJoinIndexImplV3::Header is mmapped. Therefore, writes occur 348 // when the metadata is modified. So just return OK. 349 return libtextclassifier3::Status::OK; 350 } 351 352 libtextclassifier3::StatusOr<Crc32> UpdateStoragesChecksum() override; 353 354 libtextclassifier3::StatusOr<Crc32> GetInfoChecksum() const override; 355 356 libtextclassifier3::StatusOr<Crc32> GetStoragesChecksum() const override; 357 crcs()358 Crcs& crcs() override { 359 return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() + 360 kCrcsMetadataFileOffset); 361 } 362 crcs()363 const Crcs& crcs() const override { 364 return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() + 365 kCrcsMetadataFileOffset); 366 } 367 info()368 Info& info() { 369 return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() + 370 kInfoMetadataFileOffset); 371 } 372 info()373 const Info& info() const { 374 return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() + 375 kInfoMetadataFileOffset); 376 } 377 SetInfoDirty()378 void SetInfoDirty() { is_info_dirty_ = true; } 379 // When storage is dirty, we have to set info dirty as well. So just expose 380 // SetDirty to set both. SetDirty()381 void SetDirty() { 382 is_info_dirty_ = true; 383 is_storage_dirty_ = true; 384 } 385 is_info_dirty()386 bool is_info_dirty() const { return is_info_dirty_; } is_storage_dirty()387 bool is_storage_dirty() const { return is_storage_dirty_; } 388 389 std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_; 390 391 // Storage for mapping parent document id to the ArrayInfo, which points to an 392 // extensible array stored in child_document_join_id_pair_array_, and the 393 // extensible array contains the parent's joinable children information. 394 std::unique_ptr<FileBackedVector<ArrayInfo>> 395 parent_document_id_to_child_array_info_; 396 397 // Storage for DocumentJoinIdPair. 398 // - It is a collection of multiple extensible arrays for parents. 399 // - Each extensible array contains a list of child DocumentJoinIdPair. 400 // - The extensible array information (starting index of the FileBackedVector, 401 // length) is stored in parent_document_id_to_child_array_info_. 402 std::unique_ptr<FileBackedVector<DocumentJoinIdPair>> 403 child_document_join_id_pair_array_; 404 405 const FeatureFlags& feature_flags_; // Does not own. 406 407 bool is_info_dirty_; 408 bool is_storage_dirty_; 409 }; 410 411 } // namespace lib 412 } // namespace icing 413 414 #endif // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V3_H_ 415