1 // Copyright (C) 2023 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_ 16 #define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <utility> 23 #include <vector> 24 25 #include "icing/text_classifier/lib3/utils/base/status.h" 26 #include "icing/text_classifier/lib3/utils/base/statusor.h" 27 #include "icing/absl_ports/canonical_errors.h" 28 #include "icing/file/filesystem.h" 29 #include "icing/file/persistent-storage.h" 30 #include "icing/file/posting_list/flash-index-storage.h" 31 #include "icing/file/posting_list/posting-list-identifier.h" 32 #include "icing/join/document-id-to-join-info.h" 33 #include "icing/join/document-join-id-pair.h" 34 #include "icing/join/posting-list-join-data-accessor.h" 35 #include "icing/join/posting-list-join-data-serializer.h" 36 #include "icing/join/qualified-id-join-index.h" 37 #include "icing/schema/joinable-property.h" 38 #include "icing/store/document-filter-data.h" 39 #include "icing/store/document-id.h" 40 #include "icing/store/key-mapper.h" 41 #include "icing/store/namespace-id-fingerprint.h" 42 #include "icing/store/namespace-id.h" 43 #include "icing/util/crc32.h" 44 45 namespace icing { 46 namespace lib { 47 48 // QualifiedIdJoinIndexImplV2: a class to maintain join data (DocumentId to 49 // referenced NamespaceIdFingerprint). It stores join data in posting lists 50 // and bucketizes them by (schema_type_id, joinable_property_id). 51 class QualifiedIdJoinIndexImplV2 : public QualifiedIdJoinIndex { 52 public: 53 using JoinDataType = DocumentIdToJoinInfo<NamespaceIdFingerprint>; 54 55 class JoinDataIterator : public JoinDataIteratorBase { 56 public: JoinDataIterator(std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor)57 explicit JoinDataIterator( 58 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor) 59 : pl_accessor_(std::move(pl_accessor)), 60 should_retrieve_next_batch_(true) {} 61 62 ~JoinDataIterator() override = default; 63 64 // Advances to the next data. 65 // 66 // Returns: 67 // - OK on success 68 // - RESOURCE_EXHAUSTED_ERROR if reaching the end (i.e. no more relevant 69 // data) 70 // - Any other PostingListJoinDataAccessor errors 71 libtextclassifier3::Status Advance() override; 72 GetCurrent()73 const JoinDataType& GetCurrent() const override { return *curr_; } 74 75 private: 76 // Gets next batch of data from the posting list chain, caches in 77 // cached_batch_integer_index_data_, and sets curr_ to the begin of the 78 // cache. 79 libtextclassifier3::Status GetNextDataBatch(); 80 81 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor_; 82 std::vector<JoinDataType> cached_batch_join_data_; 83 std::vector<JoinDataType>::const_iterator curr_; 84 bool should_retrieve_next_batch_; 85 }; 86 87 struct Info { 88 static constexpr int32_t kMagic = 0x32e374a7; 89 90 int32_t magic; 91 int32_t num_data; 92 DocumentId last_added_document_id; 93 GetChecksumInfo94 Crc32 GetChecksum() const { 95 return Crc32( 96 std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); 97 } 98 } __attribute__((packed)); 99 static_assert(sizeof(Info) == 12, ""); 100 101 // Metadata file layout: <Crcs><Info> 102 static constexpr int32_t kCrcsMetadataBufferOffset = 0; 103 static constexpr int32_t kInfoMetadataBufferOffset = 104 static_cast<int32_t>(sizeof(Crcs)); 105 static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); 106 static_assert(kMetadataFileSize == 24, ""); 107 108 static constexpr WorkingPathType kWorkingPathType = 109 WorkingPathType::kDirectory; 110 111 // Creates a QualifiedIdJoinIndexImplV2 instance to store join data 112 // (DocumentId to referenced NamespaceIdFingerprint) for future joining 113 // search. If any of the underlying file is missing, then delete the whole 114 // working_path and (re)initialize with new ones. Otherwise initialize and 115 // create the instance by existing files. 116 // 117 // filesystem: Object to make system level calls 118 // working_path: Specifies the working path for PersistentStorage. 119 // QualifiedIdJoinIndexImplV2 uses working path as working 120 // directory and all related files will be stored under this 121 // directory. It takes full ownership and of working_path_, 122 // including creation/deletion. It is the caller's 123 // responsibility to specify correct working path and avoid 124 // mixing different persistent storages together under the same 125 // path. Also the caller has the ownership for the parent 126 // directory of working_path_, and it is responsible for parent 127 // directory creation/deletion. See PersistentStorage for more 128 // details about the concept of working_path. 129 // pre_mapping_fbv: flag indicating whether memory map max possible file size 130 // for underlying FileBackedVector before growing the actual 131 // file size. 132 // 133 // Returns: 134 // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored 135 // checksum 136 // - INTERNAL_ERROR on I/O errors 137 // - Any KeyMapper errors 138 static libtextclassifier3::StatusOr< 139 std::unique_ptr<QualifiedIdJoinIndexImplV2>> 140 Create(const Filesystem& filesystem, std::string working_path, 141 bool pre_mapping_fbv); 142 143 // Delete copy and move constructor/assignment operator. 144 QualifiedIdJoinIndexImplV2(const QualifiedIdJoinIndexImplV2&) = delete; 145 QualifiedIdJoinIndexImplV2& operator=(const QualifiedIdJoinIndexImplV2&) = 146 delete; 147 148 QualifiedIdJoinIndexImplV2(QualifiedIdJoinIndexImplV2&&) = delete; 149 QualifiedIdJoinIndexImplV2& operator=(QualifiedIdJoinIndexImplV2&&) = delete; 150 151 ~QualifiedIdJoinIndexImplV2() override; 152 153 // v1 only API. Returns UNIMPLEMENTED_ERROR. Put(const DocumentJoinIdPair & document_join_id_pair,std::string_view ref_qualified_id_str)154 libtextclassifier3::Status Put( 155 const DocumentJoinIdPair& document_join_id_pair, 156 std::string_view ref_qualified_id_str) override { 157 return absl_ports::UnimplementedError("This API is not supported in V2"); 158 } 159 160 // v1 only API. Returns UNIMPLEMENTED_ERROR. Get(const DocumentJoinIdPair & document_join_id_pair)161 libtextclassifier3::StatusOr<std::string_view> Get( 162 const DocumentJoinIdPair& document_join_id_pair) const override { 163 return absl_ports::UnimplementedError("This API is not supported in V2"); 164 } 165 166 // v3 only API. Returns UNIMPLEMENTED_ERROR. Put(const DocumentJoinIdPair & child_document_join_id_pair,std::vector<DocumentId> && parent_document_ids)167 libtextclassifier3::Status Put( 168 const DocumentJoinIdPair& child_document_join_id_pair, 169 std::vector<DocumentId>&& parent_document_ids) override { 170 return absl_ports::UnimplementedError("This API is not supported in V2"); 171 } 172 173 // v3 only API. Returns UNIMPLEMENTED_ERROR. Get(DocumentId parent_document_id)174 libtextclassifier3::StatusOr<std::vector<DocumentJoinIdPair>> Get( 175 DocumentId parent_document_id) const override { 176 return absl_ports::UnimplementedError("This API is not supported in V2"); 177 } 178 179 libtextclassifier3::Status Put( 180 SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id, 181 DocumentId document_id, 182 std::vector<NamespaceIdFingerprint>&& ref_namespace_id_uri_fingerprints) 183 override; 184 185 libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>> 186 GetIterator(SchemaTypeId schema_type_id, 187 JoinablePropertyId joinable_property_id) const override; 188 189 // No-op since v2 stores parent information in (namespace_id, 190 // fingerprint(uri)) format and does not require parent migration. MigrateParent(DocumentId old_document_id,DocumentId new_document_id)191 libtextclassifier3::Status MigrateParent( 192 DocumentId old_document_id, DocumentId new_document_id) override { 193 return libtextclassifier3::Status::OK; 194 } 195 196 libtextclassifier3::Status Optimize( 197 const std::vector<DocumentId>& document_id_old_to_new, 198 const std::vector<NamespaceId>& namespace_id_old_to_new, 199 DocumentId new_last_added_document_id) override; 200 201 libtextclassifier3::Status Clear() override; 202 version()203 QualifiedIdJoinIndex::Version version() const override { 204 return QualifiedIdJoinIndex::Version::kV2; 205 } 206 size()207 int32_t size() const override { return info().num_data; } 208 empty()209 bool empty() const override { return size() == 0; } 210 last_added_document_id()211 DocumentId last_added_document_id() const override { 212 return info().last_added_document_id; 213 } 214 set_last_added_document_id(DocumentId document_id)215 void set_last_added_document_id(DocumentId document_id) override { 216 SetInfoDirty(); 217 218 Info& info_ref = info(); 219 if (info_ref.last_added_document_id == kInvalidDocumentId || 220 document_id > info_ref.last_added_document_id) { 221 info_ref.last_added_document_id = document_id; 222 } 223 } 224 225 private: QualifiedIdJoinIndexImplV2(const Filesystem & filesystem,std::string && working_path,std::unique_ptr<uint8_t[]> metadata_buffer,std::unique_ptr<KeyMapper<PostingListIdentifier>> schema_joinable_id_to_posting_list_mapper,std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> posting_list_serializer,std::unique_ptr<FlashIndexStorage> flash_index_storage,bool pre_mapping_fbv)226 explicit QualifiedIdJoinIndexImplV2( 227 const Filesystem& filesystem, std::string&& working_path, 228 std::unique_ptr<uint8_t[]> metadata_buffer, 229 std::unique_ptr<KeyMapper<PostingListIdentifier>> 230 schema_joinable_id_to_posting_list_mapper, 231 std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> 232 posting_list_serializer, 233 std::unique_ptr<FlashIndexStorage> flash_index_storage, 234 bool pre_mapping_fbv) 235 : QualifiedIdJoinIndex(filesystem, std::move(working_path)), 236 metadata_buffer_(std::move(metadata_buffer)), 237 schema_joinable_id_to_posting_list_mapper_( 238 std::move(schema_joinable_id_to_posting_list_mapper)), 239 posting_list_serializer_(std::move(posting_list_serializer)), 240 flash_index_storage_(std::move(flash_index_storage)), 241 pre_mapping_fbv_(pre_mapping_fbv), 242 is_info_dirty_(false), 243 is_storage_dirty_(false) {} 244 245 static libtextclassifier3::StatusOr< 246 std::unique_ptr<QualifiedIdJoinIndexImplV2>> 247 InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, 248 bool pre_mapping_fbv); 249 250 static libtextclassifier3::StatusOr< 251 std::unique_ptr<QualifiedIdJoinIndexImplV2>> 252 InitializeExistingFiles(const Filesystem& filesystem, 253 std::string&& working_path, bool pre_mapping_fbv); 254 255 // Transfers qualified id join index data from the current to new_index and 256 // convert to new document id according to document_id_old_to_new and 257 // namespace_id_old_to_new. It is a helper function for Optimize. 258 // 259 // Returns: 260 // - OK on success 261 // - INTERNAL_ERROR on I/O error 262 libtextclassifier3::Status TransferIndex( 263 const std::vector<DocumentId>& document_id_old_to_new, 264 const std::vector<NamespaceId>& namespace_id_old_to_new, 265 QualifiedIdJoinIndexImplV2* new_index) const; 266 267 libtextclassifier3::Status PersistMetadataToDisk() override; 268 269 libtextclassifier3::Status PersistStoragesToDisk() override; 270 271 libtextclassifier3::Status WriteMetadata() override; 272 273 libtextclassifier3::Status InternalWriteMetadata(const ScopedFd& sfd); 274 275 libtextclassifier3::StatusOr<Crc32> UpdateStoragesChecksum() override; 276 277 libtextclassifier3::StatusOr<Crc32> GetInfoChecksum() const override; 278 279 libtextclassifier3::StatusOr<Crc32> GetStoragesChecksum() const override; 280 crcs()281 Crcs& crcs() override { 282 return *reinterpret_cast<Crcs*>(metadata_buffer_.get() + 283 kCrcsMetadataBufferOffset); 284 } 285 crcs()286 const Crcs& crcs() const override { 287 return *reinterpret_cast<const Crcs*>(metadata_buffer_.get() + 288 kCrcsMetadataBufferOffset); 289 } 290 info()291 Info& info() { 292 return *reinterpret_cast<Info*>(metadata_buffer_.get() + 293 kInfoMetadataBufferOffset); 294 } 295 info()296 const Info& info() const { 297 return *reinterpret_cast<const Info*>(metadata_buffer_.get() + 298 kInfoMetadataBufferOffset); 299 } 300 SetInfoDirty()301 void SetInfoDirty() { is_info_dirty_ = true; } 302 // When storage is dirty, we have to set info dirty as well. So just expose 303 // SetDirty to set both. SetDirty()304 void SetDirty() { 305 is_info_dirty_ = true; 306 is_storage_dirty_ = true; 307 } 308 is_info_dirty()309 bool is_info_dirty() const { return is_info_dirty_; } is_storage_dirty()310 bool is_storage_dirty() const { return is_storage_dirty_; } 311 312 // Metadata buffer 313 std::unique_ptr<uint8_t[]> metadata_buffer_; 314 315 // Persistent KeyMapper for mapping (schema_type_id, joinable_property_id) to 316 // PostingListIdentifier. 317 std::unique_ptr<KeyMapper<PostingListIdentifier>> 318 schema_joinable_id_to_posting_list_mapper_; 319 320 // Posting list related members. Use posting list to store join data 321 // (document id to referenced NamespaceIdFingerprint). 322 std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> 323 posting_list_serializer_; 324 std::unique_ptr<FlashIndexStorage> flash_index_storage_; 325 326 // TODO(b/268521214): add delete propagation storage 327 328 // Flag indicating whether memory map max possible file size for underlying 329 // FileBackedVector before growing the actual file size. 330 bool pre_mapping_fbv_; 331 332 bool is_info_dirty_; 333 bool is_storage_dirty_; 334 }; 335 336 } // namespace lib 337 } // namespace icing 338 339 #endif // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_ 340