1 // Copyright (C) 2023 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_H_ 16 #define ICING_INDEX_NUMERIC_INTEGER_INDEX_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <unordered_map> 23 24 #include "icing/text_classifier/lib3/utils/base/status.h" 25 #include "icing/text_classifier/lib3/utils/base/statusor.h" 26 #include "icing/file/file-backed-proto.h" 27 #include "icing/file/filesystem.h" 28 #include "icing/file/memory-mapped-file.h" 29 #include "icing/index/numeric/integer-index-storage.h" 30 #include "icing/index/numeric/numeric-index.h" 31 #include "icing/index/numeric/posting-list-integer-index-serializer.h" 32 #include "icing/index/numeric/wildcard-property-storage.pb.h" 33 #include "icing/schema/schema-store.h" 34 #include "icing/store/document-id.h" 35 #include "icing/store/document-store.h" 36 #include "icing/util/crc32.h" 37 38 namespace icing { 39 namespace lib { 40 41 // IntegerIndex: a wrapper class for managing IntegerIndexStorage (a lower level 42 // persistent storage class for indexing and searching contents of integer type 43 // sections in documents) instances for different property paths. 44 // We separate indexable integer data from different properties into different 45 // storages, and IntegerIndex manages and handles indexable integer data 46 // appropriately to their corresponding IntegerIndexStorage instance according 47 // to the given property path. 48 class IntegerIndex : public NumericIndex<int64_t> { 49 public: 50 using PropertyToStorageMapType = 51 std::unordered_map<std::string, std::unique_ptr<IntegerIndexStorage>>; 52 53 // Maximum number of individual property storages that this index will allow 54 // before falling back to placing hits for any new properties into the 55 // 'wildcard' storage. 56 static constexpr int kMaxPropertyStorages = 32; 57 58 static constexpr int32_t kDefaultNumDataThresholdForBucketSplit = 59 IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit; 60 61 struct Info { 62 static constexpr int32_t kMagic = 0x5d8a1e8a; 63 64 int32_t magic; 65 DocumentId last_added_document_id; 66 int32_t num_data_threshold_for_bucket_split; 67 GetChecksumInfo68 Crc32 GetChecksum() const { 69 return Crc32( 70 std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); 71 } 72 } __attribute__((packed)); 73 static_assert(sizeof(Info) == 12, ""); 74 75 // Metadata file layout: <Crcs><Info> 76 static constexpr int32_t kCrcsMetadataFileOffset = 0; 77 static constexpr int32_t kInfoMetadataFileOffset = 78 static_cast<int32_t>(sizeof(Crcs)); 79 static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); 80 static_assert(kMetadataFileSize == 24, ""); 81 82 static constexpr WorkingPathType kWorkingPathType = 83 WorkingPathType::kDirectory; 84 static constexpr std::string_view kFilePrefix = "integer_index"; 85 86 // Creates a new IntegerIndex instance to index integers. If any of the 87 // underlying file is missing, then delete the whole working_path and 88 // (re)initialize with new ones. Otherwise initialize and create the instance 89 // by existing files. 90 // 91 // filesystem: Object to make system level calls 92 // working_path: Specifies the working path for PersistentStorage. 93 // IntegerIndex uses working path as working directory and all 94 // related files will be stored under this directory. See 95 // PersistentStorage for more details about the concept of 96 // working_path. 97 // num_data_threshold_for_bucket_split: see IntegerIndexStorage::Options for 98 // more details. 99 // pre_mapping_fbv: flag indicating whether memory map max possible file size 100 // for underlying FileBackedVector before growing the actual 101 // file size. 102 // 103 // Returns: 104 // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored 105 // checksum. 106 // - INTERNAL_ERROR on I/O errors. 107 // - Any FileBackedVector/MemoryMappedFile errors. 108 static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> Create( 109 const Filesystem& filesystem, std::string working_path, 110 int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv); 111 112 // Deletes IntegerIndex under working_path. 113 // 114 // Returns: 115 // - OK on success 116 // - INTERNAL_ERROR on I/O error Discard(const Filesystem & filesystem,const std::string & working_path)117 static libtextclassifier3::Status Discard(const Filesystem& filesystem, 118 const std::string& working_path) { 119 return PersistentStorage::Discard(filesystem, working_path, 120 kWorkingPathType); 121 } 122 123 ~IntegerIndex() override; 124 Edit(std::string_view property_path,DocumentId document_id,SectionId section_id)125 std::unique_ptr<typename NumericIndex<int64_t>::Editor> Edit( 126 std::string_view property_path, DocumentId document_id, 127 SectionId section_id) override { 128 return std::make_unique<Editor>(property_path, document_id, section_id, 129 *this, num_data_threshold_for_bucket_split_, 130 pre_mapping_fbv_); 131 } 132 133 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator( 134 std::string_view property_path, int64_t key_lower, int64_t key_upper, 135 const DocumentStore& document_store, const SchemaStore& schema_store, 136 int64_t current_time_ms) const override; 137 138 libtextclassifier3::Status Optimize( 139 const std::vector<DocumentId>& document_id_old_to_new, 140 DocumentId new_last_added_document_id) override; 141 142 libtextclassifier3::Status Clear() override; 143 last_added_document_id()144 DocumentId last_added_document_id() const override { 145 return info().last_added_document_id; 146 } 147 set_last_added_document_id(DocumentId document_id)148 void set_last_added_document_id(DocumentId document_id) override { 149 SetInfoDirty(); 150 151 Info& info_ref = info(); 152 if (info_ref.last_added_document_id == kInvalidDocumentId || 153 document_id > info_ref.last_added_document_id) { 154 info_ref.last_added_document_id = document_id; 155 } 156 } 157 num_property_indices()158 int num_property_indices() const override { 159 return property_to_storage_map_.size() + 160 ((wildcard_index_storage_ == nullptr) ? 0 : 1); 161 } 162 163 private: 164 class Editor : public NumericIndex<int64_t>::Editor { 165 public: Editor(std::string_view property_path,DocumentId document_id,SectionId section_id,IntegerIndex & integer_index,int32_t num_data_threshold_for_bucket_split,bool pre_mapping_fbv)166 explicit Editor(std::string_view property_path, DocumentId document_id, 167 SectionId section_id, IntegerIndex& integer_index, 168 int32_t num_data_threshold_for_bucket_split, 169 bool pre_mapping_fbv) 170 : NumericIndex<int64_t>::Editor(property_path, document_id, section_id), 171 integer_index_(integer_index), 172 num_data_threshold_for_bucket_split_( 173 num_data_threshold_for_bucket_split), 174 pre_mapping_fbv_(pre_mapping_fbv) {} 175 176 ~Editor() override = default; 177 BufferKey(int64_t key)178 libtextclassifier3::Status BufferKey(int64_t key) override { 179 seen_keys_.push_back(key); 180 return libtextclassifier3::Status::OK; 181 } 182 183 libtextclassifier3::Status IndexAllBufferedKeys() && override; 184 185 private: 186 // Vector for caching all seen keys. Since IntegerIndexStorage::AddKeys 187 // sorts and dedupes keys, we can just simply use vector here and move it to 188 // AddKeys(). 189 std::vector<int64_t> seen_keys_; 190 191 IntegerIndex& integer_index_; // Does not own. 192 193 int32_t num_data_threshold_for_bucket_split_; 194 195 // Flag indicating whether memory map max possible file size for underlying 196 // FileBackedVector before growing the actual file size. 197 bool pre_mapping_fbv_; 198 }; 199 IntegerIndex(const Filesystem & filesystem,std::string && working_path,std::unique_ptr<PostingListIntegerIndexSerializer> posting_list_serializer,std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,PropertyToStorageMapType && property_to_storage_map,std::unique_ptr<FileBackedProto<WildcardPropertyStorage>> wildcard_property_storage,std::unordered_set<std::string> wildcard_properties_set,std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage,int32_t num_data_threshold_for_bucket_split,bool pre_mapping_fbv)200 explicit IntegerIndex( 201 const Filesystem& filesystem, std::string&& working_path, 202 std::unique_ptr<PostingListIntegerIndexSerializer> 203 posting_list_serializer, 204 std::unique_ptr<MemoryMappedFile> metadata_mmapped_file, 205 PropertyToStorageMapType&& property_to_storage_map, 206 std::unique_ptr<FileBackedProto<WildcardPropertyStorage>> 207 wildcard_property_storage, 208 std::unordered_set<std::string> wildcard_properties_set, 209 std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage, 210 int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) 211 : NumericIndex<int64_t>(filesystem, std::move(working_path), 212 kWorkingPathType), 213 posting_list_serializer_(std::move(posting_list_serializer)), 214 metadata_mmapped_file_(std::move(metadata_mmapped_file)), 215 property_to_storage_map_(std::move(property_to_storage_map)), 216 wildcard_property_storage_(std::move(wildcard_property_storage)), 217 wildcard_properties_set_(std::move(wildcard_properties_set)), 218 wildcard_index_storage_(std::move(wildcard_index_storage)), 219 num_data_threshold_for_bucket_split_( 220 num_data_threshold_for_bucket_split), 221 pre_mapping_fbv_(pre_mapping_fbv), 222 is_info_dirty_(false), 223 is_storage_dirty_(false) {} 224 225 static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> 226 InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path, 227 int32_t num_data_threshold_for_bucket_split, 228 bool pre_mapping_fbv); 229 230 static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> 231 InitializeExistingFiles(const Filesystem& filesystem, 232 std::string&& working_path, 233 int32_t num_data_threshold_for_bucket_split, 234 bool pre_mapping_fbv); 235 236 // Adds the property path to the list of properties using wildcard storage. 237 // This will both update the in-memory list (wildcard_properties_set_) and 238 // the persistent list (wilcard_property_storage_). 239 // 240 // RETURNS: 241 // - OK on success 242 // - INTERNAL_ERROR if unable to successfully persist updated properties 243 // list in wildcard_property_storage_. 244 libtextclassifier3::Status AddPropertyToWildcardStorage( 245 const std::string& property_path); 246 247 // Transfers integer index data from the current integer index to 248 // new_integer_index. 249 // 250 // Returns: 251 // - OK on success 252 // - INTERNAL_ERROR on I/O error. This could potentially leave the storages 253 // in an invalid state and the caller should handle it properly (e.g. 254 // discard and rebuild) 255 libtextclassifier3::Status TransferIndex( 256 const std::vector<DocumentId>& document_id_old_to_new, 257 IntegerIndex* new_integer_index) const; 258 259 // Transfers integer index data from old_storage to new_integer_index. 260 // 261 // Returns: 262 // - OK on success 263 // - INTERNAL_ERROR on I/O error. This could potentially leave the storages 264 // in an invalid state and the caller should handle it properly (e.g. 265 // discard and rebuild) 266 libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> 267 TransferIntegerIndexStorage( 268 const std::vector<DocumentId>& document_id_old_to_new, 269 const IntegerIndexStorage* old_storage, const std::string& property_path, 270 IntegerIndex* new_integer_index) const; 271 272 // Transfers the persistent and in-memory list of properties using the 273 // wildcard storage from old_storage to new_integer_index. 274 // 275 // RETURNS: 276 // - OK on success 277 // - INTERNAL_ERROR if unable to successfully persist updated properties 278 // list in new_integer_index. 279 libtextclassifier3::Status TransferWildcardStorage( 280 IntegerIndex* new_integer_index) const; 281 282 libtextclassifier3::Status PersistStoragesToDisk() override; 283 284 libtextclassifier3::Status PersistMetadataToDisk() override; 285 WriteMetadata()286 libtextclassifier3::Status WriteMetadata() override { 287 // IntegerIndex::Header is mmapped. Therefore, writes occur when the 288 // metadata is modified. So just return OK. 289 return libtextclassifier3::Status::OK; 290 } 291 292 libtextclassifier3::StatusOr<Crc32> UpdateStoragesChecksum() override; 293 294 libtextclassifier3::StatusOr<Crc32> GetInfoChecksum() const override; 295 296 libtextclassifier3::StatusOr<Crc32> GetStoragesChecksum() const override; 297 crcs()298 Crcs& crcs() override { 299 return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() + 300 kCrcsMetadataFileOffset); 301 } 302 crcs()303 const Crcs& crcs() const override { 304 return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() + 305 kCrcsMetadataFileOffset); 306 } 307 info()308 Info& info() { 309 return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() + 310 kInfoMetadataFileOffset); 311 } 312 info()313 const Info& info() const { 314 return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() + 315 kInfoMetadataFileOffset); 316 } 317 SetInfoDirty()318 void SetInfoDirty() { is_info_dirty_ = true; } 319 320 // When the storage is dirty, then the checksum in the info is invalid and 321 // must be recalculated. Therefore, also mark the info as dirty. SetDirty()322 void SetDirty() { 323 SetInfoDirty(); 324 is_storage_dirty_ = true; 325 } 326 is_info_dirty()327 bool is_info_dirty() const { return is_info_dirty_; } is_storage_dirty()328 bool is_storage_dirty() const { return is_storage_dirty_; } 329 330 std::unique_ptr<PostingListIntegerIndexSerializer> posting_list_serializer_; 331 332 std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_; 333 334 // Property path to integer index storage map. 335 PropertyToStorageMapType property_to_storage_map_; 336 337 // Persistent list of properties that have added content to 338 // wildcard_index_storage_. 339 std::unique_ptr<FileBackedProto<WildcardPropertyStorage>> 340 wildcard_property_storage_; 341 342 // In-memory list of properties that have added content to 343 // wildcard_index_storage_. 344 std::unordered_set<std::string> wildcard_properties_set_; 345 346 // The index storage that is used once we have already created 347 // kMaxPropertyStorages in property_to_storage_map. 348 std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage_; 349 350 int32_t num_data_threshold_for_bucket_split_; 351 352 // Flag indicating whether memory map max possible file size for underlying 353 // FileBackedVector before growing the actual file size. 354 bool pre_mapping_fbv_; 355 356 bool is_info_dirty_; 357 bool is_storage_dirty_; 358 }; 359 360 } // namespace lib 361 } // namespace icing 362 363 #endif // ICING_INDEX_NUMERIC_INTEGER_INDEX_H_ 364