xref: /aosp_15_r20/external/icing/icing/index/numeric/integer-index.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_H_
16 #define ICING_INDEX_NUMERIC_INTEGER_INDEX_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/file/file-backed-proto.h"
27 #include "icing/file/filesystem.h"
28 #include "icing/file/memory-mapped-file.h"
29 #include "icing/index/numeric/integer-index-storage.h"
30 #include "icing/index/numeric/numeric-index.h"
31 #include "icing/index/numeric/posting-list-integer-index-serializer.h"
32 #include "icing/index/numeric/wildcard-property-storage.pb.h"
33 #include "icing/schema/schema-store.h"
34 #include "icing/store/document-id.h"
35 #include "icing/store/document-store.h"
36 #include "icing/util/crc32.h"
37 
38 namespace icing {
39 namespace lib {
40 
41 // IntegerIndex: a wrapper class for managing IntegerIndexStorage (a lower level
42 // persistent storage class for indexing and searching contents of integer type
43 // sections in documents) instances for different property paths.
44 // We separate indexable integer data from different properties into different
45 // storages, and IntegerIndex manages and handles indexable integer data
46 // appropriately to their corresponding IntegerIndexStorage instance according
47 // to the given property path.
48 class IntegerIndex : public NumericIndex<int64_t> {
49  public:
50   using PropertyToStorageMapType =
51       std::unordered_map<std::string, std::unique_ptr<IntegerIndexStorage>>;
52 
53   // Maximum number of individual property storages that this index will allow
54   // before falling back to placing hits for any new properties into the
55   // 'wildcard' storage.
56   static constexpr int kMaxPropertyStorages = 32;
57 
58   static constexpr int32_t kDefaultNumDataThresholdForBucketSplit =
59       IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit;
60 
61   struct Info {
62     static constexpr int32_t kMagic = 0x5d8a1e8a;
63 
64     int32_t magic;
65     DocumentId last_added_document_id;
66     int32_t num_data_threshold_for_bucket_split;
67 
GetChecksumInfo68     Crc32 GetChecksum() const {
69       return Crc32(
70           std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
71     }
72   } __attribute__((packed));
73   static_assert(sizeof(Info) == 12, "");
74 
75   // Metadata file layout: <Crcs><Info>
76   static constexpr int32_t kCrcsMetadataFileOffset = 0;
77   static constexpr int32_t kInfoMetadataFileOffset =
78       static_cast<int32_t>(sizeof(Crcs));
79   static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
80   static_assert(kMetadataFileSize == 24, "");
81 
82   static constexpr WorkingPathType kWorkingPathType =
83       WorkingPathType::kDirectory;
84   static constexpr std::string_view kFilePrefix = "integer_index";
85 
86   // Creates a new IntegerIndex instance to index integers. If any of the
87   // underlying file is missing, then delete the whole working_path and
88   // (re)initialize with new ones. Otherwise initialize and create the instance
89   // by existing files.
90   //
91   // filesystem: Object to make system level calls
92   // working_path: Specifies the working path for PersistentStorage.
93   //               IntegerIndex uses working path as working directory and all
94   //               related files will be stored under this directory. See
95   //               PersistentStorage for more details about the concept of
96   //               working_path.
97   // num_data_threshold_for_bucket_split: see IntegerIndexStorage::Options for
98   //                                      more details.
99   // pre_mapping_fbv: flag indicating whether memory map max possible file size
100   //                  for underlying FileBackedVector before growing the actual
101   //                  file size.
102   //
103   // Returns:
104   //   - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
105   //                               checksum.
106   //   - INTERNAL_ERROR on I/O errors.
107   //   - Any FileBackedVector/MemoryMappedFile errors.
108   static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> Create(
109       const Filesystem& filesystem, std::string working_path,
110       int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv);
111 
112   // Deletes IntegerIndex under working_path.
113   //
114   // Returns:
115   //   - OK on success
116   //   - INTERNAL_ERROR on I/O error
Discard(const Filesystem & filesystem,const std::string & working_path)117   static libtextclassifier3::Status Discard(const Filesystem& filesystem,
118                                             const std::string& working_path) {
119     return PersistentStorage::Discard(filesystem, working_path,
120                                       kWorkingPathType);
121   }
122 
123   ~IntegerIndex() override;
124 
Edit(std::string_view property_path,DocumentId document_id,SectionId section_id)125   std::unique_ptr<typename NumericIndex<int64_t>::Editor> Edit(
126       std::string_view property_path, DocumentId document_id,
127       SectionId section_id) override {
128     return std::make_unique<Editor>(property_path, document_id, section_id,
129                                     *this, num_data_threshold_for_bucket_split_,
130                                     pre_mapping_fbv_);
131   }
132 
133   libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
134       std::string_view property_path, int64_t key_lower, int64_t key_upper,
135       const DocumentStore& document_store, const SchemaStore& schema_store,
136       int64_t current_time_ms) const override;
137 
138   libtextclassifier3::Status Optimize(
139       const std::vector<DocumentId>& document_id_old_to_new,
140       DocumentId new_last_added_document_id) override;
141 
142   libtextclassifier3::Status Clear() override;
143 
last_added_document_id()144   DocumentId last_added_document_id() const override {
145     return info().last_added_document_id;
146   }
147 
set_last_added_document_id(DocumentId document_id)148   void set_last_added_document_id(DocumentId document_id) override {
149     SetInfoDirty();
150 
151     Info& info_ref = info();
152     if (info_ref.last_added_document_id == kInvalidDocumentId ||
153         document_id > info_ref.last_added_document_id) {
154       info_ref.last_added_document_id = document_id;
155     }
156   }
157 
num_property_indices()158   int num_property_indices() const override {
159     return property_to_storage_map_.size() +
160            ((wildcard_index_storage_ == nullptr) ? 0 : 1);
161   }
162 
163  private:
164   class Editor : public NumericIndex<int64_t>::Editor {
165    public:
Editor(std::string_view property_path,DocumentId document_id,SectionId section_id,IntegerIndex & integer_index,int32_t num_data_threshold_for_bucket_split,bool pre_mapping_fbv)166     explicit Editor(std::string_view property_path, DocumentId document_id,
167                     SectionId section_id, IntegerIndex& integer_index,
168                     int32_t num_data_threshold_for_bucket_split,
169                     bool pre_mapping_fbv)
170         : NumericIndex<int64_t>::Editor(property_path, document_id, section_id),
171           integer_index_(integer_index),
172           num_data_threshold_for_bucket_split_(
173               num_data_threshold_for_bucket_split),
174           pre_mapping_fbv_(pre_mapping_fbv) {}
175 
176     ~Editor() override = default;
177 
BufferKey(int64_t key)178     libtextclassifier3::Status BufferKey(int64_t key) override {
179       seen_keys_.push_back(key);
180       return libtextclassifier3::Status::OK;
181     }
182 
183     libtextclassifier3::Status IndexAllBufferedKeys() && override;
184 
185    private:
186     // Vector for caching all seen keys. Since IntegerIndexStorage::AddKeys
187     // sorts and dedupes keys, we can just simply use vector here and move it to
188     // AddKeys().
189     std::vector<int64_t> seen_keys_;
190 
191     IntegerIndex& integer_index_;  // Does not own.
192 
193     int32_t num_data_threshold_for_bucket_split_;
194 
195     // Flag indicating whether memory map max possible file size for underlying
196     // FileBackedVector before growing the actual file size.
197     bool pre_mapping_fbv_;
198   };
199 
IntegerIndex(const Filesystem & filesystem,std::string && working_path,std::unique_ptr<PostingListIntegerIndexSerializer> posting_list_serializer,std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,PropertyToStorageMapType && property_to_storage_map,std::unique_ptr<FileBackedProto<WildcardPropertyStorage>> wildcard_property_storage,std::unordered_set<std::string> wildcard_properties_set,std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage,int32_t num_data_threshold_for_bucket_split,bool pre_mapping_fbv)200   explicit IntegerIndex(
201       const Filesystem& filesystem, std::string&& working_path,
202       std::unique_ptr<PostingListIntegerIndexSerializer>
203           posting_list_serializer,
204       std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,
205       PropertyToStorageMapType&& property_to_storage_map,
206       std::unique_ptr<FileBackedProto<WildcardPropertyStorage>>
207           wildcard_property_storage,
208       std::unordered_set<std::string> wildcard_properties_set,
209       std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage,
210       int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv)
211       : NumericIndex<int64_t>(filesystem, std::move(working_path),
212                               kWorkingPathType),
213         posting_list_serializer_(std::move(posting_list_serializer)),
214         metadata_mmapped_file_(std::move(metadata_mmapped_file)),
215         property_to_storage_map_(std::move(property_to_storage_map)),
216         wildcard_property_storage_(std::move(wildcard_property_storage)),
217         wildcard_properties_set_(std::move(wildcard_properties_set)),
218         wildcard_index_storage_(std::move(wildcard_index_storage)),
219         num_data_threshold_for_bucket_split_(
220             num_data_threshold_for_bucket_split),
221         pre_mapping_fbv_(pre_mapping_fbv),
222         is_info_dirty_(false),
223         is_storage_dirty_(false) {}
224 
225   static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
226   InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path,
227                      int32_t num_data_threshold_for_bucket_split,
228                      bool pre_mapping_fbv);
229 
230   static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
231   InitializeExistingFiles(const Filesystem& filesystem,
232                           std::string&& working_path,
233                           int32_t num_data_threshold_for_bucket_split,
234                           bool pre_mapping_fbv);
235 
236   // Adds the property path to the list of properties using wildcard storage.
237   // This will both update the in-memory list (wildcard_properties_set_) and
238   // the persistent list (wilcard_property_storage_).
239   //
240   // RETURNS:
241   //   - OK on success
242   //   - INTERNAL_ERROR if unable to successfully persist updated properties
243   //     list in wildcard_property_storage_.
244   libtextclassifier3::Status AddPropertyToWildcardStorage(
245       const std::string& property_path);
246 
247   // Transfers integer index data from the current integer index to
248   // new_integer_index.
249   //
250   // Returns:
251   //   - OK on success
252   //   - INTERNAL_ERROR on I/O error. This could potentially leave the storages
253   //     in an invalid state and the caller should handle it properly (e.g.
254   //     discard and rebuild)
255   libtextclassifier3::Status TransferIndex(
256       const std::vector<DocumentId>& document_id_old_to_new,
257       IntegerIndex* new_integer_index) const;
258 
259   // Transfers integer index data from old_storage to new_integer_index.
260   //
261   // Returns:
262   //   - OK on success
263   //   - INTERNAL_ERROR on I/O error. This could potentially leave the storages
264   //     in an invalid state and the caller should handle it properly (e.g.
265   //     discard and rebuild)
266   libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
267   TransferIntegerIndexStorage(
268       const std::vector<DocumentId>& document_id_old_to_new,
269       const IntegerIndexStorage* old_storage, const std::string& property_path,
270       IntegerIndex* new_integer_index) const;
271 
272   // Transfers the persistent and in-memory list of properties using the
273   // wildcard storage from old_storage to new_integer_index.
274   //
275   // RETURNS:
276   //   - OK on success
277   //   - INTERNAL_ERROR if unable to successfully persist updated properties
278   //     list in new_integer_index.
279   libtextclassifier3::Status TransferWildcardStorage(
280       IntegerIndex* new_integer_index) const;
281 
282   libtextclassifier3::Status PersistStoragesToDisk() override;
283 
284   libtextclassifier3::Status PersistMetadataToDisk() override;
285 
WriteMetadata()286   libtextclassifier3::Status WriteMetadata() override {
287     // IntegerIndex::Header is mmapped. Therefore, writes occur when the
288     // metadata is modified. So just return OK.
289     return libtextclassifier3::Status::OK;
290   }
291 
292   libtextclassifier3::StatusOr<Crc32> UpdateStoragesChecksum() override;
293 
294   libtextclassifier3::StatusOr<Crc32> GetInfoChecksum() const override;
295 
296   libtextclassifier3::StatusOr<Crc32> GetStoragesChecksum() const override;
297 
crcs()298   Crcs& crcs() override {
299     return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() +
300                                     kCrcsMetadataFileOffset);
301   }
302 
crcs()303   const Crcs& crcs() const override {
304     return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() +
305                                           kCrcsMetadataFileOffset);
306   }
307 
info()308   Info& info() {
309     return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() +
310                                     kInfoMetadataFileOffset);
311   }
312 
info()313   const Info& info() const {
314     return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() +
315                                           kInfoMetadataFileOffset);
316   }
317 
SetInfoDirty()318   void SetInfoDirty() { is_info_dirty_ = true; }
319 
320   // When the storage is dirty, then the checksum in the info is invalid and
321   // must be recalculated. Therefore, also mark the info as dirty.
SetDirty()322   void SetDirty() {
323     SetInfoDirty();
324     is_storage_dirty_ = true;
325   }
326 
is_info_dirty()327   bool is_info_dirty() const { return is_info_dirty_; }
is_storage_dirty()328   bool is_storage_dirty() const { return is_storage_dirty_; }
329 
330   std::unique_ptr<PostingListIntegerIndexSerializer> posting_list_serializer_;
331 
332   std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_;
333 
334   // Property path to integer index storage map.
335   PropertyToStorageMapType property_to_storage_map_;
336 
337   // Persistent list of properties that have added content to
338   // wildcard_index_storage_.
339   std::unique_ptr<FileBackedProto<WildcardPropertyStorage>>
340       wildcard_property_storage_;
341 
342   // In-memory list of properties that have added content to
343   // wildcard_index_storage_.
344   std::unordered_set<std::string> wildcard_properties_set_;
345 
346   // The index storage that is used once we have already created
347   // kMaxPropertyStorages in property_to_storage_map.
348   std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage_;
349 
350   int32_t num_data_threshold_for_bucket_split_;
351 
352   // Flag indicating whether memory map max possible file size for underlying
353   // FileBackedVector before growing the actual file size.
354   bool pre_mapping_fbv_;
355 
356   bool is_info_dirty_;
357   bool is_storage_dirty_;
358 };
359 
360 }  // namespace lib
361 }  // namespace icing
362 
363 #endif  // ICING_INDEX_NUMERIC_INTEGER_INDEX_H_
364