xref: /aosp_15_r20/external/icing/icing/index/numeric/dummy-numeric-index.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_NUMERIC_DUMMY_NUMERIC_INDEX_H_
16 #define ICING_INDEX_NUMERIC_DUMMY_NUMERIC_INDEX_H_
17 
18 #include <cstdint>
19 #include <functional>
20 #include <map>
21 #include <memory>
22 #include <queue>
23 #include <string>
24 #include <string_view>
25 #include <unordered_map>
26 #include <unordered_set>
27 #include <vector>
28 
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/file/filesystem.h"
34 #include "icing/file/persistent-storage.h"
35 #include "icing/index/hit/doc-hit-info.h"
36 #include "icing/index/hit/hit.h"
37 #include "icing/index/iterator/doc-hit-info-iterator.h"
38 #include "icing/index/numeric/doc-hit-info-iterator-numeric.h"
39 #include "icing/index/numeric/numeric-index.h"
40 #include "icing/schema/section.h"
41 #include "icing/store/document-id.h"
42 #include "icing/util/crc32.h"
43 #include "icing/util/status-macros.h"
44 
45 namespace icing {
46 namespace lib {
47 
48 // DummyNumericIndex: dummy class to help with testing and unblock e2e
49 // integration for numeric search. It stores all numeric index data (keys and
50 // hits) in memory without actual persistent storages. All PersistentStorage
51 // features do not work as expected, i.e. they don't persist any data into disk
52 // and therefore data are volatile.
53 template <typename T>
54 class DummyNumericIndex : public NumericIndex<T> {
55  public:
56   static libtextclassifier3::StatusOr<std::unique_ptr<DummyNumericIndex<T>>>
Create(const Filesystem & filesystem,std::string working_path)57   Create(const Filesystem& filesystem, std::string working_path) {
58     auto dummy_numeric_index = std::unique_ptr<DummyNumericIndex<T>>(
59         new DummyNumericIndex<T>(filesystem, std::move(working_path)));
60     ICING_RETURN_IF_ERROR(dummy_numeric_index->InitializeNewStorage());
61     return dummy_numeric_index;
62   }
63 
64   ~DummyNumericIndex() override = default;
65 
Edit(std::string_view property_path,DocumentId document_id,SectionId section_id)66   std::unique_ptr<typename NumericIndex<T>::Editor> Edit(
67       std::string_view property_path, DocumentId document_id,
68       SectionId section_id) override {
69     return std::make_unique<Editor>(property_path, document_id, section_id,
70                                     storage_);
71   }
72 
73   libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
74       std::string_view property_path, T key_lower, T key_upper,
75       const DocumentStore&, const SchemaStore&, int64_t) const override;
76 
77   libtextclassifier3::Status Optimize(
78       const std::vector<DocumentId>& document_id_old_to_new,
79       DocumentId new_last_added_document_id) override;
80 
Clear()81   libtextclassifier3::Status Clear() override {
82     storage_.clear();
83     last_added_document_id_ = kInvalidDocumentId;
84     return libtextclassifier3::Status::OK;
85   }
86 
last_added_document_id()87   DocumentId last_added_document_id() const override {
88     return last_added_document_id_;
89   }
90 
set_last_added_document_id(DocumentId document_id)91   void set_last_added_document_id(DocumentId document_id) override {
92     if (last_added_document_id_ == kInvalidDocumentId ||
93         document_id > last_added_document_id_) {
94       last_added_document_id_ = document_id;
95     }
96   }
97 
num_property_indices()98   int num_property_indices() const override { return storage_.size(); }
99 
100  private:
101   class Editor : public NumericIndex<T>::Editor {
102    public:
Editor(std::string_view property_path,DocumentId document_id,SectionId section_id,std::unordered_map<std::string,std::map<T,std::vector<BasicHit>>> & storage)103     explicit Editor(
104         std::string_view property_path, DocumentId document_id,
105         SectionId section_id,
106         std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>>&
107             storage)
108         : NumericIndex<T>::Editor(property_path, document_id, section_id),
109           storage_(storage) {}
110 
111     ~Editor() override = default;
112 
BufferKey(T key)113     libtextclassifier3::Status BufferKey(T key) override {
114       seen_keys_.insert(key);
115       return libtextclassifier3::Status::OK;
116     }
117 
118     libtextclassifier3::Status IndexAllBufferedKeys() && override;
119 
120    private:
121     std::unordered_set<T> seen_keys_;
122     std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>>&
123         storage_;  // Does not own.
124   };
125 
126   class Iterator : public NumericIndex<T>::Iterator {
127    public:
128     // We group BasicHits (sorted by document_id) of a key into a Bucket (stored
129     // as std::vector) and store key -> vector in an std::map. When doing range
130     // query, we may access vectors from multiple keys and want to return
131     // BasicHits to callers sorted by document_id. Therefore, this problem is
132     // actually "merge K sorted vectors".
133     // To implement this algorithm via priority_queue, we create this wrapper
134     // class to store iterators of map and vector.
135     class BucketInfo {
136      public:
BucketInfo(typename std::map<T,std::vector<BasicHit>>::const_iterator bucket_iter)137       explicit BucketInfo(
138           typename std::map<T, std::vector<BasicHit>>::const_iterator
139               bucket_iter)
140           : bucket_iter_(bucket_iter),
141             vec_iter_(bucket_iter_->second.rbegin()) {}
142 
Advance()143       bool Advance() { return ++vec_iter_ != bucket_iter_->second.rend(); }
144 
GetCurrentBasicHit()145       const BasicHit& GetCurrentBasicHit() const { return *vec_iter_; }
146 
147       bool operator<(const BucketInfo& other) const {
148         // std::priority_queue is a max heap and we should return BasicHits in
149         // DocumentId descending order.
150         // - BucketInfo::operator< should have the same order as DocumentId.
151         // - BasicHit encodes inverted document id and its operator< compares
152         //   the encoded raw value directly.
153         // - Therefore, BucketInfo::operator< should compare BasicHit reversely.
154         // - This will make priority_queue return buckets in DocumentId
155         //   descending and SectionId ascending order.
156         // - Whatever direction we sort SectionId by (or pop by priority_queue)
157         //   doesn't matter because all hits for the same DocumentId will be
158         //   merged into a single DocHitInfo.
159         return other.GetCurrentBasicHit() < GetCurrentBasicHit();
160       }
161 
162      private:
163       typename std::map<T, std::vector<BasicHit>>::const_iterator bucket_iter_;
164       std::vector<BasicHit>::const_reverse_iterator vec_iter_;
165     };
166 
Iterator(T key_lower,T key_upper,std::vector<BucketInfo> && bucket_info_vec)167     explicit Iterator(T key_lower, T key_upper,
168                       std::vector<BucketInfo>&& bucket_info_vec)
169         : NumericIndex<T>::Iterator(key_lower, key_upper),
170           pq_(std::less<BucketInfo>(), std::move(bucket_info_vec)),
171           num_advance_calls_(0) {}
172 
173     ~Iterator() override = default;
174 
175     libtextclassifier3::Status Advance() override;
176 
GetDocHitInfo()177     DocHitInfo GetDocHitInfo() const override { return doc_hit_info_; }
178 
GetNumAdvanceCalls()179     int32_t GetNumAdvanceCalls() const override { return num_advance_calls_; }
180 
GetNumBlocksInspected()181     int32_t GetNumBlocksInspected() const override { return 0; }
182 
183    private:
184     std::priority_queue<BucketInfo> pq_;
185     DocHitInfo doc_hit_info_;
186 
187     int32_t num_advance_calls_;
188   };
189 
DummyNumericIndex(const Filesystem & filesystem,std::string && working_path)190   explicit DummyNumericIndex(const Filesystem& filesystem,
191                              std::string&& working_path)
192       : NumericIndex<T>(filesystem, std::move(working_path),
193                         PersistentStorage::WorkingPathType::kDummy),
194         dummy_crcs_buffer_(
195             std::make_unique<uint8_t[]>(sizeof(PersistentStorage::Crcs))),
196         last_added_document_id_(kInvalidDocumentId) {
197     memset(dummy_crcs_buffer_.get(), 0, sizeof(PersistentStorage::Crcs));
198   }
199 
PersistStoragesToDisk()200   libtextclassifier3::Status PersistStoragesToDisk() override {
201     return libtextclassifier3::Status::OK;
202   }
203 
PersistMetadataToDisk()204   libtextclassifier3::Status PersistMetadataToDisk() override {
205     return libtextclassifier3::Status::OK;
206   }
207 
WriteMetadata()208   libtextclassifier3::Status WriteMetadata() override {
209     return libtextclassifier3::Status::OK;
210   }
211 
UpdateStoragesChecksum()212   libtextclassifier3::StatusOr<Crc32> UpdateStoragesChecksum() override {
213     return Crc32();
214   }
215 
GetInfoChecksum()216   libtextclassifier3::StatusOr<Crc32> GetInfoChecksum() const override {
217     return Crc32();
218   }
219 
GetStoragesChecksum()220   libtextclassifier3::StatusOr<Crc32> GetStoragesChecksum() const override {
221     return Crc32();
222   }
223 
crcs()224   PersistentStorage::Crcs& crcs() override {
225     return *reinterpret_cast<PersistentStorage::Crcs*>(
226         dummy_crcs_buffer_.get());
227   }
crcs()228   const PersistentStorage::Crcs& crcs() const override {
229     return *reinterpret_cast<const PersistentStorage::Crcs*>(
230         dummy_crcs_buffer_.get());
231   }
232 
233   std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>> storage_;
234   std::unique_ptr<uint8_t[]> dummy_crcs_buffer_;
235   DocumentId last_added_document_id_;
236 };
237 
238 template <typename T>
239 libtextclassifier3::Status
IndexAllBufferedKeys()240 DummyNumericIndex<T>::Editor::IndexAllBufferedKeys() && {
241   auto property_map_iter = storage_.find(this->property_path_);
242   if (property_map_iter == storage_.end()) {
243     const auto& [inserted_iter, insert_result] =
244         storage_.insert({this->property_path_, {}});
245     if (!insert_result) {
246       return absl_ports::InternalError(
247           absl_ports::StrCat("Failed to create a new map for property \"",
248                              this->property_path_, "\""));
249     }
250     property_map_iter = inserted_iter;
251   }
252 
253   for (const T& key : seen_keys_) {
254     auto key_map_iter = property_map_iter->second.find(key);
255     if (key_map_iter == property_map_iter->second.end()) {
256       const auto& [inserted_iter, insert_result] =
257           property_map_iter->second.insert({key, {}});
258       if (!insert_result) {
259         return absl_ports::InternalError("Failed to create a new map for key");
260       }
261       key_map_iter = inserted_iter;
262     }
263     key_map_iter->second.push_back(
264         BasicHit(this->section_id_, this->document_id_));
265   }
266   return libtextclassifier3::Status::OK;
267 }
268 
269 template <typename T>
Advance()270 libtextclassifier3::Status DummyNumericIndex<T>::Iterator::Advance() {
271   if (pq_.empty()) {
272     return absl_ports::ResourceExhaustedError("End of iterator");
273   }
274 
275   DocumentId document_id = pq_.top().GetCurrentBasicHit().document_id();
276   doc_hit_info_ = DocHitInfo(document_id);
277   // Merge sections with same document_id into a single DocHitInfo
278   while (!pq_.empty() &&
279          pq_.top().GetCurrentBasicHit().document_id() == document_id) {
280     ++num_advance_calls_;
281     doc_hit_info_.UpdateSection(pq_.top().GetCurrentBasicHit().section_id());
282 
283     BucketInfo info = pq_.top();
284     pq_.pop();
285 
286     if (info.Advance()) {
287       pq_.push(std::move(info));
288     }
289   }
290 
291   return libtextclassifier3::Status::OK;
292 }
293 
294 template <typename T>
295 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
GetIterator(std::string_view property_path,T key_lower,T key_upper,const DocumentStore &,const SchemaStore &,int64_t)296 DummyNumericIndex<T>::GetIterator(std::string_view property_path, T key_lower,
297                                   T key_upper, const DocumentStore&,
298                                   const SchemaStore&, int64_t) const {
299   if (key_lower > key_upper) {
300     return absl_ports::InvalidArgumentError(
301         "key_lower should not be greater than key_upper");
302   }
303 
304   auto property_map_iter = storage_.find(std::string(property_path));
305   if (property_map_iter == storage_.end()) {
306     // Return an empty iterator.
307     return std::make_unique<DocHitInfoIteratorNumeric<T>>(nullptr);
308   }
309 
310   std::vector<typename Iterator::BucketInfo> bucket_info_vec;
311   for (auto key_map_iter = property_map_iter->second.lower_bound(key_lower);
312        key_map_iter != property_map_iter->second.cend() &&
313        key_map_iter->first <= key_upper;
314        ++key_map_iter) {
315     bucket_info_vec.push_back(typename Iterator::BucketInfo(key_map_iter));
316   }
317 
318   return std::make_unique<DocHitInfoIteratorNumeric<T>>(
319       std::make_unique<Iterator>(key_lower, key_upper,
320                                  std::move(bucket_info_vec)));
321 }
322 
323 template <typename T>
Optimize(const std::vector<DocumentId> & document_id_old_to_new,DocumentId new_last_added_document_id)324 libtextclassifier3::Status DummyNumericIndex<T>::Optimize(
325     const std::vector<DocumentId>& document_id_old_to_new,
326     DocumentId new_last_added_document_id) {
327   std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>>
328       new_storage;
329 
330   for (const auto& [property_path, old_property_map] : storage_) {
331     std::map<T, std::vector<BasicHit>> new_property_map;
332     for (const auto& [key, hits] : old_property_map) {
333       for (const BasicHit& hit : hits) {
334         DocumentId old_doc_id = hit.document_id();
335         if (old_doc_id >= document_id_old_to_new.size() ||
336             document_id_old_to_new[old_doc_id] == kInvalidDocumentId) {
337           continue;
338         }
339 
340         new_property_map[key].push_back(
341             BasicHit(hit.section_id(), document_id_old_to_new[old_doc_id]));
342       }
343     }
344 
345     if (!new_property_map.empty()) {
346       new_storage[property_path] = std::move(new_property_map);
347     }
348   }
349 
350   storage_ = std::move(new_storage);
351   last_added_document_id_ = new_last_added_document_id;
352   return libtextclassifier3::Status::OK;
353 }
354 
355 }  // namespace lib
356 }  // namespace icing
357 
358 #endif  // ICING_INDEX_NUMERIC_DUMMY_NUMERIC_INDEX_H_
359