1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef ICING_INDEX_NUMERIC_DUMMY_NUMERIC_INDEX_H_
16 #define ICING_INDEX_NUMERIC_DUMMY_NUMERIC_INDEX_H_
17
18 #include <cstdint>
19 #include <functional>
20 #include <map>
21 #include <memory>
22 #include <queue>
23 #include <string>
24 #include <string_view>
25 #include <unordered_map>
26 #include <unordered_set>
27 #include <vector>
28
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/canonical_errors.h"
32 #include "icing/absl_ports/str_cat.h"
33 #include "icing/file/filesystem.h"
34 #include "icing/file/persistent-storage.h"
35 #include "icing/index/hit/doc-hit-info.h"
36 #include "icing/index/hit/hit.h"
37 #include "icing/index/iterator/doc-hit-info-iterator.h"
38 #include "icing/index/numeric/doc-hit-info-iterator-numeric.h"
39 #include "icing/index/numeric/numeric-index.h"
40 #include "icing/schema/section.h"
41 #include "icing/store/document-id.h"
42 #include "icing/util/crc32.h"
43 #include "icing/util/status-macros.h"
44
45 namespace icing {
46 namespace lib {
47
48 // DummyNumericIndex: dummy class to help with testing and unblock e2e
49 // integration for numeric search. It stores all numeric index data (keys and
50 // hits) in memory without actual persistent storages. All PersistentStorage
51 // features do not work as expected, i.e. they don't persist any data into disk
52 // and therefore data are volatile.
53 template <typename T>
54 class DummyNumericIndex : public NumericIndex<T> {
55 public:
56 static libtextclassifier3::StatusOr<std::unique_ptr<DummyNumericIndex<T>>>
Create(const Filesystem & filesystem,std::string working_path)57 Create(const Filesystem& filesystem, std::string working_path) {
58 auto dummy_numeric_index = std::unique_ptr<DummyNumericIndex<T>>(
59 new DummyNumericIndex<T>(filesystem, std::move(working_path)));
60 ICING_RETURN_IF_ERROR(dummy_numeric_index->InitializeNewStorage());
61 return dummy_numeric_index;
62 }
63
64 ~DummyNumericIndex() override = default;
65
Edit(std::string_view property_path,DocumentId document_id,SectionId section_id)66 std::unique_ptr<typename NumericIndex<T>::Editor> Edit(
67 std::string_view property_path, DocumentId document_id,
68 SectionId section_id) override {
69 return std::make_unique<Editor>(property_path, document_id, section_id,
70 storage_);
71 }
72
73 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
74 std::string_view property_path, T key_lower, T key_upper,
75 const DocumentStore&, const SchemaStore&, int64_t) const override;
76
77 libtextclassifier3::Status Optimize(
78 const std::vector<DocumentId>& document_id_old_to_new,
79 DocumentId new_last_added_document_id) override;
80
Clear()81 libtextclassifier3::Status Clear() override {
82 storage_.clear();
83 last_added_document_id_ = kInvalidDocumentId;
84 return libtextclassifier3::Status::OK;
85 }
86
last_added_document_id()87 DocumentId last_added_document_id() const override {
88 return last_added_document_id_;
89 }
90
set_last_added_document_id(DocumentId document_id)91 void set_last_added_document_id(DocumentId document_id) override {
92 if (last_added_document_id_ == kInvalidDocumentId ||
93 document_id > last_added_document_id_) {
94 last_added_document_id_ = document_id;
95 }
96 }
97
num_property_indices()98 int num_property_indices() const override { return storage_.size(); }
99
100 private:
101 class Editor : public NumericIndex<T>::Editor {
102 public:
Editor(std::string_view property_path,DocumentId document_id,SectionId section_id,std::unordered_map<std::string,std::map<T,std::vector<BasicHit>>> & storage)103 explicit Editor(
104 std::string_view property_path, DocumentId document_id,
105 SectionId section_id,
106 std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>>&
107 storage)
108 : NumericIndex<T>::Editor(property_path, document_id, section_id),
109 storage_(storage) {}
110
111 ~Editor() override = default;
112
BufferKey(T key)113 libtextclassifier3::Status BufferKey(T key) override {
114 seen_keys_.insert(key);
115 return libtextclassifier3::Status::OK;
116 }
117
118 libtextclassifier3::Status IndexAllBufferedKeys() && override;
119
120 private:
121 std::unordered_set<T> seen_keys_;
122 std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>>&
123 storage_; // Does not own.
124 };
125
126 class Iterator : public NumericIndex<T>::Iterator {
127 public:
128 // We group BasicHits (sorted by document_id) of a key into a Bucket (stored
129 // as std::vector) and store key -> vector in an std::map. When doing range
130 // query, we may access vectors from multiple keys and want to return
131 // BasicHits to callers sorted by document_id. Therefore, this problem is
132 // actually "merge K sorted vectors".
133 // To implement this algorithm via priority_queue, we create this wrapper
134 // class to store iterators of map and vector.
135 class BucketInfo {
136 public:
BucketInfo(typename std::map<T,std::vector<BasicHit>>::const_iterator bucket_iter)137 explicit BucketInfo(
138 typename std::map<T, std::vector<BasicHit>>::const_iterator
139 bucket_iter)
140 : bucket_iter_(bucket_iter),
141 vec_iter_(bucket_iter_->second.rbegin()) {}
142
Advance()143 bool Advance() { return ++vec_iter_ != bucket_iter_->second.rend(); }
144
GetCurrentBasicHit()145 const BasicHit& GetCurrentBasicHit() const { return *vec_iter_; }
146
147 bool operator<(const BucketInfo& other) const {
148 // std::priority_queue is a max heap and we should return BasicHits in
149 // DocumentId descending order.
150 // - BucketInfo::operator< should have the same order as DocumentId.
151 // - BasicHit encodes inverted document id and its operator< compares
152 // the encoded raw value directly.
153 // - Therefore, BucketInfo::operator< should compare BasicHit reversely.
154 // - This will make priority_queue return buckets in DocumentId
155 // descending and SectionId ascending order.
156 // - Whatever direction we sort SectionId by (or pop by priority_queue)
157 // doesn't matter because all hits for the same DocumentId will be
158 // merged into a single DocHitInfo.
159 return other.GetCurrentBasicHit() < GetCurrentBasicHit();
160 }
161
162 private:
163 typename std::map<T, std::vector<BasicHit>>::const_iterator bucket_iter_;
164 std::vector<BasicHit>::const_reverse_iterator vec_iter_;
165 };
166
Iterator(T key_lower,T key_upper,std::vector<BucketInfo> && bucket_info_vec)167 explicit Iterator(T key_lower, T key_upper,
168 std::vector<BucketInfo>&& bucket_info_vec)
169 : NumericIndex<T>::Iterator(key_lower, key_upper),
170 pq_(std::less<BucketInfo>(), std::move(bucket_info_vec)),
171 num_advance_calls_(0) {}
172
173 ~Iterator() override = default;
174
175 libtextclassifier3::Status Advance() override;
176
GetDocHitInfo()177 DocHitInfo GetDocHitInfo() const override { return doc_hit_info_; }
178
GetNumAdvanceCalls()179 int32_t GetNumAdvanceCalls() const override { return num_advance_calls_; }
180
GetNumBlocksInspected()181 int32_t GetNumBlocksInspected() const override { return 0; }
182
183 private:
184 std::priority_queue<BucketInfo> pq_;
185 DocHitInfo doc_hit_info_;
186
187 int32_t num_advance_calls_;
188 };
189
DummyNumericIndex(const Filesystem & filesystem,std::string && working_path)190 explicit DummyNumericIndex(const Filesystem& filesystem,
191 std::string&& working_path)
192 : NumericIndex<T>(filesystem, std::move(working_path),
193 PersistentStorage::WorkingPathType::kDummy),
194 dummy_crcs_buffer_(
195 std::make_unique<uint8_t[]>(sizeof(PersistentStorage::Crcs))),
196 last_added_document_id_(kInvalidDocumentId) {
197 memset(dummy_crcs_buffer_.get(), 0, sizeof(PersistentStorage::Crcs));
198 }
199
PersistStoragesToDisk()200 libtextclassifier3::Status PersistStoragesToDisk() override {
201 return libtextclassifier3::Status::OK;
202 }
203
PersistMetadataToDisk()204 libtextclassifier3::Status PersistMetadataToDisk() override {
205 return libtextclassifier3::Status::OK;
206 }
207
WriteMetadata()208 libtextclassifier3::Status WriteMetadata() override {
209 return libtextclassifier3::Status::OK;
210 }
211
UpdateStoragesChecksum()212 libtextclassifier3::StatusOr<Crc32> UpdateStoragesChecksum() override {
213 return Crc32();
214 }
215
GetInfoChecksum()216 libtextclassifier3::StatusOr<Crc32> GetInfoChecksum() const override {
217 return Crc32();
218 }
219
GetStoragesChecksum()220 libtextclassifier3::StatusOr<Crc32> GetStoragesChecksum() const override {
221 return Crc32();
222 }
223
crcs()224 PersistentStorage::Crcs& crcs() override {
225 return *reinterpret_cast<PersistentStorage::Crcs*>(
226 dummy_crcs_buffer_.get());
227 }
crcs()228 const PersistentStorage::Crcs& crcs() const override {
229 return *reinterpret_cast<const PersistentStorage::Crcs*>(
230 dummy_crcs_buffer_.get());
231 }
232
233 std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>> storage_;
234 std::unique_ptr<uint8_t[]> dummy_crcs_buffer_;
235 DocumentId last_added_document_id_;
236 };
237
238 template <typename T>
239 libtextclassifier3::Status
IndexAllBufferedKeys()240 DummyNumericIndex<T>::Editor::IndexAllBufferedKeys() && {
241 auto property_map_iter = storage_.find(this->property_path_);
242 if (property_map_iter == storage_.end()) {
243 const auto& [inserted_iter, insert_result] =
244 storage_.insert({this->property_path_, {}});
245 if (!insert_result) {
246 return absl_ports::InternalError(
247 absl_ports::StrCat("Failed to create a new map for property \"",
248 this->property_path_, "\""));
249 }
250 property_map_iter = inserted_iter;
251 }
252
253 for (const T& key : seen_keys_) {
254 auto key_map_iter = property_map_iter->second.find(key);
255 if (key_map_iter == property_map_iter->second.end()) {
256 const auto& [inserted_iter, insert_result] =
257 property_map_iter->second.insert({key, {}});
258 if (!insert_result) {
259 return absl_ports::InternalError("Failed to create a new map for key");
260 }
261 key_map_iter = inserted_iter;
262 }
263 key_map_iter->second.push_back(
264 BasicHit(this->section_id_, this->document_id_));
265 }
266 return libtextclassifier3::Status::OK;
267 }
268
269 template <typename T>
Advance()270 libtextclassifier3::Status DummyNumericIndex<T>::Iterator::Advance() {
271 if (pq_.empty()) {
272 return absl_ports::ResourceExhaustedError("End of iterator");
273 }
274
275 DocumentId document_id = pq_.top().GetCurrentBasicHit().document_id();
276 doc_hit_info_ = DocHitInfo(document_id);
277 // Merge sections with same document_id into a single DocHitInfo
278 while (!pq_.empty() &&
279 pq_.top().GetCurrentBasicHit().document_id() == document_id) {
280 ++num_advance_calls_;
281 doc_hit_info_.UpdateSection(pq_.top().GetCurrentBasicHit().section_id());
282
283 BucketInfo info = pq_.top();
284 pq_.pop();
285
286 if (info.Advance()) {
287 pq_.push(std::move(info));
288 }
289 }
290
291 return libtextclassifier3::Status::OK;
292 }
293
294 template <typename T>
295 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
GetIterator(std::string_view property_path,T key_lower,T key_upper,const DocumentStore &,const SchemaStore &,int64_t)296 DummyNumericIndex<T>::GetIterator(std::string_view property_path, T key_lower,
297 T key_upper, const DocumentStore&,
298 const SchemaStore&, int64_t) const {
299 if (key_lower > key_upper) {
300 return absl_ports::InvalidArgumentError(
301 "key_lower should not be greater than key_upper");
302 }
303
304 auto property_map_iter = storage_.find(std::string(property_path));
305 if (property_map_iter == storage_.end()) {
306 // Return an empty iterator.
307 return std::make_unique<DocHitInfoIteratorNumeric<T>>(nullptr);
308 }
309
310 std::vector<typename Iterator::BucketInfo> bucket_info_vec;
311 for (auto key_map_iter = property_map_iter->second.lower_bound(key_lower);
312 key_map_iter != property_map_iter->second.cend() &&
313 key_map_iter->first <= key_upper;
314 ++key_map_iter) {
315 bucket_info_vec.push_back(typename Iterator::BucketInfo(key_map_iter));
316 }
317
318 return std::make_unique<DocHitInfoIteratorNumeric<T>>(
319 std::make_unique<Iterator>(key_lower, key_upper,
320 std::move(bucket_info_vec)));
321 }
322
323 template <typename T>
Optimize(const std::vector<DocumentId> & document_id_old_to_new,DocumentId new_last_added_document_id)324 libtextclassifier3::Status DummyNumericIndex<T>::Optimize(
325 const std::vector<DocumentId>& document_id_old_to_new,
326 DocumentId new_last_added_document_id) {
327 std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>>
328 new_storage;
329
330 for (const auto& [property_path, old_property_map] : storage_) {
331 std::map<T, std::vector<BasicHit>> new_property_map;
332 for (const auto& [key, hits] : old_property_map) {
333 for (const BasicHit& hit : hits) {
334 DocumentId old_doc_id = hit.document_id();
335 if (old_doc_id >= document_id_old_to_new.size() ||
336 document_id_old_to_new[old_doc_id] == kInvalidDocumentId) {
337 continue;
338 }
339
340 new_property_map[key].push_back(
341 BasicHit(hit.section_id(), document_id_old_to_new[old_doc_id]));
342 }
343 }
344
345 if (!new_property_map.empty()) {
346 new_storage[property_path] = std::move(new_property_map);
347 }
348 }
349
350 storage_ = std::move(new_storage);
351 last_added_document_id_ = new_last_added_document_id;
352 return libtextclassifier3::Status::OK;
353 }
354
355 } // namespace lib
356 } // namespace icing
357
358 #endif // ICING_INDEX_NUMERIC_DUMMY_NUMERIC_INDEX_H_
359