xref: /aosp_15_r20/external/icing/icing/index/iterator/doc-hit-info-iterator.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
16 #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
17 
18 #include <array>
19 #include <cstdint>
20 #include <functional>
21 #include <memory>
22 #include <string>
23 #include <string_view>
24 #include <utility>
25 #include <vector>
26 
27 #include "icing/text_classifier/lib3/utils/base/status.h"
28 #include "icing/text_classifier/lib3/utils/base/statusor.h"
29 #include "icing/absl_ports/canonical_errors.h"
30 #include "icing/index/hit/doc-hit-info.h"
31 #include "icing/index/hit/hit.h"
32 #include "icing/schema/section.h"
33 #include "icing/store/document-id.h"
34 
35 namespace icing {
36 namespace lib {
37 
38 class SectionRestrictData;
39 
40 // Data structure that maps a single matched query term to its section mask
41 // and the list of term frequencies.
42 // TODO(b/158603837): add stat on whether the matched terms are prefix matched
43 // or not. This information will be used to boost exact match.
44 struct TermMatchInfo {
45   std::string_view term;
46   // SectionIdMask associated to the term.
47   SectionIdMask section_ids_mask;
48   // Array with fixed size kMaxSectionId. For every section id, i.e.
49   // vector index, it stores the term frequency of the term.
50   std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies;
51 
TermMatchInfoTermMatchInfo52   explicit TermMatchInfo(
53       std::string_view term, SectionIdMask section_ids_mask,
54       std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies)
55       : term(term),
56         section_ids_mask(section_ids_mask),
57         term_frequencies(std::move(term_frequencies)) {}
58 };
59 
60 // Iterator over DocHitInfos (collapsed Hits) in REVERSE document_id order.
61 //
62 // NOTE: You must call Advance() before calling hit_info().
63 //
64 // Example:
65 // DocHitInfoIterator itr = GetIterator(...);
66 // while (itr.Advance()) {
67 //   HandleDocHitInfo(itr.hit_info());
68 // }
69 class DocHitInfoIterator {
70  public:
71   using ChildrenMapper = std::function<std::unique_ptr<DocHitInfoIterator>(
72       std::unique_ptr<DocHitInfoIterator>)>;
73 
74   // CallStats is a wrapper class of all stats to collect among all levels of
75   // the DocHitInfoIterator tree. Mostly the internal nodes will aggregate the
76   // number of all leaf nodes, while the leaf nodes will return the actual
77   // numbers.
78   struct CallStats {
79     // The number of times Advance() was called on the leaf node for term lite
80     // index.
81     // - Leaf nodes:
82     //   - DocHitInfoIteratorTermLite should maintain and set it correctly.
83     //   - Others should set it 0.
84     // - Internal nodes: should aggregate values from all children.
85     int32_t num_leaf_advance_calls_lite_index;
86 
87     // The number of times Advance() was called on the leaf node for term main
88     // index.
89     // - Leaf nodes:
90     //   - DocHitInfoIteratorTermMain should maintain and set it correctly.
91     //   - Others should set it 0.
92     // - Internal nodes: should aggregate values from all children.
93     int32_t num_leaf_advance_calls_main_index;
94 
95     // The number of times Advance() was called on the leaf node for integer
96     // index.
97     // - Leaf nodes:
98     //   - DocHitInfoIteratorNumeric should maintain and set it correctly.
99     //   - Others should set it 0.
100     // - Internal nodes: should aggregate values from all children.
101     int32_t num_leaf_advance_calls_integer_index;
102 
103     // The number of times Advance() was called on the leaf node without reading
104     // any hits from index. Usually it is a special field for
105     // DocHitInfoIteratorAllDocumentId.
106     // - Leaf nodes:
107     //   - DocHitInfoIteratorAllDocumentId should maintain and set it correctly.
108     //   - Others should set it 0.
109     // - Internal nodes: should aggregate values from all children.
110     int32_t num_leaf_advance_calls_no_index;
111 
112     // The number of flash index blocks that have been read as a result of
113     // operations on this object.
114     // - Leaf nodes: should maintain and set it correctly for all child classes
115     //   involving flash index block access.
116     // - Internal nodes: should aggregate values from all children.
117     int32_t num_blocks_inspected;
118 
CallStatsCallStats119     explicit CallStats()
120         : CallStats(/*num_leaf_advance_calls_lite_index_in=*/0,
121                     /*num_leaf_advance_calls_main_index_in=*/0,
122                     /*num_leaf_advance_calls_integer_index_in=*/0,
123                     /*num_leaf_advance_calls_no_index_in=*/0,
124                     /*num_blocks_inspected_in=*/0) {}
125 
CallStatsCallStats126     explicit CallStats(int32_t num_leaf_advance_calls_lite_index_in,
127                        int32_t num_leaf_advance_calls_main_index_in,
128                        int32_t num_leaf_advance_calls_integer_index_in,
129                        int32_t num_leaf_advance_calls_no_index_in,
130                        int32_t num_blocks_inspected_in)
131         : num_leaf_advance_calls_lite_index(
132               num_leaf_advance_calls_lite_index_in),
133           num_leaf_advance_calls_main_index(
134               num_leaf_advance_calls_main_index_in),
135           num_leaf_advance_calls_integer_index(
136               num_leaf_advance_calls_integer_index_in),
137           num_leaf_advance_calls_no_index(num_leaf_advance_calls_no_index_in),
138           num_blocks_inspected(num_blocks_inspected_in) {}
139 
num_leaf_advance_callsCallStats140     int32_t num_leaf_advance_calls() const {
141       return num_leaf_advance_calls_lite_index +
142              num_leaf_advance_calls_main_index +
143              num_leaf_advance_calls_integer_index +
144              num_leaf_advance_calls_no_index;
145     }
146 
147     bool operator==(const CallStats& other) const {
148       return num_leaf_advance_calls_lite_index ==
149                  other.num_leaf_advance_calls_lite_index &&
150              num_leaf_advance_calls_main_index ==
151                  other.num_leaf_advance_calls_main_index &&
152              num_leaf_advance_calls_integer_index ==
153                  other.num_leaf_advance_calls_integer_index &&
154              num_leaf_advance_calls_no_index ==
155                  other.num_leaf_advance_calls_no_index &&
156              num_blocks_inspected == other.num_blocks_inspected;
157     }
158 
159     CallStats operator+(const CallStats& other) const {
160       return CallStats(num_leaf_advance_calls_lite_index +
161                            other.num_leaf_advance_calls_lite_index,
162                        num_leaf_advance_calls_main_index +
163                            other.num_leaf_advance_calls_main_index,
164                        num_leaf_advance_calls_integer_index +
165                            other.num_leaf_advance_calls_integer_index,
166                        num_leaf_advance_calls_no_index +
167                            other.num_leaf_advance_calls_no_index,
168                        num_blocks_inspected + other.num_blocks_inspected);
169     }
170 
171     CallStats& operator+=(const CallStats& other) {
172       *this = *this + other;
173       return *this;
174     }
175   };
176 
177   struct TrimmedNode {
178     // the query results which we should only search for suggestion in these
179     // documents.
180     std::unique_ptr<DocHitInfoIterator> iterator_;
181     // term of the trimmed node which we need to generate suggested strings.
182     std::string term_;
183     // the string in the query which indicates the target section we should
184     // search for suggestions.
185     std::string target_section_;
186     // the start index of the current term in the given search query.
187     int term_start_index_;
188     // The length of the given unnormalized term in the search query
189     int unnormalized_term_length_;
190 
TrimmedNodeTrimmedNode191     TrimmedNode(std::unique_ptr<DocHitInfoIterator> iterator, std::string term,
192                 int term_start_index, int unnormalized_term_length)
193         : iterator_(std::move(iterator)),
194           term_(term),
195           target_section_(""),
196           term_start_index_(term_start_index),
197           unnormalized_term_length_(unnormalized_term_length) {}
198   };
199 
200   // Trim the rightmost iterator of the iterator tree.
201   // This is to support search suggestions for the last term which is the
202   // right-most node of the root iterator tree. Only support trim the right-most
203   // node on the AND, AND_NARY, OR, OR_NARY, OR_LEAF, Filter, and the
204   // property-in-schema-check iterator.
205   //
206   // After calling this method, this iterator is no longer usable. Please use
207   // the returned iterator.
208   // Returns:
209   //   the new iterator without the right-most child, if was able to trim the
210   //   right-most node.
211   //   nullptr if the current iterator should be trimmed.
212   //   INVALID_ARGUMENT if the right-most node is not suppose to be trimmed.
213   virtual libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && = 0;
214 
215   // Map all direct children of this iterator according to the passed mapper.
216   virtual void MapChildren(const ChildrenMapper& mapper) = 0;
217 
is_leaf()218   virtual bool is_leaf() { return false; }
219 
220   // Try to internally handle the provided section restriction in the iterator.
221   //
222   // Returns:
223   //   - false if the iterator does not support handling section restriction.
224   //   - true if the iterator supports handling section restriction, and the
225   //     section restriction has been applied.
HandleSectionRestriction(SectionRestrictData * other_data)226   virtual bool HandleSectionRestriction(SectionRestrictData* other_data) {
227     return false;
228   }
229 
230   virtual ~DocHitInfoIterator() = default;
231 
232   // Returns:
233   //   OK if was able to advance to a new document_id.
234   //   INVALID_ARGUMENT if there are less than 2 iterators for an AND/OR
235   //       iterator
236   //   RESOUCE_EXHAUSTED if we've run out of document_ids to iterate over
237   virtual libtextclassifier3::Status Advance() = 0;
238 
239   // Returns the DocHitInfo that the iterator is currently at. The DocHitInfo
240   // will have a kInvalidDocumentId if Advance() was not called after
241   // construction or if Advance returned an error.
doc_hit_info()242   const DocHitInfo& doc_hit_info() const { return doc_hit_info_; }
243 
244   // Returns CallStats of the DocHitInfoIterator tree.
245   virtual CallStats GetCallStats() const = 0;
246 
247   // A string representing the iterator.
248   virtual std::string ToString() const = 0;
249 
250   // For the last hit docid, retrieves all the matched query terms and other
251   // stats, see TermMatchInfo.
252   // filtering_section_mask filters the matching sections and should be set only
253   // by DocHitInfoIteratorSectionRestrict.
254   // If Advance() wasn't called after construction, Advance() returned false or
255   // the concrete HitIterator didn't override this method, the vectors aren't
256   // populated.
257   virtual void PopulateMatchedTermsStats(
258       std::vector<TermMatchInfo>* matched_terms_stats,
259       SectionIdMask filtering_section_mask = kSectionIdMaskAll) const {}
260 
261  protected:
262   DocHitInfo doc_hit_info_;
263 
264   // Helper function to advance the given iterator to at most the given
265   // document_id.
AdvanceTo(DocHitInfoIterator * it,DocumentId document_id)266   libtextclassifier3::StatusOr<DocumentId> AdvanceTo(DocHitInfoIterator* it,
267                                                      DocumentId document_id) {
268     while (it->Advance().ok()) {
269       if (it->doc_hit_info().document_id() <= document_id) {
270         return it->doc_hit_info().document_id();
271       }
272     }
273 
274     // Didn't find anything for the other iterator, reset to invalid values and
275     // return.
276     doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
277     return absl_ports::ResourceExhaustedError(
278         "No more DocHitInfos in iterator");
279   }
280 };
281 
282 // A leaf node is a term node or a chain of section restriction node applied on
283 // a term node.
284 class DocHitInfoLeafIterator : public DocHitInfoIterator {
285  public:
is_leaf()286   bool is_leaf() override { return true; }
287 
288   // Calling MapChildren on leaf node does not make sense, and will do nothing.
MapChildren(const ChildrenMapper & mapper)289   void MapChildren(const ChildrenMapper& mapper) override {}
290 };
291 
292 }  // namespace lib
293 }  // namespace icing
294 
295 #endif  // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
296