1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_ 16 #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_ 17 18 #include <array> 19 #include <cstdint> 20 #include <functional> 21 #include <memory> 22 #include <string> 23 #include <string_view> 24 #include <utility> 25 #include <vector> 26 27 #include "icing/text_classifier/lib3/utils/base/status.h" 28 #include "icing/text_classifier/lib3/utils/base/statusor.h" 29 #include "icing/absl_ports/canonical_errors.h" 30 #include "icing/index/hit/doc-hit-info.h" 31 #include "icing/index/hit/hit.h" 32 #include "icing/schema/section.h" 33 #include "icing/store/document-id.h" 34 35 namespace icing { 36 namespace lib { 37 38 class SectionRestrictData; 39 40 // Data structure that maps a single matched query term to its section mask 41 // and the list of term frequencies. 42 // TODO(b/158603837): add stat on whether the matched terms are prefix matched 43 // or not. This information will be used to boost exact match. 44 struct TermMatchInfo { 45 std::string_view term; 46 // SectionIdMask associated to the term. 47 SectionIdMask section_ids_mask; 48 // Array with fixed size kMaxSectionId. For every section id, i.e. 49 // vector index, it stores the term frequency of the term. 50 std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies; 51 TermMatchInfoTermMatchInfo52 explicit TermMatchInfo( 53 std::string_view term, SectionIdMask section_ids_mask, 54 std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies) 55 : term(term), 56 section_ids_mask(section_ids_mask), 57 term_frequencies(std::move(term_frequencies)) {} 58 }; 59 60 // Iterator over DocHitInfos (collapsed Hits) in REVERSE document_id order. 61 // 62 // NOTE: You must call Advance() before calling hit_info(). 63 // 64 // Example: 65 // DocHitInfoIterator itr = GetIterator(...); 66 // while (itr.Advance()) { 67 // HandleDocHitInfo(itr.hit_info()); 68 // } 69 class DocHitInfoIterator { 70 public: 71 using ChildrenMapper = std::function<std::unique_ptr<DocHitInfoIterator>( 72 std::unique_ptr<DocHitInfoIterator>)>; 73 74 // CallStats is a wrapper class of all stats to collect among all levels of 75 // the DocHitInfoIterator tree. Mostly the internal nodes will aggregate the 76 // number of all leaf nodes, while the leaf nodes will return the actual 77 // numbers. 78 struct CallStats { 79 // The number of times Advance() was called on the leaf node for term lite 80 // index. 81 // - Leaf nodes: 82 // - DocHitInfoIteratorTermLite should maintain and set it correctly. 83 // - Others should set it 0. 84 // - Internal nodes: should aggregate values from all children. 85 int32_t num_leaf_advance_calls_lite_index; 86 87 // The number of times Advance() was called on the leaf node for term main 88 // index. 89 // - Leaf nodes: 90 // - DocHitInfoIteratorTermMain should maintain and set it correctly. 91 // - Others should set it 0. 92 // - Internal nodes: should aggregate values from all children. 93 int32_t num_leaf_advance_calls_main_index; 94 95 // The number of times Advance() was called on the leaf node for integer 96 // index. 97 // - Leaf nodes: 98 // - DocHitInfoIteratorNumeric should maintain and set it correctly. 99 // - Others should set it 0. 100 // - Internal nodes: should aggregate values from all children. 101 int32_t num_leaf_advance_calls_integer_index; 102 103 // The number of times Advance() was called on the leaf node without reading 104 // any hits from index. Usually it is a special field for 105 // DocHitInfoIteratorAllDocumentId. 106 // - Leaf nodes: 107 // - DocHitInfoIteratorAllDocumentId should maintain and set it correctly. 108 // - Others should set it 0. 109 // - Internal nodes: should aggregate values from all children. 110 int32_t num_leaf_advance_calls_no_index; 111 112 // The number of flash index blocks that have been read as a result of 113 // operations on this object. 114 // - Leaf nodes: should maintain and set it correctly for all child classes 115 // involving flash index block access. 116 // - Internal nodes: should aggregate values from all children. 117 int32_t num_blocks_inspected; 118 CallStatsCallStats119 explicit CallStats() 120 : CallStats(/*num_leaf_advance_calls_lite_index_in=*/0, 121 /*num_leaf_advance_calls_main_index_in=*/0, 122 /*num_leaf_advance_calls_integer_index_in=*/0, 123 /*num_leaf_advance_calls_no_index_in=*/0, 124 /*num_blocks_inspected_in=*/0) {} 125 CallStatsCallStats126 explicit CallStats(int32_t num_leaf_advance_calls_lite_index_in, 127 int32_t num_leaf_advance_calls_main_index_in, 128 int32_t num_leaf_advance_calls_integer_index_in, 129 int32_t num_leaf_advance_calls_no_index_in, 130 int32_t num_blocks_inspected_in) 131 : num_leaf_advance_calls_lite_index( 132 num_leaf_advance_calls_lite_index_in), 133 num_leaf_advance_calls_main_index( 134 num_leaf_advance_calls_main_index_in), 135 num_leaf_advance_calls_integer_index( 136 num_leaf_advance_calls_integer_index_in), 137 num_leaf_advance_calls_no_index(num_leaf_advance_calls_no_index_in), 138 num_blocks_inspected(num_blocks_inspected_in) {} 139 num_leaf_advance_callsCallStats140 int32_t num_leaf_advance_calls() const { 141 return num_leaf_advance_calls_lite_index + 142 num_leaf_advance_calls_main_index + 143 num_leaf_advance_calls_integer_index + 144 num_leaf_advance_calls_no_index; 145 } 146 147 bool operator==(const CallStats& other) const { 148 return num_leaf_advance_calls_lite_index == 149 other.num_leaf_advance_calls_lite_index && 150 num_leaf_advance_calls_main_index == 151 other.num_leaf_advance_calls_main_index && 152 num_leaf_advance_calls_integer_index == 153 other.num_leaf_advance_calls_integer_index && 154 num_leaf_advance_calls_no_index == 155 other.num_leaf_advance_calls_no_index && 156 num_blocks_inspected == other.num_blocks_inspected; 157 } 158 159 CallStats operator+(const CallStats& other) const { 160 return CallStats(num_leaf_advance_calls_lite_index + 161 other.num_leaf_advance_calls_lite_index, 162 num_leaf_advance_calls_main_index + 163 other.num_leaf_advance_calls_main_index, 164 num_leaf_advance_calls_integer_index + 165 other.num_leaf_advance_calls_integer_index, 166 num_leaf_advance_calls_no_index + 167 other.num_leaf_advance_calls_no_index, 168 num_blocks_inspected + other.num_blocks_inspected); 169 } 170 171 CallStats& operator+=(const CallStats& other) { 172 *this = *this + other; 173 return *this; 174 } 175 }; 176 177 struct TrimmedNode { 178 // the query results which we should only search for suggestion in these 179 // documents. 180 std::unique_ptr<DocHitInfoIterator> iterator_; 181 // term of the trimmed node which we need to generate suggested strings. 182 std::string term_; 183 // the string in the query which indicates the target section we should 184 // search for suggestions. 185 std::string target_section_; 186 // the start index of the current term in the given search query. 187 int term_start_index_; 188 // The length of the given unnormalized term in the search query 189 int unnormalized_term_length_; 190 TrimmedNodeTrimmedNode191 TrimmedNode(std::unique_ptr<DocHitInfoIterator> iterator, std::string term, 192 int term_start_index, int unnormalized_term_length) 193 : iterator_(std::move(iterator)), 194 term_(term), 195 target_section_(""), 196 term_start_index_(term_start_index), 197 unnormalized_term_length_(unnormalized_term_length) {} 198 }; 199 200 // Trim the rightmost iterator of the iterator tree. 201 // This is to support search suggestions for the last term which is the 202 // right-most node of the root iterator tree. Only support trim the right-most 203 // node on the AND, AND_NARY, OR, OR_NARY, OR_LEAF, Filter, and the 204 // property-in-schema-check iterator. 205 // 206 // After calling this method, this iterator is no longer usable. Please use 207 // the returned iterator. 208 // Returns: 209 // the new iterator without the right-most child, if was able to trim the 210 // right-most node. 211 // nullptr if the current iterator should be trimmed. 212 // INVALID_ARGUMENT if the right-most node is not suppose to be trimmed. 213 virtual libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && = 0; 214 215 // Map all direct children of this iterator according to the passed mapper. 216 virtual void MapChildren(const ChildrenMapper& mapper) = 0; 217 is_leaf()218 virtual bool is_leaf() { return false; } 219 220 // Try to internally handle the provided section restriction in the iterator. 221 // 222 // Returns: 223 // - false if the iterator does not support handling section restriction. 224 // - true if the iterator supports handling section restriction, and the 225 // section restriction has been applied. HandleSectionRestriction(SectionRestrictData * other_data)226 virtual bool HandleSectionRestriction(SectionRestrictData* other_data) { 227 return false; 228 } 229 230 virtual ~DocHitInfoIterator() = default; 231 232 // Returns: 233 // OK if was able to advance to a new document_id. 234 // INVALID_ARGUMENT if there are less than 2 iterators for an AND/OR 235 // iterator 236 // RESOUCE_EXHAUSTED if we've run out of document_ids to iterate over 237 virtual libtextclassifier3::Status Advance() = 0; 238 239 // Returns the DocHitInfo that the iterator is currently at. The DocHitInfo 240 // will have a kInvalidDocumentId if Advance() was not called after 241 // construction or if Advance returned an error. doc_hit_info()242 const DocHitInfo& doc_hit_info() const { return doc_hit_info_; } 243 244 // Returns CallStats of the DocHitInfoIterator tree. 245 virtual CallStats GetCallStats() const = 0; 246 247 // A string representing the iterator. 248 virtual std::string ToString() const = 0; 249 250 // For the last hit docid, retrieves all the matched query terms and other 251 // stats, see TermMatchInfo. 252 // filtering_section_mask filters the matching sections and should be set only 253 // by DocHitInfoIteratorSectionRestrict. 254 // If Advance() wasn't called after construction, Advance() returned false or 255 // the concrete HitIterator didn't override this method, the vectors aren't 256 // populated. 257 virtual void PopulateMatchedTermsStats( 258 std::vector<TermMatchInfo>* matched_terms_stats, 259 SectionIdMask filtering_section_mask = kSectionIdMaskAll) const {} 260 261 protected: 262 DocHitInfo doc_hit_info_; 263 264 // Helper function to advance the given iterator to at most the given 265 // document_id. AdvanceTo(DocHitInfoIterator * it,DocumentId document_id)266 libtextclassifier3::StatusOr<DocumentId> AdvanceTo(DocHitInfoIterator* it, 267 DocumentId document_id) { 268 while (it->Advance().ok()) { 269 if (it->doc_hit_info().document_id() <= document_id) { 270 return it->doc_hit_info().document_id(); 271 } 272 } 273 274 // Didn't find anything for the other iterator, reset to invalid values and 275 // return. 276 doc_hit_info_ = DocHitInfo(kInvalidDocumentId); 277 return absl_ports::ResourceExhaustedError( 278 "No more DocHitInfos in iterator"); 279 } 280 }; 281 282 // A leaf node is a term node or a chain of section restriction node applied on 283 // a term node. 284 class DocHitInfoLeafIterator : public DocHitInfoIterator { 285 public: is_leaf()286 bool is_leaf() override { return true; } 287 288 // Calling MapChildren on leaf node does not make sense, and will do nothing. MapChildren(const ChildrenMapper & mapper)289 void MapChildren(const ChildrenMapper& mapper) override {} 290 }; 291 292 } // namespace lib 293 } // namespace icing 294 295 #endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_ 296