xref: /aosp_15_r20/external/icing/icing/index/lite/doc-hit-info-iterator-term-lite.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <cstring>
20 #include <numeric>
21 #include <string>
22 #include <utility>
23 #include <vector>
24 
25 #include "icing/text_classifier/lib3/utils/base/status.h"
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/absl_ports/str_cat.h"
29 #include "icing/index/hit/doc-hit-info.h"
30 #include "icing/index/hit/hit.h"
31 #include "icing/index/iterator/doc-hit-info-iterator.h"
32 #include "icing/index/lite/lite-index.h"
33 #include "icing/index/term-id-codec.h"
34 #include "icing/schema/section.h"
35 #include "icing/util/logging.h"
36 #include "icing/util/status-macros.h"
37 
38 namespace icing {
39 namespace lib {
40 
41 namespace {
42 
SectionIdMaskToString(SectionIdMask section_id_mask)43 std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
44   std::string mask(kTotalNumSections, '0');
45   for (SectionId i = kMaxSectionId; i >= 0; --i) {
46     if (section_id_mask & (UINT64_C(1) << i)) {
47       mask[kMaxSectionId - i] = '1';
48     }
49   }
50   return mask;
51 }
52 
53 }  // namespace
54 
Advance()55 libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
56   if (cached_hits_idx_ == -1) {
57     libtextclassifier3::Status status = RetrieveMoreHits();
58     if (!status.ok()) {
59       if (!absl_ports::IsNotFound(status)) {
60         // NOT_FOUND is expected to happen (not every term will be in the main
61         // index!). Other errors are worth logging.
62         ICING_LOG(ERROR)
63             << "Encountered unexpected failure while retrieving  hits "
64             << status.error_message();
65       }
66       return absl_ports::ResourceExhaustedError(
67           "No more DocHitInfos in iterator");
68     }
69   } else {
70     ++cached_hits_idx_;
71   }
72   if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) {
73     // Nothing more for the iterator to return. Set these members to invalid
74     // values.
75     doc_hit_info_ = DocHitInfo();
76     return absl_ports::ResourceExhaustedError(
77         "No more DocHitInfos in iterator");
78   }
79   ++num_advance_calls_;
80   doc_hit_info_ = cached_hits_.at(cached_hits_idx_);
81   return libtextclassifier3::Status::OK;
82 }
83 
84 libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
TrimRightMostNode()85 DocHitInfoIteratorTermLite::TrimRightMostNode() && {
86   // Leaf iterator should trim itself.
87   DocHitInfoIterator::TrimmedNode node = {nullptr, term_, term_start_index_,
88                                           unnormalized_term_length_};
89   return node;
90 }
91 
RetrieveMoreHits()92 libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
93   // Exact match only. All hits in lite lexicon are exact.
94   ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->GetTermId(term_));
95   ICING_ASSIGN_OR_RETURN(uint32_t term_id,
96                          term_id_codec_->EncodeTvi(tvi, TviType::LITE));
97   lite_index_->FetchHits(
98       term_id, section_restrict_mask_,
99       /*only_from_prefix_sections=*/false,
100       /*score_by=*/
101       SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE,
102       /*namespace_checker=*/nullptr, &cached_hits_,
103       need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr);
104   cached_hits_idx_ = 0;
105   return libtextclassifier3::Status::OK;
106 }
107 
ToString() const108 std::string DocHitInfoIteratorTermLiteExact::ToString() const {
109   return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
110                             term_);
111 }
112 
113 libtextclassifier3::Status
RetrieveMoreHits()114 DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
115   // Take union of lite terms.
116   int term_len = term_.length();
117   int terms_matched = 0;
118   for (LiteIndex::PrefixIterator it = lite_index_->FindTermPrefixes(term_);
119        it.IsValid(); it.Advance()) {
120     bool exact_match = it.GetKey().size() == term_len;
121     ICING_ASSIGN_OR_RETURN(
122         uint32_t term_id,
123         term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
124     lite_index_->FetchHits(
125         term_id, section_restrict_mask_,
126         /*only_from_prefix_sections=*/!exact_match,
127         /*score_by=*/
128         SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE,
129         /*namespace_checker=*/nullptr, &cached_hits_,
130         need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr);
131     ++terms_matched;
132   }
133   if (terms_matched > 1) {
134     SortAndDedupeDocumentIds();
135   }
136   cached_hits_idx_ = 0;
137   return libtextclassifier3::Status::OK;
138 }
139 
SortDocumentIds()140 void DocHitInfoIteratorTermLitePrefix::SortDocumentIds() {
141   // Re-sort cached document_ids and merge sections.
142   if (!need_hit_term_frequency_) {
143     // If we don't need to also sort cached_hit_term_frequency_ along with
144     // cached_hits_, then just simply sort cached_hits_.
145     sort(cached_hits_.begin(), cached_hits_.end());
146   } else {
147     // Sort cached_hit_term_frequency_ along with cached_hits_.
148     std::vector<int> indices(cached_hits_.size());
149     std::iota(indices.begin(), indices.end(), 0);
150     std::sort(indices.begin(), indices.end(), [this](int i, int j) {
151       return cached_hits_[i] < cached_hits_[j];
152     });
153     // Now indices is a map from sorted index to current index. In other words,
154     // the sorted cached_hits_[i] should be the current cached_hits_[indices[i]]
155     // for every valid i.
156     std::vector<bool> done(indices.size());
157     // Apply permutation
158     for (int i = 0; i < indices.size(); ++i) {
159       if (done[i]) {
160         continue;
161       }
162       done[i] = true;
163       int curr = i;
164       int next = indices[i];
165       // Since every finite permutation is formed by disjoint cycles, we can
166       // start with the current element, at index i, and swap the element at
167       // this position with whatever element that *should* be here. Then,
168       // continue to swap the original element, at its updated positions, with
169       // the element that should be occupying that position until the original
170       // element has reached *its* correct position. This completes applying the
171       // single cycle in the permutation.
172       while (next != i) {
173         std::swap(cached_hits_[curr], cached_hits_[next]);
174         std::swap(cached_hit_term_frequency_[curr],
175                   cached_hit_term_frequency_[next]);
176         done[next] = true;
177         curr = next;
178         next = indices[next];
179       }
180     }
181   }
182 }
183 
SortAndDedupeDocumentIds()184 void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() {
185   SortDocumentIds();
186   int idx = 0;
187   for (int i = 1; i < cached_hits_.size(); ++i) {
188     const DocHitInfo& hit_info = cached_hits_[i];
189     DocHitInfo& collapsed_hit_info = cached_hits_[idx];
190     if (collapsed_hit_info.document_id() == hit_info.document_id()) {
191       SectionIdMask curr_mask = hit_info.hit_section_ids_mask();
192       collapsed_hit_info.MergeSectionsFrom(curr_mask);
193       if (need_hit_term_frequency_) {
194         Hit::TermFrequencyArray& collapsed_term_frequency =
195             cached_hit_term_frequency_[idx];
196         while (curr_mask) {
197           SectionId section_id = __builtin_ctzll(curr_mask);
198           // We add the new term-frequency to the existing term-frequency we've
199           // recorded for the current section-id in case there are multiple hits
200           // matching the query for this section.
201           // - This is the case for a prefix query where there are multiple
202           //   terms matching the prefix from the same sectionId + docId.
203           collapsed_term_frequency[section_id] +=
204               cached_hit_term_frequency_[i][section_id];
205           curr_mask &= ~(UINT64_C(1) << section_id);
206         }
207       }
208     } else {
209       // New document_id.
210       ++idx;
211       cached_hits_[idx] = hit_info;
212       if (need_hit_term_frequency_) {
213         cached_hit_term_frequency_[idx] = cached_hit_term_frequency_[i];
214       }
215     }
216   }
217   // idx points to last doc hit info.
218   cached_hits_.resize(idx + 1);
219   if (need_hit_term_frequency_) {
220     cached_hit_term_frequency_.resize(idx + 1);
221   }
222 }
223 
ToString() const224 std::string DocHitInfoIteratorTermLitePrefix::ToString() const {
225   return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
226                             term_, "*");
227 }
228 
229 }  // namespace lib
230 }  // namespace icing
231