1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <cstring>
20 #include <numeric>
21 #include <string>
22 #include <utility>
23 #include <vector>
24
25 #include "icing/text_classifier/lib3/utils/base/status.h"
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/absl_ports/str_cat.h"
29 #include "icing/index/hit/doc-hit-info.h"
30 #include "icing/index/hit/hit.h"
31 #include "icing/index/iterator/doc-hit-info-iterator.h"
32 #include "icing/index/lite/lite-index.h"
33 #include "icing/index/term-id-codec.h"
34 #include "icing/schema/section.h"
35 #include "icing/util/logging.h"
36 #include "icing/util/status-macros.h"
37
38 namespace icing {
39 namespace lib {
40
41 namespace {
42
SectionIdMaskToString(SectionIdMask section_id_mask)43 std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
44 std::string mask(kTotalNumSections, '0');
45 for (SectionId i = kMaxSectionId; i >= 0; --i) {
46 if (section_id_mask & (UINT64_C(1) << i)) {
47 mask[kMaxSectionId - i] = '1';
48 }
49 }
50 return mask;
51 }
52
53 } // namespace
54
Advance()55 libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
56 if (cached_hits_idx_ == -1) {
57 libtextclassifier3::Status status = RetrieveMoreHits();
58 if (!status.ok()) {
59 if (!absl_ports::IsNotFound(status)) {
60 // NOT_FOUND is expected to happen (not every term will be in the main
61 // index!). Other errors are worth logging.
62 ICING_LOG(ERROR)
63 << "Encountered unexpected failure while retrieving hits "
64 << status.error_message();
65 }
66 return absl_ports::ResourceExhaustedError(
67 "No more DocHitInfos in iterator");
68 }
69 } else {
70 ++cached_hits_idx_;
71 }
72 if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) {
73 // Nothing more for the iterator to return. Set these members to invalid
74 // values.
75 doc_hit_info_ = DocHitInfo();
76 return absl_ports::ResourceExhaustedError(
77 "No more DocHitInfos in iterator");
78 }
79 ++num_advance_calls_;
80 doc_hit_info_ = cached_hits_.at(cached_hits_idx_);
81 return libtextclassifier3::Status::OK;
82 }
83
84 libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
TrimRightMostNode()85 DocHitInfoIteratorTermLite::TrimRightMostNode() && {
86 // Leaf iterator should trim itself.
87 DocHitInfoIterator::TrimmedNode node = {nullptr, term_, term_start_index_,
88 unnormalized_term_length_};
89 return node;
90 }
91
RetrieveMoreHits()92 libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
93 // Exact match only. All hits in lite lexicon are exact.
94 ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->GetTermId(term_));
95 ICING_ASSIGN_OR_RETURN(uint32_t term_id,
96 term_id_codec_->EncodeTvi(tvi, TviType::LITE));
97 lite_index_->FetchHits(
98 term_id, section_restrict_mask_,
99 /*only_from_prefix_sections=*/false,
100 /*score_by=*/
101 SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE,
102 /*namespace_checker=*/nullptr, &cached_hits_,
103 need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr);
104 cached_hits_idx_ = 0;
105 return libtextclassifier3::Status::OK;
106 }
107
ToString() const108 std::string DocHitInfoIteratorTermLiteExact::ToString() const {
109 return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
110 term_);
111 }
112
113 libtextclassifier3::Status
RetrieveMoreHits()114 DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
115 // Take union of lite terms.
116 int term_len = term_.length();
117 int terms_matched = 0;
118 for (LiteIndex::PrefixIterator it = lite_index_->FindTermPrefixes(term_);
119 it.IsValid(); it.Advance()) {
120 bool exact_match = it.GetKey().size() == term_len;
121 ICING_ASSIGN_OR_RETURN(
122 uint32_t term_id,
123 term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
124 lite_index_->FetchHits(
125 term_id, section_restrict_mask_,
126 /*only_from_prefix_sections=*/!exact_match,
127 /*score_by=*/
128 SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE,
129 /*namespace_checker=*/nullptr, &cached_hits_,
130 need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr);
131 ++terms_matched;
132 }
133 if (terms_matched > 1) {
134 SortAndDedupeDocumentIds();
135 }
136 cached_hits_idx_ = 0;
137 return libtextclassifier3::Status::OK;
138 }
139
SortDocumentIds()140 void DocHitInfoIteratorTermLitePrefix::SortDocumentIds() {
141 // Re-sort cached document_ids and merge sections.
142 if (!need_hit_term_frequency_) {
143 // If we don't need to also sort cached_hit_term_frequency_ along with
144 // cached_hits_, then just simply sort cached_hits_.
145 sort(cached_hits_.begin(), cached_hits_.end());
146 } else {
147 // Sort cached_hit_term_frequency_ along with cached_hits_.
148 std::vector<int> indices(cached_hits_.size());
149 std::iota(indices.begin(), indices.end(), 0);
150 std::sort(indices.begin(), indices.end(), [this](int i, int j) {
151 return cached_hits_[i] < cached_hits_[j];
152 });
153 // Now indices is a map from sorted index to current index. In other words,
154 // the sorted cached_hits_[i] should be the current cached_hits_[indices[i]]
155 // for every valid i.
156 std::vector<bool> done(indices.size());
157 // Apply permutation
158 for (int i = 0; i < indices.size(); ++i) {
159 if (done[i]) {
160 continue;
161 }
162 done[i] = true;
163 int curr = i;
164 int next = indices[i];
165 // Since every finite permutation is formed by disjoint cycles, we can
166 // start with the current element, at index i, and swap the element at
167 // this position with whatever element that *should* be here. Then,
168 // continue to swap the original element, at its updated positions, with
169 // the element that should be occupying that position until the original
170 // element has reached *its* correct position. This completes applying the
171 // single cycle in the permutation.
172 while (next != i) {
173 std::swap(cached_hits_[curr], cached_hits_[next]);
174 std::swap(cached_hit_term_frequency_[curr],
175 cached_hit_term_frequency_[next]);
176 done[next] = true;
177 curr = next;
178 next = indices[next];
179 }
180 }
181 }
182 }
183
SortAndDedupeDocumentIds()184 void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() {
185 SortDocumentIds();
186 int idx = 0;
187 for (int i = 1; i < cached_hits_.size(); ++i) {
188 const DocHitInfo& hit_info = cached_hits_[i];
189 DocHitInfo& collapsed_hit_info = cached_hits_[idx];
190 if (collapsed_hit_info.document_id() == hit_info.document_id()) {
191 SectionIdMask curr_mask = hit_info.hit_section_ids_mask();
192 collapsed_hit_info.MergeSectionsFrom(curr_mask);
193 if (need_hit_term_frequency_) {
194 Hit::TermFrequencyArray& collapsed_term_frequency =
195 cached_hit_term_frequency_[idx];
196 while (curr_mask) {
197 SectionId section_id = __builtin_ctzll(curr_mask);
198 // We add the new term-frequency to the existing term-frequency we've
199 // recorded for the current section-id in case there are multiple hits
200 // matching the query for this section.
201 // - This is the case for a prefix query where there are multiple
202 // terms matching the prefix from the same sectionId + docId.
203 collapsed_term_frequency[section_id] +=
204 cached_hit_term_frequency_[i][section_id];
205 curr_mask &= ~(UINT64_C(1) << section_id);
206 }
207 }
208 } else {
209 // New document_id.
210 ++idx;
211 cached_hits_[idx] = hit_info;
212 if (need_hit_term_frequency_) {
213 cached_hit_term_frequency_[idx] = cached_hit_term_frequency_[i];
214 }
215 }
216 }
217 // idx points to last doc hit info.
218 cached_hits_.resize(idx + 1);
219 if (need_hit_term_frequency_) {
220 cached_hit_term_frequency_.resize(idx + 1);
221 }
222 }
223
ToString() const224 std::string DocHitInfoIteratorTermLitePrefix::ToString() const {
225 return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
226 term_, "*");
227 }
228
229 } // namespace lib
230 } // namespace icing
231