1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/main/posting-list-hit-serializer.h"
16
17 #include <cstdint>
18 #include <cstring>
19 #include <limits>
20 #include <vector>
21
22 #include "icing/text_classifier/lib3/utils/base/status.h"
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 #include "icing/absl_ports/canonical_errors.h"
25 #include "icing/file/posting_list/posting-list-used.h"
26 #include "icing/index/hit/hit.h"
27 #include "icing/legacy/core/icing-string-util.h"
28 #include "icing/legacy/index/icing-bit-util.h"
29 #include "icing/util/logging.h"
30 #include "icing/util/status-macros.h"
31
32 namespace icing {
33 namespace lib {
34
35 namespace {
36
GetTermFrequencyByteSize(const Hit & hit)37 uint32_t GetTermFrequencyByteSize(const Hit& hit) {
38 return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0;
39 }
40
GetFlagsByteSize(const Hit & hit)41 uint32_t GetFlagsByteSize(const Hit& hit) {
42 return hit.has_flags() ? sizeof(Hit::Flags) : 0;
43 }
44
45 } // namespace
46
GetBytesUsed(const PostingListUsed * posting_list_used) const47 uint32_t PostingListHitSerializer::GetBytesUsed(
48 const PostingListUsed* posting_list_used) const {
49 // The special hits will be included if they represent actual hits. If they
50 // represent the hit offset or the invalid hit sentinel, they are not
51 // included.
52 return posting_list_used->size_in_bytes() -
53 GetStartByteOffset(posting_list_used);
54 }
55
GetMinPostingListSizeToFit(const PostingListUsed * posting_list_used) const56 uint32_t PostingListHitSerializer::GetMinPostingListSizeToFit(
57 const PostingListUsed* posting_list_used) const {
58 if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) {
59 // If in either the FULL state or ALMOST_FULL state, this posting list *is*
60 // the minimum size posting list that can fit these hits. So just return the
61 // size of the posting list.
62 return posting_list_used->size_in_bytes();
63 }
64
65 // Edge case: if number of hits <= 2, and posting_list_used is NOT_FULL, we
66 // should return kMinPostingListSize as we know that two hits is able to fit
67 // in the 2 special hits position of a min-sized posting list.
68 // - Checking this scenario requires deserializing posting_list_used, which is
69 // an expensive operation. However this is only possible when
70 // GetBytesUsed(posting_list_used) <=
71 // sizeof(uncompressed hit) + max(delta encoded value size) +
72 // (sizeof(Hit) + (sizeof(Hit) - sizeof(Hit::Value)), so we don't need to do
73 // this when the size exceeds this number.
74 if (GetBytesUsed(posting_list_used) <=
75 2 * sizeof(Hit) + 5 - sizeof(Hit::Value)) {
76 // Check if we're able to get more than 2 hits from posting_list_used
77 std::vector<Hit> hits;
78 libtextclassifier3::Status status =
79 GetHitsInternal(posting_list_used, /*limit=*/3, /*pop=*/false, &hits);
80 if (status.ok() && hits.size() <= 2) {
81 return GetMinPostingListSize();
82 }
83 }
84
85 // - In NOT_FULL status, BytesUsed contains no special hits. For a posting
86 // list in the NOT_FULL state with n hits, we would have n-1 compressed hits
87 // and 1 uncompressed hit.
88 // - The minimum sized posting list that would be guaranteed to fit these hits
89 // would be FULL, but calculating the size required for the FULL posting
90 // list would require deserializing the last two added hits, so instead we
91 // will calculate the size of an ALMOST_FULL posting list to fit.
92 // - An ALMOST_FULL posting list would have kInvalidHit in special_hit(0), the
93 // full uncompressed Hit in special_hit(1), and the n-1 compressed hits in
94 // the compressed region.
95 // - Currently BytesUsed contains one uncompressed Hit and n-1 compressed
96 // hits. However, it's possible that the uncompressed Hit is not a full hit,
97 // but rather only the Hit::Value (this is the case if
98 // !hit.has_term_frequency()).
99 // - Therefore, fitting these hits into a posting list would require
100 // BytesUsed + one extra full hit + byte difference between a full hit and
101 // Hit::Value. i.e:
102 // ByteUsed + sizeof(Hit) + (sizeof(Hit) - sizeof(Hit::Value)).
103 return GetBytesUsed(posting_list_used) + 2 * sizeof(Hit) - sizeof(Hit::Value);
104 }
105
Clear(PostingListUsed * posting_list_used) const106 void PostingListHitSerializer::Clear(PostingListUsed* posting_list_used) const {
107 // Safe to ignore return value because posting_list_used->size_in_bytes() is
108 // a valid argument.
109 SetStartByteOffset(posting_list_used,
110 /*offset=*/posting_list_used->size_in_bytes());
111 }
112
MoveFrom(PostingListUsed * dst,PostingListUsed * src) const113 libtextclassifier3::Status PostingListHitSerializer::MoveFrom(
114 PostingListUsed* dst, PostingListUsed* src) const {
115 ICING_RETURN_ERROR_IF_NULL(dst);
116 ICING_RETURN_ERROR_IF_NULL(src);
117 if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) {
118 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
119 "src MinPostingListSizeToFit %d must be larger than size %d.",
120 GetMinPostingListSizeToFit(src), dst->size_in_bytes()));
121 }
122
123 if (!IsPostingListValid(dst)) {
124 return absl_ports::FailedPreconditionError(
125 "Dst posting list is in an invalid state and can't be used!");
126 }
127 if (!IsPostingListValid(src)) {
128 return absl_ports::InvalidArgumentError(
129 "Cannot MoveFrom an invalid src posting list!");
130 }
131
132 // Pop just enough hits that all of src's compressed hits fit in
133 // dst posting_list's compressed area. Then we can memcpy that area.
134 std::vector<Hit> hits;
135 while (IsFull(src) || IsAlmostFull(src) ||
136 (dst->size_in_bytes() - kSpecialHitsSize < GetBytesUsed(src))) {
137 if (!GetHitsInternal(src, /*limit=*/1, /*pop=*/true, &hits).ok()) {
138 return absl_ports::AbortedError(
139 "Unable to retrieve hits from src posting list.");
140 }
141 }
142
143 // memcpy the area and set up start byte offset.
144 Clear(dst);
145 memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src),
146 src->posting_list_buffer() + GetStartByteOffset(src),
147 GetBytesUsed(src));
148 // Because we popped all hits from src outside of the compressed area and we
149 // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() -
150 // kSpecialHitSize. This is guaranteed to be a valid byte offset for the
151 // NOT_FULL state, so ignoring the value is safe.
152 SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src));
153
154 // Put back remaining hits.
155 for (size_t i = 0; i < hits.size(); i++) {
156 const Hit& hit = hits[hits.size() - i - 1];
157 // PrependHit can return either INVALID_ARGUMENT - if hit is invalid or not
158 // less than the previous hit - or RESOURCE_EXHAUSTED. RESOURCE_EXHAUSTED
159 // should be impossible because we've already assured that there is enough
160 // room above.
161 ICING_RETURN_IF_ERROR(PrependHit(dst, hit));
162 }
163
164 Clear(src);
165 return libtextclassifier3::Status::OK;
166 }
167
GetPadEnd(const PostingListUsed * posting_list_used,uint32_t offset) const168 uint32_t PostingListHitSerializer::GetPadEnd(
169 const PostingListUsed* posting_list_used, uint32_t offset) const {
170 Hit::Value pad;
171 uint32_t pad_end = offset;
172 while (pad_end < posting_list_used->size_in_bytes()) {
173 size_t pad_len = VarInt::Decode(
174 posting_list_used->posting_list_buffer() + pad_end, &pad);
175 if (pad != 0) {
176 // No longer a pad.
177 break;
178 }
179 pad_end += pad_len;
180 }
181 return pad_end;
182 }
183
PadToEnd(PostingListUsed * posting_list_used,uint32_t start,uint32_t end) const184 bool PostingListHitSerializer::PadToEnd(PostingListUsed* posting_list_used,
185 uint32_t start, uint32_t end) const {
186 if (end > posting_list_used->size_in_bytes()) {
187 ICING_LOG(ERROR) << "Cannot pad a region that ends after size!";
188 return false;
189 }
190 // In VarInt a value of 0 encodes to 0.
191 memset(posting_list_used->posting_list_buffer() + start, 0, end - start);
192 return true;
193 }
194
PrependHitToAlmostFull(PostingListUsed * posting_list_used,const Hit & hit) const195 libtextclassifier3::Status PostingListHitSerializer::PrependHitToAlmostFull(
196 PostingListUsed* posting_list_used, const Hit& hit) const {
197 // Get delta between first hit and the new hit. Try to fit delta
198 // in the padded area and put new hit at the special position 1.
199 // Calling ValueOrDie is safe here because 1 < kNumSpecialData.
200 Hit cur = GetSpecialHit(posting_list_used, /*index=*/1).ValueOrDie();
201 if (!(hit < cur)) {
202 return absl_ports::InvalidArgumentError(
203 "Hit being prepended must be strictly less than the most recent Hit");
204 }
205 uint8_t delta_buf[VarInt::kMaxEncodedLen64];
206 size_t delta_len =
207 EncodeNextHitValue(/*next_hit_value=*/hit.value(),
208 /*curr_hit_value=*/cur.value(), delta_buf);
209 uint32_t cur_flags_bytes = GetFlagsByteSize(cur);
210 uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur);
211
212 uint32_t pad_end = GetPadEnd(posting_list_used,
213 /*offset=*/kSpecialHitsSize);
214
215 if (pad_end >= kSpecialHitsSize + delta_len + cur_flags_bytes +
216 cur_term_frequency_bytes) {
217 // Pad area has enough space for delta, flags and term_frequency of existing
218 // hit (cur). Write delta at pad_end - delta_len - cur_flags_bytes -
219 // cur_term_frequency_bytes.
220 uint8_t* delta_offset = posting_list_used->posting_list_buffer() + pad_end -
221 delta_len - cur_flags_bytes -
222 cur_term_frequency_bytes;
223 memcpy(delta_offset, delta_buf, delta_len);
224
225 // Now copy flags.
226 Hit::Flags flags = cur.flags();
227 uint8_t* flags_offset = delta_offset + delta_len;
228 memcpy(flags_offset, &flags, cur_flags_bytes);
229 // Copy term_frequency.
230 Hit::TermFrequency term_frequency = cur.term_frequency();
231 uint8_t* term_frequency_offset = flags_offset + cur_flags_bytes;
232 memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes);
233
234 // Now first hit is the new hit, at special position 1. Safe to ignore the
235 // return value because 1 < kNumSpecialData.
236 SetSpecialHit(posting_list_used, /*index=*/1, hit);
237 // Safe to ignore the return value because sizeof(Hit) is a valid argument.
238 SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
239 } else {
240 // No space for delta. We put the new hit at special position 0
241 // and go to the full state. Safe to ignore the return value because 1 <
242 // kNumSpecialData.
243 SetSpecialHit(posting_list_used, /*index=*/0, hit);
244 }
245 return libtextclassifier3::Status::OK;
246 }
247
PrependHitToEmpty(PostingListUsed * posting_list_used,const Hit & hit) const248 void PostingListHitSerializer::PrependHitToEmpty(
249 PostingListUsed* posting_list_used, const Hit& hit) const {
250 // First hit to be added. Just add verbatim, no compression.
251 if (posting_list_used->size_in_bytes() == kSpecialHitsSize) {
252 // Safe to ignore the return value because 1 < kNumSpecialData
253 SetSpecialHit(posting_list_used, /*index=*/1, hit);
254 // Safe to ignore the return value because sizeof(Hit) is a valid argument.
255 SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
256 } else {
257 // Since this is the first hit, size != kSpecialHitsSize and
258 // size % sizeof(Hit) == 0, we know that there is room to fit 'hit' into
259 // the compressed region, so ValueOrDie is safe.
260 uint32_t offset =
261 PrependHitUncompressed(posting_list_used, hit,
262 /*offset=*/posting_list_used->size_in_bytes())
263 .ValueOrDie();
264 // Safe to ignore the return value because PrependHitUncompressed is
265 // guaranteed to return a valid offset.
266 SetStartByteOffset(posting_list_used, offset);
267 }
268 }
269
PrependHitToNotFull(PostingListUsed * posting_list_used,const Hit & hit,uint32_t offset) const270 libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull(
271 PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
272 // First hit in compressed area. It is uncompressed. See if delta
273 // between the first hit and new hit will still fit in the
274 // compressed area.
275 if (offset + sizeof(Hit::Value) > posting_list_used->size_in_bytes()) {
276 // The first hit in the compressed region *should* be uncompressed, but
277 // somehow there isn't enough room between offset and the end of the
278 // compressed area to fit an uncompressed hit. This should NEVER happen.
279 return absl_ports::FailedPreconditionError(
280 "Posting list is in an invalid state.");
281 }
282
283 // Retrieve the last added (cur) hit's value and flags and compare to the hit
284 // we're adding.
285 Hit::Value cur_value;
286 uint8_t* cur_value_offset = posting_list_used->posting_list_buffer() + offset;
287 memcpy(&cur_value, cur_value_offset, sizeof(Hit::Value));
288 Hit::Flags cur_flags = Hit::kNoEnabledFlags;
289 if (GetFlagsByteSize(Hit(cur_value)) > 0) {
290 uint8_t* cur_flags_offset = cur_value_offset + sizeof(Hit::Value);
291 memcpy(&cur_flags, cur_flags_offset, sizeof(Hit::Flags));
292 }
293 // Term-frequency is not used for hit comparison so it's ok to pass in the
294 // default term-frequency here.
295 Hit cur(cur_value, cur_flags, Hit::kDefaultTermFrequency);
296 if (!(hit < cur)) {
297 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
298 "Hit (value=%d, flags=%d) being prepended must be strictly less than "
299 "the most recent Hit (value=%d, flags=%d)",
300 hit.value(), hit.flags(), cur_value, cur_flags));
301 }
302 uint8_t delta_buf[VarInt::kMaxEncodedLen64];
303 size_t delta_len =
304 EncodeNextHitValue(/*next_hit_value=*/hit.value(),
305 /*curr_hit_value=*/cur.value(), delta_buf);
306 uint32_t hit_flags_bytes = GetFlagsByteSize(hit);
307 uint32_t hit_term_frequency_bytes = GetTermFrequencyByteSize(hit);
308
309 // offset now points to one past the end of the first hit.
310 offset += sizeof(Hit::Value);
311 if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len + hit_flags_bytes +
312 hit_term_frequency_bytes <=
313 offset) {
314 // Enough space for delta in compressed area.
315
316 // Prepend delta.
317 offset -= delta_len;
318 memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
319 delta_len);
320
321 // Prepend new hit with (possibly) its flags and term_frequency. We know
322 // that there is room for 'hit' because of the if statement above, so
323 // calling ValueOrDie is safe.
324 offset =
325 PrependHitUncompressed(posting_list_used, hit, offset).ValueOrDie();
326 // offset is guaranteed to be valid here. So it's safe to ignore the return
327 // value. The if above will guarantee that offset >= kSpecialHitSize and <
328 // posting_list_used->size_in_bytes() because the if ensures that there is
329 // enough room between offset and kSpecialHitSize to fit the delta of the
330 // previous hit, any term_frequency and the uncompressed hit.
331 SetStartByteOffset(posting_list_used, offset);
332 } else if (kSpecialHitsSize + delta_len <= offset) {
333 // Only have space for delta. The new hit must be put in special
334 // position 1.
335
336 // Prepend delta.
337 offset -= delta_len;
338 memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
339 delta_len);
340
341 // Prepend pad. Safe to ignore the return value of PadToEnd because offset
342 // must be less than posting_list_used->size_in_bytes(). Otherwise, this
343 // function already would have returned FAILED_PRECONDITION.
344 PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize,
345 /*end=*/offset);
346
347 // Put new hit in special position 1. Safe to ignore return value because 1
348 // < kNumSpecialData.
349 SetSpecialHit(posting_list_used, /*index=*/1, hit);
350
351 // State almost_full. Safe to ignore the return value because sizeof(Hit) is
352 // a valid argument.
353 SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
354 } else {
355 // Very rare case where delta is larger than sizeof(Hit::Value)
356 // (i.e. varint delta encoding expanded required storage). We
357 // move first hit to special position 1 and put new hit in
358 // special position 0.
359
360 // Offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes.
361 // Therefore, offset must be less than kSpecialHitSize + 5. Since posting
362 // list size must be divisible by sizeof(Hit) (6), it is guaranteed that
363 // offset < size_in_bytes, so it is safe to ignore the return value here.
364 ICING_RETURN_IF_ERROR(ConsumeFlagsAndTermFrequencyIfPresent(
365 posting_list_used, &cur, &offset));
366 // Safe to ignore the return value of PadToEnd because offset must be less
367 // than posting_list_used->size_in_bytes(). Otherwise, this function
368 // already would have returned FAILED_PRECONDITION.
369 PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize,
370 /*end=*/offset);
371 // Safe to ignore the return value here because 0 and 1 < kNumSpecialData.
372 SetSpecialHit(posting_list_used, /*index=*/1, cur);
373 SetSpecialHit(posting_list_used, /*index=*/0, hit);
374 }
375 return libtextclassifier3::Status::OK;
376 }
377
PrependHit(PostingListUsed * posting_list_used,const Hit & hit) const378 libtextclassifier3::Status PostingListHitSerializer::PrependHit(
379 PostingListUsed* posting_list_used, const Hit& hit) const {
380 static_assert(sizeof(Hit::Value) <= sizeof(uint64_t),
381 "Hit::Value cannot be larger than 8 bytes because the delta "
382 "must be able to fit in 8 bytes.");
383 if (!hit.is_valid()) {
384 return absl_ports::InvalidArgumentError("Cannot prepend an invalid hit!");
385 }
386 if (!IsPostingListValid(posting_list_used)) {
387 return absl_ports::FailedPreconditionError(
388 "This PostingListUsed is in an invalid state and can't add any hits!");
389 }
390
391 if (IsFull(posting_list_used)) {
392 // State full: no space left.
393 return absl_ports::ResourceExhaustedError("No more room for hits");
394 } else if (IsAlmostFull(posting_list_used)) {
395 return PrependHitToAlmostFull(posting_list_used, hit);
396 } else if (IsEmpty(posting_list_used)) {
397 PrependHitToEmpty(posting_list_used, hit);
398 return libtextclassifier3::Status::OK;
399 } else {
400 uint32_t offset = GetStartByteOffset(posting_list_used);
401 return PrependHitToNotFull(posting_list_used, hit, offset);
402 }
403 }
404
405 libtextclassifier3::StatusOr<std::vector<Hit>>
GetHits(const PostingListUsed * posting_list_used) const406 PostingListHitSerializer::GetHits(
407 const PostingListUsed* posting_list_used) const {
408 std::vector<Hit> hits_out;
409 ICING_RETURN_IF_ERROR(GetHits(posting_list_used, &hits_out));
410 return hits_out;
411 }
412
GetHits(const PostingListUsed * posting_list_used,std::vector<Hit> * hits_out) const413 libtextclassifier3::Status PostingListHitSerializer::GetHits(
414 const PostingListUsed* posting_list_used,
415 std::vector<Hit>* hits_out) const {
416 return GetHitsInternal(posting_list_used,
417 /*limit=*/std::numeric_limits<uint32_t>::max(),
418 /*pop=*/false, hits_out);
419 }
420
PopFrontHits(PostingListUsed * posting_list_used,uint32_t num_hits) const421 libtextclassifier3::Status PostingListHitSerializer::PopFrontHits(
422 PostingListUsed* posting_list_used, uint32_t num_hits) const {
423 if (num_hits == 1 && IsFull(posting_list_used)) {
424 // The PL is in full status which means that we save 2 uncompressed hits in
425 // the 2 special postions. But full status may be reached by 2 different
426 // statuses.
427 // (1) In "almost full" status
428 // +-----------------+----------------+-------+-----------------+
429 // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits|
430 // +-----------------+----------------+-------+-----------------+
431 // When we prepend another hit, we can only put it at the special
432 // position 0. And we get a full PL
433 // +-----------------+----------------+-------+-----------------+
434 // |new 1st hit |original 1st hit|(pad) |(compressed) hits|
435 // +-----------------+----------------+-------+-----------------+
436 // (2) In "not full" status
437 // +-----------------+----------------+------+-------+------------------+
438 // |hits-start-offset|Hit::kInvalidVal|(pad) |1st hit|(compressed) hits |
439 // +-----------------+----------------+------+-------+------------------+
440 // When we prepend another hit, we can reach any of the 3 following
441 // scenarios:
442 // (2.1) not full
443 // if the space of pad and original 1st hit can accommodate the new 1st hit
444 // and the encoded delta value.
445 // +-----------------+----------------+------+-----------+-----------------+
446 // |hits-start-offset|Hit::kInvalidVal|(pad) |new 1st hit|(compressed) hits|
447 // +-----------------+----------------+------+-----------+-----------------+
448 // (2.2) almost full
449 // If the space of pad and original 1st hit cannot accommodate the new 1st
450 // hit and the encoded delta value but can accommodate the encoded delta
451 // value only. We can put the new 1st hit at special position 1.
452 // +-----------------+----------------+-------+-----------------+
453 // |Hit::kInvalidVal |new 1st hit |(pad) |(compressed) hits|
454 // +-----------------+----------------+-------+-----------------+
455 // (2.3) full
456 // In very rare case, it cannot even accommodate only the encoded delta
457 // value. we can move the original 1st hit into special position 1 and the
458 // new 1st hit into special position 0. This may happen because we use
459 // VarInt encoding method which may make the encoded value longer (about
460 // 4/3 times of original)
461 // +-----------------+----------------+-------+-----------------+
462 // |new 1st hit |original 1st hit|(pad) |(compressed) hits|
463 // +-----------------+----------------+-------+-----------------+
464 // Suppose now the PL is full. But we don't know whether it arrived to
465 // this status from "not full" like (2.3) or from "almost full" like (1).
466 // We'll return to "almost full" status like (1) if we simply pop the new
467 // 1st hit but we want to make the prepending operation "reversible". So
468 // there should be some way to return to "not full" if possible. A simple
469 // way to do it is to pop 2 hits out of the PL to status "almost full" or
470 // "not full". And add the original 1st hit back. We can return to the
471 // correct original statuses of (2.1) or (1). This makes our prepending
472 // operation reversible.
473 std::vector<Hit> out;
474
475 // Popping 2 hits should never fail because we've just ensured that the
476 // posting list is in the FULL state.
477 ICING_RETURN_IF_ERROR(
478 GetHitsInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out));
479
480 // PrependHit should never fail because out[1] is a valid hit less than
481 // previous hits in the posting list and because there's no way that the
482 // posting list could run out of room because it previously stored this hit
483 // AND another hit.
484 ICING_RETURN_IF_ERROR(PrependHit(posting_list_used, out[1]));
485 } else if (num_hits > 0) {
486 return GetHitsInternal(posting_list_used, /*limit=*/num_hits, /*pop=*/true,
487 nullptr);
488 }
489 return libtextclassifier3::Status::OK;
490 }
491
GetHitsInternal(const PostingListUsed * posting_list_used,uint32_t limit,bool pop,std::vector<Hit> * out) const492 libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal(
493 const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
494 std::vector<Hit>* out) const {
495 // Put current uncompressed val here.
496 Hit::Value val = Hit::kInvalidValue;
497 uint32_t offset = GetStartByteOffset(posting_list_used);
498 uint32_t count = 0;
499
500 // First traverse the first two special positions.
501 while (count < limit && offset < kSpecialHitsSize) {
502 // Calling ValueOrDie is safe here because offset / sizeof(Hit) <
503 // kNumSpecialData because of the check above.
504 Hit hit = GetSpecialHit(posting_list_used, /*index=*/offset / sizeof(Hit))
505 .ValueOrDie();
506 val = hit.value();
507 if (out != nullptr) {
508 out->push_back(hit);
509 }
510 offset += sizeof(Hit);
511 count++;
512 }
513
514 // If special position 1 was set then we need to skip padding.
515 if (val != Hit::kInvalidValue && offset == kSpecialHitsSize) {
516 offset = GetPadEnd(posting_list_used, offset);
517 }
518
519 while (count < limit && offset < posting_list_used->size_in_bytes()) {
520 if (val == Hit::kInvalidValue) {
521 // First hit is in compressed area. Put that in val.
522 memcpy(&val, posting_list_used->posting_list_buffer() + offset,
523 sizeof(Hit::Value));
524 offset += sizeof(Hit::Value);
525 } else {
526 // Now we have delta encoded subsequent hits. Decode and push.
527 DecodedHitInfo decoded_hit_info = DecodeNextHitValue(
528 posting_list_used->posting_list_buffer() + offset, val);
529 offset += decoded_hit_info.encoded_size;
530 val = decoded_hit_info.hit_value;
531 }
532 Hit hit(val);
533 libtextclassifier3::Status status =
534 ConsumeFlagsAndTermFrequencyIfPresent(posting_list_used, &hit, &offset);
535 if (!status.ok()) {
536 // This posting list has been corrupted somehow. The first hit of the
537 // posting list claims to have a term frequency or flag, but there's no
538 // more room in the posting list for that term frequency or flag to exist.
539 // Return an empty vector and zero to indicate no hits retrieved.
540 if (out != nullptr) {
541 out->clear();
542 }
543 return absl_ports::InternalError("Posting list has been corrupted!");
544 }
545 if (out != nullptr) {
546 out->push_back(hit);
547 }
548 count++;
549 }
550
551 if (pop) {
552 PostingListUsed* mutable_posting_list_used =
553 const_cast<PostingListUsed*>(posting_list_used);
554 // Modify the posting list so that we pop all hits actually
555 // traversed.
556 if (offset >= kSpecialHitsSize &&
557 offset < posting_list_used->size_in_bytes()) {
558 // In the compressed area. Pop and reconstruct. offset/val is
559 // the last traversed hit, which we must discard. So move one
560 // more forward.
561 DecodedHitInfo decoded_hit_info = DecodeNextHitValue(
562 posting_list_used->posting_list_buffer() + offset, val);
563 offset += decoded_hit_info.encoded_size;
564 val = decoded_hit_info.hit_value;
565
566 // Now val is the first hit of the new posting list.
567 if (kSpecialHitsSize + sizeof(Hit::Value) <= offset) {
568 // val fits in compressed area. Simply copy.
569 offset -= sizeof(Hit::Value);
570 memcpy(mutable_posting_list_used->posting_list_buffer() + offset, &val,
571 sizeof(Hit::Value));
572 } else {
573 // val won't fit in compressed area. Also see if there is a flag or
574 // term_frequency.
575 Hit hit(val);
576 libtextclassifier3::Status status =
577 ConsumeFlagsAndTermFrequencyIfPresent(posting_list_used, &hit,
578 &offset);
579 if (!status.ok()) {
580 // This posting list has been corrupted somehow. The first hit of
581 // the posting list claims to have a term frequency or flag, but
582 // there's no more room in the posting list for that term frequency or
583 // flag to exist. Return an empty vector and zero to indicate no hits
584 // retrieved. Do not pop anything.
585 if (out != nullptr) {
586 out->clear();
587 }
588 return absl_ports::InternalError("Posting list has been corrupted!");
589 }
590 // Okay to ignore the return value here because 1 < kNumSpecialData.
591 SetSpecialHit(mutable_posting_list_used, /*index=*/1, hit);
592
593 // Prepend pad. Safe to ignore the return value of PadToEnd because
594 // offset must be less than posting_list_used->size_in_bytes() thanks to
595 // the if above.
596 PadToEnd(mutable_posting_list_used,
597 /*start=*/kSpecialHitsSize,
598 /*end=*/offset);
599 offset = sizeof(Hit);
600 }
601 }
602 // offset is guaranteed to be valid so ignoring the return value of
603 // set_start_byte_offset is safe. It falls into one of four scenarios:
604 // Scenario 1: the above if was false because offset is not <
605 // posting_list_used->size_in_bytes()
606 // In this case, offset must be == posting_list_used->size_in_bytes()
607 // because we reached offset by unwinding hits on the posting list.
608 // Scenario 2: offset is < kSpecialHitSize
609 // In this case, offset is guaranteed to be either 0 or sizeof(Hit)
610 // because offset is incremented by sizeof(Hit) within the first while
611 // loop.
612 // Scenario 3: offset is within the compressed region and the new first hit
613 // in the posting list (the value that 'val' holds) will fit as an
614 // uncompressed hit in the compressed region. The resulting offset from
615 // decompressing val must be >= kSpecialHitSize because otherwise we'd be
616 // in Scenario 4
617 // Scenario 4: offset is within the compressed region, but the new first hit
618 // in the posting list is too large to fit as an uncompressed hit in the
619 // in the compressed region. Therefore, it must be stored in a special hit
620 // and offset will be sizeof(Hit).
621 SetStartByteOffset(mutable_posting_list_used, offset);
622 }
623
624 return libtextclassifier3::Status::OK;
625 }
626
GetSpecialHit(const PostingListUsed * posting_list_used,uint32_t index) const627 libtextclassifier3::StatusOr<Hit> PostingListHitSerializer::GetSpecialHit(
628 const PostingListUsed* posting_list_used, uint32_t index) const {
629 static_assert(sizeof(Hit::Value) >= sizeof(uint32_t), "HitTooSmall");
630 if (index >= kNumSpecialData || index < 0) {
631 return absl_ports::InvalidArgumentError(
632 "Special hits only exist at indices 0 and 1");
633 }
634 Hit val(Hit::kInvalidValue);
635 memcpy(&val, posting_list_used->posting_list_buffer() + index * sizeof(val),
636 sizeof(val));
637 return val;
638 }
639
SetSpecialHit(PostingListUsed * posting_list_used,uint32_t index,const Hit & val) const640 bool PostingListHitSerializer::SetSpecialHit(PostingListUsed* posting_list_used,
641 uint32_t index,
642 const Hit& val) const {
643 if (index >= kNumSpecialData || index < 0) {
644 ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1";
645 return false;
646 }
647 memcpy(posting_list_used->posting_list_buffer() + index * sizeof(val), &val,
648 sizeof(val));
649 return true;
650 }
651
IsPostingListValid(const PostingListUsed * posting_list_used) const652 bool PostingListHitSerializer::IsPostingListValid(
653 const PostingListUsed* posting_list_used) const {
654 if (IsAlmostFull(posting_list_used)) {
655 // Special Hit 1 should hold a Hit. Calling ValueOrDie is safe because we
656 // know that 1 < kNumSpecialData.
657 if (!GetSpecialHit(posting_list_used, /*index=*/1)
658 .ValueOrDie()
659 .is_valid()) {
660 ICING_LOG(ERROR)
661 << "Both special hits cannot be invalid at the same time.";
662 return false;
663 }
664 } else if (!IsFull(posting_list_used)) {
665 // NOT_FULL. Special Hit 0 should hold a valid offset. Calling ValueOrDie is
666 // safe because we know that 0 < kNumSpecialData.
667 if (GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() >
668 posting_list_used->size_in_bytes() ||
669 GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() <
670 kSpecialHitsSize) {
671 ICING_LOG(ERROR)
672 << "Hit: "
673 << GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value()
674 << " size: " << posting_list_used->size_in_bytes()
675 << " sp size: " << kSpecialHitsSize;
676 return false;
677 }
678 }
679 return true;
680 }
681
GetStartByteOffset(const PostingListUsed * posting_list_used) const682 uint32_t PostingListHitSerializer::GetStartByteOffset(
683 const PostingListUsed* posting_list_used) const {
684 if (IsFull(posting_list_used)) {
685 return 0;
686 } else if (IsAlmostFull(posting_list_used)) {
687 return sizeof(Hit);
688 } else {
689 // NOT_FULL, calling ValueOrDie is safe because we know that 0 <
690 // kNumSpecialData.
691 return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value();
692 }
693 }
694
SetStartByteOffset(PostingListUsed * posting_list_used,uint32_t offset) const695 bool PostingListHitSerializer::SetStartByteOffset(
696 PostingListUsed* posting_list_used, uint32_t offset) const {
697 if (offset > posting_list_used->size_in_bytes()) {
698 ICING_LOG(ERROR) << "offset cannot be a value greater than size "
699 << posting_list_used->size_in_bytes() << ". offset is "
700 << offset << ".";
701 return false;
702 }
703 if (offset < kSpecialHitsSize && offset > sizeof(Hit)) {
704 ICING_LOG(ERROR) << "offset cannot be a value between (" << sizeof(Hit)
705 << ", " << kSpecialHitsSize << "). offset is " << offset
706 << ".";
707 return false;
708 }
709 if (offset < sizeof(Hit) && offset != 0) {
710 ICING_LOG(ERROR) << "offset cannot be a value between (0, " << sizeof(Hit)
711 << "). offset is " << offset << ".";
712 return false;
713 }
714 if (offset >= kSpecialHitsSize) {
715 // not_full state. Safe to ignore the return value because 0 and 1 are both
716 // < kNumSpecialData.
717 SetSpecialHit(posting_list_used, /*index=*/0, Hit(offset));
718 SetSpecialHit(posting_list_used, /*index=*/1, Hit(Hit::kInvalidValue));
719 } else if (offset == sizeof(Hit)) {
720 // almost_full state. Safe to ignore the return value because 1 is both <
721 // kNumSpecialData.
722 SetSpecialHit(posting_list_used, /*index=*/0, Hit(Hit::kInvalidValue));
723 }
724 // Nothing to do for the FULL state - the offset isn't actually stored
725 // anywhere and both special hits hold valid hits.
726 return true;
727 }
728
729 libtextclassifier3::StatusOr<uint32_t>
PrependHitUncompressed(PostingListUsed * posting_list_used,const Hit & hit,uint32_t offset) const730 PostingListHitSerializer::PrependHitUncompressed(
731 PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
732 uint32_t hit_bytes_to_prepend = sizeof(Hit::Value) + GetFlagsByteSize(hit) +
733 GetTermFrequencyByteSize(hit);
734
735 if (offset < kSpecialHitsSize + hit_bytes_to_prepend) {
736 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
737 "Not enough room to prepend Hit at offset %d.", offset));
738 }
739
740 if (hit.has_term_frequency()) {
741 offset -= sizeof(Hit::TermFrequency);
742 Hit::TermFrequency term_frequency = hit.term_frequency();
743 memcpy(posting_list_used->posting_list_buffer() + offset, &term_frequency,
744 sizeof(Hit::TermFrequency));
745 }
746 if (hit.has_flags()) {
747 offset -= sizeof(Hit::Flags);
748 Hit::Flags flags = hit.flags();
749 memcpy(posting_list_used->posting_list_buffer() + offset, &flags,
750 sizeof(Hit::Flags));
751 }
752 offset -= sizeof(Hit::Value);
753 Hit::Value val = hit.value();
754 memcpy(posting_list_used->posting_list_buffer() + offset, &val,
755 sizeof(Hit::Value));
756 return offset;
757 }
758
759 libtextclassifier3::Status
ConsumeFlagsAndTermFrequencyIfPresent(const PostingListUsed * posting_list_used,Hit * hit,uint32_t * offset) const760 PostingListHitSerializer::ConsumeFlagsAndTermFrequencyIfPresent(
761 const PostingListUsed* posting_list_used, Hit* hit,
762 uint32_t* offset) const {
763 if (!hit->has_flags()) {
764 // No flags (and by extension, no term-frequency) to consume. Everything is
765 // fine.
766 return libtextclassifier3::Status::OK;
767 }
768
769 // Consume flags
770 Hit::Flags flags;
771 if (*offset + sizeof(Hit::Flags) > posting_list_used->size_in_bytes()) {
772 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
773 "offset %d must not point past the end of the posting list of size "
774 "%d.",
775 *offset, posting_list_used->size_in_bytes()));
776 }
777 memcpy(&flags, posting_list_used->posting_list_buffer() + *offset,
778 sizeof(Hit::Flags));
779 *hit = Hit(hit->value(), flags, Hit::kDefaultTermFrequency);
780 *offset += sizeof(Hit::Flags);
781
782 if (hit->has_term_frequency()) {
783 // Consume term frequency
784 if (*offset + sizeof(Hit::TermFrequency) >
785 posting_list_used->size_in_bytes()) {
786 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
787 "offset %d must not point past the end of the posting list of size "
788 "%d.",
789 *offset, posting_list_used->size_in_bytes()));
790 }
791 Hit::TermFrequency term_frequency;
792 memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset,
793 sizeof(Hit::TermFrequency));
794 *hit = Hit(hit->value(), flags, term_frequency);
795 *offset += sizeof(Hit::TermFrequency);
796 }
797
798 return libtextclassifier3::Status::OK;
799 }
800
801 } // namespace lib
802 } // namespace icing
803