xref: /aosp_15_r20/external/icing/icing/index/main/posting-list-hit-serializer.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/main/posting-list-hit-serializer.h"
16 
17 #include <cstdint>
18 #include <cstring>
19 #include <limits>
20 #include <vector>
21 
22 #include "icing/text_classifier/lib3/utils/base/status.h"
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 #include "icing/absl_ports/canonical_errors.h"
25 #include "icing/file/posting_list/posting-list-used.h"
26 #include "icing/index/hit/hit.h"
27 #include "icing/legacy/core/icing-string-util.h"
28 #include "icing/legacy/index/icing-bit-util.h"
29 #include "icing/util/logging.h"
30 #include "icing/util/status-macros.h"
31 
32 namespace icing {
33 namespace lib {
34 
35 namespace {
36 
GetTermFrequencyByteSize(const Hit & hit)37 uint32_t GetTermFrequencyByteSize(const Hit& hit) {
38   return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0;
39 }
40 
GetFlagsByteSize(const Hit & hit)41 uint32_t GetFlagsByteSize(const Hit& hit) {
42   return hit.has_flags() ? sizeof(Hit::Flags) : 0;
43 }
44 
45 }  // namespace
46 
GetBytesUsed(const PostingListUsed * posting_list_used) const47 uint32_t PostingListHitSerializer::GetBytesUsed(
48     const PostingListUsed* posting_list_used) const {
49   // The special hits will be included if they represent actual hits. If they
50   // represent the hit offset or the invalid hit sentinel, they are not
51   // included.
52   return posting_list_used->size_in_bytes() -
53          GetStartByteOffset(posting_list_used);
54 }
55 
GetMinPostingListSizeToFit(const PostingListUsed * posting_list_used) const56 uint32_t PostingListHitSerializer::GetMinPostingListSizeToFit(
57     const PostingListUsed* posting_list_used) const {
58   if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) {
59     // If in either the FULL state or ALMOST_FULL state, this posting list *is*
60     // the minimum size posting list that can fit these hits. So just return the
61     // size of the posting list.
62     return posting_list_used->size_in_bytes();
63   }
64 
65   // Edge case: if number of hits <= 2, and posting_list_used is NOT_FULL, we
66   // should return kMinPostingListSize as we know that two hits is able to fit
67   // in the 2 special hits position of a min-sized posting list.
68   // - Checking this scenario requires deserializing posting_list_used, which is
69   //   an expensive operation. However this is only possible when
70   //   GetBytesUsed(posting_list_used) <=
71   //   sizeof(uncompressed hit) + max(delta encoded value size) +
72   //   (sizeof(Hit) + (sizeof(Hit) - sizeof(Hit::Value)), so we don't need to do
73   //   this when the size exceeds this number.
74   if (GetBytesUsed(posting_list_used) <=
75       2 * sizeof(Hit) + 5 - sizeof(Hit::Value)) {
76     // Check if we're able to get more than 2 hits from posting_list_used
77     std::vector<Hit> hits;
78     libtextclassifier3::Status status =
79         GetHitsInternal(posting_list_used, /*limit=*/3, /*pop=*/false, &hits);
80     if (status.ok() && hits.size() <= 2) {
81       return GetMinPostingListSize();
82     }
83   }
84 
85   // - In NOT_FULL status, BytesUsed contains no special hits. For a posting
86   //   list in the NOT_FULL state with n hits, we would have n-1 compressed hits
87   //   and 1 uncompressed hit.
88   // - The minimum sized posting list that would be guaranteed to fit these hits
89   //   would be FULL, but calculating the size required for the FULL posting
90   //   list would require deserializing the last two added hits, so instead we
91   //   will calculate the size of an ALMOST_FULL posting list to fit.
92   // - An ALMOST_FULL posting list would have kInvalidHit in special_hit(0), the
93   //   full uncompressed Hit in special_hit(1), and the n-1 compressed hits in
94   //   the compressed region.
95   // - Currently BytesUsed contains one uncompressed Hit and n-1 compressed
96   //   hits. However, it's possible that the uncompressed Hit is not a full hit,
97   //   but rather only the Hit::Value (this is the case if
98   //   !hit.has_term_frequency()).
99   // - Therefore, fitting these hits into a posting list would require
100   //   BytesUsed + one extra full hit + byte difference between a full hit and
101   //   Hit::Value. i.e:
102   //   ByteUsed + sizeof(Hit) + (sizeof(Hit) - sizeof(Hit::Value)).
103   return GetBytesUsed(posting_list_used) + 2 * sizeof(Hit) - sizeof(Hit::Value);
104 }
105 
Clear(PostingListUsed * posting_list_used) const106 void PostingListHitSerializer::Clear(PostingListUsed* posting_list_used) const {
107   // Safe to ignore return value because posting_list_used->size_in_bytes() is
108   // a valid argument.
109   SetStartByteOffset(posting_list_used,
110                      /*offset=*/posting_list_used->size_in_bytes());
111 }
112 
MoveFrom(PostingListUsed * dst,PostingListUsed * src) const113 libtextclassifier3::Status PostingListHitSerializer::MoveFrom(
114     PostingListUsed* dst, PostingListUsed* src) const {
115   ICING_RETURN_ERROR_IF_NULL(dst);
116   ICING_RETURN_ERROR_IF_NULL(src);
117   if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) {
118     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
119         "src MinPostingListSizeToFit %d must be larger than size %d.",
120         GetMinPostingListSizeToFit(src), dst->size_in_bytes()));
121   }
122 
123   if (!IsPostingListValid(dst)) {
124     return absl_ports::FailedPreconditionError(
125         "Dst posting list is in an invalid state and can't be used!");
126   }
127   if (!IsPostingListValid(src)) {
128     return absl_ports::InvalidArgumentError(
129         "Cannot MoveFrom an invalid src posting list!");
130   }
131 
132   // Pop just enough hits that all of src's compressed hits fit in
133   // dst posting_list's compressed area. Then we can memcpy that area.
134   std::vector<Hit> hits;
135   while (IsFull(src) || IsAlmostFull(src) ||
136          (dst->size_in_bytes() - kSpecialHitsSize < GetBytesUsed(src))) {
137     if (!GetHitsInternal(src, /*limit=*/1, /*pop=*/true, &hits).ok()) {
138       return absl_ports::AbortedError(
139           "Unable to retrieve hits from src posting list.");
140     }
141   }
142 
143   // memcpy the area and set up start byte offset.
144   Clear(dst);
145   memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src),
146          src->posting_list_buffer() + GetStartByteOffset(src),
147          GetBytesUsed(src));
148   // Because we popped all hits from src outside of the compressed area and we
149   // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() -
150   // kSpecialHitSize. This is guaranteed to be a valid byte offset for the
151   // NOT_FULL state, so ignoring the value is safe.
152   SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src));
153 
154   // Put back remaining hits.
155   for (size_t i = 0; i < hits.size(); i++) {
156     const Hit& hit = hits[hits.size() - i - 1];
157     // PrependHit can return either INVALID_ARGUMENT - if hit is invalid or not
158     // less than the previous hit - or RESOURCE_EXHAUSTED. RESOURCE_EXHAUSTED
159     // should be impossible because we've already assured that there is enough
160     // room above.
161     ICING_RETURN_IF_ERROR(PrependHit(dst, hit));
162   }
163 
164   Clear(src);
165   return libtextclassifier3::Status::OK;
166 }
167 
GetPadEnd(const PostingListUsed * posting_list_used,uint32_t offset) const168 uint32_t PostingListHitSerializer::GetPadEnd(
169     const PostingListUsed* posting_list_used, uint32_t offset) const {
170   Hit::Value pad;
171   uint32_t pad_end = offset;
172   while (pad_end < posting_list_used->size_in_bytes()) {
173     size_t pad_len = VarInt::Decode(
174         posting_list_used->posting_list_buffer() + pad_end, &pad);
175     if (pad != 0) {
176       // No longer a pad.
177       break;
178     }
179     pad_end += pad_len;
180   }
181   return pad_end;
182 }
183 
PadToEnd(PostingListUsed * posting_list_used,uint32_t start,uint32_t end) const184 bool PostingListHitSerializer::PadToEnd(PostingListUsed* posting_list_used,
185                                         uint32_t start, uint32_t end) const {
186   if (end > posting_list_used->size_in_bytes()) {
187     ICING_LOG(ERROR) << "Cannot pad a region that ends after size!";
188     return false;
189   }
190   // In VarInt a value of 0 encodes to 0.
191   memset(posting_list_used->posting_list_buffer() + start, 0, end - start);
192   return true;
193 }
194 
PrependHitToAlmostFull(PostingListUsed * posting_list_used,const Hit & hit) const195 libtextclassifier3::Status PostingListHitSerializer::PrependHitToAlmostFull(
196     PostingListUsed* posting_list_used, const Hit& hit) const {
197   // Get delta between first hit and the new hit. Try to fit delta
198   // in the padded area and put new hit at the special position 1.
199   // Calling ValueOrDie is safe here because 1 < kNumSpecialData.
200   Hit cur = GetSpecialHit(posting_list_used, /*index=*/1).ValueOrDie();
201   if (!(hit < cur)) {
202     return absl_ports::InvalidArgumentError(
203         "Hit being prepended must be strictly less than the most recent Hit");
204   }
205   uint8_t delta_buf[VarInt::kMaxEncodedLen64];
206   size_t delta_len =
207       EncodeNextHitValue(/*next_hit_value=*/hit.value(),
208                          /*curr_hit_value=*/cur.value(), delta_buf);
209   uint32_t cur_flags_bytes = GetFlagsByteSize(cur);
210   uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur);
211 
212   uint32_t pad_end = GetPadEnd(posting_list_used,
213                                /*offset=*/kSpecialHitsSize);
214 
215   if (pad_end >= kSpecialHitsSize + delta_len + cur_flags_bytes +
216                      cur_term_frequency_bytes) {
217     // Pad area has enough space for delta, flags and term_frequency of existing
218     // hit (cur). Write delta at pad_end - delta_len - cur_flags_bytes -
219     // cur_term_frequency_bytes.
220     uint8_t* delta_offset = posting_list_used->posting_list_buffer() + pad_end -
221                             delta_len - cur_flags_bytes -
222                             cur_term_frequency_bytes;
223     memcpy(delta_offset, delta_buf, delta_len);
224 
225     // Now copy flags.
226     Hit::Flags flags = cur.flags();
227     uint8_t* flags_offset = delta_offset + delta_len;
228     memcpy(flags_offset, &flags, cur_flags_bytes);
229     // Copy term_frequency.
230     Hit::TermFrequency term_frequency = cur.term_frequency();
231     uint8_t* term_frequency_offset = flags_offset + cur_flags_bytes;
232     memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes);
233 
234     // Now first hit is the new hit, at special position 1. Safe to ignore the
235     // return value because 1 < kNumSpecialData.
236     SetSpecialHit(posting_list_used, /*index=*/1, hit);
237     // Safe to ignore the return value because sizeof(Hit) is a valid argument.
238     SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
239   } else {
240     // No space for delta. We put the new hit at special position 0
241     // and go to the full state. Safe to ignore the return value because 1 <
242     // kNumSpecialData.
243     SetSpecialHit(posting_list_used, /*index=*/0, hit);
244   }
245   return libtextclassifier3::Status::OK;
246 }
247 
PrependHitToEmpty(PostingListUsed * posting_list_used,const Hit & hit) const248 void PostingListHitSerializer::PrependHitToEmpty(
249     PostingListUsed* posting_list_used, const Hit& hit) const {
250   // First hit to be added. Just add verbatim, no compression.
251   if (posting_list_used->size_in_bytes() == kSpecialHitsSize) {
252     // Safe to ignore the return value because 1 < kNumSpecialData
253     SetSpecialHit(posting_list_used, /*index=*/1, hit);
254     // Safe to ignore the return value because sizeof(Hit) is a valid argument.
255     SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
256   } else {
257     // Since this is the first hit, size != kSpecialHitsSize and
258     // size % sizeof(Hit) == 0, we know that there is room to fit 'hit' into
259     // the compressed region, so ValueOrDie is safe.
260     uint32_t offset =
261         PrependHitUncompressed(posting_list_used, hit,
262                                /*offset=*/posting_list_used->size_in_bytes())
263             .ValueOrDie();
264     // Safe to ignore the return value because PrependHitUncompressed is
265     // guaranteed to return a valid offset.
266     SetStartByteOffset(posting_list_used, offset);
267   }
268 }
269 
PrependHitToNotFull(PostingListUsed * posting_list_used,const Hit & hit,uint32_t offset) const270 libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull(
271     PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
272   // First hit in compressed area. It is uncompressed. See if delta
273   // between the first hit and new hit will still fit in the
274   // compressed area.
275   if (offset + sizeof(Hit::Value) > posting_list_used->size_in_bytes()) {
276     // The first hit in the compressed region *should* be uncompressed, but
277     // somehow there isn't enough room between offset and the end of the
278     // compressed area to fit an uncompressed hit. This should NEVER happen.
279     return absl_ports::FailedPreconditionError(
280         "Posting list is in an invalid state.");
281   }
282 
283   // Retrieve the last added (cur) hit's value and flags and compare to the hit
284   // we're adding.
285   Hit::Value cur_value;
286   uint8_t* cur_value_offset = posting_list_used->posting_list_buffer() + offset;
287   memcpy(&cur_value, cur_value_offset, sizeof(Hit::Value));
288   Hit::Flags cur_flags = Hit::kNoEnabledFlags;
289   if (GetFlagsByteSize(Hit(cur_value)) > 0) {
290     uint8_t* cur_flags_offset = cur_value_offset + sizeof(Hit::Value);
291     memcpy(&cur_flags, cur_flags_offset, sizeof(Hit::Flags));
292   }
293   // Term-frequency is not used for hit comparison so it's ok to pass in the
294   // default term-frequency here.
295   Hit cur(cur_value, cur_flags, Hit::kDefaultTermFrequency);
296   if (!(hit < cur)) {
297     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
298         "Hit (value=%d, flags=%d) being prepended must be strictly less than "
299         "the most recent Hit (value=%d, flags=%d)",
300         hit.value(), hit.flags(), cur_value, cur_flags));
301   }
302   uint8_t delta_buf[VarInt::kMaxEncodedLen64];
303   size_t delta_len =
304       EncodeNextHitValue(/*next_hit_value=*/hit.value(),
305                          /*curr_hit_value=*/cur.value(), delta_buf);
306   uint32_t hit_flags_bytes = GetFlagsByteSize(hit);
307   uint32_t hit_term_frequency_bytes = GetTermFrequencyByteSize(hit);
308 
309   // offset now points to one past the end of the first hit.
310   offset += sizeof(Hit::Value);
311   if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len + hit_flags_bytes +
312           hit_term_frequency_bytes <=
313       offset) {
314     // Enough space for delta in compressed area.
315 
316     // Prepend delta.
317     offset -= delta_len;
318     memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
319            delta_len);
320 
321     // Prepend new hit with (possibly) its flags and term_frequency. We know
322     // that there is room for 'hit' because of the if statement above, so
323     // calling ValueOrDie is safe.
324     offset =
325         PrependHitUncompressed(posting_list_used, hit, offset).ValueOrDie();
326     // offset is guaranteed to be valid here. So it's safe to ignore the return
327     // value. The if above will guarantee that offset >= kSpecialHitSize and <
328     // posting_list_used->size_in_bytes() because the if ensures that there is
329     // enough room between offset and kSpecialHitSize to fit the delta of the
330     // previous hit, any term_frequency and the uncompressed hit.
331     SetStartByteOffset(posting_list_used, offset);
332   } else if (kSpecialHitsSize + delta_len <= offset) {
333     // Only have space for delta. The new hit must be put in special
334     // position 1.
335 
336     // Prepend delta.
337     offset -= delta_len;
338     memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
339            delta_len);
340 
341     // Prepend pad. Safe to ignore the return value of PadToEnd because offset
342     // must be less than posting_list_used->size_in_bytes(). Otherwise, this
343     // function already would have returned FAILED_PRECONDITION.
344     PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize,
345              /*end=*/offset);
346 
347     // Put new hit in special position 1. Safe to ignore return value because 1
348     // < kNumSpecialData.
349     SetSpecialHit(posting_list_used, /*index=*/1, hit);
350 
351     // State almost_full. Safe to ignore the return value because sizeof(Hit) is
352     // a valid argument.
353     SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
354   } else {
355     // Very rare case where delta is larger than sizeof(Hit::Value)
356     // (i.e. varint delta encoding expanded required storage). We
357     // move first hit to special position 1 and put new hit in
358     // special position 0.
359 
360     // Offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes.
361     // Therefore, offset must be less than kSpecialHitSize + 5. Since posting
362     // list size must be divisible by sizeof(Hit) (6), it is guaranteed that
363     // offset < size_in_bytes, so it is safe to ignore the return value here.
364     ICING_RETURN_IF_ERROR(ConsumeFlagsAndTermFrequencyIfPresent(
365         posting_list_used, &cur, &offset));
366     // Safe to ignore the return value of PadToEnd because offset must be less
367     // than posting_list_used->size_in_bytes(). Otherwise, this function
368     // already would have returned FAILED_PRECONDITION.
369     PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize,
370              /*end=*/offset);
371     // Safe to ignore the return value here because 0 and 1 < kNumSpecialData.
372     SetSpecialHit(posting_list_used, /*index=*/1, cur);
373     SetSpecialHit(posting_list_used, /*index=*/0, hit);
374   }
375   return libtextclassifier3::Status::OK;
376 }
377 
PrependHit(PostingListUsed * posting_list_used,const Hit & hit) const378 libtextclassifier3::Status PostingListHitSerializer::PrependHit(
379     PostingListUsed* posting_list_used, const Hit& hit) const {
380   static_assert(sizeof(Hit::Value) <= sizeof(uint64_t),
381                 "Hit::Value cannot be larger than 8 bytes because the delta "
382                 "must be able to fit in 8 bytes.");
383   if (!hit.is_valid()) {
384     return absl_ports::InvalidArgumentError("Cannot prepend an invalid hit!");
385   }
386   if (!IsPostingListValid(posting_list_used)) {
387     return absl_ports::FailedPreconditionError(
388         "This PostingListUsed is in an invalid state and can't add any hits!");
389   }
390 
391   if (IsFull(posting_list_used)) {
392     // State full: no space left.
393     return absl_ports::ResourceExhaustedError("No more room for hits");
394   } else if (IsAlmostFull(posting_list_used)) {
395     return PrependHitToAlmostFull(posting_list_used, hit);
396   } else if (IsEmpty(posting_list_used)) {
397     PrependHitToEmpty(posting_list_used, hit);
398     return libtextclassifier3::Status::OK;
399   } else {
400     uint32_t offset = GetStartByteOffset(posting_list_used);
401     return PrependHitToNotFull(posting_list_used, hit, offset);
402   }
403 }
404 
405 libtextclassifier3::StatusOr<std::vector<Hit>>
GetHits(const PostingListUsed * posting_list_used) const406 PostingListHitSerializer::GetHits(
407     const PostingListUsed* posting_list_used) const {
408   std::vector<Hit> hits_out;
409   ICING_RETURN_IF_ERROR(GetHits(posting_list_used, &hits_out));
410   return hits_out;
411 }
412 
GetHits(const PostingListUsed * posting_list_used,std::vector<Hit> * hits_out) const413 libtextclassifier3::Status PostingListHitSerializer::GetHits(
414     const PostingListUsed* posting_list_used,
415     std::vector<Hit>* hits_out) const {
416   return GetHitsInternal(posting_list_used,
417                          /*limit=*/std::numeric_limits<uint32_t>::max(),
418                          /*pop=*/false, hits_out);
419 }
420 
PopFrontHits(PostingListUsed * posting_list_used,uint32_t num_hits) const421 libtextclassifier3::Status PostingListHitSerializer::PopFrontHits(
422     PostingListUsed* posting_list_used, uint32_t num_hits) const {
423   if (num_hits == 1 && IsFull(posting_list_used)) {
424     // The PL is in full status which means that we save 2 uncompressed hits in
425     // the 2 special postions. But full status may be reached by 2 different
426     // statuses.
427     // (1) In "almost full" status
428     // +-----------------+----------------+-------+-----------------+
429     // |Hit::kInvalidVal |1st hit         |(pad)  |(compressed) hits|
430     // +-----------------+----------------+-------+-----------------+
431     // When we prepend another hit, we can only put it at the special
432     // position 0. And we get a full PL
433     // +-----------------+----------------+-------+-----------------+
434     // |new 1st hit      |original 1st hit|(pad)  |(compressed) hits|
435     // +-----------------+----------------+-------+-----------------+
436     // (2) In "not full" status
437     // +-----------------+----------------+------+-------+------------------+
438     // |hits-start-offset|Hit::kInvalidVal|(pad) |1st hit|(compressed) hits |
439     // +-----------------+----------------+------+-------+------------------+
440     // When we prepend another hit, we can reach any of the 3 following
441     // scenarios:
442     // (2.1) not full
443     // if the space of pad and original 1st hit can accommodate the new 1st hit
444     // and the encoded delta value.
445     // +-----------------+----------------+------+-----------+-----------------+
446     // |hits-start-offset|Hit::kInvalidVal|(pad) |new 1st hit|(compressed) hits|
447     // +-----------------+----------------+------+-----------+-----------------+
448     // (2.2) almost full
449     // If the space of pad and original 1st hit cannot accommodate the new 1st
450     // hit and the encoded delta value but can accommodate the encoded delta
451     // value only. We can put the new 1st hit at special position 1.
452     // +-----------------+----------------+-------+-----------------+
453     // |Hit::kInvalidVal |new 1st hit     |(pad)  |(compressed) hits|
454     // +-----------------+----------------+-------+-----------------+
455     // (2.3) full
456     // In very rare case, it cannot even accommodate only the encoded delta
457     // value. we can move the original 1st hit into special position 1 and the
458     // new 1st hit into special position 0. This may happen because we use
459     // VarInt encoding method which may make the encoded value longer (about
460     // 4/3 times of original)
461     // +-----------------+----------------+-------+-----------------+
462     // |new 1st hit      |original 1st hit|(pad)  |(compressed) hits|
463     // +-----------------+----------------+-------+-----------------+
464     // Suppose now the PL is full. But we don't know whether it arrived to
465     // this status from "not full" like (2.3) or from "almost full" like (1).
466     // We'll return to "almost full" status like (1) if we simply pop the new
467     // 1st hit but we want to make the prepending operation "reversible". So
468     // there should be some way to return to "not full" if possible. A simple
469     // way to do it is to pop 2 hits out of the PL to status "almost full" or
470     // "not full".  And add the original 1st hit back. We can return to the
471     // correct original statuses of (2.1) or (1). This makes our prepending
472     // operation reversible.
473     std::vector<Hit> out;
474 
475     // Popping 2 hits should never fail because we've just ensured that the
476     // posting list is in the FULL state.
477     ICING_RETURN_IF_ERROR(
478         GetHitsInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out));
479 
480     // PrependHit should never fail because out[1] is a valid hit less than
481     // previous hits in the posting list and because there's no way that the
482     // posting list could run out of room because it previously stored this hit
483     // AND another hit.
484     ICING_RETURN_IF_ERROR(PrependHit(posting_list_used, out[1]));
485   } else if (num_hits > 0) {
486     return GetHitsInternal(posting_list_used, /*limit=*/num_hits, /*pop=*/true,
487                            nullptr);
488   }
489   return libtextclassifier3::Status::OK;
490 }
491 
GetHitsInternal(const PostingListUsed * posting_list_used,uint32_t limit,bool pop,std::vector<Hit> * out) const492 libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal(
493     const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
494     std::vector<Hit>* out) const {
495   // Put current uncompressed val here.
496   Hit::Value val = Hit::kInvalidValue;
497   uint32_t offset = GetStartByteOffset(posting_list_used);
498   uint32_t count = 0;
499 
500   // First traverse the first two special positions.
501   while (count < limit && offset < kSpecialHitsSize) {
502     // Calling ValueOrDie is safe here because offset / sizeof(Hit) <
503     // kNumSpecialData because of the check above.
504     Hit hit = GetSpecialHit(posting_list_used, /*index=*/offset / sizeof(Hit))
505                   .ValueOrDie();
506     val = hit.value();
507     if (out != nullptr) {
508       out->push_back(hit);
509     }
510     offset += sizeof(Hit);
511     count++;
512   }
513 
514   // If special position 1 was set then we need to skip padding.
515   if (val != Hit::kInvalidValue && offset == kSpecialHitsSize) {
516     offset = GetPadEnd(posting_list_used, offset);
517   }
518 
519   while (count < limit && offset < posting_list_used->size_in_bytes()) {
520     if (val == Hit::kInvalidValue) {
521       // First hit is in compressed area. Put that in val.
522       memcpy(&val, posting_list_used->posting_list_buffer() + offset,
523              sizeof(Hit::Value));
524       offset += sizeof(Hit::Value);
525     } else {
526       // Now we have delta encoded subsequent hits. Decode and push.
527       DecodedHitInfo decoded_hit_info = DecodeNextHitValue(
528           posting_list_used->posting_list_buffer() + offset, val);
529       offset += decoded_hit_info.encoded_size;
530       val = decoded_hit_info.hit_value;
531     }
532     Hit hit(val);
533     libtextclassifier3::Status status =
534         ConsumeFlagsAndTermFrequencyIfPresent(posting_list_used, &hit, &offset);
535     if (!status.ok()) {
536       // This posting list has been corrupted somehow. The first hit of the
537       // posting list claims to have a term frequency or flag, but there's no
538       // more room in the posting list for that term frequency or flag to exist.
539       // Return an empty vector and zero to indicate no hits retrieved.
540       if (out != nullptr) {
541         out->clear();
542       }
543       return absl_ports::InternalError("Posting list has been corrupted!");
544     }
545     if (out != nullptr) {
546       out->push_back(hit);
547     }
548     count++;
549   }
550 
551   if (pop) {
552     PostingListUsed* mutable_posting_list_used =
553         const_cast<PostingListUsed*>(posting_list_used);
554     // Modify the posting list so that we pop all hits actually
555     // traversed.
556     if (offset >= kSpecialHitsSize &&
557         offset < posting_list_used->size_in_bytes()) {
558       // In the compressed area. Pop and reconstruct. offset/val is
559       // the last traversed hit, which we must discard. So move one
560       // more forward.
561       DecodedHitInfo decoded_hit_info = DecodeNextHitValue(
562           posting_list_used->posting_list_buffer() + offset, val);
563       offset += decoded_hit_info.encoded_size;
564       val = decoded_hit_info.hit_value;
565 
566       // Now val is the first hit of the new posting list.
567       if (kSpecialHitsSize + sizeof(Hit::Value) <= offset) {
568         // val fits in compressed area. Simply copy.
569         offset -= sizeof(Hit::Value);
570         memcpy(mutable_posting_list_used->posting_list_buffer() + offset, &val,
571                sizeof(Hit::Value));
572       } else {
573         // val won't fit in compressed area. Also see if there is a flag or
574         // term_frequency.
575         Hit hit(val);
576         libtextclassifier3::Status status =
577             ConsumeFlagsAndTermFrequencyIfPresent(posting_list_used, &hit,
578                                                   &offset);
579         if (!status.ok()) {
580           // This posting list has been corrupted somehow. The first hit of
581           // the posting list claims to have a term frequency or flag, but
582           // there's no more room in the posting list for that term frequency or
583           // flag to exist. Return an empty vector and zero to indicate no hits
584           // retrieved. Do not pop anything.
585           if (out != nullptr) {
586             out->clear();
587           }
588           return absl_ports::InternalError("Posting list has been corrupted!");
589         }
590         // Okay to ignore the return value here because 1 < kNumSpecialData.
591         SetSpecialHit(mutable_posting_list_used, /*index=*/1, hit);
592 
593         // Prepend pad. Safe to ignore the return value of PadToEnd because
594         // offset must be less than posting_list_used->size_in_bytes() thanks to
595         // the if above.
596         PadToEnd(mutable_posting_list_used,
597                  /*start=*/kSpecialHitsSize,
598                  /*end=*/offset);
599         offset = sizeof(Hit);
600       }
601     }
602     // offset is guaranteed to be valid so ignoring the return value of
603     // set_start_byte_offset is safe. It falls into one of four scenarios:
604     // Scenario 1: the above if was false because offset is not <
605     //             posting_list_used->size_in_bytes()
606     //   In this case, offset must be == posting_list_used->size_in_bytes()
607     //   because we reached offset by unwinding hits on the posting list.
608     // Scenario 2: offset is < kSpecialHitSize
609     //   In this case, offset is guaranteed to be either 0 or sizeof(Hit)
610     //   because offset is incremented by sizeof(Hit) within the first while
611     //   loop.
612     // Scenario 3: offset is within the compressed region and the new first hit
613     //   in the posting list (the value that 'val' holds) will fit as an
614     //   uncompressed hit in the compressed region. The resulting offset from
615     //   decompressing val must be >= kSpecialHitSize because otherwise we'd be
616     //   in Scenario 4
617     // Scenario 4: offset is within the compressed region, but the new first hit
618     //   in the posting list is too large to fit as an uncompressed hit in the
619     //   in the compressed region. Therefore, it must be stored in a special hit
620     //   and offset will be sizeof(Hit).
621     SetStartByteOffset(mutable_posting_list_used, offset);
622   }
623 
624   return libtextclassifier3::Status::OK;
625 }
626 
GetSpecialHit(const PostingListUsed * posting_list_used,uint32_t index) const627 libtextclassifier3::StatusOr<Hit> PostingListHitSerializer::GetSpecialHit(
628     const PostingListUsed* posting_list_used, uint32_t index) const {
629   static_assert(sizeof(Hit::Value) >= sizeof(uint32_t), "HitTooSmall");
630   if (index >= kNumSpecialData || index < 0) {
631     return absl_ports::InvalidArgumentError(
632         "Special hits only exist at indices 0 and 1");
633   }
634   Hit val(Hit::kInvalidValue);
635   memcpy(&val, posting_list_used->posting_list_buffer() + index * sizeof(val),
636          sizeof(val));
637   return val;
638 }
639 
SetSpecialHit(PostingListUsed * posting_list_used,uint32_t index,const Hit & val) const640 bool PostingListHitSerializer::SetSpecialHit(PostingListUsed* posting_list_used,
641                                              uint32_t index,
642                                              const Hit& val) const {
643   if (index >= kNumSpecialData || index < 0) {
644     ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1";
645     return false;
646   }
647   memcpy(posting_list_used->posting_list_buffer() + index * sizeof(val), &val,
648          sizeof(val));
649   return true;
650 }
651 
IsPostingListValid(const PostingListUsed * posting_list_used) const652 bool PostingListHitSerializer::IsPostingListValid(
653     const PostingListUsed* posting_list_used) const {
654   if (IsAlmostFull(posting_list_used)) {
655     // Special Hit 1 should hold a Hit. Calling ValueOrDie is safe because we
656     // know that 1 < kNumSpecialData.
657     if (!GetSpecialHit(posting_list_used, /*index=*/1)
658              .ValueOrDie()
659              .is_valid()) {
660       ICING_LOG(ERROR)
661           << "Both special hits cannot be invalid at the same time.";
662       return false;
663     }
664   } else if (!IsFull(posting_list_used)) {
665     // NOT_FULL. Special Hit 0 should hold a valid offset. Calling ValueOrDie is
666     // safe because we know that 0 < kNumSpecialData.
667     if (GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() >
668             posting_list_used->size_in_bytes() ||
669         GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() <
670             kSpecialHitsSize) {
671       ICING_LOG(ERROR)
672           << "Hit: "
673           << GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value()
674           << " size: " << posting_list_used->size_in_bytes()
675           << " sp size: " << kSpecialHitsSize;
676       return false;
677     }
678   }
679   return true;
680 }
681 
GetStartByteOffset(const PostingListUsed * posting_list_used) const682 uint32_t PostingListHitSerializer::GetStartByteOffset(
683     const PostingListUsed* posting_list_used) const {
684   if (IsFull(posting_list_used)) {
685     return 0;
686   } else if (IsAlmostFull(posting_list_used)) {
687     return sizeof(Hit);
688   } else {
689     // NOT_FULL, calling ValueOrDie is safe because we know that 0 <
690     // kNumSpecialData.
691     return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value();
692   }
693 }
694 
SetStartByteOffset(PostingListUsed * posting_list_used,uint32_t offset) const695 bool PostingListHitSerializer::SetStartByteOffset(
696     PostingListUsed* posting_list_used, uint32_t offset) const {
697   if (offset > posting_list_used->size_in_bytes()) {
698     ICING_LOG(ERROR) << "offset cannot be a value greater than size "
699                      << posting_list_used->size_in_bytes() << ". offset is "
700                      << offset << ".";
701     return false;
702   }
703   if (offset < kSpecialHitsSize && offset > sizeof(Hit)) {
704     ICING_LOG(ERROR) << "offset cannot be a value between (" << sizeof(Hit)
705                      << ", " << kSpecialHitsSize << "). offset is " << offset
706                      << ".";
707     return false;
708   }
709   if (offset < sizeof(Hit) && offset != 0) {
710     ICING_LOG(ERROR) << "offset cannot be a value between (0, " << sizeof(Hit)
711                      << "). offset is " << offset << ".";
712     return false;
713   }
714   if (offset >= kSpecialHitsSize) {
715     // not_full state. Safe to ignore the return value because 0 and 1 are both
716     // < kNumSpecialData.
717     SetSpecialHit(posting_list_used, /*index=*/0, Hit(offset));
718     SetSpecialHit(posting_list_used, /*index=*/1, Hit(Hit::kInvalidValue));
719   } else if (offset == sizeof(Hit)) {
720     // almost_full state. Safe to ignore the return value because 1 is both <
721     // kNumSpecialData.
722     SetSpecialHit(posting_list_used, /*index=*/0, Hit(Hit::kInvalidValue));
723   }
724   // Nothing to do for the FULL state - the offset isn't actually stored
725   // anywhere and both special hits hold valid hits.
726   return true;
727 }
728 
729 libtextclassifier3::StatusOr<uint32_t>
PrependHitUncompressed(PostingListUsed * posting_list_used,const Hit & hit,uint32_t offset) const730 PostingListHitSerializer::PrependHitUncompressed(
731     PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
732   uint32_t hit_bytes_to_prepend = sizeof(Hit::Value) + GetFlagsByteSize(hit) +
733                                   GetTermFrequencyByteSize(hit);
734 
735   if (offset < kSpecialHitsSize + hit_bytes_to_prepend) {
736     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
737         "Not enough room to prepend Hit at offset %d.", offset));
738   }
739 
740   if (hit.has_term_frequency()) {
741     offset -= sizeof(Hit::TermFrequency);
742     Hit::TermFrequency term_frequency = hit.term_frequency();
743     memcpy(posting_list_used->posting_list_buffer() + offset, &term_frequency,
744            sizeof(Hit::TermFrequency));
745   }
746   if (hit.has_flags()) {
747     offset -= sizeof(Hit::Flags);
748     Hit::Flags flags = hit.flags();
749     memcpy(posting_list_used->posting_list_buffer() + offset, &flags,
750            sizeof(Hit::Flags));
751   }
752   offset -= sizeof(Hit::Value);
753   Hit::Value val = hit.value();
754   memcpy(posting_list_used->posting_list_buffer() + offset, &val,
755          sizeof(Hit::Value));
756   return offset;
757 }
758 
759 libtextclassifier3::Status
ConsumeFlagsAndTermFrequencyIfPresent(const PostingListUsed * posting_list_used,Hit * hit,uint32_t * offset) const760 PostingListHitSerializer::ConsumeFlagsAndTermFrequencyIfPresent(
761     const PostingListUsed* posting_list_used, Hit* hit,
762     uint32_t* offset) const {
763   if (!hit->has_flags()) {
764     // No flags (and by extension, no term-frequency) to consume. Everything is
765     // fine.
766     return libtextclassifier3::Status::OK;
767   }
768 
769   // Consume flags
770   Hit::Flags flags;
771   if (*offset + sizeof(Hit::Flags) > posting_list_used->size_in_bytes()) {
772     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
773         "offset %d must not point past the end of the posting list of size "
774         "%d.",
775         *offset, posting_list_used->size_in_bytes()));
776   }
777   memcpy(&flags, posting_list_used->posting_list_buffer() + *offset,
778          sizeof(Hit::Flags));
779   *hit = Hit(hit->value(), flags, Hit::kDefaultTermFrequency);
780   *offset += sizeof(Hit::Flags);
781 
782   if (hit->has_term_frequency()) {
783     // Consume term frequency
784     if (*offset + sizeof(Hit::TermFrequency) >
785         posting_list_used->size_in_bytes()) {
786       return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
787           "offset %d must not point past the end of the posting list of size "
788           "%d.",
789           *offset, posting_list_used->size_in_bytes()));
790     }
791     Hit::TermFrequency term_frequency;
792     memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset,
793            sizeof(Hit::TermFrequency));
794     *hit = Hit(hit->value(), flags, term_frequency);
795     *offset += sizeof(Hit::TermFrequency);
796   }
797 
798   return libtextclassifier3::Status::OK;
799 }
800 
801 }  // namespace lib
802 }  // namespace icing
803