xref: /aosp_15_r20/external/icing/icing/index/main/posting-list-hit-serializer.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
16 #define ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
17 
18 #include <cstddef>
19 #include <cstdint>
20 #include <vector>
21 
22 #include "icing/text_classifier/lib3/utils/base/status.h"
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 #include "icing/file/posting_list/posting-list-common.h"
25 #include "icing/file/posting_list/posting-list-used.h"
26 #include "icing/index/hit/hit.h"
27 #include "icing/legacy/index/icing-bit-util.h"
28 #include "icing/util/status-macros.h"
29 
30 namespace icing {
31 namespace lib {
32 
33 // A serializer class to serialize hits to PostingListUsed. Layout described in
34 // comments in posting-list-hit-serializer.cc.
35 class PostingListHitSerializer : public PostingListSerializer {
36  public:
37   static constexpr uint32_t kSpecialHitsSize = kNumSpecialData * sizeof(Hit);
38 
39   struct DecodedHitInfo {
40     // The decoded hit value.
41     Hit::Value hit_value;
42 
43     // Size of the encoded hit in bytes.
44     size_t encoded_size;
45   };
46 
47   // Given the current hit value, encodes the next hit value for serialization
48   // in the posting list.
49   //
50   // The encoded value is the varint-encoded delta between next_hit_value and
51   // curr_hit_value.
52   // - We add 1 to this delta so as to avoid getting a delta value of 0.
53   // - This allows us to add duplicate hits with the same value, which is a
54   //   valid case if we need to store hits with different flags that belong in
55   //   the same section-id/doc-id combo.
56   // - We cannot have an encoded hit delta with a value of 0 as 0 is currently
57   //   used for padding the unused region in the posting list.
58   //
59   // REQUIRES: next_hit_value <= curr_hit_value AND
60   //           curr_hit_value - next_hit_value <
61   //              std::numeric_limits<Hit::Value>::max()
62   //
63   // RETURNS: next_hit_value's encoded length in bytes and writes the encoded
64   //          value directly into encoded_buf_out.
EncodeNextHitValue(Hit::Value next_hit_value,Hit::Value curr_hit_value,uint8_t * encoded_buf_out)65   static size_t EncodeNextHitValue(Hit::Value next_hit_value,
66                                    Hit::Value curr_hit_value,
67                                    uint8_t* encoded_buf_out) {
68     uint64_t delta = curr_hit_value - next_hit_value + 1;
69     return VarInt::Encode(delta, encoded_buf_out);
70   }
71 
72   // Given the current hit value, decodes the next hit value from an encoded
73   // byte array buffer.
74   //
75   // RETURNS: DecodedHitInfo containing the decoded hit value and the value's
76   //          encoded size in bytes.
DecodeNextHitValue(const uint8_t * encoded_buf_in,Hit::Value curr_hit_value)77   static DecodedHitInfo DecodeNextHitValue(const uint8_t* encoded_buf_in,
78                                            Hit::Value curr_hit_value) {
79     uint64_t delta;
80     size_t delta_size = VarInt::Decode(encoded_buf_in, &delta);
81     Hit::Value hit_value = curr_hit_value + delta - 1;
82     return {hit_value, delta_size};
83   }
84 
GetDataTypeBytes()85   uint32_t GetDataTypeBytes() const override { return sizeof(Hit); }
86 
GetMinPostingListSize()87   uint32_t GetMinPostingListSize() const override {
88     static constexpr uint32_t kMinPostingListSize = kSpecialHitsSize;
89     static_assert(sizeof(PostingListIndex) <= kMinPostingListSize,
90                   "PostingListIndex must be small enough to fit in a "
91                   "minimum-sized Posting List.");
92 
93     return kMinPostingListSize;
94   }
95 
96   uint32_t GetMinPostingListSizeToFit(
97       const PostingListUsed* posting_list_used) const override;
98 
99   uint32_t GetBytesUsed(
100       const PostingListUsed* posting_list_used) const override;
101 
102   void Clear(PostingListUsed* posting_list_used) const override;
103 
104   libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
105                                       PostingListUsed* src) const override;
106 
107   // Prepend a hit to the posting list.
108   //
109   // RETURNS:
110   //   - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
111   //       previously added hit.
112   //   - RESOURCE_EXHAUSTED if there is no more room to add hit to the posting
113   //       list.
114   libtextclassifier3::Status PrependHit(PostingListUsed* posting_list_used,
115                                         const Hit& hit) const;
116 
117   // Retrieves the hits stored in the posting list.
118   //
119   // RETURNS:
120   //   - On success, a vector of hits sorted by the reverse order of prepending.
121   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
122   libtextclassifier3::StatusOr<std::vector<Hit>> GetHits(
123       const PostingListUsed* posting_list_used) const;
124 
125   // Same as GetHits but appends hits to hits_out.
126   //
127   // RETURNS:
128   //   - On success, a vector of hits sorted by the reverse order of prepending.
129   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
130   libtextclassifier3::Status GetHits(const PostingListUsed* posting_list_used,
131                                      std::vector<Hit>* hits_out) const;
132 
133   // Undo the last num_hits hits prepended. If num_hits > number of
134   // hits we clear all hits.
135   //
136   // RETURNS:
137   //   - OK on success
138   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
139   libtextclassifier3::Status PopFrontHits(PostingListUsed* posting_list_used,
140                                           uint32_t num_hits) const;
141 
142  private:
143   // Posting list layout formats:
144   //
145   // not_full
146   //
147   // +-----------------+----------------+-------+-----------------+
148   // |hits-start-offset|Hit::kInvalidVal|xxxxxxx|(compressed) hits|
149   // +-----------------+----------------+-------+-----------------+
150   //
151   // almost_full
152   //
153   // +-----------------+----------------+-------+-----------------+
154   // |Hit::kInvalidVal |1st hit         |(pad)  |(compressed) hits|
155   // +-----------------+----------------+-------+-----------------+
156   //
157   // full()
158   //
159   // +-----------------+----------------+-------+-----------------+
160   // |1st hit          |2nd hit         |(pad)  |(compressed) hits|
161   // +-----------------+----------------+-------+-----------------+
162   //
163   // The first two uncompressed hits also implicitly encode information about
164   // the size of the compressed hits region.
165   //
166   // 1. If the posting list is NOT_FULL, then
167   // posting_list_buffer_[0] contains the byte offset of the start of the
168   // compressed hits - and, thus, the size of the compressed hits region is
169   // size_in_bytes - posting_list_buffer_[0].
170   //
171   // 2. If posting list is ALMOST_FULL or FULL, then the compressed hits region
172   // starts somewhere between [kSpecialHitsSize, kSpecialHitsSize + sizeof(Hit)
173   // - 1] and ends at size_in_bytes - 1.
174   //
175   // Hit term frequencies are stored after the hit value, compressed or
176   // uncompressed. For the first two special hits, we always have a
177   // space for the term frequency. For hits in the compressed area, we only have
178   // the term frequency following the hit value of hit.has_term_frequency() is
179   // true. This allows good compression in the common case where hits don't have
180   // a valid term frequency.
181   //
182   // EXAMPLE
183   // Posting list storage. Posting list size: 20 bytes
184   // EMPTY!
185   // +--bytes 0-4--+----- 5-9 ------+---------------- 10-19 -----------------+
186   // |     20      |Hit::kInvalidVal|                 0x000                  |
187   // +-------------+----------------+----------------+-----------------------+
188   //
189   // Add Hit 0x07FFF998 (DocumentId = 12, SectionId = 3, Flags = 0)
190   // NOT FULL!
191   // +--bytes 0-4--+----- 5-9 ------+----- 10-15 -----+-------- 16-19 -------+
192   // |     16      |Hit::kInvalidVal|      0x000      |       0x07FFF998     |
193   // +-------------+----------------+-----------------+----------------------+
194   //
195   // Add Hit 0x07FFF684 (DocumentId = 18, SectionId = 0, Flags = 4,
196   // TermFrequency=125)
197   // (Hit 0x07FFF998 - Hit 0x07FFF684 = 788)
198   // +--bytes 0-4--+----- 5-9 ------+-- 10-12 --+-- 13-16 --+- 17 -+-- 18-19 --+
199   // |      13     |Hit::kInvalidVal|   0x000   | 0x07FFF684| 125  |    788    |
200   // +-------------+----------------+-----------+-----------+------+-----------+
201   //
202   // Add Hit 0x07FFF4D2 (DocumentId = 22, SectionId = 10, Flags = 2)
203   // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 434)
204   // +--bytes 0-4--+--- 5-9 ----+-- 10 --+-- 11-14 -+- 15-16 -+- 17 -+- 18-19 -+
205   // |      9      |Hit::kInvVal|  0x00  |0x07FFF4D2|   434   | 125  |   788   |
206   // +-------------+------------+--------+----------+---------+------+---------+
207   //
208   // Add Hit 0x07FFF40E (DocumentId = 23, SectionId = 1, Flags = 6,
209   // TermFrequency = 87)
210   // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 196) ALMOST FULL!
211   // +--bytes 0-4-+---- 5-9 ----+- 10-12 -+- 13-14 -+- 15-16 -+- 17 -+- 18-19 -+
212   // |Hit::kInvVal|0x07FFF40E,87|  0x000  |    196  |   434   |  125 |   788   |
213   // +-------------+------------+---------+---------+---------+------+---------+
214   //
215   // Add Hit 0x07FFF320 (DocumentId = 27, SectionId = 4, Flags = 0)
216   // FULL!
217   // +--bytes 0-4--+---- 5-9 ----+- 10-13 -+-- 14-15 -+- 16-17 -+- 18 -+- 19-20
218   // -+ | 0x07FFF320  |0x07FFF40E,87|  0x000  |    196   |   434   |  125 | 788
219   // |
220   // +-------------+-------------+---------+----------+---------+------+---------+
221 
222   // Helpers to determine what state the posting list is in.
IsFull(const PostingListUsed * posting_list_used)223   bool IsFull(const PostingListUsed* posting_list_used) const {
224     return GetSpecialHit(posting_list_used, /*index=*/0)
225                .ValueOrDie()
226                .is_valid() &&
227            GetSpecialHit(posting_list_used, /*index=*/1)
228                .ValueOrDie()
229                .is_valid();
230   }
231 
IsAlmostFull(const PostingListUsed * posting_list_used)232   bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
233     return !GetSpecialHit(posting_list_used, /*index=*/0)
234                 .ValueOrDie()
235                 .is_valid();
236   }
237 
IsEmpty(const PostingListUsed * posting_list_used)238   bool IsEmpty(const PostingListUsed* posting_list_used) const {
239     return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() ==
240                posting_list_used->size_in_bytes() &&
241            !GetSpecialHit(posting_list_used, /*index=*/1)
242                 .ValueOrDie()
243                 .is_valid();
244   }
245 
246   // Returns false if both special hits are invalid or if the offset value
247   // stored in the special hit is less than kSpecialHitsSize or greater than
248   // posting_list_used->size_in_bytes(). Returns true, otherwise.
249   bool IsPostingListValid(const PostingListUsed* posting_list_used) const;
250 
251   // Prepend hit to a posting list that is in the ALMOST_FULL state.
252   // RETURNS:
253   //  - OK, if successful
254   //  - INVALID_ARGUMENT if hit is not less than the previously added hit.
255   libtextclassifier3::Status PrependHitToAlmostFull(
256       PostingListUsed* posting_list_used, const Hit& hit) const;
257 
258   // Prepend hit to a posting list that is in the EMPTY state. This will always
259   // succeed because there are no pre-existing hits and no validly constructed
260   // posting list could fail to fit one hit.
261   void PrependHitToEmpty(PostingListUsed* posting_list_used,
262                          const Hit& hit) const;
263 
264   // Prepend hit to a posting list that is in the NOT_FULL state.
265   // RETURNS:
266   //  - OK, if successful
267   //  - INVALID_ARGUMENT if hit is not less than the previously added hit.
268   libtextclassifier3::Status PrependHitToNotFull(
269       PostingListUsed* posting_list_used, const Hit& hit,
270       uint32_t offset) const;
271 
272   // Returns either 0 (full state), sizeof(Hit) (almost_full state) or
273   // a byte offset between kSpecialHitsSize and
274   // posting_list_used->size_in_bytes() (inclusive) (not_full state).
275   uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const;
276 
277   // Sets the special hits to properly reflect what offset is (see layout
278   // comment for further details).
279   //
280   // Returns false if offset > posting_list_used->size_in_bytes() or offset is
281   // (kSpecialHitsSize, sizeof(Hit)) or offset is (sizeof(Hit), 0). True,
282   // otherwise.
283   bool SetStartByteOffset(PostingListUsed* posting_list_used,
284                           uint32_t offset) const;
285 
286   // Manipulate padded areas. We never store the same hit value twice
287   // so a delta of 0 is a pad byte.
288 
289   // Returns offset of first non-pad byte.
290   uint32_t GetPadEnd(const PostingListUsed* posting_list_used,
291                      uint32_t offset) const;
292 
293   // Fill padding between offset start and offset end with 0s.
294   // Returns false if end > posting_list_used->size_in_bytes(). True,
295   // otherwise.
296   bool PadToEnd(PostingListUsed* posting_list_used, uint32_t start,
297                 uint32_t end) const;
298 
299   // Helper for AppendHits/PopFrontHits. Adds limit number of hits to out or all
300   // hits in the posting list if the posting list contains less than limit
301   // number of hits. out can be NULL.
302   //
303   // NOTE: If called with limit=1, pop=true on a posting list that transitioned
304   // from NOT_FULL directly to FULL, GetHitsInternal will not return the posting
305   // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
306   // ALMOST_FULL.
307   //
308   // RETURNS:
309   //   - OK on success
310   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
311   libtextclassifier3::Status GetHitsInternal(
312       const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
313       std::vector<Hit>* out) const;
314 
315   // Retrieves the value stored in the index-th special hit.
316   //
317   // RETURNS:
318   //   - A valid Hit, on success
319   //   - INVALID_ARGUMENT if index is not less than kNumSpecialData
320   libtextclassifier3::StatusOr<Hit> GetSpecialHit(
321       const PostingListUsed* posting_list_used, uint32_t index) const;
322 
323   // Sets the value stored in the index-th special hit to val. If index is not
324   // less than kSpecialHitSize / sizeof(Hit), this has no effect.
325   bool SetSpecialHit(PostingListUsed* posting_list_used, uint32_t index,
326                      const Hit& val) const;
327 
328   // Prepends hit to the memory region [offset - sizeof(Hit), offset] and
329   // returns the new beginning of the padded region.
330   //
331   // RETURNS:
332   //   - The new beginning of the padded region, if successful.
333   //   - INVALID_ARGUMENT if hit will not fit (uncompressed) between offset and
334   // kSpecialHitsSize
335   libtextclassifier3::StatusOr<uint32_t> PrependHitUncompressed(
336       PostingListUsed* posting_list_used, const Hit& hit,
337       uint32_t offset) const;
338 
339   // If hit has the flags and/or term frequency field, consumes the flags and/or
340   // term frequency at offset, updates hit to include the flag and/or term
341   // frequency and updates offset to reflect that the flag and/or term frequency
342   // fields have been consumed.
343   //
344   // RETURNS:
345   //   - OK, if successful
346   //   - INVALID_ARGUMENT if hit has a flags and/or term frequency field and
347   //     offset + sizeof(Hit's flag) + sizeof(Hit's tf) >=
348   //     posting_list_used->size_in_bytes()
349   //     i.e. the posting list is not large enough to consume the hit's flags
350   //     and term frequency fields
351   libtextclassifier3::Status ConsumeFlagsAndTermFrequencyIfPresent(
352       const PostingListUsed* posting_list_used, Hit* hit,
353       uint32_t* offset) const;
354 };
355 
356 }  // namespace lib
357 }  // namespace icing
358 
359 #endif  // ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
360