xref: /aosp_15_r20/external/icing/icing/index/numeric/posting-list-integer-index-serializer.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_SERIALIZER_H_
16 #define ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_SERIALIZER_H_
17 
18 #include <cstdint>
19 #include <vector>
20 
21 #include "icing/text_classifier/lib3/utils/base/status.h"
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/file/posting_list/posting-list-common.h"
24 #include "icing/file/posting_list/posting-list-used.h"
25 #include "icing/index/numeric/integer-index-data.h"
26 
27 namespace icing {
28 namespace lib {
29 
30 // A serializer class to serialize IntegerIndexData to PostingListUsed.
31 class PostingListIntegerIndexSerializer : public PostingListSerializer {
32  public:
33   using SpecialDataType = SpecialData<IntegerIndexData>;
34   static_assert(sizeof(SpecialDataType) == sizeof(IntegerIndexData), "");
35 
36   static constexpr uint32_t kSpecialDataSize =
37       kNumSpecialData * sizeof(SpecialDataType);
38 
GetDataTypeBytes()39   uint32_t GetDataTypeBytes() const override {
40     return sizeof(IntegerIndexData);
41   }
42 
GetMinPostingListSize()43   uint32_t GetMinPostingListSize() const override {
44     static constexpr uint32_t kMinPostingListSize = kSpecialDataSize;
45     static_assert(sizeof(PostingListIndex) <= kMinPostingListSize,
46                   "PostingListIndex must be small enough to fit in a "
47                   "minimum-sized Posting List.");
48 
49     return kMinPostingListSize;
50   }
51 
52   uint32_t GetMinPostingListSizeToFit(
53       const PostingListUsed* posting_list_used) const override;
54 
55   uint32_t GetBytesUsed(
56       const PostingListUsed* posting_list_used) const override;
57 
58   void Clear(PostingListUsed* posting_list_used) const override;
59 
60   libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
61                                       PostingListUsed* src) const override;
62 
63   // Prepend an IntegerIndexData to the posting list.
64   //
65   // RETURNS:
66   //   - INVALID_ARGUMENT if !data.is_valid() or if data is not less than the
67   //       previously added data.
68   //   - RESOURCE_EXHAUSTED if there is no more room to add data to the posting
69   //       list.
70   libtextclassifier3::Status PrependData(PostingListUsed* posting_list_used,
71                                          const IntegerIndexData& data) const;
72 
73   // Prepend multiple IntegerIndexData to the posting list. Data should be
74   // sorted in ascending order (as defined by the less than operator for
75   // IntegerIndexData)
76   // If keep_prepended is true, whatever could be prepended is kept, otherwise
77   // the posting list is reverted and left in its original state.
78   //
79   // RETURNS:
80   //   The number of data that have been prepended to the posting list. If
81   //   keep_prepended is false and reverted, then it returns 0.
82   libtextclassifier3::StatusOr<uint32_t> PrependDataArray(
83       PostingListUsed* posting_list_used, const IntegerIndexData* array,
84       uint32_t num_data, bool keep_prepended) const;
85 
86   // Retrieves all data stored in the posting list.
87   //
88   // RETURNS:
89   //   - On success, a vector of IntegerIndexData sorted by the reverse order of
90   //     prepending.
91   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
92   libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> GetData(
93       const PostingListUsed* posting_list_used) const;
94 
95   // Same as GetData but appends data to data_arr_out.
96   //
97   // RETURNS:
98   //   - OK on success, and data_arr_out will be appended IntegerIndexData
99   //     sorted by the reverse order of prepending.
100   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
101   libtextclassifier3::Status GetData(
102       const PostingListUsed* posting_list_used,
103       std::vector<IntegerIndexData>* data_arr_out) const;
104 
105   // Undo the last num_data data prepended. If num_data > number of data, then
106   // we clear all data.
107   //
108   // RETURNS:
109   //   - OK on success
110   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
111   libtextclassifier3::Status PopFrontData(PostingListUsed* posting_list_used,
112                                           uint32_t num_data) const;
113 
114   // Helper function to determine if posting list is full.
IsFull(const PostingListUsed * posting_list_used)115   bool IsFull(const PostingListUsed* posting_list_used) const {
116     return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
117            GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
118   }
119 
120  private:
121   // Posting list layout formats:
122   //
123   // NOT_FULL
124   // +-special-data-0--+-special-data-1--+------------+-----------------------+
125   // |                 |                 |            |                       |
126   // |data-start-offset|  Data::Invalid  | 0x00000000 |   (compressed) data   |
127   // |                 |                 |            |                       |
128   // +-----------------+-----------------+------------+-----------------------+
129   //
130   // ALMOST_FULL
131   // +-special-data-0--+-special-data-1--+-----+------------------------------+
132   // |                 |                 |     |                              |
133   // |  Data::Invalid  |    1st data     |(pad)|      (compressed) data       |
134   // |                 |                 |     |                              |
135   // +-----------------+-----------------+-----+------------------------------+
136   //
137   // FULL
138   // +-special-data-0--+-special-data-1--+-----+------------------------------+
139   // |                 |                 |     |                              |
140   // |    1st data     |    2nd data     |(pad)|      (compressed) data       |
141   // |                 |                 |     |                              |
142   // +-----------------+-----------------+-----+------------------------------+
143   //
144   // The first two uncompressed (special) data also implicitly encode
145   // information about the size of the compressed data region.
146   //
147   // 1. If the posting list is NOT_FULL, then special_data_0 contains the byte
148   //    offset of the start of the compressed data. Thus, the size of the
149   //    compressed data is
150   //    posting_list_used->size_in_bytes() - special_data_0.data_start_offset().
151   //
152   // 2. If posting list is ALMOST_FULL or FULL, then the compressed data region
153   //    starts somewhere between
154   //    [kSpecialDataSize, kSpecialDataSize + sizeof(IntegerIndexData) - 1] and
155   //    ends at posting_list_used->size_in_bytes() - 1.
156   //
157   // EXAMPLE
158   // Posting list storage. Posting list size: 36 bytes
159   //
160   // EMPTY!
161   // +--- byte 0-11 ---+----- 12-23 -----+-------------- 24-35 ---------------+
162   // |                 |                 |                                    |
163   // |       36        |  Data::Invalid  |             0x00000000             |
164   // |                 |                 |                                    |
165   // +-----------------+-----------------+------------------------------------+
166   //
167   // Add IntegerIndexData(0x0FFFFCC3, 5)
168   //   (DocumentId = 12, SectionId = 3; Key = 5)
169   //   (VarInt64(5) is encoded as 10 (b'1010), requires 1 byte)
170   // NOT FULL!
171   // +--- byte 0-11 ---+----- 12-23 -----+------- 24-30 -------+--- 31-35 ----+
172   // |                 |                 |                     | 0x0FFFFCC3   |
173   // |       31        |  Data::Invalid  |     0x00000000      | VI64(5)      |
174   // |                 |                 |                     |              |
175   // +-----------------+-----------------+---------------------+--------------+
176   //
177   // Add IntegerIndexData(0x0FFFFB40, -2)
178   //   (DocumentId = 18, SectionId = 0; Key = -2)
179   //   (VarInt64(-2) is encoded as 3 (b'11), requires 1 byte)
180   // Previous IntegerIndexData BasicHit delta varint encoding:
181   //   0x0FFFFCC3 - 0x0FFFFB40 = 387, VarUnsignedInt(387) requires 2 bytes
182   // +--- byte 0-11 ---+----- 12-23 -----+-- 24-27 ---+--- 28-32 ----+ 33-35 -+
183   // |                 |                 |            | 0x0FFFFB40   |VUI(387)|
184   // |       28        |  Data::Invalid  |    0x00    | VI64(-2)     |VI64(5) |
185   // |                 |                 |            |              |        |
186   // +-----------------+-----------------+------------+--------------+--------+
187   //
188   // Add IntegerIndexData(0x0FFFFA4A, 3)
189   //   (DocumentId = 22, SectionId = 10; Key = 3)
190   //   (VarInt64(3) is encoded as 6 (b'110), requires 1 byte)
191   // Previous IntegerIndexData BasicHit delta varint encoding:
192   //   0x0FFFFB40 - 0x0FFFFA4A = 246, VarUnsignedInt(246) requires 2 bytes
193   // +--- byte 0-11 ---+----- 12-23 -----+---+--- 25-29 ----+ 30-32 -+ 33-35 -+
194   // |                 |                 |   | 0x0FFFFA4A   |VUI(246)|VUI(387)|
195   // |       25        |  Data::Invalid  |   | VI64(3)      |VI64(-2)|VI64(5) |
196   // |                 |                 |   |              |        |        |
197   // +-----------------+-----------------+---+--------------+--------+--------+
198   //
199   // Add IntegerIndexData(0x0FFFFA01, -4)
200   //   (DocumentId = 23, SectionId = 1; Key = -4)
201   //   (No VarInt64 for key, since it is stored in special data section)
202   // Previous IntegerIndexData BasicHit delta varint encoding:
203   //   0x0FFFFA4A - 0x0FFFFA01 = 73, VarUnsignedInt(73) requires 1 byte)
204   // ALMOST_FULL!
205   // +--- byte 0-11 ---+----- 12-23 -----+-- 24-27 ---+28-29+ 30-32 -+ 33-35 -+
206   // |                 |   0x0FFFFA01    |            |(73) |VUI(246)|VUI(387)|
207   // |  Data::Invalid  |   0xFFFFFFFF    |   (pad)    |(3)  |VI64(-2)|VI64(5) |
208   // |                 |   0xFFFFFFFC    |            |     |        |        |
209   // +-----------------+-----------------+------------+-----+--------+--------+
210   //
211   // Add IntegerIndexData(0x0FFFF904, 0)
212   //   (DocumentId = 27, SectionId = 4; Key = 0)
213   //   (No VarInt64 for key, since it is stored in special data section)
214   // Previous IntegerIndexData:
215   //   Since 0x0FFFFA01 - 0x0FFFF904 = 253 and VarInt64(-4) is encoded as 7
216   //   (b'111), it requires only 3 bytes after compression. It's able to fit
217   //   into the padding section.
218   // Still ALMOST_FULL!
219   // +--- byte 0-11 ---+----- 12-23 -----+---+ 25-27 -+28-29+ 30-32 -+ 33-35 -+
220   // |                 |   0x0FFFF904    |   |VUI(253)|(73) |VUI(246)|VUI(387)|
221   // |  Data::Invalid  |   0x00000000    |   |VI64(-4)|(3)  |VI64(-2)|VI64(5) |
222   // |                 |   0x00000000    |   |        |     |        |        |
223   // +-----------------+-----------------+---+--------+-----+--------+--------+
224   //
225   // Add IntegerIndexData(0x0FFFF8C3, -1)
226   //   (DocumentId = 28, SectionId = 3; Key = -1)
227   //   (No VarInt64 for key, since it is stored in special data section)
228   //   (No VarUnsignedInt for previous IntegerIndexData BasicHit)
229   // FULL!
230   // +--- byte 0-11 ---+----- 12-23 -----+---+ 25-27 -+28-29+ 30-32 -+ 33-35 -+
231   // |   0x0FFFF8C3    |   0x0FFFF904    |   |VUI(253)|(73) |VUI(246)|VUI(387)|
232   // |   0xFFFFFFFF    |   0x00000000    |   |VI64(-4)|(3)  |VI64(-2)|VI64(5) |
233   // |   0xFFFFFFFF    |   0x00000000    |   |        |     |        |        |
234   // +-----------------+-----------------+---+--------+-----+--------+--------+
235 
236   // Helpers to determine what state the posting list is in.
IsAlmostFull(const PostingListUsed * posting_list_used)237   bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
238     return !GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
239            GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
240   }
241 
IsEmpty(const PostingListUsed * posting_list_used)242   bool IsEmpty(const PostingListUsed* posting_list_used) const {
243     return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset() ==
244                posting_list_used->size_in_bytes() &&
245            !GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
246   }
247 
248   // Returns false if both special data are invalid or if data start offset
249   // stored in the special data is less than kSpecialDataSize or greater than
250   // posting_list_used->size_in_bytes(). Returns true, otherwise.
251   bool IsPostingListValid(const PostingListUsed* posting_list_used) const;
252 
253   // Prepend data to a posting list that is in the ALMOST_FULL state.
254   //
255   // RETURNS:
256   //  - OK, if successful
257   //  - INVALID_ARGUMENT if data is not less than the previously added data.
258   libtextclassifier3::Status PrependDataToAlmostFull(
259       PostingListUsed* posting_list_used, const IntegerIndexData& data) const;
260 
261   // Prepend data to a posting list that is in the EMPTY state. This will always
262   // succeed because there are no pre-existing data and no validly constructed
263   // posting list could fail to fit one data.
264   void PrependDataToEmpty(PostingListUsed* posting_list_used,
265                           const IntegerIndexData& data) const;
266 
267   // Prepend data to a posting list that is in the NOT_FULL state.
268   //
269   // RETURNS:
270   //  - OK, if successful
271   //  - INVALID_ARGUMENT if data is not less than the previously added data.
272   libtextclassifier3::Status PrependDataToNotFull(
273       PostingListUsed* posting_list_used, const IntegerIndexData& data,
274       uint32_t offset) const;
275 
276   // Returns either 0 (FULL state), sizeof(IntegerIndexData) (ALMOST_FULL state)
277   // or a byte offset between kSpecialDataSize and
278   // posting_list_used->size_in_bytes() (inclusive) (NOT_FULL state).
279   uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const;
280 
281   // Sets special data 0 to properly reflect what start byte offset is (see
282   // layout comment for further details).
283   //
284   // Returns false if offset > posting_list_used->size_in_bytes() or offset is
285   // in range (kSpecialDataSize, sizeof(IntegerIndexData)) or
286   // (sizeof(IntegerIndexData), 0). True, otherwise.
287   bool SetStartByteOffset(PostingListUsed* posting_list_used,
288                           uint32_t offset) const;
289 
290   // Helper for MoveFrom/GetData/PopFrontData. Adds limit number of data to out
291   // or all data in the posting list if the posting list contains less than
292   // limit number of data. out can be NULL.
293   //
294   // NOTE: If called with limit=1, pop=true on a posting list that transitioned
295   // from NOT_FULL directly to FULL, GetDataInternal will not return the posting
296   // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
297   // ALMOST_FULL.
298   //
299   // RETURNS:
300   //   - OK on success
301   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
302   libtextclassifier3::Status GetDataInternal(
303       const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
304       std::vector<IntegerIndexData>* out) const;
305 
306   // Retrieves the value stored in the index-th special data.
307   //
308   // REQUIRES:
309   //   0 <= index < kNumSpecialData.
310   //
311   // RETURNS:
312   //   - A valid SpecialData<IntegerIndexData>.
313   SpecialDataType GetSpecialData(const PostingListUsed* posting_list_used,
314                                  uint32_t index) const;
315 
316   // Sets the value stored in the index-th special data to special_data.
317   //
318   // REQUIRES:
319   //   0 <= index < kNumSpecialData.
320   void SetSpecialData(PostingListUsed* posting_list_used, uint32_t index,
321                       const SpecialDataType& special_data) const;
322 
323   // Prepends data to the memory region [offset - sizeof(IntegerIndexData),
324   // offset - 1] and returns the new beginning of the region.
325   //
326   // RETURNS:
327   //   - The new beginning of the padded region, if successful.
328   //   - INVALID_ARGUMENT if data will not fit (uncompressed) between
329   //       [kSpecialDataSize, offset - 1]
330   libtextclassifier3::StatusOr<uint32_t> PrependDataUncompressed(
331       PostingListUsed* posting_list_used, const IntegerIndexData& data,
332       uint32_t offset) const;
333 };
334 
335 }  // namespace lib
336 }  // namespace icing
337 
338 #endif  // ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_SERIALIZER_H_
339