xref: /aosp_15_r20/external/icing/icing/join/posting-list-join-data-serializer.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_
16 #define ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_
17 
18 #include <cstdint>
19 #include <cstring>
20 #include <limits>
21 #include <vector>
22 
23 #include "icing/text_classifier/lib3/utils/base/status.h"
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/absl_ports/canonical_errors.h"
26 #include "icing/file/posting_list/posting-list-common.h"
27 #include "icing/file/posting_list/posting-list-used.h"
28 #include "icing/legacy/core/icing-string-util.h"
29 #include "icing/util/logging.h"
30 #include "icing/util/status-macros.h"
31 
32 namespace icing {
33 namespace lib {
34 
35 // A serializer class to serialize JoinDataType to PostingListUsed. Usually
36 // JoinDataType is DocumentIdToJoinInfo<NamespaceIdFingerprint>,
37 // DocumentIdToJoinInfo<TermId>, or DocumentIdToJoinInfo<int64_t>.
38 //
39 // REQUIRES:
40 // - JoinDataType is comparable by operator <.
41 // - JoinDataType implements is_valid() method.
42 // - JoinDataType has static method GetInvalid() that returns a JoinDataType
43 //   instance containing invalid data.
44 template <typename JoinDataType>
45 class PostingListJoinDataSerializer : public PostingListSerializer {
46  public:
47   using SpecialDataType = SpecialData<JoinDataType>;
48   static_assert(sizeof(SpecialDataType) == sizeof(JoinDataType), "");
49 
50   static constexpr uint32_t kSpecialDataSize =
51       kNumSpecialData * sizeof(SpecialDataType);
52 
GetDataTypeBytes()53   uint32_t GetDataTypeBytes() const override { return sizeof(JoinDataType); }
54 
GetMinPostingListSize()55   uint32_t GetMinPostingListSize() const override {
56     static constexpr uint32_t kMinPostingListSize = kSpecialDataSize;
57     static_assert(sizeof(PostingListIndex) <= kMinPostingListSize,
58                   "PostingListIndex must be small enough to fit in a "
59                   "minimum-sized Posting List.");
60 
61     return kMinPostingListSize;
62   }
63 
64   uint32_t GetMinPostingListSizeToFit(
65       const PostingListUsed* posting_list_used) const override;
66 
67   uint32_t GetBytesUsed(
68       const PostingListUsed* posting_list_used) const override;
69 
70   void Clear(PostingListUsed* posting_list_used) const override;
71 
72   libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
73                                       PostingListUsed* src) const override;
74 
75   // Prepend a JoinData to the posting list.
76   //
77   // RETURNS:
78   //   - INVALID_ARGUMENT if !data.is_valid() or if data is not greater than the
79   //     previously added data.
80   //   - RESOURCE_EXHAUSTED if there is no more room to add data to the posting
81   //     list.
82   libtextclassifier3::Status PrependData(PostingListUsed* posting_list_used,
83                                          const JoinDataType& data) const;
84 
85   // Prepend multiple JoinData to the posting list.
86   // Data should be sorted in ascending order (as defined by the less than
87   // operator for JoinData)
88   // If keep_prepended is true, whatever could be prepended is kept, otherwise
89   // the posting list is reverted and left in its original state.
90   //
91   // RETURNS:
92   //   The number of data that have been prepended to the posting list. If
93   //   keep_prepended is false and reverted, then it returns 0.
94   libtextclassifier3::StatusOr<uint32_t> PrependDataArray(
95       PostingListUsed* posting_list_used, const JoinDataType* array,
96       uint32_t num_data, bool keep_prepended) const;
97 
98   // Retrieves all data stored in the posting list.
99   //
100   // RETURNS:
101   //   - On success, a vector of JoinDataType sorted by the reverse order of
102   //     prepending.
103   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
104   libtextclassifier3::StatusOr<std::vector<JoinDataType>> GetData(
105       const PostingListUsed* posting_list_used) const;
106 
107   // Same as GetData but appends data to data_arr_out.
108   //
109   // RETURNS:
110   //   - OK on success, and data_arr_out will be appended JoinDataType sorted by
111   //     the reverse order of prepending.
112   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
113   libtextclassifier3::Status GetData(
114       const PostingListUsed* posting_list_used,
115       std::vector<JoinDataType>* data_arr_out) const;
116 
117   // Undo the last num_data data prepended. If num_data > number of data, then
118   // we clear all data.
119   //
120   // RETURNS:
121   //   - OK on success
122   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
123   libtextclassifier3::Status PopFrontData(PostingListUsed* posting_list_used,
124                                           uint32_t num_data) const;
125 
126   // Helper function to determine if posting list is full.
IsFull(const PostingListUsed * posting_list_used)127   bool IsFull(const PostingListUsed* posting_list_used) const {
128     return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
129            GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
130   }
131 
132  private:
133   // In PostingListJoinDataSerializer, there is no compression, but we still use
134   // the traditional posting list implementation.
135   //
136   // Posting list layout formats:
137   //
138   // NOT_FULL
139   // +-special-data-0--+-special-data-1--+------------+-----------------------+
140   // |                 |                 |            |                       |
141   // |data-start-offset|  Data::Invalid  | 0x00000000 |   (compressed) data   |
142   // |                 |                 |            |                       |
143   // +-----------------+-----------------+------------+-----------------------+
144   //
145   // ALMOST_FULL
146   // +-special-data-0--+-special-data-1--+-----+------------------------------+
147   // |                 |                 |     |                              |
148   // |  Data::Invalid  |    1st data     |(pad)|      (compressed) data       |
149   // |                 |                 |     |                              |
150   // +-----------------+-----------------+-----+------------------------------+
151   //
152   // FULL
153   // +-special-data-0--+-special-data-1--+-----+------------------------------+
154   // |                 |                 |     |                              |
155   // |    1st data     |    2nd data     |(pad)|      (compressed) data       |
156   // |                 |                 |     |                              |
157   // +-----------------+-----------------+-----+------------------------------+
158   //
159   // The first two uncompressed (special) data also implicitly encode
160   // information about the size of the compressed data region.
161   //
162   // 1. If the posting list is NOT_FULL, then special_data_0 contains the byte
163   //    offset of the start of the compressed data. Thus, the size of the
164   //    compressed data is
165   //    posting_list_used->size_in_bytes() - special_data_0.data_start_offset().
166   //
167   // 2. If posting list is ALMOST_FULL or FULL, then the compressed data region
168   //    starts somewhere between
169   //    [kSpecialDataSize, kSpecialDataSize + sizeof(JoinDataType) - 1] and ends
170   //    at posting_list_used->size_in_bytes() - 1.
171   //
172   // EXAMPLE
173   // JoinDataType = DocumentIdToJoinInfo<int64_t>. Posting list size: 48 bytes
174   //
175   // EMPTY!
176   // +-- byte 0-11 --+---- 12-23 ----+------------ 24-47 -------------+
177   // |               |               |                                |
178   // |      48       | Data::Invalid |           0x00000000           |
179   // |               |               |                                |
180   // +---------------+---------------+--------------------------------+
181   //
182   // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 12, JoinInteger = 5)
183   // NOT FULL!
184   // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
185   // |               |               |               | 12            |
186   // |      36       | Data::Invalid |  0x00000000   |  5            |
187   // |               |               |               |               |
188   // +---------------+---------------+---------------+---------------+
189   //
190   // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 18, JoinInteger = -2)
191   // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
192   // |               |               | 18            | 12            |
193   // |      24       | Data::Invalid | -2            |  5            |
194   // |               |               |               |               |
195   // +---------------+---------------+---------------+---------------+
196   //
197   // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 22, JoinInteger = 3)
198   // ALMOST_FULL!
199   // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
200   // |               | 22            | 18            | 12            |
201   // | Data::Invalid |  3            | -2            |  5            |
202   // |               |               |               |               |
203   // +---------------+---------------+---------------+---------------+
204   //
205   // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 27, JoinInteger = 0)
206   // FULL!
207   // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
208   // | 27            | 22            | 18            | 12            |
209   // |  0            |  3            | -2            |  5            |
210   // |               |               |               |               |
211   // +---------------+---------------+---------------+---------------+
212 
213   // Helpers to determine what state the posting list is in.
IsAlmostFull(const PostingListUsed * posting_list_used)214   bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
215     return !GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
216            GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
217   }
218 
IsEmpty(const PostingListUsed * posting_list_used)219   bool IsEmpty(const PostingListUsed* posting_list_used) const {
220     return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset() ==
221                posting_list_used->size_in_bytes() &&
222            !GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
223   }
224 
225   // Returns false if both special data are invalid or if data start offset
226   // stored in the special data is less than kSpecialDataSize or greater than
227   // posting_list_used->size_in_bytes(). Returns true, otherwise.
228   bool IsPostingListValid(const PostingListUsed* posting_list_used) const;
229 
230   // Prepend data to a posting list that is in the ALMOST_FULL state.
231   //
232   // RETURNS:
233   //  - OK, if successful
234   //  - INVALID_ARGUMENT if data is not less than the previously added data.
235   libtextclassifier3::Status PrependDataToAlmostFull(
236       PostingListUsed* posting_list_used, const JoinDataType& data) const;
237 
238   // Prepend data to a posting list that is in the EMPTY state. This will always
239   // succeed because there are no pre-existing data and no validly constructed
240   // posting list could fail to fit one data.
241   void PrependDataToEmpty(PostingListUsed* posting_list_used,
242                           const JoinDataType& data) const;
243 
244   // Prepend data to a posting list that is in the NOT_FULL state.
245   //
246   // RETURNS:
247   //  - OK, if successful
248   //  - INVALID_ARGUMENT if data is not less than the previously added data.
249   libtextclassifier3::Status PrependDataToNotFull(
250       PostingListUsed* posting_list_used, const JoinDataType& data,
251       uint32_t offset) const;
252 
253   // Returns either 0 (FULL state), sizeof(JoinDataType) (ALMOST_FULL state) or
254   // a byte offset between kSpecialDataSize and
255   // posting_list_used->size_in_bytes() (inclusive) (NOT_FULL state).
256   uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const;
257 
258   // Sets special data 0 to properly reflect what start byte offset is (see
259   // layout comment for further details).
260   //
261   // Returns false if offset > posting_list_used->size_in_bytes() or offset is
262   // in range (kSpecialDataSize, sizeof(JoinDataType)) or
263   // (sizeof(JoinDataType), 0). True, otherwise.
264   bool SetStartByteOffset(PostingListUsed* posting_list_used,
265                           uint32_t offset) const;
266 
267   // Helper for MoveFrom/GetData/PopFrontData. Adds limit number of data to out
268   // or all data in the posting list if the posting list contains less than
269   // limit number of data. out can be NULL.
270   //
271   // NOTE: If called with limit=1, pop=true on a posting list that transitioned
272   // from NOT_FULL directly to FULL, GetDataInternal will not return the posting
273   // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
274   // ALMOST_FULL.
275   //
276   // RETURNS:
277   //   - OK on success
278   //   - INTERNAL_ERROR if the posting list has been corrupted somehow.
279   libtextclassifier3::Status GetDataInternal(
280       const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
281       std::vector<JoinDataType>* out) const;
282 
283   // Retrieves the value stored in the index-th special data.
284   //
285   // REQUIRES:
286   //   0 <= index < kNumSpecialData.
287   //
288   // RETURNS:
289   //   - A valid SpecialData<JoinDataType>.
290   SpecialDataType GetSpecialData(const PostingListUsed* posting_list_used,
291                                  uint32_t index) const;
292 
293   // Sets the value stored in the index-th special data to special_data.
294   //
295   // REQUIRES:
296   //   0 <= index < kNumSpecialData.
297   void SetSpecialData(PostingListUsed* posting_list_used, uint32_t index,
298                       const SpecialDataType& special_data) const;
299 
300   // Prepends data to the memory region
301   // [offset - sizeof(JoinDataType), offset - 1] and
302   // returns the new beginning of the region.
303   //
304   // RETURNS:
305   //   - The new beginning of the padded region, if successful.
306   //   - INVALID_ARGUMENT if data will not fit (uncompressed) between
307   //       [kSpecialDataSize, offset - 1]
308   libtextclassifier3::StatusOr<uint32_t> PrependDataUncompressed(
309       PostingListUsed* posting_list_used, const JoinDataType& data,
310       uint32_t offset) const;
311 };
312 
313 template <typename JoinDataType>
GetBytesUsed(const PostingListUsed * posting_list_used)314 uint32_t PostingListJoinDataSerializer<JoinDataType>::GetBytesUsed(
315     const PostingListUsed* posting_list_used) const {
316   // The special data will be included if they represent actual data. If they
317   // represent the data start offset or the invalid data sentinel, they are not
318   // included.
319   return posting_list_used->size_in_bytes() -
320          GetStartByteOffset(posting_list_used);
321 }
322 
323 template <typename JoinDataType>
324 uint32_t
GetMinPostingListSizeToFit(const PostingListUsed * posting_list_used)325 PostingListJoinDataSerializer<JoinDataType>::GetMinPostingListSizeToFit(
326     const PostingListUsed* posting_list_used) const {
327   if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) {
328     // If in either the FULL state or ALMOST_FULL state, this posting list *is*
329     // the minimum size posting list that can fit these data. So just return the
330     // size of the posting list.
331     return posting_list_used->size_in_bytes();
332   }
333 
334   // In NOT_FULL state, BytesUsed contains no special data. The minimum sized
335   // posting list that would be guaranteed to fit these data would be
336   // ALMOST_FULL, with kInvalidData in special data 0, the uncompressed data in
337   // special data 1 and the n compressed data in the compressed region.
338   // BytesUsed contains one uncompressed data and n compressed data. Therefore,
339   // fitting these data into a posting list would require BytesUsed plus one
340   // extra data.
341   return GetBytesUsed(posting_list_used) + GetDataTypeBytes();
342 }
343 
344 template <typename JoinDataType>
Clear(PostingListUsed * posting_list_used)345 void PostingListJoinDataSerializer<JoinDataType>::Clear(
346     PostingListUsed* posting_list_used) const {
347   // Safe to ignore return value because posting_list_used->size_in_bytes() is
348   // a valid argument.
349   SetStartByteOffset(posting_list_used,
350                      /*offset=*/posting_list_used->size_in_bytes());
351 }
352 
353 template <typename JoinDataType>
354 libtextclassifier3::Status
MoveFrom(PostingListUsed * dst,PostingListUsed * src)355 PostingListJoinDataSerializer<JoinDataType>::MoveFrom(
356     PostingListUsed* dst, PostingListUsed* src) const {
357   ICING_RETURN_ERROR_IF_NULL(dst);
358   ICING_RETURN_ERROR_IF_NULL(src);
359   if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) {
360     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
361         "src MinPostingListSizeToFit %d must be larger than size %d.",
362         GetMinPostingListSizeToFit(src), dst->size_in_bytes()));
363   }
364 
365   if (!IsPostingListValid(dst)) {
366     return absl_ports::FailedPreconditionError(
367         "Dst posting list is in an invalid state and can't be used!");
368   }
369   if (!IsPostingListValid(src)) {
370     return absl_ports::InvalidArgumentError(
371         "Cannot MoveFrom an invalid src posting list!");
372   }
373 
374   // Pop just enough data that all of src's compressed data fit in
375   // dst posting_list's compressed area. Then we can memcpy that area.
376   std::vector<JoinDataType> data_arr;
377   while (IsFull(src) || IsAlmostFull(src) ||
378          (dst->size_in_bytes() - kSpecialDataSize < GetBytesUsed(src))) {
379     if (!GetDataInternal(src, /*limit=*/1, /*pop=*/true, &data_arr).ok()) {
380       return absl_ports::AbortedError(
381           "Unable to retrieve data from src posting list.");
382     }
383   }
384 
385   // memcpy the area and set up start byte offset.
386   Clear(dst);
387   memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src),
388          src->posting_list_buffer() + GetStartByteOffset(src),
389          GetBytesUsed(src));
390   // Because we popped all data from src outside of the compressed area and we
391   // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() -
392   // kSpecialDataSize. This is guaranteed to be a valid byte offset for the
393   // NOT_FULL state, so ignoring the value is safe.
394   SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src));
395 
396   // Put back remaining data.
397   for (auto riter = data_arr.rbegin(); riter != data_arr.rend(); ++riter) {
398     // PrependData may return:
399     // - INVALID_ARGUMENT: if data is invalid or not less than the previous data
400     // - RESOURCE_EXHAUSTED
401     // RESOURCE_EXHAUSTED should be impossible because we've already assured
402     // that there is enough room above.
403     ICING_RETURN_IF_ERROR(PrependData(dst, *riter));
404   }
405 
406   Clear(src);
407   return libtextclassifier3::Status::OK;
408 }
409 
410 template <typename JoinDataType>
411 libtextclassifier3::Status
PrependDataToAlmostFull(PostingListUsed * posting_list_used,const JoinDataType & data)412 PostingListJoinDataSerializer<JoinDataType>::PrependDataToAlmostFull(
413     PostingListUsed* posting_list_used, const JoinDataType& data) const {
414   SpecialDataType special_data = GetSpecialData(posting_list_used, /*index=*/1);
415   if (data < special_data.data()) {
416     return absl_ports::InvalidArgumentError(
417         "JoinData being prepended must not be smaller than the most recent "
418         "JoinData");
419   }
420 
421   // Without compression, prepend a new data into ALMOST_FULL posting list will
422   // change the posting list to FULL state. Therefore, set special data 0
423   // directly.
424   SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(data));
425   return libtextclassifier3::Status::OK;
426 }
427 
428 template <typename JoinDataType>
PrependDataToEmpty(PostingListUsed * posting_list_used,const JoinDataType & data)429 void PostingListJoinDataSerializer<JoinDataType>::PrependDataToEmpty(
430     PostingListUsed* posting_list_used, const JoinDataType& data) const {
431   // First data to be added. Just add verbatim, no compression.
432   if (posting_list_used->size_in_bytes() == kSpecialDataSize) {
433     // First data will be stored at special data 1.
434     // Safe to ignore the return value because 1 < kNumSpecialData
435     SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data));
436     // Safe to ignore the return value because sizeof(JoinDataType) is a valid
437     // argument.
438     SetStartByteOffset(posting_list_used, /*offset=*/sizeof(JoinDataType));
439   } else {
440     // Since this is the first data, size != kSpecialDataSize and
441     // size % sizeof(JoinDataType) == 0, we know that there is room to fit
442     // 'data' into the compressed region, so ValueOrDie is safe.
443     uint32_t offset =
444         PrependDataUncompressed(posting_list_used, data,
445                                 /*offset=*/posting_list_used->size_in_bytes())
446             .ValueOrDie();
447     // Safe to ignore the return value because PrependDataUncompressed is
448     // guaranteed to return a valid offset.
449     SetStartByteOffset(posting_list_used, offset);
450   }
451 }
452 
453 template <typename JoinDataType>
454 libtextclassifier3::Status
PrependDataToNotFull(PostingListUsed * posting_list_used,const JoinDataType & data,uint32_t offset)455 PostingListJoinDataSerializer<JoinDataType>::PrependDataToNotFull(
456     PostingListUsed* posting_list_used, const JoinDataType& data,
457     uint32_t offset) const {
458   JoinDataType curr = JoinDataType::GetInvalid();
459   memcpy(&curr, posting_list_used->posting_list_buffer() + offset,
460          sizeof(JoinDataType));
461   if (data < curr) {
462     return absl_ports::InvalidArgumentError(
463         "JoinData being prepended must not be smaller than the most recent "
464         "JoinData");
465   }
466 
467   if (offset >= kSpecialDataSize + sizeof(JoinDataType)) {
468     offset =
469         PrependDataUncompressed(posting_list_used, data, offset).ValueOrDie();
470     SetStartByteOffset(posting_list_used, offset);
471   } else {
472     // The new data must be put in special data 1.
473     SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data));
474     // State ALMOST_FULL. Safe to ignore the return value because
475     // sizeof(JoinDataType) is a valid argument.
476     SetStartByteOffset(posting_list_used, /*offset=*/sizeof(JoinDataType));
477   }
478   return libtextclassifier3::Status::OK;
479 }
480 
481 template <typename JoinDataType>
482 libtextclassifier3::Status
PrependData(PostingListUsed * posting_list_used,const JoinDataType & data)483 PostingListJoinDataSerializer<JoinDataType>::PrependData(
484     PostingListUsed* posting_list_used, const JoinDataType& data) const {
485   if (!data.is_valid()) {
486     return absl_ports::InvalidArgumentError("Cannot prepend an invalid data!");
487   }
488   if (!IsPostingListValid(posting_list_used)) {
489     return absl_ports::FailedPreconditionError(
490         "This PostingListUsed is in an invalid state and can't add any data!");
491   }
492 
493   if (IsFull(posting_list_used)) {
494     // State FULL: no space left.
495     return absl_ports::ResourceExhaustedError("No more room for data");
496   } else if (IsAlmostFull(posting_list_used)) {
497     return PrependDataToAlmostFull(posting_list_used, data);
498   } else if (IsEmpty(posting_list_used)) {
499     PrependDataToEmpty(posting_list_used, data);
500     return libtextclassifier3::Status::OK;
501   } else {
502     uint32_t offset = GetStartByteOffset(posting_list_used);
503     return PrependDataToNotFull(posting_list_used, data, offset);
504   }
505 }
506 
507 template <typename JoinDataType>
508 libtextclassifier3::StatusOr<uint32_t>
PrependDataArray(PostingListUsed * posting_list_used,const JoinDataType * array,uint32_t num_data,bool keep_prepended)509 PostingListJoinDataSerializer<JoinDataType>::PrependDataArray(
510     PostingListUsed* posting_list_used, const JoinDataType* array,
511     uint32_t num_data, bool keep_prepended) const {
512   if (!IsPostingListValid(posting_list_used)) {
513     return 0;
514   }
515 
516   uint32_t i;
517   for (i = 0; i < num_data; ++i) {
518     if (!PrependData(posting_list_used, array[i]).ok()) {
519       break;
520     }
521   }
522   if (i != num_data && !keep_prepended) {
523     // Didn't fit. Undo everything and check that we have the same offset as
524     // before. PopFrontData guarantees that it will remove all 'i' data so long
525     // as there are at least 'i' data in the posting list, which we know there
526     // are.
527     ICING_RETURN_IF_ERROR(PopFrontData(posting_list_used, /*num_data=*/i));
528     return 0;
529   }
530   return i;
531 }
532 
533 template <typename JoinDataType>
534 libtextclassifier3::StatusOr<std::vector<JoinDataType>>
GetData(const PostingListUsed * posting_list_used)535 PostingListJoinDataSerializer<JoinDataType>::GetData(
536     const PostingListUsed* posting_list_used) const {
537   std::vector<JoinDataType> data_arr_out;
538   ICING_RETURN_IF_ERROR(GetData(posting_list_used, &data_arr_out));
539   return data_arr_out;
540 }
541 
542 template <typename JoinDataType>
GetData(const PostingListUsed * posting_list_used,std::vector<JoinDataType> * data_arr_out)543 libtextclassifier3::Status PostingListJoinDataSerializer<JoinDataType>::GetData(
544     const PostingListUsed* posting_list_used,
545     std::vector<JoinDataType>* data_arr_out) const {
546   return GetDataInternal(posting_list_used,
547                          /*limit=*/std::numeric_limits<uint32_t>::max(),
548                          /*pop=*/false, data_arr_out);
549 }
550 
551 template <typename JoinDataType>
552 libtextclassifier3::Status
PopFrontData(PostingListUsed * posting_list_used,uint32_t num_data)553 PostingListJoinDataSerializer<JoinDataType>::PopFrontData(
554     PostingListUsed* posting_list_used, uint32_t num_data) const {
555   if (num_data == 1 && IsFull(posting_list_used)) {
556     // The PL is in FULL state which means that we save 2 uncompressed data in
557     // the 2 special postions. But FULL state may be reached by 2 different
558     // states.
559     // (1) In ALMOST_FULL state
560     // +------------------+-----------------+-----+---------------------------+
561     // |Data::Invalid     |1st data         |(pad)|(compressed) data          |
562     // |                  |                 |     |                           |
563     // +------------------+-----------------+-----+---------------------------+
564     // When we prepend another data, we can only put it at special data 0, and
565     // thus get a FULL PL
566     // +------------------+-----------------+-----+---------------------------+
567     // |new 1st data      |original 1st data|(pad)|(compressed) data          |
568     // |                  |                 |     |                           |
569     // +------------------+-----------------+-----+---------------------------+
570     //
571     // (2) In NOT_FULL state
572     // +------------------+-----------------+-------+---------+---------------+
573     // |data-start-offset |Data::Invalid    |(pad)  |1st data |(compressed)   |
574     // |                  |                 |       |         |data           |
575     // +------------------+-----------------+-------+---------+---------------+
576     // When we prepend another data, we can reach any of the 3 following
577     // scenarios:
578     // (2.1) NOT_FULL
579     // if the space of pad and original 1st data can accommodate the new 1st
580     // data and the encoded delta value.
581     // +------------------+-----------------+-----+--------+------------------+
582     // |data-start-offset |Data::Invalid    |(pad)|new     |(compressed) data |
583     // |                  |                 |     |1st data|                  |
584     // +------------------+-----------------+-----+--------+------------------+
585     // (2.2) ALMOST_FULL
586     // If the space of pad and original 1st data cannot accommodate the new 1st
587     // data and the encoded delta value but can accommodate the encoded delta
588     // value only. We can put the new 1st data at special position 1.
589     // +------------------+-----------------+---------+-----------------------+
590     // |Data::Invalid     |new 1st data     |(pad)    |(compressed) data      |
591     // |                  |                 |         |                       |
592     // +------------------+-----------------+---------+-----------------------+
593     // (2.3) FULL
594     // In very rare case, it cannot even accommodate only the encoded delta
595     // value. we can move the original 1st data into special position 1 and the
596     // new 1st data into special position 0. This may happen because we use
597     // VarInt encoding method which may make the encoded value longer (about
598     // 4/3 times of original)
599     // +------------------+-----------------+--------------+------------------+
600     // |new 1st data      |original 1st data|(pad)         |(compressed) data |
601     // |                  |                 |              |                  |
602     // +------------------+-----------------+--------------+------------------+
603     //
604     // Suppose now the PL is in FULL state. But we don't know whether it arrived
605     // this state from NOT_FULL (like (2.3)) or from ALMOST_FULL (like (1)).
606     // We'll return to ALMOST_FULL state like (1) if we simply pop the new 1st
607     // data, but we want to make the prepending operation "reversible". So
608     // there should be some way to return to NOT_FULL if possible. A simple way
609     // to do is:
610     // - Pop 2 data out of the PL to state ALMOST_FULL or NOT_FULL.
611     // - Add the second data ("original 1st data") back.
612     //
613     // Then we can return to the correct original states of (2.1) or (1). This
614     // makes our prepending operation reversible.
615     std::vector<JoinDataType> out;
616 
617     // Popping 2 data should never fail because we've just ensured that the
618     // posting list is in the FULL state.
619     ICING_RETURN_IF_ERROR(
620         GetDataInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out));
621 
622     // PrependData should never fail because:
623     // - out[1] is a valid data less than all previous data in the posting list.
624     // - There's no way that the posting list could run out of room because it
625     //   previously stored these 2 data.
626     ICING_RETURN_IF_ERROR(PrependData(posting_list_used, out[1]));
627   } else if (num_data > 0) {
628     return GetDataInternal(posting_list_used, /*limit=*/num_data, /*pop=*/true,
629                            /*out=*/nullptr);
630   }
631   return libtextclassifier3::Status::OK;
632 }
633 
634 template <typename JoinDataType>
635 libtextclassifier3::Status
GetDataInternal(const PostingListUsed * posting_list_used,uint32_t limit,bool pop,std::vector<JoinDataType> * out)636 PostingListJoinDataSerializer<JoinDataType>::GetDataInternal(
637     const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
638     std::vector<JoinDataType>* out) const {
639   uint32_t offset = GetStartByteOffset(posting_list_used);
640   uint32_t count = 0;
641 
642   // First traverse the first two special positions.
643   while (count < limit && offset < kSpecialDataSize) {
644     // offset / sizeof(JoinDataType) < kNumSpecialData
645     // because of the check above.
646     SpecialDataType special_data = GetSpecialData(
647         posting_list_used, /*index=*/offset / sizeof(JoinDataType));
648     if (out != nullptr) {
649       out->push_back(special_data.data());
650     }
651     offset += sizeof(JoinDataType);
652     ++count;
653   }
654 
655   // - We don't compress the data.
656   // - The posting list size is a multiple of data type bytes.
657   // So offset of the first non-special data is guaranteed to be at
658   // kSpecialDataSize if in ALMOST_FULL or FULL state. In fact, we must not
659   // apply padding skipping logic here when still storing uncompressed data,
660   // because in this case 0 bytes are meanful (e.g. inverted doc id byte = 0).
661   while (count < limit && offset < posting_list_used->size_in_bytes()) {
662     JoinDataType data = JoinDataType::GetInvalid();
663     memcpy(&data, posting_list_used->posting_list_buffer() + offset,
664            sizeof(JoinDataType));
665     offset += sizeof(JoinDataType);
666     if (out != nullptr) {
667       out->push_back(data);
668     }
669     ++count;
670   }
671 
672   if (pop) {
673     PostingListUsed* mutable_posting_list_used =
674         const_cast<PostingListUsed*>(posting_list_used);
675     // Modify the posting list so that we pop all data actually traversed.
676     if (offset >= kSpecialDataSize &&
677         offset < posting_list_used->size_in_bytes()) {
678       memset(
679           mutable_posting_list_used->posting_list_buffer() + kSpecialDataSize,
680           0, offset - kSpecialDataSize);
681     }
682     SetStartByteOffset(mutable_posting_list_used, offset);
683   }
684 
685   return libtextclassifier3::Status::OK;
686 }
687 
688 template <typename JoinDataType>
689 typename PostingListJoinDataSerializer<JoinDataType>::SpecialDataType
GetSpecialData(const PostingListUsed * posting_list_used,uint32_t index)690 PostingListJoinDataSerializer<JoinDataType>::GetSpecialData(
691     const PostingListUsed* posting_list_used, uint32_t index) const {
692   // It is ok to temporarily construct a SpecialData with offset = 0 since we're
693   // going to overwrite it by memcpy.
694   SpecialDataType special_data(0);
695   memcpy(&special_data,
696          posting_list_used->posting_list_buffer() +
697              index * sizeof(SpecialDataType),
698          sizeof(SpecialDataType));
699   return special_data;
700 }
701 
702 template <typename JoinDataType>
SetSpecialData(PostingListUsed * posting_list_used,uint32_t index,const SpecialDataType & special_data)703 void PostingListJoinDataSerializer<JoinDataType>::SetSpecialData(
704     PostingListUsed* posting_list_used, uint32_t index,
705     const SpecialDataType& special_data) const {
706   memcpy(posting_list_used->posting_list_buffer() +
707              index * sizeof(SpecialDataType),
708          &special_data, sizeof(SpecialDataType));
709 }
710 
711 template <typename JoinDataType>
IsPostingListValid(const PostingListUsed * posting_list_used)712 bool PostingListJoinDataSerializer<JoinDataType>::IsPostingListValid(
713     const PostingListUsed* posting_list_used) const {
714   if (IsAlmostFull(posting_list_used)) {
715     // Special data 1 should hold a valid data.
716     if (!GetSpecialData(posting_list_used, /*index=*/1).data().is_valid()) {
717       ICING_LOG(ERROR)
718           << "Both special data cannot be invalid at the same time.";
719       return false;
720     }
721   } else if (!IsFull(posting_list_used)) {
722     // NOT_FULL. Special data 0 should hold a valid offset.
723     SpecialDataType special_data =
724         GetSpecialData(posting_list_used, /*index=*/0);
725     if (special_data.data_start_offset() > posting_list_used->size_in_bytes() ||
726         special_data.data_start_offset() < kSpecialDataSize) {
727       ICING_LOG(ERROR) << "Offset: " << special_data.data_start_offset()
728                        << " size: " << posting_list_used->size_in_bytes()
729                        << " sp size: " << kSpecialDataSize;
730       return false;
731     }
732   }
733   return true;
734 }
735 
736 template <typename JoinDataType>
GetStartByteOffset(const PostingListUsed * posting_list_used)737 uint32_t PostingListJoinDataSerializer<JoinDataType>::GetStartByteOffset(
738     const PostingListUsed* posting_list_used) const {
739   if (IsFull(posting_list_used)) {
740     return 0;
741   } else if (IsAlmostFull(posting_list_used)) {
742     return sizeof(JoinDataType);
743   } else {
744     return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset();
745   }
746 }
747 
748 template <typename JoinDataType>
SetStartByteOffset(PostingListUsed * posting_list_used,uint32_t offset)749 bool PostingListJoinDataSerializer<JoinDataType>::SetStartByteOffset(
750     PostingListUsed* posting_list_used, uint32_t offset) const {
751   if (offset > posting_list_used->size_in_bytes()) {
752     ICING_LOG(ERROR) << "offset cannot be a value greater than size "
753                      << posting_list_used->size_in_bytes() << ". offset is "
754                      << offset << ".";
755     return false;
756   }
757   if (offset < kSpecialDataSize && offset > sizeof(JoinDataType)) {
758     ICING_LOG(ERROR) << "offset cannot be a value between ("
759                      << sizeof(JoinDataType) << ", " << kSpecialDataSize
760                      << "). offset is " << offset << ".";
761     return false;
762   }
763   if (offset < sizeof(JoinDataType) && offset != 0) {
764     ICING_LOG(ERROR) << "offset cannot be a value between (0, "
765                      << sizeof(JoinDataType) << "). offset is " << offset
766                      << ".";
767     return false;
768   }
769 
770   if (offset >= kSpecialDataSize) {
771     // NOT_FULL state.
772     SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(offset));
773     SetSpecialData(posting_list_used, /*index=*/1,
774                    SpecialDataType(JoinDataType::GetInvalid()));
775   } else if (offset == sizeof(JoinDataType)) {
776     // ALMOST_FULL state.
777     SetSpecialData(posting_list_used, /*index=*/0,
778                    SpecialDataType(JoinDataType::GetInvalid()));
779   }
780   // Nothing to do for the FULL state - the offset isn't actually stored
781   // anywhere and both 2 special data hold valid data.
782   return true;
783 }
784 
785 template <typename JoinDataType>
786 libtextclassifier3::StatusOr<uint32_t>
PrependDataUncompressed(PostingListUsed * posting_list_used,const JoinDataType & data,uint32_t offset)787 PostingListJoinDataSerializer<JoinDataType>::PrependDataUncompressed(
788     PostingListUsed* posting_list_used, const JoinDataType& data,
789     uint32_t offset) const {
790   if (offset < kSpecialDataSize + sizeof(JoinDataType)) {
791     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
792         "Not enough room to prepend JoinData at offset %d.", offset));
793   }
794   offset -= sizeof(JoinDataType);
795   memcpy(posting_list_used->posting_list_buffer() + offset, &data,
796          sizeof(JoinDataType));
797   return offset;
798 }
799 
800 }  // namespace lib
801 }  // namespace icing
802 
803 #endif  // ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_
804