1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_
16 #define ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_
17
18 #include <cstdint>
19 #include <cstring>
20 #include <limits>
21 #include <vector>
22
23 #include "icing/text_classifier/lib3/utils/base/status.h"
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/absl_ports/canonical_errors.h"
26 #include "icing/file/posting_list/posting-list-common.h"
27 #include "icing/file/posting_list/posting-list-used.h"
28 #include "icing/legacy/core/icing-string-util.h"
29 #include "icing/util/logging.h"
30 #include "icing/util/status-macros.h"
31
32 namespace icing {
33 namespace lib {
34
35 // A serializer class to serialize JoinDataType to PostingListUsed. Usually
36 // JoinDataType is DocumentIdToJoinInfo<NamespaceIdFingerprint>,
37 // DocumentIdToJoinInfo<TermId>, or DocumentIdToJoinInfo<int64_t>.
38 //
39 // REQUIRES:
40 // - JoinDataType is comparable by operator <.
41 // - JoinDataType implements is_valid() method.
42 // - JoinDataType has static method GetInvalid() that returns a JoinDataType
43 // instance containing invalid data.
44 template <typename JoinDataType>
45 class PostingListJoinDataSerializer : public PostingListSerializer {
46 public:
47 using SpecialDataType = SpecialData<JoinDataType>;
48 static_assert(sizeof(SpecialDataType) == sizeof(JoinDataType), "");
49
50 static constexpr uint32_t kSpecialDataSize =
51 kNumSpecialData * sizeof(SpecialDataType);
52
GetDataTypeBytes()53 uint32_t GetDataTypeBytes() const override { return sizeof(JoinDataType); }
54
GetMinPostingListSize()55 uint32_t GetMinPostingListSize() const override {
56 static constexpr uint32_t kMinPostingListSize = kSpecialDataSize;
57 static_assert(sizeof(PostingListIndex) <= kMinPostingListSize,
58 "PostingListIndex must be small enough to fit in a "
59 "minimum-sized Posting List.");
60
61 return kMinPostingListSize;
62 }
63
64 uint32_t GetMinPostingListSizeToFit(
65 const PostingListUsed* posting_list_used) const override;
66
67 uint32_t GetBytesUsed(
68 const PostingListUsed* posting_list_used) const override;
69
70 void Clear(PostingListUsed* posting_list_used) const override;
71
72 libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
73 PostingListUsed* src) const override;
74
75 // Prepend a JoinData to the posting list.
76 //
77 // RETURNS:
78 // - INVALID_ARGUMENT if !data.is_valid() or if data is not greater than the
79 // previously added data.
80 // - RESOURCE_EXHAUSTED if there is no more room to add data to the posting
81 // list.
82 libtextclassifier3::Status PrependData(PostingListUsed* posting_list_used,
83 const JoinDataType& data) const;
84
85 // Prepend multiple JoinData to the posting list.
86 // Data should be sorted in ascending order (as defined by the less than
87 // operator for JoinData)
88 // If keep_prepended is true, whatever could be prepended is kept, otherwise
89 // the posting list is reverted and left in its original state.
90 //
91 // RETURNS:
92 // The number of data that have been prepended to the posting list. If
93 // keep_prepended is false and reverted, then it returns 0.
94 libtextclassifier3::StatusOr<uint32_t> PrependDataArray(
95 PostingListUsed* posting_list_used, const JoinDataType* array,
96 uint32_t num_data, bool keep_prepended) const;
97
98 // Retrieves all data stored in the posting list.
99 //
100 // RETURNS:
101 // - On success, a vector of JoinDataType sorted by the reverse order of
102 // prepending.
103 // - INTERNAL_ERROR if the posting list has been corrupted somehow.
104 libtextclassifier3::StatusOr<std::vector<JoinDataType>> GetData(
105 const PostingListUsed* posting_list_used) const;
106
107 // Same as GetData but appends data to data_arr_out.
108 //
109 // RETURNS:
110 // - OK on success, and data_arr_out will be appended JoinDataType sorted by
111 // the reverse order of prepending.
112 // - INTERNAL_ERROR if the posting list has been corrupted somehow.
113 libtextclassifier3::Status GetData(
114 const PostingListUsed* posting_list_used,
115 std::vector<JoinDataType>* data_arr_out) const;
116
117 // Undo the last num_data data prepended. If num_data > number of data, then
118 // we clear all data.
119 //
120 // RETURNS:
121 // - OK on success
122 // - INTERNAL_ERROR if the posting list has been corrupted somehow.
123 libtextclassifier3::Status PopFrontData(PostingListUsed* posting_list_used,
124 uint32_t num_data) const;
125
126 // Helper function to determine if posting list is full.
IsFull(const PostingListUsed * posting_list_used)127 bool IsFull(const PostingListUsed* posting_list_used) const {
128 return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
129 GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
130 }
131
132 private:
133 // In PostingListJoinDataSerializer, there is no compression, but we still use
134 // the traditional posting list implementation.
135 //
136 // Posting list layout formats:
137 //
138 // NOT_FULL
139 // +-special-data-0--+-special-data-1--+------------+-----------------------+
140 // | | | | |
141 // |data-start-offset| Data::Invalid | 0x00000000 | (compressed) data |
142 // | | | | |
143 // +-----------------+-----------------+------------+-----------------------+
144 //
145 // ALMOST_FULL
146 // +-special-data-0--+-special-data-1--+-----+------------------------------+
147 // | | | | |
148 // | Data::Invalid | 1st data |(pad)| (compressed) data |
149 // | | | | |
150 // +-----------------+-----------------+-----+------------------------------+
151 //
152 // FULL
153 // +-special-data-0--+-special-data-1--+-----+------------------------------+
154 // | | | | |
155 // | 1st data | 2nd data |(pad)| (compressed) data |
156 // | | | | |
157 // +-----------------+-----------------+-----+------------------------------+
158 //
159 // The first two uncompressed (special) data also implicitly encode
160 // information about the size of the compressed data region.
161 //
162 // 1. If the posting list is NOT_FULL, then special_data_0 contains the byte
163 // offset of the start of the compressed data. Thus, the size of the
164 // compressed data is
165 // posting_list_used->size_in_bytes() - special_data_0.data_start_offset().
166 //
167 // 2. If posting list is ALMOST_FULL or FULL, then the compressed data region
168 // starts somewhere between
169 // [kSpecialDataSize, kSpecialDataSize + sizeof(JoinDataType) - 1] and ends
170 // at posting_list_used->size_in_bytes() - 1.
171 //
172 // EXAMPLE
173 // JoinDataType = DocumentIdToJoinInfo<int64_t>. Posting list size: 48 bytes
174 //
175 // EMPTY!
176 // +-- byte 0-11 --+---- 12-23 ----+------------ 24-47 -------------+
177 // | | | |
178 // | 48 | Data::Invalid | 0x00000000 |
179 // | | | |
180 // +---------------+---------------+--------------------------------+
181 //
182 // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 12, JoinInteger = 5)
183 // NOT FULL!
184 // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
185 // | | | | 12 |
186 // | 36 | Data::Invalid | 0x00000000 | 5 |
187 // | | | | |
188 // +---------------+---------------+---------------+---------------+
189 //
190 // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 18, JoinInteger = -2)
191 // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
192 // | | | 18 | 12 |
193 // | 24 | Data::Invalid | -2 | 5 |
194 // | | | | |
195 // +---------------+---------------+---------------+---------------+
196 //
197 // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 22, JoinInteger = 3)
198 // ALMOST_FULL!
199 // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
200 // | | 22 | 18 | 12 |
201 // | Data::Invalid | 3 | -2 | 5 |
202 // | | | | |
203 // +---------------+---------------+---------------+---------------+
204 //
205 // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 27, JoinInteger = 0)
206 // FULL!
207 // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
208 // | 27 | 22 | 18 | 12 |
209 // | 0 | 3 | -2 | 5 |
210 // | | | | |
211 // +---------------+---------------+---------------+---------------+
212
213 // Helpers to determine what state the posting list is in.
IsAlmostFull(const PostingListUsed * posting_list_used)214 bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
215 return !GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
216 GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
217 }
218
IsEmpty(const PostingListUsed * posting_list_used)219 bool IsEmpty(const PostingListUsed* posting_list_used) const {
220 return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset() ==
221 posting_list_used->size_in_bytes() &&
222 !GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
223 }
224
225 // Returns false if both special data are invalid or if data start offset
226 // stored in the special data is less than kSpecialDataSize or greater than
227 // posting_list_used->size_in_bytes(). Returns true, otherwise.
228 bool IsPostingListValid(const PostingListUsed* posting_list_used) const;
229
230 // Prepend data to a posting list that is in the ALMOST_FULL state.
231 //
232 // RETURNS:
233 // - OK, if successful
234 // - INVALID_ARGUMENT if data is not less than the previously added data.
235 libtextclassifier3::Status PrependDataToAlmostFull(
236 PostingListUsed* posting_list_used, const JoinDataType& data) const;
237
238 // Prepend data to a posting list that is in the EMPTY state. This will always
239 // succeed because there are no pre-existing data and no validly constructed
240 // posting list could fail to fit one data.
241 void PrependDataToEmpty(PostingListUsed* posting_list_used,
242 const JoinDataType& data) const;
243
244 // Prepend data to a posting list that is in the NOT_FULL state.
245 //
246 // RETURNS:
247 // - OK, if successful
248 // - INVALID_ARGUMENT if data is not less than the previously added data.
249 libtextclassifier3::Status PrependDataToNotFull(
250 PostingListUsed* posting_list_used, const JoinDataType& data,
251 uint32_t offset) const;
252
253 // Returns either 0 (FULL state), sizeof(JoinDataType) (ALMOST_FULL state) or
254 // a byte offset between kSpecialDataSize and
255 // posting_list_used->size_in_bytes() (inclusive) (NOT_FULL state).
256 uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const;
257
258 // Sets special data 0 to properly reflect what start byte offset is (see
259 // layout comment for further details).
260 //
261 // Returns false if offset > posting_list_used->size_in_bytes() or offset is
262 // in range (kSpecialDataSize, sizeof(JoinDataType)) or
263 // (sizeof(JoinDataType), 0). True, otherwise.
264 bool SetStartByteOffset(PostingListUsed* posting_list_used,
265 uint32_t offset) const;
266
267 // Helper for MoveFrom/GetData/PopFrontData. Adds limit number of data to out
268 // or all data in the posting list if the posting list contains less than
269 // limit number of data. out can be NULL.
270 //
271 // NOTE: If called with limit=1, pop=true on a posting list that transitioned
272 // from NOT_FULL directly to FULL, GetDataInternal will not return the posting
273 // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
274 // ALMOST_FULL.
275 //
276 // RETURNS:
277 // - OK on success
278 // - INTERNAL_ERROR if the posting list has been corrupted somehow.
279 libtextclassifier3::Status GetDataInternal(
280 const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
281 std::vector<JoinDataType>* out) const;
282
283 // Retrieves the value stored in the index-th special data.
284 //
285 // REQUIRES:
286 // 0 <= index < kNumSpecialData.
287 //
288 // RETURNS:
289 // - A valid SpecialData<JoinDataType>.
290 SpecialDataType GetSpecialData(const PostingListUsed* posting_list_used,
291 uint32_t index) const;
292
293 // Sets the value stored in the index-th special data to special_data.
294 //
295 // REQUIRES:
296 // 0 <= index < kNumSpecialData.
297 void SetSpecialData(PostingListUsed* posting_list_used, uint32_t index,
298 const SpecialDataType& special_data) const;
299
300 // Prepends data to the memory region
301 // [offset - sizeof(JoinDataType), offset - 1] and
302 // returns the new beginning of the region.
303 //
304 // RETURNS:
305 // - The new beginning of the padded region, if successful.
306 // - INVALID_ARGUMENT if data will not fit (uncompressed) between
307 // [kSpecialDataSize, offset - 1]
308 libtextclassifier3::StatusOr<uint32_t> PrependDataUncompressed(
309 PostingListUsed* posting_list_used, const JoinDataType& data,
310 uint32_t offset) const;
311 };
312
313 template <typename JoinDataType>
GetBytesUsed(const PostingListUsed * posting_list_used)314 uint32_t PostingListJoinDataSerializer<JoinDataType>::GetBytesUsed(
315 const PostingListUsed* posting_list_used) const {
316 // The special data will be included if they represent actual data. If they
317 // represent the data start offset or the invalid data sentinel, they are not
318 // included.
319 return posting_list_used->size_in_bytes() -
320 GetStartByteOffset(posting_list_used);
321 }
322
323 template <typename JoinDataType>
324 uint32_t
GetMinPostingListSizeToFit(const PostingListUsed * posting_list_used)325 PostingListJoinDataSerializer<JoinDataType>::GetMinPostingListSizeToFit(
326 const PostingListUsed* posting_list_used) const {
327 if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) {
328 // If in either the FULL state or ALMOST_FULL state, this posting list *is*
329 // the minimum size posting list that can fit these data. So just return the
330 // size of the posting list.
331 return posting_list_used->size_in_bytes();
332 }
333
334 // In NOT_FULL state, BytesUsed contains no special data. The minimum sized
335 // posting list that would be guaranteed to fit these data would be
336 // ALMOST_FULL, with kInvalidData in special data 0, the uncompressed data in
337 // special data 1 and the n compressed data in the compressed region.
338 // BytesUsed contains one uncompressed data and n compressed data. Therefore,
339 // fitting these data into a posting list would require BytesUsed plus one
340 // extra data.
341 return GetBytesUsed(posting_list_used) + GetDataTypeBytes();
342 }
343
344 template <typename JoinDataType>
Clear(PostingListUsed * posting_list_used)345 void PostingListJoinDataSerializer<JoinDataType>::Clear(
346 PostingListUsed* posting_list_used) const {
347 // Safe to ignore return value because posting_list_used->size_in_bytes() is
348 // a valid argument.
349 SetStartByteOffset(posting_list_used,
350 /*offset=*/posting_list_used->size_in_bytes());
351 }
352
353 template <typename JoinDataType>
354 libtextclassifier3::Status
MoveFrom(PostingListUsed * dst,PostingListUsed * src)355 PostingListJoinDataSerializer<JoinDataType>::MoveFrom(
356 PostingListUsed* dst, PostingListUsed* src) const {
357 ICING_RETURN_ERROR_IF_NULL(dst);
358 ICING_RETURN_ERROR_IF_NULL(src);
359 if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) {
360 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
361 "src MinPostingListSizeToFit %d must be larger than size %d.",
362 GetMinPostingListSizeToFit(src), dst->size_in_bytes()));
363 }
364
365 if (!IsPostingListValid(dst)) {
366 return absl_ports::FailedPreconditionError(
367 "Dst posting list is in an invalid state and can't be used!");
368 }
369 if (!IsPostingListValid(src)) {
370 return absl_ports::InvalidArgumentError(
371 "Cannot MoveFrom an invalid src posting list!");
372 }
373
374 // Pop just enough data that all of src's compressed data fit in
375 // dst posting_list's compressed area. Then we can memcpy that area.
376 std::vector<JoinDataType> data_arr;
377 while (IsFull(src) || IsAlmostFull(src) ||
378 (dst->size_in_bytes() - kSpecialDataSize < GetBytesUsed(src))) {
379 if (!GetDataInternal(src, /*limit=*/1, /*pop=*/true, &data_arr).ok()) {
380 return absl_ports::AbortedError(
381 "Unable to retrieve data from src posting list.");
382 }
383 }
384
385 // memcpy the area and set up start byte offset.
386 Clear(dst);
387 memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src),
388 src->posting_list_buffer() + GetStartByteOffset(src),
389 GetBytesUsed(src));
390 // Because we popped all data from src outside of the compressed area and we
391 // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() -
392 // kSpecialDataSize. This is guaranteed to be a valid byte offset for the
393 // NOT_FULL state, so ignoring the value is safe.
394 SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src));
395
396 // Put back remaining data.
397 for (auto riter = data_arr.rbegin(); riter != data_arr.rend(); ++riter) {
398 // PrependData may return:
399 // - INVALID_ARGUMENT: if data is invalid or not less than the previous data
400 // - RESOURCE_EXHAUSTED
401 // RESOURCE_EXHAUSTED should be impossible because we've already assured
402 // that there is enough room above.
403 ICING_RETURN_IF_ERROR(PrependData(dst, *riter));
404 }
405
406 Clear(src);
407 return libtextclassifier3::Status::OK;
408 }
409
410 template <typename JoinDataType>
411 libtextclassifier3::Status
PrependDataToAlmostFull(PostingListUsed * posting_list_used,const JoinDataType & data)412 PostingListJoinDataSerializer<JoinDataType>::PrependDataToAlmostFull(
413 PostingListUsed* posting_list_used, const JoinDataType& data) const {
414 SpecialDataType special_data = GetSpecialData(posting_list_used, /*index=*/1);
415 if (data < special_data.data()) {
416 return absl_ports::InvalidArgumentError(
417 "JoinData being prepended must not be smaller than the most recent "
418 "JoinData");
419 }
420
421 // Without compression, prepend a new data into ALMOST_FULL posting list will
422 // change the posting list to FULL state. Therefore, set special data 0
423 // directly.
424 SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(data));
425 return libtextclassifier3::Status::OK;
426 }
427
428 template <typename JoinDataType>
PrependDataToEmpty(PostingListUsed * posting_list_used,const JoinDataType & data)429 void PostingListJoinDataSerializer<JoinDataType>::PrependDataToEmpty(
430 PostingListUsed* posting_list_used, const JoinDataType& data) const {
431 // First data to be added. Just add verbatim, no compression.
432 if (posting_list_used->size_in_bytes() == kSpecialDataSize) {
433 // First data will be stored at special data 1.
434 // Safe to ignore the return value because 1 < kNumSpecialData
435 SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data));
436 // Safe to ignore the return value because sizeof(JoinDataType) is a valid
437 // argument.
438 SetStartByteOffset(posting_list_used, /*offset=*/sizeof(JoinDataType));
439 } else {
440 // Since this is the first data, size != kSpecialDataSize and
441 // size % sizeof(JoinDataType) == 0, we know that there is room to fit
442 // 'data' into the compressed region, so ValueOrDie is safe.
443 uint32_t offset =
444 PrependDataUncompressed(posting_list_used, data,
445 /*offset=*/posting_list_used->size_in_bytes())
446 .ValueOrDie();
447 // Safe to ignore the return value because PrependDataUncompressed is
448 // guaranteed to return a valid offset.
449 SetStartByteOffset(posting_list_used, offset);
450 }
451 }
452
453 template <typename JoinDataType>
454 libtextclassifier3::Status
PrependDataToNotFull(PostingListUsed * posting_list_used,const JoinDataType & data,uint32_t offset)455 PostingListJoinDataSerializer<JoinDataType>::PrependDataToNotFull(
456 PostingListUsed* posting_list_used, const JoinDataType& data,
457 uint32_t offset) const {
458 JoinDataType curr = JoinDataType::GetInvalid();
459 memcpy(&curr, posting_list_used->posting_list_buffer() + offset,
460 sizeof(JoinDataType));
461 if (data < curr) {
462 return absl_ports::InvalidArgumentError(
463 "JoinData being prepended must not be smaller than the most recent "
464 "JoinData");
465 }
466
467 if (offset >= kSpecialDataSize + sizeof(JoinDataType)) {
468 offset =
469 PrependDataUncompressed(posting_list_used, data, offset).ValueOrDie();
470 SetStartByteOffset(posting_list_used, offset);
471 } else {
472 // The new data must be put in special data 1.
473 SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data));
474 // State ALMOST_FULL. Safe to ignore the return value because
475 // sizeof(JoinDataType) is a valid argument.
476 SetStartByteOffset(posting_list_used, /*offset=*/sizeof(JoinDataType));
477 }
478 return libtextclassifier3::Status::OK;
479 }
480
481 template <typename JoinDataType>
482 libtextclassifier3::Status
PrependData(PostingListUsed * posting_list_used,const JoinDataType & data)483 PostingListJoinDataSerializer<JoinDataType>::PrependData(
484 PostingListUsed* posting_list_used, const JoinDataType& data) const {
485 if (!data.is_valid()) {
486 return absl_ports::InvalidArgumentError("Cannot prepend an invalid data!");
487 }
488 if (!IsPostingListValid(posting_list_used)) {
489 return absl_ports::FailedPreconditionError(
490 "This PostingListUsed is in an invalid state and can't add any data!");
491 }
492
493 if (IsFull(posting_list_used)) {
494 // State FULL: no space left.
495 return absl_ports::ResourceExhaustedError("No more room for data");
496 } else if (IsAlmostFull(posting_list_used)) {
497 return PrependDataToAlmostFull(posting_list_used, data);
498 } else if (IsEmpty(posting_list_used)) {
499 PrependDataToEmpty(posting_list_used, data);
500 return libtextclassifier3::Status::OK;
501 } else {
502 uint32_t offset = GetStartByteOffset(posting_list_used);
503 return PrependDataToNotFull(posting_list_used, data, offset);
504 }
505 }
506
507 template <typename JoinDataType>
508 libtextclassifier3::StatusOr<uint32_t>
PrependDataArray(PostingListUsed * posting_list_used,const JoinDataType * array,uint32_t num_data,bool keep_prepended)509 PostingListJoinDataSerializer<JoinDataType>::PrependDataArray(
510 PostingListUsed* posting_list_used, const JoinDataType* array,
511 uint32_t num_data, bool keep_prepended) const {
512 if (!IsPostingListValid(posting_list_used)) {
513 return 0;
514 }
515
516 uint32_t i;
517 for (i = 0; i < num_data; ++i) {
518 if (!PrependData(posting_list_used, array[i]).ok()) {
519 break;
520 }
521 }
522 if (i != num_data && !keep_prepended) {
523 // Didn't fit. Undo everything and check that we have the same offset as
524 // before. PopFrontData guarantees that it will remove all 'i' data so long
525 // as there are at least 'i' data in the posting list, which we know there
526 // are.
527 ICING_RETURN_IF_ERROR(PopFrontData(posting_list_used, /*num_data=*/i));
528 return 0;
529 }
530 return i;
531 }
532
533 template <typename JoinDataType>
534 libtextclassifier3::StatusOr<std::vector<JoinDataType>>
GetData(const PostingListUsed * posting_list_used)535 PostingListJoinDataSerializer<JoinDataType>::GetData(
536 const PostingListUsed* posting_list_used) const {
537 std::vector<JoinDataType> data_arr_out;
538 ICING_RETURN_IF_ERROR(GetData(posting_list_used, &data_arr_out));
539 return data_arr_out;
540 }
541
542 template <typename JoinDataType>
GetData(const PostingListUsed * posting_list_used,std::vector<JoinDataType> * data_arr_out)543 libtextclassifier3::Status PostingListJoinDataSerializer<JoinDataType>::GetData(
544 const PostingListUsed* posting_list_used,
545 std::vector<JoinDataType>* data_arr_out) const {
546 return GetDataInternal(posting_list_used,
547 /*limit=*/std::numeric_limits<uint32_t>::max(),
548 /*pop=*/false, data_arr_out);
549 }
550
551 template <typename JoinDataType>
552 libtextclassifier3::Status
PopFrontData(PostingListUsed * posting_list_used,uint32_t num_data)553 PostingListJoinDataSerializer<JoinDataType>::PopFrontData(
554 PostingListUsed* posting_list_used, uint32_t num_data) const {
555 if (num_data == 1 && IsFull(posting_list_used)) {
556 // The PL is in FULL state which means that we save 2 uncompressed data in
557 // the 2 special postions. But FULL state may be reached by 2 different
558 // states.
559 // (1) In ALMOST_FULL state
560 // +------------------+-----------------+-----+---------------------------+
561 // |Data::Invalid |1st data |(pad)|(compressed) data |
562 // | | | | |
563 // +------------------+-----------------+-----+---------------------------+
564 // When we prepend another data, we can only put it at special data 0, and
565 // thus get a FULL PL
566 // +------------------+-----------------+-----+---------------------------+
567 // |new 1st data |original 1st data|(pad)|(compressed) data |
568 // | | | | |
569 // +------------------+-----------------+-----+---------------------------+
570 //
571 // (2) In NOT_FULL state
572 // +------------------+-----------------+-------+---------+---------------+
573 // |data-start-offset |Data::Invalid |(pad) |1st data |(compressed) |
574 // | | | | |data |
575 // +------------------+-----------------+-------+---------+---------------+
576 // When we prepend another data, we can reach any of the 3 following
577 // scenarios:
578 // (2.1) NOT_FULL
579 // if the space of pad and original 1st data can accommodate the new 1st
580 // data and the encoded delta value.
581 // +------------------+-----------------+-----+--------+------------------+
582 // |data-start-offset |Data::Invalid |(pad)|new |(compressed) data |
583 // | | | |1st data| |
584 // +------------------+-----------------+-----+--------+------------------+
585 // (2.2) ALMOST_FULL
586 // If the space of pad and original 1st data cannot accommodate the new 1st
587 // data and the encoded delta value but can accommodate the encoded delta
588 // value only. We can put the new 1st data at special position 1.
589 // +------------------+-----------------+---------+-----------------------+
590 // |Data::Invalid |new 1st data |(pad) |(compressed) data |
591 // | | | | |
592 // +------------------+-----------------+---------+-----------------------+
593 // (2.3) FULL
594 // In very rare case, it cannot even accommodate only the encoded delta
595 // value. we can move the original 1st data into special position 1 and the
596 // new 1st data into special position 0. This may happen because we use
597 // VarInt encoding method which may make the encoded value longer (about
598 // 4/3 times of original)
599 // +------------------+-----------------+--------------+------------------+
600 // |new 1st data |original 1st data|(pad) |(compressed) data |
601 // | | | | |
602 // +------------------+-----------------+--------------+------------------+
603 //
604 // Suppose now the PL is in FULL state. But we don't know whether it arrived
605 // this state from NOT_FULL (like (2.3)) or from ALMOST_FULL (like (1)).
606 // We'll return to ALMOST_FULL state like (1) if we simply pop the new 1st
607 // data, but we want to make the prepending operation "reversible". So
608 // there should be some way to return to NOT_FULL if possible. A simple way
609 // to do is:
610 // - Pop 2 data out of the PL to state ALMOST_FULL or NOT_FULL.
611 // - Add the second data ("original 1st data") back.
612 //
613 // Then we can return to the correct original states of (2.1) or (1). This
614 // makes our prepending operation reversible.
615 std::vector<JoinDataType> out;
616
617 // Popping 2 data should never fail because we've just ensured that the
618 // posting list is in the FULL state.
619 ICING_RETURN_IF_ERROR(
620 GetDataInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out));
621
622 // PrependData should never fail because:
623 // - out[1] is a valid data less than all previous data in the posting list.
624 // - There's no way that the posting list could run out of room because it
625 // previously stored these 2 data.
626 ICING_RETURN_IF_ERROR(PrependData(posting_list_used, out[1]));
627 } else if (num_data > 0) {
628 return GetDataInternal(posting_list_used, /*limit=*/num_data, /*pop=*/true,
629 /*out=*/nullptr);
630 }
631 return libtextclassifier3::Status::OK;
632 }
633
634 template <typename JoinDataType>
635 libtextclassifier3::Status
GetDataInternal(const PostingListUsed * posting_list_used,uint32_t limit,bool pop,std::vector<JoinDataType> * out)636 PostingListJoinDataSerializer<JoinDataType>::GetDataInternal(
637 const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
638 std::vector<JoinDataType>* out) const {
639 uint32_t offset = GetStartByteOffset(posting_list_used);
640 uint32_t count = 0;
641
642 // First traverse the first two special positions.
643 while (count < limit && offset < kSpecialDataSize) {
644 // offset / sizeof(JoinDataType) < kNumSpecialData
645 // because of the check above.
646 SpecialDataType special_data = GetSpecialData(
647 posting_list_used, /*index=*/offset / sizeof(JoinDataType));
648 if (out != nullptr) {
649 out->push_back(special_data.data());
650 }
651 offset += sizeof(JoinDataType);
652 ++count;
653 }
654
655 // - We don't compress the data.
656 // - The posting list size is a multiple of data type bytes.
657 // So offset of the first non-special data is guaranteed to be at
658 // kSpecialDataSize if in ALMOST_FULL or FULL state. In fact, we must not
659 // apply padding skipping logic here when still storing uncompressed data,
660 // because in this case 0 bytes are meanful (e.g. inverted doc id byte = 0).
661 while (count < limit && offset < posting_list_used->size_in_bytes()) {
662 JoinDataType data = JoinDataType::GetInvalid();
663 memcpy(&data, posting_list_used->posting_list_buffer() + offset,
664 sizeof(JoinDataType));
665 offset += sizeof(JoinDataType);
666 if (out != nullptr) {
667 out->push_back(data);
668 }
669 ++count;
670 }
671
672 if (pop) {
673 PostingListUsed* mutable_posting_list_used =
674 const_cast<PostingListUsed*>(posting_list_used);
675 // Modify the posting list so that we pop all data actually traversed.
676 if (offset >= kSpecialDataSize &&
677 offset < posting_list_used->size_in_bytes()) {
678 memset(
679 mutable_posting_list_used->posting_list_buffer() + kSpecialDataSize,
680 0, offset - kSpecialDataSize);
681 }
682 SetStartByteOffset(mutable_posting_list_used, offset);
683 }
684
685 return libtextclassifier3::Status::OK;
686 }
687
688 template <typename JoinDataType>
689 typename PostingListJoinDataSerializer<JoinDataType>::SpecialDataType
GetSpecialData(const PostingListUsed * posting_list_used,uint32_t index)690 PostingListJoinDataSerializer<JoinDataType>::GetSpecialData(
691 const PostingListUsed* posting_list_used, uint32_t index) const {
692 // It is ok to temporarily construct a SpecialData with offset = 0 since we're
693 // going to overwrite it by memcpy.
694 SpecialDataType special_data(0);
695 memcpy(&special_data,
696 posting_list_used->posting_list_buffer() +
697 index * sizeof(SpecialDataType),
698 sizeof(SpecialDataType));
699 return special_data;
700 }
701
702 template <typename JoinDataType>
SetSpecialData(PostingListUsed * posting_list_used,uint32_t index,const SpecialDataType & special_data)703 void PostingListJoinDataSerializer<JoinDataType>::SetSpecialData(
704 PostingListUsed* posting_list_used, uint32_t index,
705 const SpecialDataType& special_data) const {
706 memcpy(posting_list_used->posting_list_buffer() +
707 index * sizeof(SpecialDataType),
708 &special_data, sizeof(SpecialDataType));
709 }
710
711 template <typename JoinDataType>
IsPostingListValid(const PostingListUsed * posting_list_used)712 bool PostingListJoinDataSerializer<JoinDataType>::IsPostingListValid(
713 const PostingListUsed* posting_list_used) const {
714 if (IsAlmostFull(posting_list_used)) {
715 // Special data 1 should hold a valid data.
716 if (!GetSpecialData(posting_list_used, /*index=*/1).data().is_valid()) {
717 ICING_LOG(ERROR)
718 << "Both special data cannot be invalid at the same time.";
719 return false;
720 }
721 } else if (!IsFull(posting_list_used)) {
722 // NOT_FULL. Special data 0 should hold a valid offset.
723 SpecialDataType special_data =
724 GetSpecialData(posting_list_used, /*index=*/0);
725 if (special_data.data_start_offset() > posting_list_used->size_in_bytes() ||
726 special_data.data_start_offset() < kSpecialDataSize) {
727 ICING_LOG(ERROR) << "Offset: " << special_data.data_start_offset()
728 << " size: " << posting_list_used->size_in_bytes()
729 << " sp size: " << kSpecialDataSize;
730 return false;
731 }
732 }
733 return true;
734 }
735
736 template <typename JoinDataType>
GetStartByteOffset(const PostingListUsed * posting_list_used)737 uint32_t PostingListJoinDataSerializer<JoinDataType>::GetStartByteOffset(
738 const PostingListUsed* posting_list_used) const {
739 if (IsFull(posting_list_used)) {
740 return 0;
741 } else if (IsAlmostFull(posting_list_used)) {
742 return sizeof(JoinDataType);
743 } else {
744 return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset();
745 }
746 }
747
748 template <typename JoinDataType>
SetStartByteOffset(PostingListUsed * posting_list_used,uint32_t offset)749 bool PostingListJoinDataSerializer<JoinDataType>::SetStartByteOffset(
750 PostingListUsed* posting_list_used, uint32_t offset) const {
751 if (offset > posting_list_used->size_in_bytes()) {
752 ICING_LOG(ERROR) << "offset cannot be a value greater than size "
753 << posting_list_used->size_in_bytes() << ". offset is "
754 << offset << ".";
755 return false;
756 }
757 if (offset < kSpecialDataSize && offset > sizeof(JoinDataType)) {
758 ICING_LOG(ERROR) << "offset cannot be a value between ("
759 << sizeof(JoinDataType) << ", " << kSpecialDataSize
760 << "). offset is " << offset << ".";
761 return false;
762 }
763 if (offset < sizeof(JoinDataType) && offset != 0) {
764 ICING_LOG(ERROR) << "offset cannot be a value between (0, "
765 << sizeof(JoinDataType) << "). offset is " << offset
766 << ".";
767 return false;
768 }
769
770 if (offset >= kSpecialDataSize) {
771 // NOT_FULL state.
772 SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(offset));
773 SetSpecialData(posting_list_used, /*index=*/1,
774 SpecialDataType(JoinDataType::GetInvalid()));
775 } else if (offset == sizeof(JoinDataType)) {
776 // ALMOST_FULL state.
777 SetSpecialData(posting_list_used, /*index=*/0,
778 SpecialDataType(JoinDataType::GetInvalid()));
779 }
780 // Nothing to do for the FULL state - the offset isn't actually stored
781 // anywhere and both 2 special data hold valid data.
782 return true;
783 }
784
785 template <typename JoinDataType>
786 libtextclassifier3::StatusOr<uint32_t>
PrependDataUncompressed(PostingListUsed * posting_list_used,const JoinDataType & data,uint32_t offset)787 PostingListJoinDataSerializer<JoinDataType>::PrependDataUncompressed(
788 PostingListUsed* posting_list_used, const JoinDataType& data,
789 uint32_t offset) const {
790 if (offset < kSpecialDataSize + sizeof(JoinDataType)) {
791 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
792 "Not enough room to prepend JoinData at offset %d.", offset));
793 }
794 offset -= sizeof(JoinDataType);
795 memcpy(posting_list_used->posting_list_buffer() + offset, &data,
796 sizeof(JoinDataType));
797 return offset;
798 }
799
800 } // namespace lib
801 } // namespace icing
802
803 #endif // ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_
804