1 // Copyright (C) 2022 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_ 16 #define ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_ 17 18 #include <cstddef> 19 #include <cstdint> 20 #include <vector> 21 22 #include "icing/text_classifier/lib3/utils/base/status.h" 23 #include "icing/text_classifier/lib3/utils/base/statusor.h" 24 #include "icing/file/posting_list/posting-list-common.h" 25 #include "icing/file/posting_list/posting-list-used.h" 26 #include "icing/index/hit/hit.h" 27 #include "icing/legacy/index/icing-bit-util.h" 28 #include "icing/util/status-macros.h" 29 30 namespace icing { 31 namespace lib { 32 33 // A serializer class to serialize hits to PostingListUsed. Layout described in 34 // comments in posting-list-hit-serializer.cc. 35 class PostingListHitSerializer : public PostingListSerializer { 36 public: 37 static constexpr uint32_t kSpecialHitsSize = kNumSpecialData * sizeof(Hit); 38 39 struct DecodedHitInfo { 40 // The decoded hit value. 41 Hit::Value hit_value; 42 43 // Size of the encoded hit in bytes. 44 size_t encoded_size; 45 }; 46 47 // Given the current hit value, encodes the next hit value for serialization 48 // in the posting list. 49 // 50 // The encoded value is the varint-encoded delta between next_hit_value and 51 // curr_hit_value. 52 // - We add 1 to this delta so as to avoid getting a delta value of 0. 53 // - This allows us to add duplicate hits with the same value, which is a 54 // valid case if we need to store hits with different flags that belong in 55 // the same section-id/doc-id combo. 56 // - We cannot have an encoded hit delta with a value of 0 as 0 is currently 57 // used for padding the unused region in the posting list. 58 // 59 // REQUIRES: next_hit_value <= curr_hit_value AND 60 // curr_hit_value - next_hit_value < 61 // std::numeric_limits<Hit::Value>::max() 62 // 63 // RETURNS: next_hit_value's encoded length in bytes and writes the encoded 64 // value directly into encoded_buf_out. EncodeNextHitValue(Hit::Value next_hit_value,Hit::Value curr_hit_value,uint8_t * encoded_buf_out)65 static size_t EncodeNextHitValue(Hit::Value next_hit_value, 66 Hit::Value curr_hit_value, 67 uint8_t* encoded_buf_out) { 68 uint64_t delta = curr_hit_value - next_hit_value + 1; 69 return VarInt::Encode(delta, encoded_buf_out); 70 } 71 72 // Given the current hit value, decodes the next hit value from an encoded 73 // byte array buffer. 74 // 75 // RETURNS: DecodedHitInfo containing the decoded hit value and the value's 76 // encoded size in bytes. DecodeNextHitValue(const uint8_t * encoded_buf_in,Hit::Value curr_hit_value)77 static DecodedHitInfo DecodeNextHitValue(const uint8_t* encoded_buf_in, 78 Hit::Value curr_hit_value) { 79 uint64_t delta; 80 size_t delta_size = VarInt::Decode(encoded_buf_in, &delta); 81 Hit::Value hit_value = curr_hit_value + delta - 1; 82 return {hit_value, delta_size}; 83 } 84 GetDataTypeBytes()85 uint32_t GetDataTypeBytes() const override { return sizeof(Hit); } 86 GetMinPostingListSize()87 uint32_t GetMinPostingListSize() const override { 88 static constexpr uint32_t kMinPostingListSize = kSpecialHitsSize; 89 static_assert(sizeof(PostingListIndex) <= kMinPostingListSize, 90 "PostingListIndex must be small enough to fit in a " 91 "minimum-sized Posting List."); 92 93 return kMinPostingListSize; 94 } 95 96 uint32_t GetMinPostingListSizeToFit( 97 const PostingListUsed* posting_list_used) const override; 98 99 uint32_t GetBytesUsed( 100 const PostingListUsed* posting_list_used) const override; 101 102 void Clear(PostingListUsed* posting_list_used) const override; 103 104 libtextclassifier3::Status MoveFrom(PostingListUsed* dst, 105 PostingListUsed* src) const override; 106 107 // Prepend a hit to the posting list. 108 // 109 // RETURNS: 110 // - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the 111 // previously added hit. 112 // - RESOURCE_EXHAUSTED if there is no more room to add hit to the posting 113 // list. 114 libtextclassifier3::Status PrependHit(PostingListUsed* posting_list_used, 115 const Hit& hit) const; 116 117 // Retrieves the hits stored in the posting list. 118 // 119 // RETURNS: 120 // - On success, a vector of hits sorted by the reverse order of prepending. 121 // - INTERNAL_ERROR if the posting list has been corrupted somehow. 122 libtextclassifier3::StatusOr<std::vector<Hit>> GetHits( 123 const PostingListUsed* posting_list_used) const; 124 125 // Same as GetHits but appends hits to hits_out. 126 // 127 // RETURNS: 128 // - On success, a vector of hits sorted by the reverse order of prepending. 129 // - INTERNAL_ERROR if the posting list has been corrupted somehow. 130 libtextclassifier3::Status GetHits(const PostingListUsed* posting_list_used, 131 std::vector<Hit>* hits_out) const; 132 133 // Undo the last num_hits hits prepended. If num_hits > number of 134 // hits we clear all hits. 135 // 136 // RETURNS: 137 // - OK on success 138 // - INTERNAL_ERROR if the posting list has been corrupted somehow. 139 libtextclassifier3::Status PopFrontHits(PostingListUsed* posting_list_used, 140 uint32_t num_hits) const; 141 142 private: 143 // Posting list layout formats: 144 // 145 // not_full 146 // 147 // +-----------------+----------------+-------+-----------------+ 148 // |hits-start-offset|Hit::kInvalidVal|xxxxxxx|(compressed) hits| 149 // +-----------------+----------------+-------+-----------------+ 150 // 151 // almost_full 152 // 153 // +-----------------+----------------+-------+-----------------+ 154 // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits| 155 // +-----------------+----------------+-------+-----------------+ 156 // 157 // full() 158 // 159 // +-----------------+----------------+-------+-----------------+ 160 // |1st hit |2nd hit |(pad) |(compressed) hits| 161 // +-----------------+----------------+-------+-----------------+ 162 // 163 // The first two uncompressed hits also implicitly encode information about 164 // the size of the compressed hits region. 165 // 166 // 1. If the posting list is NOT_FULL, then 167 // posting_list_buffer_[0] contains the byte offset of the start of the 168 // compressed hits - and, thus, the size of the compressed hits region is 169 // size_in_bytes - posting_list_buffer_[0]. 170 // 171 // 2. If posting list is ALMOST_FULL or FULL, then the compressed hits region 172 // starts somewhere between [kSpecialHitsSize, kSpecialHitsSize + sizeof(Hit) 173 // - 1] and ends at size_in_bytes - 1. 174 // 175 // Hit term frequencies are stored after the hit value, compressed or 176 // uncompressed. For the first two special hits, we always have a 177 // space for the term frequency. For hits in the compressed area, we only have 178 // the term frequency following the hit value of hit.has_term_frequency() is 179 // true. This allows good compression in the common case where hits don't have 180 // a valid term frequency. 181 // 182 // EXAMPLE 183 // Posting list storage. Posting list size: 20 bytes 184 // EMPTY! 185 // +--bytes 0-4--+----- 5-9 ------+---------------- 10-19 -----------------+ 186 // | 20 |Hit::kInvalidVal| 0x000 | 187 // +-------------+----------------+----------------+-----------------------+ 188 // 189 // Add Hit 0x07FFF998 (DocumentId = 12, SectionId = 3, Flags = 0) 190 // NOT FULL! 191 // +--bytes 0-4--+----- 5-9 ------+----- 10-15 -----+-------- 16-19 -------+ 192 // | 16 |Hit::kInvalidVal| 0x000 | 0x07FFF998 | 193 // +-------------+----------------+-----------------+----------------------+ 194 // 195 // Add Hit 0x07FFF684 (DocumentId = 18, SectionId = 0, Flags = 4, 196 // TermFrequency=125) 197 // (Hit 0x07FFF998 - Hit 0x07FFF684 = 788) 198 // +--bytes 0-4--+----- 5-9 ------+-- 10-12 --+-- 13-16 --+- 17 -+-- 18-19 --+ 199 // | 13 |Hit::kInvalidVal| 0x000 | 0x07FFF684| 125 | 788 | 200 // +-------------+----------------+-----------+-----------+------+-----------+ 201 // 202 // Add Hit 0x07FFF4D2 (DocumentId = 22, SectionId = 10, Flags = 2) 203 // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 434) 204 // +--bytes 0-4--+--- 5-9 ----+-- 10 --+-- 11-14 -+- 15-16 -+- 17 -+- 18-19 -+ 205 // | 9 |Hit::kInvVal| 0x00 |0x07FFF4D2| 434 | 125 | 788 | 206 // +-------------+------------+--------+----------+---------+------+---------+ 207 // 208 // Add Hit 0x07FFF40E (DocumentId = 23, SectionId = 1, Flags = 6, 209 // TermFrequency = 87) 210 // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 196) ALMOST FULL! 211 // +--bytes 0-4-+---- 5-9 ----+- 10-12 -+- 13-14 -+- 15-16 -+- 17 -+- 18-19 -+ 212 // |Hit::kInvVal|0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788 | 213 // +-------------+------------+---------+---------+---------+------+---------+ 214 // 215 // Add Hit 0x07FFF320 (DocumentId = 27, SectionId = 4, Flags = 0) 216 // FULL! 217 // +--bytes 0-4--+---- 5-9 ----+- 10-13 -+-- 14-15 -+- 16-17 -+- 18 -+- 19-20 218 // -+ | 0x07FFF320 |0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788 219 // | 220 // +-------------+-------------+---------+----------+---------+------+---------+ 221 222 // Helpers to determine what state the posting list is in. IsFull(const PostingListUsed * posting_list_used)223 bool IsFull(const PostingListUsed* posting_list_used) const { 224 return GetSpecialHit(posting_list_used, /*index=*/0) 225 .ValueOrDie() 226 .is_valid() && 227 GetSpecialHit(posting_list_used, /*index=*/1) 228 .ValueOrDie() 229 .is_valid(); 230 } 231 IsAlmostFull(const PostingListUsed * posting_list_used)232 bool IsAlmostFull(const PostingListUsed* posting_list_used) const { 233 return !GetSpecialHit(posting_list_used, /*index=*/0) 234 .ValueOrDie() 235 .is_valid(); 236 } 237 IsEmpty(const PostingListUsed * posting_list_used)238 bool IsEmpty(const PostingListUsed* posting_list_used) const { 239 return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() == 240 posting_list_used->size_in_bytes() && 241 !GetSpecialHit(posting_list_used, /*index=*/1) 242 .ValueOrDie() 243 .is_valid(); 244 } 245 246 // Returns false if both special hits are invalid or if the offset value 247 // stored in the special hit is less than kSpecialHitsSize or greater than 248 // posting_list_used->size_in_bytes(). Returns true, otherwise. 249 bool IsPostingListValid(const PostingListUsed* posting_list_used) const; 250 251 // Prepend hit to a posting list that is in the ALMOST_FULL state. 252 // RETURNS: 253 // - OK, if successful 254 // - INVALID_ARGUMENT if hit is not less than the previously added hit. 255 libtextclassifier3::Status PrependHitToAlmostFull( 256 PostingListUsed* posting_list_used, const Hit& hit) const; 257 258 // Prepend hit to a posting list that is in the EMPTY state. This will always 259 // succeed because there are no pre-existing hits and no validly constructed 260 // posting list could fail to fit one hit. 261 void PrependHitToEmpty(PostingListUsed* posting_list_used, 262 const Hit& hit) const; 263 264 // Prepend hit to a posting list that is in the NOT_FULL state. 265 // RETURNS: 266 // - OK, if successful 267 // - INVALID_ARGUMENT if hit is not less than the previously added hit. 268 libtextclassifier3::Status PrependHitToNotFull( 269 PostingListUsed* posting_list_used, const Hit& hit, 270 uint32_t offset) const; 271 272 // Returns either 0 (full state), sizeof(Hit) (almost_full state) or 273 // a byte offset between kSpecialHitsSize and 274 // posting_list_used->size_in_bytes() (inclusive) (not_full state). 275 uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const; 276 277 // Sets the special hits to properly reflect what offset is (see layout 278 // comment for further details). 279 // 280 // Returns false if offset > posting_list_used->size_in_bytes() or offset is 281 // (kSpecialHitsSize, sizeof(Hit)) or offset is (sizeof(Hit), 0). True, 282 // otherwise. 283 bool SetStartByteOffset(PostingListUsed* posting_list_used, 284 uint32_t offset) const; 285 286 // Manipulate padded areas. We never store the same hit value twice 287 // so a delta of 0 is a pad byte. 288 289 // Returns offset of first non-pad byte. 290 uint32_t GetPadEnd(const PostingListUsed* posting_list_used, 291 uint32_t offset) const; 292 293 // Fill padding between offset start and offset end with 0s. 294 // Returns false if end > posting_list_used->size_in_bytes(). True, 295 // otherwise. 296 bool PadToEnd(PostingListUsed* posting_list_used, uint32_t start, 297 uint32_t end) const; 298 299 // Helper for AppendHits/PopFrontHits. Adds limit number of hits to out or all 300 // hits in the posting list if the posting list contains less than limit 301 // number of hits. out can be NULL. 302 // 303 // NOTE: If called with limit=1, pop=true on a posting list that transitioned 304 // from NOT_FULL directly to FULL, GetHitsInternal will not return the posting 305 // list to NOT_FULL. Instead it will leave it in a valid state, but it will be 306 // ALMOST_FULL. 307 // 308 // RETURNS: 309 // - OK on success 310 // - INTERNAL_ERROR if the posting list has been corrupted somehow. 311 libtextclassifier3::Status GetHitsInternal( 312 const PostingListUsed* posting_list_used, uint32_t limit, bool pop, 313 std::vector<Hit>* out) const; 314 315 // Retrieves the value stored in the index-th special hit. 316 // 317 // RETURNS: 318 // - A valid Hit, on success 319 // - INVALID_ARGUMENT if index is not less than kNumSpecialData 320 libtextclassifier3::StatusOr<Hit> GetSpecialHit( 321 const PostingListUsed* posting_list_used, uint32_t index) const; 322 323 // Sets the value stored in the index-th special hit to val. If index is not 324 // less than kSpecialHitSize / sizeof(Hit), this has no effect. 325 bool SetSpecialHit(PostingListUsed* posting_list_used, uint32_t index, 326 const Hit& val) const; 327 328 // Prepends hit to the memory region [offset - sizeof(Hit), offset] and 329 // returns the new beginning of the padded region. 330 // 331 // RETURNS: 332 // - The new beginning of the padded region, if successful. 333 // - INVALID_ARGUMENT if hit will not fit (uncompressed) between offset and 334 // kSpecialHitsSize 335 libtextclassifier3::StatusOr<uint32_t> PrependHitUncompressed( 336 PostingListUsed* posting_list_used, const Hit& hit, 337 uint32_t offset) const; 338 339 // If hit has the flags and/or term frequency field, consumes the flags and/or 340 // term frequency at offset, updates hit to include the flag and/or term 341 // frequency and updates offset to reflect that the flag and/or term frequency 342 // fields have been consumed. 343 // 344 // RETURNS: 345 // - OK, if successful 346 // - INVALID_ARGUMENT if hit has a flags and/or term frequency field and 347 // offset + sizeof(Hit's flag) + sizeof(Hit's tf) >= 348 // posting_list_used->size_in_bytes() 349 // i.e. the posting list is not large enough to consume the hit's flags 350 // and term frequency fields 351 libtextclassifier3::Status ConsumeFlagsAndTermFrequencyIfPresent( 352 const PostingListUsed* posting_list_used, Hit* hit, 353 uint32_t* offset) const; 354 }; 355 356 } // namespace lib 357 } // namespace icing 358 359 #endif // ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_ 360