1 // Copyright (C) 2022 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_SERIALIZER_H_ 16 #define ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_SERIALIZER_H_ 17 18 #include <cstdint> 19 #include <vector> 20 21 #include "icing/text_classifier/lib3/utils/base/status.h" 22 #include "icing/text_classifier/lib3/utils/base/statusor.h" 23 #include "icing/file/posting_list/posting-list-common.h" 24 #include "icing/file/posting_list/posting-list-used.h" 25 #include "icing/index/numeric/integer-index-data.h" 26 27 namespace icing { 28 namespace lib { 29 30 // A serializer class to serialize IntegerIndexData to PostingListUsed. 31 class PostingListIntegerIndexSerializer : public PostingListSerializer { 32 public: 33 using SpecialDataType = SpecialData<IntegerIndexData>; 34 static_assert(sizeof(SpecialDataType) == sizeof(IntegerIndexData), ""); 35 36 static constexpr uint32_t kSpecialDataSize = 37 kNumSpecialData * sizeof(SpecialDataType); 38 GetDataTypeBytes()39 uint32_t GetDataTypeBytes() const override { 40 return sizeof(IntegerIndexData); 41 } 42 GetMinPostingListSize()43 uint32_t GetMinPostingListSize() const override { 44 static constexpr uint32_t kMinPostingListSize = kSpecialDataSize; 45 static_assert(sizeof(PostingListIndex) <= kMinPostingListSize, 46 "PostingListIndex must be small enough to fit in a " 47 "minimum-sized Posting List."); 48 49 return kMinPostingListSize; 50 } 51 52 uint32_t GetMinPostingListSizeToFit( 53 const PostingListUsed* posting_list_used) const override; 54 55 uint32_t GetBytesUsed( 56 const PostingListUsed* posting_list_used) const override; 57 58 void Clear(PostingListUsed* posting_list_used) const override; 59 60 libtextclassifier3::Status MoveFrom(PostingListUsed* dst, 61 PostingListUsed* src) const override; 62 63 // Prepend an IntegerIndexData to the posting list. 64 // 65 // RETURNS: 66 // - INVALID_ARGUMENT if !data.is_valid() or if data is not less than the 67 // previously added data. 68 // - RESOURCE_EXHAUSTED if there is no more room to add data to the posting 69 // list. 70 libtextclassifier3::Status PrependData(PostingListUsed* posting_list_used, 71 const IntegerIndexData& data) const; 72 73 // Prepend multiple IntegerIndexData to the posting list. Data should be 74 // sorted in ascending order (as defined by the less than operator for 75 // IntegerIndexData) 76 // If keep_prepended is true, whatever could be prepended is kept, otherwise 77 // the posting list is reverted and left in its original state. 78 // 79 // RETURNS: 80 // The number of data that have been prepended to the posting list. If 81 // keep_prepended is false and reverted, then it returns 0. 82 libtextclassifier3::StatusOr<uint32_t> PrependDataArray( 83 PostingListUsed* posting_list_used, const IntegerIndexData* array, 84 uint32_t num_data, bool keep_prepended) const; 85 86 // Retrieves all data stored in the posting list. 87 // 88 // RETURNS: 89 // - On success, a vector of IntegerIndexData sorted by the reverse order of 90 // prepending. 91 // - INTERNAL_ERROR if the posting list has been corrupted somehow. 92 libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> GetData( 93 const PostingListUsed* posting_list_used) const; 94 95 // Same as GetData but appends data to data_arr_out. 96 // 97 // RETURNS: 98 // - OK on success, and data_arr_out will be appended IntegerIndexData 99 // sorted by the reverse order of prepending. 100 // - INTERNAL_ERROR if the posting list has been corrupted somehow. 101 libtextclassifier3::Status GetData( 102 const PostingListUsed* posting_list_used, 103 std::vector<IntegerIndexData>* data_arr_out) const; 104 105 // Undo the last num_data data prepended. If num_data > number of data, then 106 // we clear all data. 107 // 108 // RETURNS: 109 // - OK on success 110 // - INTERNAL_ERROR if the posting list has been corrupted somehow. 111 libtextclassifier3::Status PopFrontData(PostingListUsed* posting_list_used, 112 uint32_t num_data) const; 113 114 // Helper function to determine if posting list is full. IsFull(const PostingListUsed * posting_list_used)115 bool IsFull(const PostingListUsed* posting_list_used) const { 116 return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() && 117 GetSpecialData(posting_list_used, /*index=*/1).data().is_valid(); 118 } 119 120 private: 121 // Posting list layout formats: 122 // 123 // NOT_FULL 124 // +-special-data-0--+-special-data-1--+------------+-----------------------+ 125 // | | | | | 126 // |data-start-offset| Data::Invalid | 0x00000000 | (compressed) data | 127 // | | | | | 128 // +-----------------+-----------------+------------+-----------------------+ 129 // 130 // ALMOST_FULL 131 // +-special-data-0--+-special-data-1--+-----+------------------------------+ 132 // | | | | | 133 // | Data::Invalid | 1st data |(pad)| (compressed) data | 134 // | | | | | 135 // +-----------------+-----------------+-----+------------------------------+ 136 // 137 // FULL 138 // +-special-data-0--+-special-data-1--+-----+------------------------------+ 139 // | | | | | 140 // | 1st data | 2nd data |(pad)| (compressed) data | 141 // | | | | | 142 // +-----------------+-----------------+-----+------------------------------+ 143 // 144 // The first two uncompressed (special) data also implicitly encode 145 // information about the size of the compressed data region. 146 // 147 // 1. If the posting list is NOT_FULL, then special_data_0 contains the byte 148 // offset of the start of the compressed data. Thus, the size of the 149 // compressed data is 150 // posting_list_used->size_in_bytes() - special_data_0.data_start_offset(). 151 // 152 // 2. If posting list is ALMOST_FULL or FULL, then the compressed data region 153 // starts somewhere between 154 // [kSpecialDataSize, kSpecialDataSize + sizeof(IntegerIndexData) - 1] and 155 // ends at posting_list_used->size_in_bytes() - 1. 156 // 157 // EXAMPLE 158 // Posting list storage. Posting list size: 36 bytes 159 // 160 // EMPTY! 161 // +--- byte 0-11 ---+----- 12-23 -----+-------------- 24-35 ---------------+ 162 // | | | | 163 // | 36 | Data::Invalid | 0x00000000 | 164 // | | | | 165 // +-----------------+-----------------+------------------------------------+ 166 // 167 // Add IntegerIndexData(0x0FFFFCC3, 5) 168 // (DocumentId = 12, SectionId = 3; Key = 5) 169 // (VarInt64(5) is encoded as 10 (b'1010), requires 1 byte) 170 // NOT FULL! 171 // +--- byte 0-11 ---+----- 12-23 -----+------- 24-30 -------+--- 31-35 ----+ 172 // | | | | 0x0FFFFCC3 | 173 // | 31 | Data::Invalid | 0x00000000 | VI64(5) | 174 // | | | | | 175 // +-----------------+-----------------+---------------------+--------------+ 176 // 177 // Add IntegerIndexData(0x0FFFFB40, -2) 178 // (DocumentId = 18, SectionId = 0; Key = -2) 179 // (VarInt64(-2) is encoded as 3 (b'11), requires 1 byte) 180 // Previous IntegerIndexData BasicHit delta varint encoding: 181 // 0x0FFFFCC3 - 0x0FFFFB40 = 387, VarUnsignedInt(387) requires 2 bytes 182 // +--- byte 0-11 ---+----- 12-23 -----+-- 24-27 ---+--- 28-32 ----+ 33-35 -+ 183 // | | | | 0x0FFFFB40 |VUI(387)| 184 // | 28 | Data::Invalid | 0x00 | VI64(-2) |VI64(5) | 185 // | | | | | | 186 // +-----------------+-----------------+------------+--------------+--------+ 187 // 188 // Add IntegerIndexData(0x0FFFFA4A, 3) 189 // (DocumentId = 22, SectionId = 10; Key = 3) 190 // (VarInt64(3) is encoded as 6 (b'110), requires 1 byte) 191 // Previous IntegerIndexData BasicHit delta varint encoding: 192 // 0x0FFFFB40 - 0x0FFFFA4A = 246, VarUnsignedInt(246) requires 2 bytes 193 // +--- byte 0-11 ---+----- 12-23 -----+---+--- 25-29 ----+ 30-32 -+ 33-35 -+ 194 // | | | | 0x0FFFFA4A |VUI(246)|VUI(387)| 195 // | 25 | Data::Invalid | | VI64(3) |VI64(-2)|VI64(5) | 196 // | | | | | | | 197 // +-----------------+-----------------+---+--------------+--------+--------+ 198 // 199 // Add IntegerIndexData(0x0FFFFA01, -4) 200 // (DocumentId = 23, SectionId = 1; Key = -4) 201 // (No VarInt64 for key, since it is stored in special data section) 202 // Previous IntegerIndexData BasicHit delta varint encoding: 203 // 0x0FFFFA4A - 0x0FFFFA01 = 73, VarUnsignedInt(73) requires 1 byte) 204 // ALMOST_FULL! 205 // +--- byte 0-11 ---+----- 12-23 -----+-- 24-27 ---+28-29+ 30-32 -+ 33-35 -+ 206 // | | 0x0FFFFA01 | |(73) |VUI(246)|VUI(387)| 207 // | Data::Invalid | 0xFFFFFFFF | (pad) |(3) |VI64(-2)|VI64(5) | 208 // | | 0xFFFFFFFC | | | | | 209 // +-----------------+-----------------+------------+-----+--------+--------+ 210 // 211 // Add IntegerIndexData(0x0FFFF904, 0) 212 // (DocumentId = 27, SectionId = 4; Key = 0) 213 // (No VarInt64 for key, since it is stored in special data section) 214 // Previous IntegerIndexData: 215 // Since 0x0FFFFA01 - 0x0FFFF904 = 253 and VarInt64(-4) is encoded as 7 216 // (b'111), it requires only 3 bytes after compression. It's able to fit 217 // into the padding section. 218 // Still ALMOST_FULL! 219 // +--- byte 0-11 ---+----- 12-23 -----+---+ 25-27 -+28-29+ 30-32 -+ 33-35 -+ 220 // | | 0x0FFFF904 | |VUI(253)|(73) |VUI(246)|VUI(387)| 221 // | Data::Invalid | 0x00000000 | |VI64(-4)|(3) |VI64(-2)|VI64(5) | 222 // | | 0x00000000 | | | | | | 223 // +-----------------+-----------------+---+--------+-----+--------+--------+ 224 // 225 // Add IntegerIndexData(0x0FFFF8C3, -1) 226 // (DocumentId = 28, SectionId = 3; Key = -1) 227 // (No VarInt64 for key, since it is stored in special data section) 228 // (No VarUnsignedInt for previous IntegerIndexData BasicHit) 229 // FULL! 230 // +--- byte 0-11 ---+----- 12-23 -----+---+ 25-27 -+28-29+ 30-32 -+ 33-35 -+ 231 // | 0x0FFFF8C3 | 0x0FFFF904 | |VUI(253)|(73) |VUI(246)|VUI(387)| 232 // | 0xFFFFFFFF | 0x00000000 | |VI64(-4)|(3) |VI64(-2)|VI64(5) | 233 // | 0xFFFFFFFF | 0x00000000 | | | | | | 234 // +-----------------+-----------------+---+--------+-----+--------+--------+ 235 236 // Helpers to determine what state the posting list is in. IsAlmostFull(const PostingListUsed * posting_list_used)237 bool IsAlmostFull(const PostingListUsed* posting_list_used) const { 238 return !GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() && 239 GetSpecialData(posting_list_used, /*index=*/1).data().is_valid(); 240 } 241 IsEmpty(const PostingListUsed * posting_list_used)242 bool IsEmpty(const PostingListUsed* posting_list_used) const { 243 return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset() == 244 posting_list_used->size_in_bytes() && 245 !GetSpecialData(posting_list_used, /*index=*/1).data().is_valid(); 246 } 247 248 // Returns false if both special data are invalid or if data start offset 249 // stored in the special data is less than kSpecialDataSize or greater than 250 // posting_list_used->size_in_bytes(). Returns true, otherwise. 251 bool IsPostingListValid(const PostingListUsed* posting_list_used) const; 252 253 // Prepend data to a posting list that is in the ALMOST_FULL state. 254 // 255 // RETURNS: 256 // - OK, if successful 257 // - INVALID_ARGUMENT if data is not less than the previously added data. 258 libtextclassifier3::Status PrependDataToAlmostFull( 259 PostingListUsed* posting_list_used, const IntegerIndexData& data) const; 260 261 // Prepend data to a posting list that is in the EMPTY state. This will always 262 // succeed because there are no pre-existing data and no validly constructed 263 // posting list could fail to fit one data. 264 void PrependDataToEmpty(PostingListUsed* posting_list_used, 265 const IntegerIndexData& data) const; 266 267 // Prepend data to a posting list that is in the NOT_FULL state. 268 // 269 // RETURNS: 270 // - OK, if successful 271 // - INVALID_ARGUMENT if data is not less than the previously added data. 272 libtextclassifier3::Status PrependDataToNotFull( 273 PostingListUsed* posting_list_used, const IntegerIndexData& data, 274 uint32_t offset) const; 275 276 // Returns either 0 (FULL state), sizeof(IntegerIndexData) (ALMOST_FULL state) 277 // or a byte offset between kSpecialDataSize and 278 // posting_list_used->size_in_bytes() (inclusive) (NOT_FULL state). 279 uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const; 280 281 // Sets special data 0 to properly reflect what start byte offset is (see 282 // layout comment for further details). 283 // 284 // Returns false if offset > posting_list_used->size_in_bytes() or offset is 285 // in range (kSpecialDataSize, sizeof(IntegerIndexData)) or 286 // (sizeof(IntegerIndexData), 0). True, otherwise. 287 bool SetStartByteOffset(PostingListUsed* posting_list_used, 288 uint32_t offset) const; 289 290 // Helper for MoveFrom/GetData/PopFrontData. Adds limit number of data to out 291 // or all data in the posting list if the posting list contains less than 292 // limit number of data. out can be NULL. 293 // 294 // NOTE: If called with limit=1, pop=true on a posting list that transitioned 295 // from NOT_FULL directly to FULL, GetDataInternal will not return the posting 296 // list to NOT_FULL. Instead it will leave it in a valid state, but it will be 297 // ALMOST_FULL. 298 // 299 // RETURNS: 300 // - OK on success 301 // - INTERNAL_ERROR if the posting list has been corrupted somehow. 302 libtextclassifier3::Status GetDataInternal( 303 const PostingListUsed* posting_list_used, uint32_t limit, bool pop, 304 std::vector<IntegerIndexData>* out) const; 305 306 // Retrieves the value stored in the index-th special data. 307 // 308 // REQUIRES: 309 // 0 <= index < kNumSpecialData. 310 // 311 // RETURNS: 312 // - A valid SpecialData<IntegerIndexData>. 313 SpecialDataType GetSpecialData(const PostingListUsed* posting_list_used, 314 uint32_t index) const; 315 316 // Sets the value stored in the index-th special data to special_data. 317 // 318 // REQUIRES: 319 // 0 <= index < kNumSpecialData. 320 void SetSpecialData(PostingListUsed* posting_list_used, uint32_t index, 321 const SpecialDataType& special_data) const; 322 323 // Prepends data to the memory region [offset - sizeof(IntegerIndexData), 324 // offset - 1] and returns the new beginning of the region. 325 // 326 // RETURNS: 327 // - The new beginning of the padded region, if successful. 328 // - INVALID_ARGUMENT if data will not fit (uncompressed) between 329 // [kSpecialDataSize, offset - 1] 330 libtextclassifier3::StatusOr<uint32_t> PrependDataUncompressed( 331 PostingListUsed* posting_list_used, const IntegerIndexData& data, 332 uint32_t offset) const; 333 }; 334 335 } // namespace lib 336 } // namespace icing 337 338 #endif // ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_SERIALIZER_H_ 339