1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef ICING_JOIN_POSTING_LIST_JOIN_DATA_ACCESSOR_H_
16 #define ICING_JOIN_POSTING_LIST_JOIN_DATA_ACCESSOR_H_
17
18 #include <cstdint>
19 #include <memory>
20 #include <utility>
21 #include <vector>
22
23 #include "icing/text_classifier/lib3/utils/base/status.h"
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/absl_ports/canonical_errors.h"
26 #include "icing/file/posting_list/flash-index-storage.h"
27 #include "icing/file/posting_list/index-block.h"
28 #include "icing/file/posting_list/posting-list-accessor.h"
29 #include "icing/file/posting_list/posting-list-common.h"
30 #include "icing/file/posting_list/posting-list-identifier.h"
31 #include "icing/file/posting_list/posting-list-used.h"
32 #include "icing/join/posting-list-join-data-serializer.h"
33 #include "icing/legacy/index/icing-bit-util.h"
34 #include "icing/util/status-macros.h"
35
36 namespace icing {
37 namespace lib {
38
39 // This class is used to provide a simple abstraction for adding join data to
40 // posting lists. PostingListJoinDataAccessor handles:
41 // 1) selection of properly-sized posting lists for the accumulated join index
42 // data during Finalize()
43 // 2) chaining of max-sized posting lists.
44 template <typename JoinDataType>
45 class PostingListJoinDataAccessor : public PostingListAccessor {
46 public:
47 // Creates an empty PostingListJoinDataAccessor.
48 //
49 // RETURNS:
50 // - On success, a valid instance of PostingListJoinDataAccessor
51 // - INVALID_ARGUMENT error if storage has an invalid block_size.
52 static libtextclassifier3::StatusOr<
53 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>>
54 Create(FlashIndexStorage* storage,
55 PostingListJoinDataSerializer<JoinDataType>* serializer);
56
57 // Creates a PostingListJoinDataAccessor with an existing posting list
58 // identified by existing_posting_list_id.
59 //
60 // RETURNS:
61 // - On success, a valid instance of PostingListJoinDataAccessor
62 // - INVALID_ARGUMENT if storage has an invalid block_size.
63 static libtextclassifier3::StatusOr<
64 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>>
65 CreateFromExisting(FlashIndexStorage* storage,
66 PostingListJoinDataSerializer<JoinDataType>* serializer,
67 PostingListIdentifier existing_posting_list_id);
68
GetSerializer()69 PostingListSerializer* GetSerializer() override { return serializer_; }
70
71 // Retrieves the next batch of data in the posting list chain.
72 //
73 // RETURNS:
74 // - On success, a vector of join data in the posting list chain
75 // - FAILED_PRECONDITION_ERROR if called on an instance that was created via
76 // Create.
77 // - INTERNAL_ERROR if unable to read the next posting list in the chain or
78 // if the posting list has been corrupted somehow.
79 libtextclassifier3::StatusOr<std::vector<JoinDataType>> GetNextDataBatch();
80
81 // Prepends one data. This may result in flushing the posting list to disk (if
82 // the PostingListJoinDataAccessor holds a max-sized posting list that is
83 // full) or freeing a pre-existing posting list if it is too small to fit all
84 // data necessary.
85 //
86 // RETURNS:
87 // - OK, on success
88 // - INVALID_ARGUMENT if !data.is_valid() or if data is greater than the
89 // previously added data.
90 // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
91 // posting list.
92 libtextclassifier3::Status PrependData(const JoinDataType& data);
93
94 private:
PostingListJoinDataAccessor(FlashIndexStorage * storage,PostingListUsed in_memory_posting_list,PostingListJoinDataSerializer<JoinDataType> * serializer)95 explicit PostingListJoinDataAccessor(
96 FlashIndexStorage* storage, PostingListUsed in_memory_posting_list,
97 PostingListJoinDataSerializer<JoinDataType>* serializer)
98 : PostingListAccessor(storage, std::move(in_memory_posting_list)),
99 serializer_(serializer) {}
100
101 PostingListJoinDataSerializer<JoinDataType>* serializer_; // Does not own.
102 };
103
104 template <typename JoinDataType>
105 /* static */ libtextclassifier3::StatusOr<
106 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>>
Create(FlashIndexStorage * storage,PostingListJoinDataSerializer<JoinDataType> * serializer)107 PostingListJoinDataAccessor<JoinDataType>::Create(
108 FlashIndexStorage* storage,
109 PostingListJoinDataSerializer<JoinDataType>* serializer) {
110 uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
111 storage->block_size(), serializer->GetDataTypeBytes());
112 ICING_ASSIGN_OR_RETURN(PostingListUsed in_memory_posting_list,
113 PostingListUsed::CreateFromUnitializedRegion(
114 serializer, max_posting_list_bytes));
115 return std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>(
116 new PostingListJoinDataAccessor<JoinDataType>(
117 storage, std::move(in_memory_posting_list), serializer));
118 }
119
120 template <typename JoinDataType>
121 /* static */ libtextclassifier3::StatusOr<
122 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>>
CreateFromExisting(FlashIndexStorage * storage,PostingListJoinDataSerializer<JoinDataType> * serializer,PostingListIdentifier existing_posting_list_id)123 PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
124 FlashIndexStorage* storage,
125 PostingListJoinDataSerializer<JoinDataType>* serializer,
126 PostingListIdentifier existing_posting_list_id) {
127 ICING_ASSIGN_OR_RETURN(
128 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
129 Create(storage, serializer));
130 ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
131 storage->GetPostingList(existing_posting_list_id));
132 pl_accessor->preexisting_posting_list_ =
133 std::make_unique<PostingListHolder>(std::move(holder));
134 return pl_accessor;
135 }
136
137 // Returns the next batch of join data for the provided posting list.
138 template <typename JoinDataType>
139 libtextclassifier3::StatusOr<std::vector<JoinDataType>>
GetNextDataBatch()140 PostingListJoinDataAccessor<JoinDataType>::GetNextDataBatch() {
141 if (preexisting_posting_list_ == nullptr) {
142 if (has_reached_posting_list_chain_end_) {
143 return std::vector<JoinDataType>();
144 }
145 return absl_ports::FailedPreconditionError(
146 "Cannot retrieve data from a PostingListJoinDataAccessor that was not "
147 "created from a preexisting posting list.");
148 }
149 ICING_ASSIGN_OR_RETURN(
150 std::vector<JoinDataType> batch,
151 serializer_->GetData(&preexisting_posting_list_->posting_list));
152 uint32_t next_block_index = kInvalidBlockIndex;
153 // Posting lists will only be chained when they are max-sized, in which case
154 // next_block_index will point to the next block for the next posting list.
155 // Otherwise, next_block_index can be kInvalidBlockIndex or be used to point
156 // to the next free list block, which is not relevant here.
157 if (preexisting_posting_list_->posting_list.size_in_bytes() ==
158 storage_->max_posting_list_bytes()) {
159 next_block_index = preexisting_posting_list_->next_block_index;
160 }
161
162 if (next_block_index != kInvalidBlockIndex) {
163 // Since we only have to deal with next block for max-sized posting list
164 // block, max_num_posting_lists is 1 and posting_list_index_bits is
165 // BitsToStore(1).
166 PostingListIdentifier next_posting_list_id(
167 next_block_index, /*posting_list_index=*/0,
168 /*posting_list_index_bits=*/BitsToStore(1));
169 ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
170 storage_->GetPostingList(next_posting_list_id));
171 preexisting_posting_list_ =
172 std::make_unique<PostingListHolder>(std::move(holder));
173 } else {
174 has_reached_posting_list_chain_end_ = true;
175 preexisting_posting_list_.reset();
176 }
177 return batch;
178 }
179
180 template <typename JoinDataType>
181 libtextclassifier3::Status
PrependData(const JoinDataType & data)182 PostingListJoinDataAccessor<JoinDataType>::PrependData(
183 const JoinDataType& data) {
184 PostingListUsed& active_pl = (preexisting_posting_list_ != nullptr)
185 ? preexisting_posting_list_->posting_list
186 : in_memory_posting_list_;
187 libtextclassifier3::Status status =
188 serializer_->PrependData(&active_pl, data);
189 if (!absl_ports::IsResourceExhausted(status)) {
190 return status;
191 }
192 // There is no more room to add data to this current posting list! Therefore,
193 // we need to either move those data to a larger posting list or flush this
194 // posting list and create another max-sized posting list in the chain.
195 if (preexisting_posting_list_ != nullptr) {
196 ICING_RETURN_IF_ERROR(FlushPreexistingPostingList());
197 } else {
198 ICING_RETURN_IF_ERROR(FlushInMemoryPostingList());
199 }
200
201 // Re-add data. Should always fit since we just cleared
202 // in_memory_posting_list_. It's fine to explicitly reference
203 // in_memory_posting_list_ here because there's no way of reaching this line
204 // while preexisting_posting_list_ is still in use.
205 return serializer_->PrependData(&in_memory_posting_list_, data);
206 }
207
208 } // namespace lib
209 } // namespace icing
210
211 #endif // ICING_JOIN_POSTING_LIST_JOIN_DATA_ACCESSOR_H_
212