1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/join/posting-list-join-data-accessor.h"
16
17 #include <cstdint>
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22
23 #include "icing/text_classifier/lib3/utils/base/status.h"
24 #include "gmock/gmock.h"
25 #include "gtest/gtest.h"
26 #include "icing/file/filesystem.h"
27 #include "icing/file/posting_list/flash-index-storage.h"
28 #include "icing/file/posting_list/posting-list-accessor.h"
29 #include "icing/file/posting_list/posting-list-common.h"
30 #include "icing/file/posting_list/posting-list-identifier.h"
31 #include "icing/join/document-id-to-join-info.h"
32 #include "icing/join/posting-list-join-data-serializer.h"
33 #include "icing/store/document-id.h"
34 #include "icing/store/namespace-id-fingerprint.h"
35 #include "icing/store/namespace-id.h"
36 #include "icing/testing/common-matchers.h"
37 #include "icing/testing/tmp-directory.h"
38
39 namespace icing {
40 namespace lib {
41
42 namespace {
43
44 using ::testing::ElementsAre;
45 using ::testing::ElementsAreArray;
46 using ::testing::Eq;
47 using ::testing::Lt;
48 using ::testing::Ne;
49 using ::testing::SizeIs;
50
51 using JoinDataType = DocumentIdToJoinInfo<NamespaceIdFingerprint>;
52
53 static constexpr NamespaceId kDefaultNamespaceId = 1;
54
55 class PostingListJoinDataAccessorTest : public ::testing::Test {
56 protected:
SetUp()57 void SetUp() override {
58 test_dir_ = GetTestTempDir() + "/test_dir";
59 file_name_ = test_dir_ + "/test_file.idx.index";
60
61 ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
62 ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
63
64 serializer_ =
65 std::make_unique<PostingListJoinDataSerializer<JoinDataType>>();
66
67 ICING_ASSERT_OK_AND_ASSIGN(
68 FlashIndexStorage flash_index_storage,
69 FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
70 flash_index_storage_ =
71 std::make_unique<FlashIndexStorage>(std::move(flash_index_storage));
72 }
73
TearDown()74 void TearDown() override {
75 flash_index_storage_.reset();
76 serializer_.reset();
77 ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
78 }
79
80 Filesystem filesystem_;
81 std::string test_dir_;
82 std::string file_name_;
83 std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> serializer_;
84 std::unique_ptr<FlashIndexStorage> flash_index_storage_;
85 };
86
CreateData(int num_data,DocumentId start_document_id,NamespaceId ref_namespace_id,uint64_t start_ref_hash_uri)87 std::vector<JoinDataType> CreateData(int num_data, DocumentId start_document_id,
88 NamespaceId ref_namespace_id,
89 uint64_t start_ref_hash_uri) {
90 std::vector<JoinDataType> data;
91 data.reserve(num_data);
92 for (int i = 0; i < num_data; ++i) {
93 data.push_back(JoinDataType(
94 start_document_id,
95 NamespaceIdFingerprint(ref_namespace_id,
96 /*fingerprint=*/start_ref_hash_uri)));
97
98 ++start_document_id;
99 ++start_ref_hash_uri;
100 }
101 return data;
102 }
103
TEST_F(PostingListJoinDataAccessorTest,DataAddAndRetrieveProperly)104 TEST_F(PostingListJoinDataAccessorTest, DataAddAndRetrieveProperly) {
105 ICING_ASSERT_OK_AND_ASSIGN(
106 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
107 PostingListJoinDataAccessor<JoinDataType>::Create(
108 flash_index_storage_.get(), serializer_.get()));
109 // Add some join data
110 std::vector<JoinDataType> data_vec =
111 CreateData(/*num_data=*/5, /*start_document_id=*/0,
112 /*ref_namespace_id=*/kDefaultNamespaceId,
113 /*start_ref_hash_uri=*/819);
114 for (const JoinDataType& data : data_vec) {
115 EXPECT_THAT(pl_accessor->PrependData(data), IsOk());
116 }
117 PostingListAccessor::FinalizeResult result =
118 std::move(*pl_accessor).Finalize();
119 EXPECT_THAT(result.status, IsOk());
120 EXPECT_THAT(result.id.block_index(), Eq(1));
121 EXPECT_THAT(result.id.posting_list_index(), Eq(0));
122
123 // Retrieve some data.
124 ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
125 flash_index_storage_->GetPostingList(result.id));
126 EXPECT_THAT(
127 serializer_->GetData(&pl_holder.posting_list),
128 IsOkAndHolds(ElementsAreArray(data_vec.rbegin(), data_vec.rend())));
129 EXPECT_THAT(pl_holder.next_block_index, Eq(kInvalidBlockIndex));
130 }
131
TEST_F(PostingListJoinDataAccessorTest,PreexistingPLKeepOnSameBlock)132 TEST_F(PostingListJoinDataAccessorTest, PreexistingPLKeepOnSameBlock) {
133 ICING_ASSERT_OK_AND_ASSIGN(
134 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
135 PostingListJoinDataAccessor<JoinDataType>::Create(
136 flash_index_storage_.get(), serializer_.get()));
137 // Add a single data. This will fit in a min-sized posting list.
138 JoinDataType data1(
139 /*document_id=*/1,
140 NamespaceIdFingerprint(kDefaultNamespaceId, /*fingerprint=*/123));
141 ICING_ASSERT_OK(pl_accessor->PrependData(data1));
142 PostingListAccessor::FinalizeResult result1 =
143 std::move(*pl_accessor).Finalize();
144 ICING_ASSERT_OK(result1.status);
145 // Should be allocated to the first block.
146 ASSERT_THAT(result1.id.block_index(), Eq(1));
147 ASSERT_THAT(result1.id.posting_list_index(), Eq(0));
148
149 // Add one more data. The minimum size for a posting list must be able to fit
150 // two data, so this should NOT cause the previous pl to be reallocated.
151 ICING_ASSERT_OK_AND_ASSIGN(
152 pl_accessor,
153 PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
154 flash_index_storage_.get(), serializer_.get(), result1.id));
155 JoinDataType data2(
156 /*document_id=*/2,
157 NamespaceIdFingerprint(kDefaultNamespaceId, /*fingerprint=*/456));
158 ICING_ASSERT_OK(pl_accessor->PrependData(data2));
159 PostingListAccessor::FinalizeResult result2 =
160 std::move(*pl_accessor).Finalize();
161 ICING_ASSERT_OK(result2.status);
162 // Should be in the same posting list.
163 EXPECT_THAT(result2.id, Eq(result1.id));
164
165 // The posting list at result2.id should hold all of the data that have been
166 // added.
167 ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
168 flash_index_storage_->GetPostingList(result2.id));
169 EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list),
170 IsOkAndHolds(ElementsAre(data2, data1)));
171 }
172
TEST_F(PostingListJoinDataAccessorTest,PreexistingPLReallocateToLargerPL)173 TEST_F(PostingListJoinDataAccessorTest, PreexistingPLReallocateToLargerPL) {
174 ICING_ASSERT_OK_AND_ASSIGN(
175 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
176 PostingListJoinDataAccessor<JoinDataType>::Create(
177 flash_index_storage_.get(), serializer_.get()));
178 // Adding 3 data should cause Finalize allocating a 56-byte posting list,
179 // which can store at most 4 data.
180 std::vector<JoinDataType> data_vec1 =
181 CreateData(/*num_data=*/3, /*start_document_id=*/0,
182 /*ref_namespace_id=*/kDefaultNamespaceId,
183 /*start_ref_hash_uri=*/819);
184 for (const JoinDataType& data : data_vec1) {
185 ICING_ASSERT_OK(pl_accessor->PrependData(data));
186 }
187 PostingListAccessor::FinalizeResult result1 =
188 std::move(*pl_accessor).Finalize();
189 ICING_ASSERT_OK(result1.status);
190 // Should be allocated to the first block.
191 ASSERT_THAT(result1.id.block_index(), Eq(1));
192 ASSERT_THAT(result1.id.posting_list_index(), Eq(0));
193
194 // Now add more data.
195 ICING_ASSERT_OK_AND_ASSIGN(
196 pl_accessor,
197 PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
198 flash_index_storage_.get(), serializer_.get(), result1.id));
199 // The current posting list can fit 1 more data. Adding 12 more data should
200 // result in these data being moved to a larger posting list. Also the total
201 // size of these data won't exceed max size posting list, so there will be
202 // only one single posting list and no chain.
203 std::vector<JoinDataType> data_vec2 = CreateData(
204 /*num_data=*/12, /*start_document_id=*/data_vec1.back().document_id() + 1,
205 /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819);
206
207 for (const JoinDataType& data : data_vec2) {
208 ICING_ASSERT_OK(pl_accessor->PrependData(data));
209 }
210 PostingListAccessor::FinalizeResult result2 =
211 std::move(*pl_accessor).Finalize();
212 ICING_ASSERT_OK(result2.status);
213 // Should be allocated to the second (new) block because the posting list
214 // should grow beyond the size that the first block maintains.
215 EXPECT_THAT(result2.id.block_index(), Eq(2));
216 EXPECT_THAT(result2.id.posting_list_index(), Eq(0));
217
218 // The posting list at result2.id should hold all of the data that have been
219 // added.
220 std::vector<JoinDataType> all_data_vec;
221 all_data_vec.reserve(data_vec1.size() + data_vec2.size());
222 all_data_vec.insert(all_data_vec.end(), data_vec1.begin(), data_vec1.end());
223 all_data_vec.insert(all_data_vec.end(), data_vec2.begin(), data_vec2.end());
224 ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
225 flash_index_storage_->GetPostingList(result2.id));
226 EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list),
227 IsOkAndHolds(ElementsAreArray(all_data_vec.rbegin(),
228 all_data_vec.rend())));
229 }
230
TEST_F(PostingListJoinDataAccessorTest,MultiBlockChainsBlocksProperly)231 TEST_F(PostingListJoinDataAccessorTest, MultiBlockChainsBlocksProperly) {
232 ICING_ASSERT_OK_AND_ASSIGN(
233 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
234 PostingListJoinDataAccessor<JoinDataType>::Create(
235 flash_index_storage_.get(), serializer_.get()));
236 // Block size is 4096, sizeof(BlockHeader) is 12 and sizeof(JoinDataType)
237 // is 14, so the max size posting list can store (4096 - 12) / 14 = 291 data.
238 // Adding 292 data should cause:
239 // - 2 max size posting lists being allocated to block 1 and block 2.
240 // - Chaining: block 2 -> block 1
241 std::vector<JoinDataType> data_vec = CreateData(
242 /*num_data=*/292, /*start_document_id=*/0,
243 /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819);
244 for (const JoinDataType& data : data_vec) {
245 ICING_ASSERT_OK(pl_accessor->PrependData(data));
246 }
247 PostingListAccessor::FinalizeResult result1 =
248 std::move(*pl_accessor).Finalize();
249 ICING_ASSERT_OK(result1.status);
250 PostingListIdentifier second_block_id = result1.id;
251 // Should be allocated to the second block.
252 EXPECT_THAT(second_block_id, Eq(PostingListIdentifier(
253 /*block_index=*/2, /*posting_list_index=*/0,
254 /*posting_list_index_bits=*/0)));
255
256 // We should be able to retrieve all data.
257 ICING_ASSERT_OK_AND_ASSIGN(
258 PostingListHolder pl_holder,
259 flash_index_storage_->GetPostingList(second_block_id));
260 // This pl_holder will only hold a posting list with the data that didn't fit
261 // on the first block.
262 ICING_ASSERT_OK_AND_ASSIGN(std::vector<JoinDataType> second_block_data,
263 serializer_->GetData(&pl_holder.posting_list));
264 ASSERT_THAT(second_block_data, SizeIs(Lt(data_vec.size())));
265 auto first_block_data_start = data_vec.rbegin() + second_block_data.size();
266 EXPECT_THAT(second_block_data,
267 ElementsAreArray(data_vec.rbegin(), first_block_data_start));
268
269 // Now retrieve all of the data that were on the first block.
270 uint32_t first_block_id = pl_holder.next_block_index;
271 EXPECT_THAT(first_block_id, Eq(1));
272
273 PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
274 /*posting_list_index_bits=*/0);
275 ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
276 flash_index_storage_->GetPostingList(pl_id));
277 EXPECT_THAT(
278 serializer_->GetData(&pl_holder.posting_list),
279 IsOkAndHolds(ElementsAreArray(first_block_data_start, data_vec.rend())));
280 }
281
TEST_F(PostingListJoinDataAccessorTest,PreexistingMultiBlockReusesBlocksProperly)282 TEST_F(PostingListJoinDataAccessorTest,
283 PreexistingMultiBlockReusesBlocksProperly) {
284 ICING_ASSERT_OK_AND_ASSIGN(
285 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
286 PostingListJoinDataAccessor<JoinDataType>::Create(
287 flash_index_storage_.get(), serializer_.get()));
288 // Block size is 4096, sizeof(BlockHeader) is 12 and sizeof(JoinDataType)
289 // is 14, so the max size posting list can store (4096 - 12) / 14 = 291 data.
290 // Adding 292 data will cause:
291 // - 2 max size posting lists being allocated to block 1 and block 2.
292 // - Chaining: block 2 -> block 1
293 std::vector<JoinDataType> data_vec1 = CreateData(
294 /*num_data=*/292, /*start_document_id=*/0,
295 /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819);
296 for (const JoinDataType& data : data_vec1) {
297 ICING_ASSERT_OK(pl_accessor->PrependData(data));
298 }
299 PostingListAccessor::FinalizeResult result1 =
300 std::move(*pl_accessor).Finalize();
301 ICING_ASSERT_OK(result1.status);
302 PostingListIdentifier first_add_id = result1.id;
303 EXPECT_THAT(first_add_id, Eq(PostingListIdentifier(
304 /*block_index=*/2, /*posting_list_index=*/0,
305 /*posting_list_index_bits=*/0)));
306
307 // Now add more data. These should fit on the existing second block and not
308 // fill it up.
309 ICING_ASSERT_OK_AND_ASSIGN(
310 pl_accessor,
311 PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
312 flash_index_storage_.get(), serializer_.get(), first_add_id));
313 std::vector<JoinDataType> data_vec2 = CreateData(
314 /*num_data=*/10, /*start_document_id=*/data_vec1.back().document_id() + 1,
315 /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819);
316 for (const JoinDataType& data : data_vec2) {
317 ICING_ASSERT_OK(pl_accessor->PrependData(data));
318 }
319 PostingListAccessor::FinalizeResult result2 =
320 std::move(*pl_accessor).Finalize();
321 ICING_ASSERT_OK(result2.status);
322 PostingListIdentifier second_add_id = result2.id;
323 EXPECT_THAT(second_add_id, Eq(first_add_id));
324
325 // We should be able to retrieve all data.
326 std::vector<JoinDataType> all_data_vec;
327 all_data_vec.reserve(data_vec1.size() + data_vec2.size());
328 all_data_vec.insert(all_data_vec.end(), data_vec1.begin(), data_vec1.end());
329 all_data_vec.insert(all_data_vec.end(), data_vec2.begin(), data_vec2.end());
330 ICING_ASSERT_OK_AND_ASSIGN(
331 PostingListHolder pl_holder,
332 flash_index_storage_->GetPostingList(second_add_id));
333 // This pl_holder will only hold a posting list with the data that didn't fit
334 // on the first block.
335 ICING_ASSERT_OK_AND_ASSIGN(std::vector<JoinDataType> second_block_data,
336 serializer_->GetData(&pl_holder.posting_list));
337 ASSERT_THAT(second_block_data, SizeIs(Lt(all_data_vec.size())));
338 auto first_block_data_start =
339 all_data_vec.rbegin() + second_block_data.size();
340 EXPECT_THAT(second_block_data,
341 ElementsAreArray(all_data_vec.rbegin(), first_block_data_start));
342
343 // Now retrieve all of the data that were on the first block.
344 uint32_t first_block_id = pl_holder.next_block_index;
345 EXPECT_THAT(first_block_id, Eq(1));
346
347 PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
348 /*posting_list_index_bits=*/0);
349 ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
350 flash_index_storage_->GetPostingList(pl_id));
351 EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list),
352 IsOkAndHolds(ElementsAreArray(first_block_data_start,
353 all_data_vec.rend())));
354 }
355
TEST_F(PostingListJoinDataAccessorTest,InvalidDataShouldReturnInvalidArgument)356 TEST_F(PostingListJoinDataAccessorTest,
357 InvalidDataShouldReturnInvalidArgument) {
358 ICING_ASSERT_OK_AND_ASSIGN(
359 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
360 PostingListJoinDataAccessor<JoinDataType>::Create(
361 flash_index_storage_.get(), serializer_.get()));
362 JoinDataType invalid_data = JoinDataType::GetInvalid();
363 EXPECT_THAT(pl_accessor->PrependData(invalid_data),
364 StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
365 }
366
TEST_F(PostingListJoinDataAccessorTest,JoinDataNonIncreasingShouldReturnInvalidArgument)367 TEST_F(PostingListJoinDataAccessorTest,
368 JoinDataNonIncreasingShouldReturnInvalidArgument) {
369 ICING_ASSERT_OK_AND_ASSIGN(
370 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
371 PostingListJoinDataAccessor<JoinDataType>::Create(
372 flash_index_storage_.get(), serializer_.get()));
373 JoinDataType data1(
374 /*document_id=*/1,
375 NamespaceIdFingerprint(kDefaultNamespaceId, /*fingerprint=*/819));
376 ICING_ASSERT_OK(pl_accessor->PrependData(data1));
377
378 JoinDataType data2(
379 /*document_id=*/1,
380 NamespaceIdFingerprint(kDefaultNamespaceId, /*fingerprint=*/818));
381 EXPECT_THAT(pl_accessor->PrependData(data2),
382 StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
383
384 JoinDataType data3(
385 /*document_id=*/1,
386 NamespaceIdFingerprint(kDefaultNamespaceId - 1, /*fingerprint=*/820));
387 EXPECT_THAT(pl_accessor->PrependData(data3),
388 StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
389
390 JoinDataType data4(
391 /*document_id=*/0,
392 NamespaceIdFingerprint(kDefaultNamespaceId + 1, /*fingerprint=*/820));
393 EXPECT_THAT(pl_accessor->PrependData(data4),
394 StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
395 }
396
TEST_F(PostingListJoinDataAccessorTest,NewPostingListNoDataAddedShouldReturnInvalidArgument)397 TEST_F(PostingListJoinDataAccessorTest,
398 NewPostingListNoDataAddedShouldReturnInvalidArgument) {
399 ICING_ASSERT_OK_AND_ASSIGN(
400 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
401 PostingListJoinDataAccessor<JoinDataType>::Create(
402 flash_index_storage_.get(), serializer_.get()));
403 PostingListAccessor::FinalizeResult result =
404 std::move(*pl_accessor).Finalize();
405 EXPECT_THAT(result.status,
406 StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
407 }
408
TEST_F(PostingListJoinDataAccessorTest,PreexistingPostingListNoDataAddedShouldSucceed)409 TEST_F(PostingListJoinDataAccessorTest,
410 PreexistingPostingListNoDataAddedShouldSucceed) {
411 ICING_ASSERT_OK_AND_ASSIGN(
412 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor1,
413 PostingListJoinDataAccessor<JoinDataType>::Create(
414 flash_index_storage_.get(), serializer_.get()));
415 JoinDataType data1(
416 /*document_id=*/1,
417 NamespaceIdFingerprint(kDefaultNamespaceId, /*fingerprint=*/819));
418 ICING_ASSERT_OK(pl_accessor1->PrependData(data1));
419 PostingListAccessor::FinalizeResult result1 =
420 std::move(*pl_accessor1).Finalize();
421 ICING_ASSERT_OK(result1.status);
422
423 ICING_ASSERT_OK_AND_ASSIGN(
424 std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor2,
425 PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
426 flash_index_storage_.get(), serializer_.get(), result1.id));
427 PostingListAccessor::FinalizeResult result2 =
428 std::move(*pl_accessor2).Finalize();
429 EXPECT_THAT(result2.status, IsOk());
430 }
431
432 } // namespace
433
434 } // namespace lib
435 } // namespace icing
436