xref: /aosp_15_r20/external/icing/icing/index/numeric/integer-index-bucket-util.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/numeric/integer-index-bucket-util.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <iterator>
20 #include <limits>
21 #include <utility>
22 #include <vector>
23 
24 #include "icing/index/numeric/integer-index-data.h"
25 
26 namespace icing {
27 namespace lib {
28 
29 namespace integer_index_bucket_util {
30 
31 namespace {
32 
33 // Helper function to determine if data slice [start, end) forms a "full
34 // single-range bucket".
35 //
36 // Full single-range bucket: keys of all data are identical and # of them exceed
37 // num_data_threshold.
38 //
39 // REQUIRES: data slice [start, end) are sorted by key.
WouldBeFullSingleRangeBucket(const std::vector<IntegerIndexData>::iterator & start,const std::vector<IntegerIndexData>::iterator & end,int32_t num_data_threshold)40 inline bool WouldBeFullSingleRangeBucket(
41     const std::vector<IntegerIndexData>::iterator& start,
42     const std::vector<IntegerIndexData>::iterator& end,
43     int32_t num_data_threshold) {
44   return std::distance(start, end) > num_data_threshold &&
45          start->key() == (end - 1)->key();
46 }
47 
48 // Helper function to determine if a bucket is full single-range.
49 //
50 // REQUIRES:
51 //   bucket.key_lower <= [bucket.start, bucket.end)->key() <= bucket.key_upper
IsFullSingleRangeBucket(const DataRangeAndBucketInfo & bucket,int32_t num_data_threshold)52 inline bool IsFullSingleRangeBucket(const DataRangeAndBucketInfo& bucket,
53                                     int32_t num_data_threshold) {
54   return bucket.key_lower == bucket.key_upper &&
55          WouldBeFullSingleRangeBucket(bucket.start, bucket.end,
56                                       num_data_threshold);
57 }
58 
59 // Helper function to append new bucket(s) with corresponding data slice for
60 // range [curr_key_lower, last_key] where last_key = (it_end - 1)->key().
61 //
62 // Also it handles an edge case:
63 // If data slice [it_start, it_end) forms a "full single-range bucket" (see
64 // WouldBeFullSingleRangeBucket for definition), then we have to put them into a
65 // single range bucket [last_key, last_key] instead of [curr_key_lower,
66 // last_key]. Also we have to deal with range [curr_key_lower, last_key - 1]:
67 // - If the previous bucket exists and it is not a "full single-range bucket",
68 //   then merge [curr_key_lower, last_key - 1] into the previous bucket, i.e.
69 //   change the previous bucket's key_upper to (last_key - 1). Then we will end
70 //   up having:
71 //   - [prev_bucket.key_lower, last_key - 1]
72 //   - [last_key, last_key]
73 // - Otherwise, we have to create [curr_key_lower, last_key - 1] with
74 //   empty data. Then we will end up having (Note: prev_bucket.key_upper ==
75 //   curr_key_lower - 1):
76 //   - [prev_bucket.key_lower, curr_key_lower - 1]
77 //   - [curr_key_lower, last_key - 1]
78 //   - [last_key, last_key]
79 // This will avoid split bucket being called too frequently.
80 // For example, original_key_lower = 0, original_key_upper = 50. If we have
81 // (num_data_threshold + 1) data with key = 20 and another data with key = 40:
82 // - Without this part, we will split them into [[0, 20], [21, 50]]. Then when
83 //   adding data with key = 10 next round, we will invoke split again and split
84 //   [0, 20] to [[0, 10], [11, 20]].
85 // - With this part, we will split them into [[0, 19], [20, 20], [21, 50]],
86 //   which will avoid splitting in the next round for key = 20.
87 //
88 // REQUIRES: it_start < it_end
AppendNewBuckets(const std::vector<IntegerIndexData>::iterator & it_start,const std::vector<IntegerIndexData>::iterator & it_end,int64_t curr_key_lower,int32_t num_data_threshold,std::vector<DataRangeAndBucketInfo> & results)89 void AppendNewBuckets(const std::vector<IntegerIndexData>::iterator& it_start,
90                       const std::vector<IntegerIndexData>::iterator& it_end,
91                       int64_t curr_key_lower, int32_t num_data_threshold,
92                       std::vector<DataRangeAndBucketInfo>& results) {
93   int64_t last_key = (it_end - 1)->key();
94   if (curr_key_lower < last_key &&
95       WouldBeFullSingleRangeBucket(it_start, it_end, num_data_threshold)) {
96     if (!results.empty() &&
97         !IsFullSingleRangeBucket(results.back(), num_data_threshold)) {
98       // Previous bucket is not full single-range, so merge it to now hold the
99       // range [prev_bucket.key_lower, last_key - 1].
100       results.back().key_upper = last_key - 1;
101     } else {
102       // There is either no previous bucket or the previous bucket is full
103       // single-range. So add an empty bucket for the range [curr_key_lower,
104       // last_key - 1].
105       results.push_back(DataRangeAndBucketInfo(it_start, it_start,
106                                                curr_key_lower, last_key - 1));
107     }
108     curr_key_lower = last_key;
109   }
110   results.push_back(
111       DataRangeAndBucketInfo(it_start, it_end, curr_key_lower, last_key));
112 }
113 
114 }  // namespace
115 
Split(std::vector<IntegerIndexData> & data,int64_t original_key_lower,int64_t original_key_upper,int32_t num_data_threshold)116 std::vector<DataRangeAndBucketInfo> Split(std::vector<IntegerIndexData>& data,
117                                           int64_t original_key_lower,
118                                           int64_t original_key_upper,
119                                           int32_t num_data_threshold) {
120   // Early return if there is no need to split.
121   if (data.size() <= num_data_threshold) {
122     return {DataRangeAndBucketInfo(data.begin(), data.end(), original_key_lower,
123                                    original_key_upper)};
124   }
125 
126   // Sort data by key.
127   std::sort(
128       data.begin(), data.end(),
129       [](const IntegerIndexData& lhs, const IntegerIndexData& rhs) -> bool {
130         return lhs.key() < rhs.key();
131       });
132 
133   std::vector<DataRangeAndBucketInfo> results;
134   int64_t curr_key_lower = original_key_lower;
135   // Sliding window [it_start, it_end) to separate data into different buckets.
136   auto it_start = data.begin();
137   auto it_end = data.begin();
138   while (it_end != data.end()) {
139     // Attempt to extend it_end by 1, but we have to include all data with the
140     // same key since they cannot be separated into different buckets. Also use
141     // extend_it_end to avoid modifying it_end directly. For some edge cases,
142     // the extension in a single round is extremely large (i.e. a lot of data
143     // have the same key), and we want to separate them. For example:
144     // - key = 0: 5 data
145     // - key = 1: num_data_threshold - 1 data
146     // In the second round, # of data in the sliding window will exceed the
147     // threshold. We want to separate all data with key = 0 into a single bucket
148     // instead of putting key = 0 and key = 1 together. Therefore, using
149     // extend_it_end allow us to preserve it_end of the previous round and be
150     // able to deal with this case.
151     auto extend_it_end = it_end + 1;
152     while (extend_it_end != data.end() &&
153            it_end->key() == extend_it_end->key()) {
154       ++extend_it_end;
155     }
156 
157     if (std::distance(it_start, extend_it_end) > num_data_threshold &&
158         it_start != it_end) {
159       // Split data between [it_start, it_end) into range [curr_key_lower,
160       // (it_end - 1)->key()].
161       AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold,
162                        results);
163 
164       // it_end at this moment won't be data.end(), so the last element of the
165       // new bucket can't have key == INT64_MAX. Therefore, it is safe to set
166       // curr_key_lower as ((it_end - 1)->key() + 1).
167       curr_key_lower = (it_end - 1)->key() + 1;
168       it_start = it_end;
169     }
170     it_end = extend_it_end;
171   }
172 
173   // Handle the final range [curr_key_lower, original_key_upper].
174   if (curr_key_lower <= original_key_upper) {
175     if (it_start != it_end) {
176       AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold,
177                        results);
178 
179       // AppendNewBuckets only handles range [curr_key_lower, (it_end -
180       // 1)->key()], so we have to handle range [(it_end - 1)->key() + 1,
181       // original_key_upper] if needed.
182       int64_t last_key = (it_end - 1)->key();
183       if (last_key != std::numeric_limits<int64_t>::max() &&
184           last_key + 1 <= original_key_upper) {
185         if (!results.empty() &&
186             !IsFullSingleRangeBucket(results.back(), num_data_threshold)) {
187           results.back().key_upper = original_key_upper;
188         } else {
189           results.push_back(DataRangeAndBucketInfo(
190               it_start, it_start, last_key + 1, original_key_upper));
191         }
192       }
193     } else {
194       results.push_back(DataRangeAndBucketInfo(it_start, it_end, curr_key_lower,
195                                                original_key_upper));
196     }
197   }
198 
199   return results;
200 }
201 
202 }  // namespace integer_index_bucket_util
203 
204 }  // namespace lib
205 }  // namespace icing
206