1 // Copyright (C) 2023 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_ 16 #define ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_ 17 18 #include <cstdint> 19 #include <utility> 20 #include <vector> 21 22 #include "icing/index/numeric/integer-index-data.h" 23 24 namespace icing { 25 namespace lib { 26 27 namespace integer_index_bucket_util { 28 29 // A wrapper struct that contains information of a bucket. 30 // - The bucket contains data within the iterator [start, end). 31 // - Bucket range is [key_lower, key_upper], and all data within [start, end) 32 // should have keys in the bucket range. 33 // 34 // Note: the caller should make sure the lifecycle of data vector is longer than 35 // instances of this wrapper struct. 36 struct DataRangeAndBucketInfo { 37 std::vector<IntegerIndexData>::iterator start; 38 std::vector<IntegerIndexData>::iterator end; 39 int64_t key_lower; 40 int64_t key_upper; 41 DataRangeAndBucketInfoDataRangeAndBucketInfo42 explicit DataRangeAndBucketInfo( 43 std::vector<IntegerIndexData>::iterator start_in, 44 std::vector<IntegerIndexData>::iterator end_in, int64_t key_lower_in, 45 int64_t key_upper_in) 46 : start(std::move(start_in)), 47 end(std::move(end_in)), 48 key_lower(key_lower_in), 49 key_upper(key_upper_in) {} 50 }; 51 52 // Helper function to split data (that are originally in a bucket with range 53 // [original_key_lower, original_key_upper]) into different buckets according to 54 // num_data_threshold. 55 // - The input vector `data` will be sorted by key in ascending order (unless 56 // there's no need to split in which case data is returned unmodified) 57 // - Data with the same key will be in the same bucket even if # of them exceed 58 // num_data_threshold. 59 // - Range of all buckets will be disjoint, and the range union will be 60 // [original_key_lower, original_key_upper]. 61 // - Data slice (i.e. [start, end)) can be empty. 62 // 63 // REQUIRES: 64 // - original_key_lower < original_key_upper 65 // - num_data_threshold > 0 66 // - Keys of all data are in range [original_key_lower, original_key_upper] 67 // 68 // Returns: a vector of DataRangeAndBucketInfo that contain all bucket info 69 // after splitting. Also the returned vector should contain at least one 70 // bucket, otherwise it is considered an error. 71 std::vector<DataRangeAndBucketInfo> Split(std::vector<IntegerIndexData>& data, 72 int64_t original_key_lower, 73 int64_t original_key_upper, 74 int32_t num_data_threshold); 75 76 } // namespace integer_index_bucket_util 77 78 } // namespace lib 79 } // namespace icing 80 81 #endif // ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_ 82