1*8b6cd535SAndroid Build Coastguard Worker // Copyright (C) 2022 Google LLC 2*8b6cd535SAndroid Build Coastguard Worker // 3*8b6cd535SAndroid Build Coastguard Worker // Licensed under the Apache License, Version 2.0 (the "License"); 4*8b6cd535SAndroid Build Coastguard Worker // you may not use this file except in compliance with the License. 5*8b6cd535SAndroid Build Coastguard Worker // You may obtain a copy of the License at 6*8b6cd535SAndroid Build Coastguard Worker // 7*8b6cd535SAndroid Build Coastguard Worker // http://www.apache.org/licenses/LICENSE-2.0 8*8b6cd535SAndroid Build Coastguard Worker // 9*8b6cd535SAndroid Build Coastguard Worker // Unless required by applicable law or agreed to in writing, software 10*8b6cd535SAndroid Build Coastguard Worker // distributed under the License is distributed on an "AS IS" BASIS, 11*8b6cd535SAndroid Build Coastguard Worker // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*8b6cd535SAndroid Build Coastguard Worker // See the License for the specific language governing permissions and 13*8b6cd535SAndroid Build Coastguard Worker // limitations under the License. 14*8b6cd535SAndroid Build Coastguard Worker 15*8b6cd535SAndroid Build Coastguard Worker #ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_STORAGE_H_ 16*8b6cd535SAndroid Build Coastguard Worker #define ICING_INDEX_NUMERIC_INTEGER_INDEX_STORAGE_H_ 17*8b6cd535SAndroid Build Coastguard Worker 18*8b6cd535SAndroid Build Coastguard Worker #include <cstdint> 19*8b6cd535SAndroid Build Coastguard Worker #include <memory> 20*8b6cd535SAndroid Build Coastguard Worker #include <string> 21*8b6cd535SAndroid Build Coastguard Worker #include <string_view> 22*8b6cd535SAndroid Build Coastguard Worker #include <vector> 23*8b6cd535SAndroid Build Coastguard Worker 24*8b6cd535SAndroid Build Coastguard Worker #include "icing/text_classifier/lib3/utils/base/status.h" 25*8b6cd535SAndroid Build Coastguard Worker #include "icing/text_classifier/lib3/utils/base/statusor.h" 26*8b6cd535SAndroid Build Coastguard Worker #include "icing/file/file-backed-vector.h" 27*8b6cd535SAndroid Build Coastguard Worker #include "icing/file/filesystem.h" 28*8b6cd535SAndroid Build Coastguard Worker #include "icing/file/memory-mapped-file.h" 29*8b6cd535SAndroid Build Coastguard Worker #include "icing/file/persistent-storage.h" 30*8b6cd535SAndroid Build Coastguard Worker #include "icing/file/posting_list/flash-index-storage.h" 31*8b6cd535SAndroid Build Coastguard Worker #include "icing/file/posting_list/posting-list-identifier.h" 32*8b6cd535SAndroid Build Coastguard Worker #include "icing/index/iterator/doc-hit-info-iterator.h" 33*8b6cd535SAndroid Build Coastguard Worker #include "icing/index/numeric/integer-index-data.h" 34*8b6cd535SAndroid Build Coastguard Worker #include "icing/index/numeric/posting-list-integer-index-serializer.h" 35*8b6cd535SAndroid Build Coastguard Worker #include "icing/schema/section.h" 36*8b6cd535SAndroid Build Coastguard Worker #include "icing/store/document-id.h" 37*8b6cd535SAndroid Build Coastguard Worker #include "icing/util/crc32.h" 38*8b6cd535SAndroid Build Coastguard Worker 39*8b6cd535SAndroid Build Coastguard Worker namespace icing { 40*8b6cd535SAndroid Build Coastguard Worker namespace lib { 41*8b6cd535SAndroid Build Coastguard Worker 42*8b6cd535SAndroid Build Coastguard Worker // IntegerIndexStorage: a class for indexing (persistent storage) and searching 43*8b6cd535SAndroid Build Coastguard Worker // contents of integer type sections in documents. 44*8b6cd535SAndroid Build Coastguard Worker // - Accepts new integer contents (a.k.a keys) and adds records (BasicHit, key) 45*8b6cd535SAndroid Build Coastguard Worker // into the integer index. 46*8b6cd535SAndroid Build Coastguard Worker // - Stores records (BasicHit, key) in posting lists and compresses them. 47*8b6cd535SAndroid Build Coastguard Worker // - Bucketizes these records by key to make range query more efficient and 48*8b6cd535SAndroid Build Coastguard Worker // manages them with the corresponding posting lists. 49*8b6cd535SAndroid Build Coastguard Worker // - When a posting list reaches the max size and is full, the mechanism of 50*8b6cd535SAndroid Build Coastguard Worker // PostingListAccessor is to create another new (max-size) posting list and 51*8b6cd535SAndroid Build Coastguard Worker // chain them together. 52*8b6cd535SAndroid Build Coastguard Worker // - It will be inefficient if we store all records in the same PL chain. E.g. 53*8b6cd535SAndroid Build Coastguard Worker // small range query needs to iterate through the whole PL chain but skips a 54*8b6cd535SAndroid Build Coastguard Worker // lot of non-relevant records (whose keys don't belong to the query range). 55*8b6cd535SAndroid Build Coastguard Worker // - Therefore, we implement the splitting mechanism to split a full max-size 56*8b6cd535SAndroid Build Coastguard Worker // posting list. Also adjust range of the original bucket and add new 57*8b6cd535SAndroid Build Coastguard Worker // buckets. 58*8b6cd535SAndroid Build Coastguard Worker // - Ranges of all buckets are disjoint and the union of them is [INT64_MIN, 59*8b6cd535SAndroid Build Coastguard Worker // INT64_MAX]. 60*8b6cd535SAndroid Build Coastguard Worker // - Buckets should be sorted, so we can do binary search to find the desired 61*8b6cd535SAndroid Build Coastguard Worker // bucket(s). However, we may split a bucket into several buckets, and the 62*8b6cd535SAndroid Build Coastguard Worker // cost to insert newly created buckets is high. 63*8b6cd535SAndroid Build Coastguard Worker // - Thus, we introduce an unsorted bucket array for newly created buckets, 64*8b6cd535SAndroid Build Coastguard Worker // and merge unsorted buckets into the sorted bucket array only if length of 65*8b6cd535SAndroid Build Coastguard Worker // the unsorted bucket array exceeds the threshold. This mechanism will 66*8b6cd535SAndroid Build Coastguard Worker // reduce # of merging events and amortize the overall cost for bucket order 67*8b6cd535SAndroid Build Coastguard Worker // maintenance. 68*8b6cd535SAndroid Build Coastguard Worker // Note: some tree data structures (e.g. segment tree, B+ tree) maintain the 69*8b6cd535SAndroid Build Coastguard Worker // bucket order more efficiently than the sorted/unsorted bucket array 70*8b6cd535SAndroid Build Coastguard Worker // mechanism, but the implementation is more complicated and doesn't improve 71*8b6cd535SAndroid Build Coastguard Worker // the performance too much according to our analysis, so currently we 72*8b6cd535SAndroid Build Coastguard Worker // choose sorted/unsorted bucket array. 73*8b6cd535SAndroid Build Coastguard Worker // - Then we do binary search on the sorted bucket array and sequential search 74*8b6cd535SAndroid Build Coastguard Worker // on the unsorted bucket array. 75*8b6cd535SAndroid Build Coastguard Worker class IntegerIndexStorage : public PersistentStorage { 76*8b6cd535SAndroid Build Coastguard Worker public: 77*8b6cd535SAndroid Build Coastguard Worker struct Info { 78*8b6cd535SAndroid Build Coastguard Worker static constexpr int32_t kMagic = 0x6470e547; 79*8b6cd535SAndroid Build Coastguard Worker 80*8b6cd535SAndroid Build Coastguard Worker int32_t magic; 81*8b6cd535SAndroid Build Coastguard Worker int32_t num_data; 82*8b6cd535SAndroid Build Coastguard Worker GetChecksumInfo83*8b6cd535SAndroid Build Coastguard Worker Crc32 GetChecksum() const { 84*8b6cd535SAndroid Build Coastguard Worker return Crc32( 85*8b6cd535SAndroid Build Coastguard Worker std::string_view(reinterpret_cast<const char*>(this), sizeof(Info))); 86*8b6cd535SAndroid Build Coastguard Worker } 87*8b6cd535SAndroid Build Coastguard Worker } __attribute__((packed)); 88*8b6cd535SAndroid Build Coastguard Worker static_assert(sizeof(Info) == 8, ""); 89*8b6cd535SAndroid Build Coastguard Worker 90*8b6cd535SAndroid Build Coastguard Worker // Bucket 91*8b6cd535SAndroid Build Coastguard Worker class Bucket { 92*8b6cd535SAndroid Build Coastguard Worker public: 93*8b6cd535SAndroid Build Coastguard Worker // Absolute max # of buckets allowed. Since the absolute max file size of 94*8b6cd535SAndroid Build Coastguard Worker // FileBackedVector on 32-bit platform is ~2^28, we can at most have ~13.4M 95*8b6cd535SAndroid Build Coastguard Worker // buckets. To make it power of 2, round it down to 2^23. Also since we're 96*8b6cd535SAndroid Build Coastguard Worker // using FileBackedVector to store buckets, add some static_asserts to 97*8b6cd535SAndroid Build Coastguard Worker // ensure numbers here are compatible with FileBackedVector. 98*8b6cd535SAndroid Build Coastguard Worker static constexpr int32_t kMaxNumBuckets = 1 << 23; 99*8b6cd535SAndroid Build Coastguard Worker 100*8b6cd535SAndroid Build Coastguard Worker explicit Bucket(int64_t key_lower, int64_t key_upper, 101*8b6cd535SAndroid Build Coastguard Worker PostingListIdentifier posting_list_identifier = 102*8b6cd535SAndroid Build Coastguard Worker PostingListIdentifier::kInvalid, 103*8b6cd535SAndroid Build Coastguard Worker int32_t num_data = 0) key_lower_(key_lower)104*8b6cd535SAndroid Build Coastguard Worker : key_lower_(key_lower), 105*8b6cd535SAndroid Build Coastguard Worker key_upper_(key_upper), 106*8b6cd535SAndroid Build Coastguard Worker posting_list_identifier_(posting_list_identifier), 107*8b6cd535SAndroid Build Coastguard Worker num_data_(num_data) {} 108*8b6cd535SAndroid Build Coastguard Worker 109*8b6cd535SAndroid Build Coastguard Worker bool operator<(const Bucket& other) const { 110*8b6cd535SAndroid Build Coastguard Worker return key_lower_ < other.key_lower_; 111*8b6cd535SAndroid Build Coastguard Worker } 112*8b6cd535SAndroid Build Coastguard Worker 113*8b6cd535SAndroid Build Coastguard Worker // For FileBackedVector 114*8b6cd535SAndroid Build Coastguard Worker bool operator==(const Bucket& other) const { 115*8b6cd535SAndroid Build Coastguard Worker return key_lower_ == other.key_lower_ && key_upper_ == other.key_upper_ && 116*8b6cd535SAndroid Build Coastguard Worker posting_list_identifier_ == other.posting_list_identifier_; 117*8b6cd535SAndroid Build Coastguard Worker } 118*8b6cd535SAndroid Build Coastguard Worker key_lower()119*8b6cd535SAndroid Build Coastguard Worker int64_t key_lower() const { return key_lower_; } 120*8b6cd535SAndroid Build Coastguard Worker key_upper()121*8b6cd535SAndroid Build Coastguard Worker int64_t key_upper() const { return key_upper_; } 122*8b6cd535SAndroid Build Coastguard Worker set_key_lower(int64_t key_lower)123*8b6cd535SAndroid Build Coastguard Worker void set_key_lower(int64_t key_lower) { key_lower_ = key_lower; } 124*8b6cd535SAndroid Build Coastguard Worker set_key_upper(int64_t key_upper)125*8b6cd535SAndroid Build Coastguard Worker void set_key_upper(int64_t key_upper) { key_upper_ = key_upper; } 126*8b6cd535SAndroid Build Coastguard Worker posting_list_identifier()127*8b6cd535SAndroid Build Coastguard Worker PostingListIdentifier posting_list_identifier() const { 128*8b6cd535SAndroid Build Coastguard Worker return posting_list_identifier_; 129*8b6cd535SAndroid Build Coastguard Worker } set_posting_list_identifier(PostingListIdentifier posting_list_identifier)130*8b6cd535SAndroid Build Coastguard Worker void set_posting_list_identifier( 131*8b6cd535SAndroid Build Coastguard Worker PostingListIdentifier posting_list_identifier) { 132*8b6cd535SAndroid Build Coastguard Worker posting_list_identifier_ = posting_list_identifier; 133*8b6cd535SAndroid Build Coastguard Worker } 134*8b6cd535SAndroid Build Coastguard Worker num_data()135*8b6cd535SAndroid Build Coastguard Worker int32_t num_data() const { return num_data_; } set_num_data(int32_t num_data)136*8b6cd535SAndroid Build Coastguard Worker void set_num_data(int32_t num_data) { num_data_ = num_data; } 137*8b6cd535SAndroid Build Coastguard Worker 138*8b6cd535SAndroid Build Coastguard Worker private: 139*8b6cd535SAndroid Build Coastguard Worker int64_t key_lower_; 140*8b6cd535SAndroid Build Coastguard Worker int64_t key_upper_; 141*8b6cd535SAndroid Build Coastguard Worker PostingListIdentifier posting_list_identifier_; 142*8b6cd535SAndroid Build Coastguard Worker int32_t num_data_; 143*8b6cd535SAndroid Build Coastguard Worker } __attribute__((packed)); 144*8b6cd535SAndroid Build Coastguard Worker static_assert(sizeof(Bucket) == 24, ""); 145*8b6cd535SAndroid Build Coastguard Worker static_assert(sizeof(Bucket) == FileBackedVector<Bucket>::kElementTypeSize, 146*8b6cd535SAndroid Build Coastguard Worker "Bucket type size is inconsistent with FileBackedVector " 147*8b6cd535SAndroid Build Coastguard Worker "element type size"); 148*8b6cd535SAndroid Build Coastguard Worker static_assert(Bucket::kMaxNumBuckets <= 149*8b6cd535SAndroid Build Coastguard Worker (FileBackedVector<Bucket>::kMaxFileSize - 150*8b6cd535SAndroid Build Coastguard Worker FileBackedVector<Bucket>::Header::kHeaderSize) / 151*8b6cd535SAndroid Build Coastguard Worker FileBackedVector<Bucket>::kElementTypeSize, 152*8b6cd535SAndroid Build Coastguard Worker "Max # of buckets cannot fit into FileBackedVector"); 153*8b6cd535SAndroid Build Coastguard Worker 154*8b6cd535SAndroid Build Coastguard Worker struct Options { 155*8b6cd535SAndroid Build Coastguard Worker // - According to the benchmark result, the more # of buckets, the higher 156*8b6cd535SAndroid Build Coastguard Worker // latency for range query. Therefore, this number cannot be too small to 157*8b6cd535SAndroid Build Coastguard Worker // avoid splitting bucket too aggressively. 158*8b6cd535SAndroid Build Coastguard Worker // - We use `num_data_threshold_for_bucket_split / 2 + 5` as the cutoff 159*8b6cd535SAndroid Build Coastguard Worker // threshold after splitting. This number cannot be too small (e.g. 10) 160*8b6cd535SAndroid Build Coastguard Worker // because in this case we will have similar # of data in a single bucket 161*8b6cd535SAndroid Build Coastguard Worker // before and after splitting, which contradicts the purpose of splitting. 162*8b6cd535SAndroid Build Coastguard Worker // - For convenience, let's set 64 as the minimum value. 163*8b6cd535SAndroid Build Coastguard Worker static constexpr int32_t kMinNumDataThresholdForBucketSplit = 64; 164*8b6cd535SAndroid Build Coastguard Worker OptionsOptions165*8b6cd535SAndroid Build Coastguard Worker explicit Options(int32_t num_data_threshold_for_bucket_split_in, 166*8b6cd535SAndroid Build Coastguard Worker bool pre_mapping_fbv_in) 167*8b6cd535SAndroid Build Coastguard Worker : num_data_threshold_for_bucket_split( 168*8b6cd535SAndroid Build Coastguard Worker num_data_threshold_for_bucket_split_in), 169*8b6cd535SAndroid Build Coastguard Worker pre_mapping_fbv(pre_mapping_fbv_in) {} 170*8b6cd535SAndroid Build Coastguard Worker OptionsOptions171*8b6cd535SAndroid Build Coastguard Worker explicit Options(std::vector<Bucket> custom_init_sorted_buckets_in, 172*8b6cd535SAndroid Build Coastguard Worker std::vector<Bucket> custom_init_unsorted_buckets_in, 173*8b6cd535SAndroid Build Coastguard Worker int32_t num_data_threshold_for_bucket_split_in, 174*8b6cd535SAndroid Build Coastguard Worker bool pre_mapping_fbv_in) 175*8b6cd535SAndroid Build Coastguard Worker : custom_init_sorted_buckets(std::move(custom_init_sorted_buckets_in)), 176*8b6cd535SAndroid Build Coastguard Worker custom_init_unsorted_buckets( 177*8b6cd535SAndroid Build Coastguard Worker std::move(custom_init_unsorted_buckets_in)), 178*8b6cd535SAndroid Build Coastguard Worker num_data_threshold_for_bucket_split( 179*8b6cd535SAndroid Build Coastguard Worker num_data_threshold_for_bucket_split_in), 180*8b6cd535SAndroid Build Coastguard Worker pre_mapping_fbv(pre_mapping_fbv_in) {} 181*8b6cd535SAndroid Build Coastguard Worker 182*8b6cd535SAndroid Build Coastguard Worker bool IsValid() const; 183*8b6cd535SAndroid Build Coastguard Worker HasCustomInitBucketsOptions184*8b6cd535SAndroid Build Coastguard Worker bool HasCustomInitBuckets() const { 185*8b6cd535SAndroid Build Coastguard Worker return !custom_init_sorted_buckets.empty() || 186*8b6cd535SAndroid Build Coastguard Worker !custom_init_unsorted_buckets.empty(); 187*8b6cd535SAndroid Build Coastguard Worker } 188*8b6cd535SAndroid Build Coastguard Worker 189*8b6cd535SAndroid Build Coastguard Worker // Custom buckets when initializing new files. If both are empty, then the 190*8b6cd535SAndroid Build Coastguard Worker // initial bucket is (INT64_MIN, INT64_MAX). Usually we only set them in the 191*8b6cd535SAndroid Build Coastguard Worker // unit test. Note that all buckets in custom_init_sorted_buckets and 192*8b6cd535SAndroid Build Coastguard Worker // custom_init_unsorted_buckets should be disjoint and the range union 193*8b6cd535SAndroid Build Coastguard Worker // should be [INT64_MIN, INT64_MAX]. 194*8b6cd535SAndroid Build Coastguard Worker std::vector<Bucket> custom_init_sorted_buckets; 195*8b6cd535SAndroid Build Coastguard Worker std::vector<Bucket> custom_init_unsorted_buckets; 196*8b6cd535SAndroid Build Coastguard Worker 197*8b6cd535SAndroid Build Coastguard Worker // Threshold for invoking bucket splitting. If # of data in a bucket exceeds 198*8b6cd535SAndroid Build Coastguard Worker // this number after adding new data, then it will invoke bucket splitting 199*8b6cd535SAndroid Build Coastguard Worker // logic. 200*8b6cd535SAndroid Build Coastguard Worker // 201*8b6cd535SAndroid Build Coastguard Worker // Note: num_data_threshold_for_bucket_split should be >= 202*8b6cd535SAndroid Build Coastguard Worker // kMinNumDataThresholdForBucketSplit. 203*8b6cd535SAndroid Build Coastguard Worker int32_t num_data_threshold_for_bucket_split; 204*8b6cd535SAndroid Build Coastguard Worker 205*8b6cd535SAndroid Build Coastguard Worker // Flag indicating whether memory map max possible file size for underlying 206*8b6cd535SAndroid Build Coastguard Worker // FileBackedVector before growing the actual file size. 207*8b6cd535SAndroid Build Coastguard Worker bool pre_mapping_fbv; 208*8b6cd535SAndroid Build Coastguard Worker }; 209*8b6cd535SAndroid Build Coastguard Worker 210*8b6cd535SAndroid Build Coastguard Worker // Metadata file layout: <Crcs><Info> 211*8b6cd535SAndroid Build Coastguard Worker static constexpr int32_t kCrcsMetadataFileOffset = 0; 212*8b6cd535SAndroid Build Coastguard Worker static constexpr int32_t kInfoMetadataFileOffset = 213*8b6cd535SAndroid Build Coastguard Worker static_cast<int32_t>(sizeof(Crcs)); 214*8b6cd535SAndroid Build Coastguard Worker static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info); 215*8b6cd535SAndroid Build Coastguard Worker static_assert(kMetadataFileSize == 20, ""); 216*8b6cd535SAndroid Build Coastguard Worker 217*8b6cd535SAndroid Build Coastguard Worker static constexpr WorkingPathType kWorkingPathType = 218*8b6cd535SAndroid Build Coastguard Worker WorkingPathType::kDirectory; 219*8b6cd535SAndroid Build Coastguard Worker static constexpr std::string_view kFilePrefix = "integer_index_storage"; 220*8b6cd535SAndroid Build Coastguard Worker 221*8b6cd535SAndroid Build Coastguard Worker // Default # of data threshold for bucket splitting during indexing (AddKeys). 222*8b6cd535SAndroid Build Coastguard Worker // When # of data in a bucket reaches this number, we will try to split data 223*8b6cd535SAndroid Build Coastguard Worker // into multiple buckets according to their keys. 224*8b6cd535SAndroid Build Coastguard Worker static constexpr int32_t kDefaultNumDataThresholdForBucketSplit = 65536; 225*8b6cd535SAndroid Build Coastguard Worker 226*8b6cd535SAndroid Build Coastguard Worker // # of data threshold for bucket merging during optimization (TransferIndex) 227*8b6cd535SAndroid Build Coastguard Worker // = kNumDataThresholdRatioForBucketMerge * 228*8b6cd535SAndroid Build Coastguard Worker // options.num_data_threshold_for_bucket_split 229*8b6cd535SAndroid Build Coastguard Worker // 230*8b6cd535SAndroid Build Coastguard Worker // If total # data of adjacent buckets exceed this threshold, then flush the 231*8b6cd535SAndroid Build Coastguard Worker // accumulated data. Otherwise merge buckets and their data. 232*8b6cd535SAndroid Build Coastguard Worker static constexpr double kNumDataThresholdRatioForBucketMerge = 0.7; 233*8b6cd535SAndroid Build Coastguard Worker 234*8b6cd535SAndroid Build Coastguard Worker // Length threshold to sort and merge unsorted buckets into sorted buckets. If 235*8b6cd535SAndroid Build Coastguard Worker // the length of unsorted_buckets exceed the threshold, then call 236*8b6cd535SAndroid Build Coastguard Worker // SortBuckets(). 237*8b6cd535SAndroid Build Coastguard Worker // TODO(b/259743562): decide if removing unsorted buckets given that we 238*8b6cd535SAndroid Build Coastguard Worker // changed bucket splitting threshold and # of buckets are small now. 239*8b6cd535SAndroid Build Coastguard Worker static constexpr int32_t kUnsortedBucketsLengthThreshold = 5; 240*8b6cd535SAndroid Build Coastguard Worker 241*8b6cd535SAndroid Build Coastguard Worker // Creates a new IntegerIndexStorage instance to index integers (for a single 242*8b6cd535SAndroid Build Coastguard Worker // property). If any of the underlying file is missing, then delete the whole 243*8b6cd535SAndroid Build Coastguard Worker // working_path and (re)initialize with new ones. Otherwise initialize and 244*8b6cd535SAndroid Build Coastguard Worker // create the instance by existing files. 245*8b6cd535SAndroid Build Coastguard Worker // 246*8b6cd535SAndroid Build Coastguard Worker // filesystem: Object to make system level calls 247*8b6cd535SAndroid Build Coastguard Worker // working_path: Specifies the working path for PersistentStorage. 248*8b6cd535SAndroid Build Coastguard Worker // IntegerIndexStorage uses working path as working directory 249*8b6cd535SAndroid Build Coastguard Worker // and all related files will be stored under this directory. It 250*8b6cd535SAndroid Build Coastguard Worker // takes full ownership and of working_path_, including 251*8b6cd535SAndroid Build Coastguard Worker // creation/deletion. It is the caller's responsibility to 252*8b6cd535SAndroid Build Coastguard Worker // specify correct working path and avoid mixing different 253*8b6cd535SAndroid Build Coastguard Worker // persistent storages together under the same path. Also the 254*8b6cd535SAndroid Build Coastguard Worker // caller has the ownership for the parent directory of 255*8b6cd535SAndroid Build Coastguard Worker // working_path_, and it is responsible for parent directory 256*8b6cd535SAndroid Build Coastguard Worker // creation/deletion. See PersistentStorage for more details 257*8b6cd535SAndroid Build Coastguard Worker // about the concept of working_path. 258*8b6cd535SAndroid Build Coastguard Worker // options: Options instance. 259*8b6cd535SAndroid Build Coastguard Worker // posting_list_serializer: a PostingListIntegerIndexSerializer instance to 260*8b6cd535SAndroid Build Coastguard Worker // serialize/deserialize integer index data to/from 261*8b6cd535SAndroid Build Coastguard Worker // posting lists. 262*8b6cd535SAndroid Build Coastguard Worker // 263*8b6cd535SAndroid Build Coastguard Worker // Returns: 264*8b6cd535SAndroid Build Coastguard Worker // - INVALID_ARGUMENT_ERROR if any value in options is invalid. 265*8b6cd535SAndroid Build Coastguard Worker // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored 266*8b6cd535SAndroid Build Coastguard Worker // checksum. 267*8b6cd535SAndroid Build Coastguard Worker // - INTERNAL_ERROR on I/O errors. 268*8b6cd535SAndroid Build Coastguard Worker // - Any FileBackedVector/FlashIndexStorage errors. 269*8b6cd535SAndroid Build Coastguard Worker static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> 270*8b6cd535SAndroid Build Coastguard Worker Create(const Filesystem& filesystem, std::string working_path, 271*8b6cd535SAndroid Build Coastguard Worker Options options, 272*8b6cd535SAndroid Build Coastguard Worker PostingListIntegerIndexSerializer* posting_list_serializer); 273*8b6cd535SAndroid Build Coastguard Worker 274*8b6cd535SAndroid Build Coastguard Worker // Deletes IntegerIndexStorage under working_path. 275*8b6cd535SAndroid Build Coastguard Worker // 276*8b6cd535SAndroid Build Coastguard Worker // Returns: 277*8b6cd535SAndroid Build Coastguard Worker // - OK on success 278*8b6cd535SAndroid Build Coastguard Worker // - INTERNAL_ERROR on I/O error Discard(const Filesystem & filesystem,const std::string & working_path)279*8b6cd535SAndroid Build Coastguard Worker static libtextclassifier3::Status Discard(const Filesystem& filesystem, 280*8b6cd535SAndroid Build Coastguard Worker const std::string& working_path) { 281*8b6cd535SAndroid Build Coastguard Worker return PersistentStorage::Discard(filesystem, working_path, 282*8b6cd535SAndroid Build Coastguard Worker kWorkingPathType); 283*8b6cd535SAndroid Build Coastguard Worker } 284*8b6cd535SAndroid Build Coastguard Worker 285*8b6cd535SAndroid Build Coastguard Worker // Delete copy and move constructor/assignment operator. 286*8b6cd535SAndroid Build Coastguard Worker IntegerIndexStorage(const IntegerIndexStorage&) = delete; 287*8b6cd535SAndroid Build Coastguard Worker IntegerIndexStorage& operator=(const IntegerIndexStorage&) = delete; 288*8b6cd535SAndroid Build Coastguard Worker 289*8b6cd535SAndroid Build Coastguard Worker IntegerIndexStorage(IntegerIndexStorage&&) = delete; 290*8b6cd535SAndroid Build Coastguard Worker IntegerIndexStorage& operator=(IntegerIndexStorage&&) = delete; 291*8b6cd535SAndroid Build Coastguard Worker 292*8b6cd535SAndroid Build Coastguard Worker ~IntegerIndexStorage() override; 293*8b6cd535SAndroid Build Coastguard Worker 294*8b6cd535SAndroid Build Coastguard Worker // Batch adds new keys (of the same DocumentId and SectionId) into the integer 295*8b6cd535SAndroid Build Coastguard Worker // index storage. 296*8b6cd535SAndroid Build Coastguard Worker // Note that since we separate different property names into different integer 297*8b6cd535SAndroid Build Coastguard Worker // index storages, it is impossible to have keys in a single document across 298*8b6cd535SAndroid Build Coastguard Worker // multiple sections to add into the same integer index storage. 299*8b6cd535SAndroid Build Coastguard Worker // 300*8b6cd535SAndroid Build Coastguard Worker // Returns: 301*8b6cd535SAndroid Build Coastguard Worker // - OK on success 302*8b6cd535SAndroid Build Coastguard Worker // - RESOURCE_EXHAUSTED_ERROR if # of integers in this storage exceed 303*8b6cd535SAndroid Build Coastguard Worker // INT_MAX after adding new_keys 304*8b6cd535SAndroid Build Coastguard Worker // - Any FileBackedVector or PostingList errors 305*8b6cd535SAndroid Build Coastguard Worker libtextclassifier3::Status AddKeys(DocumentId document_id, 306*8b6cd535SAndroid Build Coastguard Worker SectionId section_id, 307*8b6cd535SAndroid Build Coastguard Worker std::vector<int64_t>&& new_keys); 308*8b6cd535SAndroid Build Coastguard Worker 309*8b6cd535SAndroid Build Coastguard Worker // Returns a DocHitInfoIteratorNumeric<int64_t> (in DocHitInfoIterator 310*8b6cd535SAndroid Build Coastguard Worker // interface type format) for iterating through all docs which have the 311*8b6cd535SAndroid Build Coastguard Worker // specified (integer) property contents in range [query_key_lower, 312*8b6cd535SAndroid Build Coastguard Worker // query_key_upper]. 313*8b6cd535SAndroid Build Coastguard Worker // When iterating through all relevant doc hits, it: 314*8b6cd535SAndroid Build Coastguard Worker // - Merges multiple SectionIds of doc hits with same DocumentId into a single 315*8b6cd535SAndroid Build Coastguard Worker // SectionIdMask and constructs DocHitInfo. 316*8b6cd535SAndroid Build Coastguard Worker // - Returns DocHitInfo in descending DocumentId order. 317*8b6cd535SAndroid Build Coastguard Worker // 318*8b6cd535SAndroid Build Coastguard Worker // Returns: 319*8b6cd535SAndroid Build Coastguard Worker // - On success: a DocHitInfoIterator(Numeric) 320*8b6cd535SAndroid Build Coastguard Worker // - INVALID_ARGUMENT_ERROR if query_key_lower > query_key_upper 321*8b6cd535SAndroid Build Coastguard Worker // - Any FileBackedVector or PostingList errors 322*8b6cd535SAndroid Build Coastguard Worker libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator( 323*8b6cd535SAndroid Build Coastguard Worker int64_t query_key_lower, int64_t query_key_upper) const; 324*8b6cd535SAndroid Build Coastguard Worker 325*8b6cd535SAndroid Build Coastguard Worker // Transfers integer index data from the current storage to new_storage and 326*8b6cd535SAndroid Build Coastguard Worker // optimizes buckets (for new_storage only), i.e. merging adjacent buckets if 327*8b6cd535SAndroid Build Coastguard Worker // total # of data among them are less than or equal to 328*8b6cd535SAndroid Build Coastguard Worker // kNumDataThresholdForBucketMerge. 329*8b6cd535SAndroid Build Coastguard Worker // 330*8b6cd535SAndroid Build Coastguard Worker // REQUIRES: new_storage should be a newly created storage instance, i.e. not 331*8b6cd535SAndroid Build Coastguard Worker // contain any data. Otherwise, existing data and posting lists won't be 332*8b6cd535SAndroid Build Coastguard Worker // freed and space will be wasted. 333*8b6cd535SAndroid Build Coastguard Worker // 334*8b6cd535SAndroid Build Coastguard Worker // Returns: 335*8b6cd535SAndroid Build Coastguard Worker // - OK on success 336*8b6cd535SAndroid Build Coastguard Worker // - OUT_OF_RANGE_ERROR if sorted buckets length exceeds the limit after 337*8b6cd535SAndroid Build Coastguard Worker // merging 338*8b6cd535SAndroid Build Coastguard Worker // - INTERNAL_ERROR on IO error 339*8b6cd535SAndroid Build Coastguard Worker libtextclassifier3::Status TransferIndex( 340*8b6cd535SAndroid Build Coastguard Worker const std::vector<DocumentId>& document_id_old_to_new, 341*8b6cd535SAndroid Build Coastguard Worker IntegerIndexStorage* new_storage) const; 342*8b6cd535SAndroid Build Coastguard Worker num_data()343*8b6cd535SAndroid Build Coastguard Worker int32_t num_data() const { return info().num_data; } 344*8b6cd535SAndroid Build Coastguard Worker 345*8b6cd535SAndroid Build Coastguard Worker private: 346*8b6cd535SAndroid Build Coastguard Worker static constexpr int8_t kNumDataAfterSplitAdjustment = 5; 347*8b6cd535SAndroid Build Coastguard Worker IntegerIndexStorage(const Filesystem & filesystem,std::string && working_path,Options && options,PostingListIntegerIndexSerializer * posting_list_serializer,std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,std::unique_ptr<FlashIndexStorage> flash_index_storage)348*8b6cd535SAndroid Build Coastguard Worker explicit IntegerIndexStorage( 349*8b6cd535SAndroid Build Coastguard Worker const Filesystem& filesystem, std::string&& working_path, 350*8b6cd535SAndroid Build Coastguard Worker Options&& options, 351*8b6cd535SAndroid Build Coastguard Worker PostingListIntegerIndexSerializer* posting_list_serializer, 352*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<MemoryMappedFile> metadata_mmapped_file, 353*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, 354*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, 355*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<FlashIndexStorage> flash_index_storage) 356*8b6cd535SAndroid Build Coastguard Worker : PersistentStorage(filesystem, std::move(working_path), 357*8b6cd535SAndroid Build Coastguard Worker kWorkingPathType), 358*8b6cd535SAndroid Build Coastguard Worker options_(std::move(options)), 359*8b6cd535SAndroid Build Coastguard Worker posting_list_serializer_(posting_list_serializer), 360*8b6cd535SAndroid Build Coastguard Worker metadata_mmapped_file_(std::move(metadata_mmapped_file)), 361*8b6cd535SAndroid Build Coastguard Worker sorted_buckets_(std::move(sorted_buckets)), 362*8b6cd535SAndroid Build Coastguard Worker unsorted_buckets_(std::move(unsorted_buckets)), 363*8b6cd535SAndroid Build Coastguard Worker flash_index_storage_(std::move(flash_index_storage)), 364*8b6cd535SAndroid Build Coastguard Worker is_info_dirty_(false), 365*8b6cd535SAndroid Build Coastguard Worker is_storage_dirty_(false) {} 366*8b6cd535SAndroid Build Coastguard Worker 367*8b6cd535SAndroid Build Coastguard Worker static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> 368*8b6cd535SAndroid Build Coastguard Worker InitializeNewFiles( 369*8b6cd535SAndroid Build Coastguard Worker const Filesystem& filesystem, std::string&& working_path, 370*8b6cd535SAndroid Build Coastguard Worker Options&& options, 371*8b6cd535SAndroid Build Coastguard Worker PostingListIntegerIndexSerializer* posting_list_serializer); 372*8b6cd535SAndroid Build Coastguard Worker 373*8b6cd535SAndroid Build Coastguard Worker static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> 374*8b6cd535SAndroid Build Coastguard Worker InitializeExistingFiles( 375*8b6cd535SAndroid Build Coastguard Worker const Filesystem& filesystem, std::string&& working_path, 376*8b6cd535SAndroid Build Coastguard Worker Options&& options, 377*8b6cd535SAndroid Build Coastguard Worker PostingListIntegerIndexSerializer* posting_list_serializer); 378*8b6cd535SAndroid Build Coastguard Worker 379*8b6cd535SAndroid Build Coastguard Worker // Flushes data into posting list(s), creates a new bucket with range 380*8b6cd535SAndroid Build Coastguard Worker // [key_lower, key_upper], and appends it into sorted buckets for storage. 381*8b6cd535SAndroid Build Coastguard Worker // It is a helper function for TransferIndex. 382*8b6cd535SAndroid Build Coastguard Worker // 383*8b6cd535SAndroid Build Coastguard Worker // Returns: 384*8b6cd535SAndroid Build Coastguard Worker // - OK on success 385*8b6cd535SAndroid Build Coastguard Worker // - INTERNAL_ERROR if fails to write existing data into posting list(s) 386*8b6cd535SAndroid Build Coastguard Worker // - Any FileBackedVector or PostingList errors 387*8b6cd535SAndroid Build Coastguard Worker static libtextclassifier3::Status FlushDataIntoNewSortedBucket( 388*8b6cd535SAndroid Build Coastguard Worker int64_t key_lower, int64_t key_upper, 389*8b6cd535SAndroid Build Coastguard Worker std::vector<IntegerIndexData>&& data, IntegerIndexStorage* storage); 390*8b6cd535SAndroid Build Coastguard Worker 391*8b6cd535SAndroid Build Coastguard Worker libtextclassifier3::Status PersistStoragesToDisk() override; 392*8b6cd535SAndroid Build Coastguard Worker 393*8b6cd535SAndroid Build Coastguard Worker libtextclassifier3::Status PersistMetadataToDisk() override; 394*8b6cd535SAndroid Build Coastguard Worker WriteMetadata()395*8b6cd535SAndroid Build Coastguard Worker libtextclassifier3::Status WriteMetadata() override { 396*8b6cd535SAndroid Build Coastguard Worker // IntegerIndexStorage::Header is mmapped. Therefore, writes occur when the 397*8b6cd535SAndroid Build Coastguard Worker // metadata is modified. So just return OK. 398*8b6cd535SAndroid Build Coastguard Worker return libtextclassifier3::Status::OK; 399*8b6cd535SAndroid Build Coastguard Worker } 400*8b6cd535SAndroid Build Coastguard Worker 401*8b6cd535SAndroid Build Coastguard Worker libtextclassifier3::StatusOr<Crc32> UpdateStoragesChecksum() override; 402*8b6cd535SAndroid Build Coastguard Worker 403*8b6cd535SAndroid Build Coastguard Worker libtextclassifier3::StatusOr<Crc32> GetInfoChecksum() const override; 404*8b6cd535SAndroid Build Coastguard Worker 405*8b6cd535SAndroid Build Coastguard Worker libtextclassifier3::StatusOr<Crc32> GetStoragesChecksum() const override; 406*8b6cd535SAndroid Build Coastguard Worker 407*8b6cd535SAndroid Build Coastguard Worker // Helper function to add keys in range [it_start, it_end) into the given 408*8b6cd535SAndroid Build Coastguard Worker // bucket. It handles the bucket and its corresponding posting list(s) to make 409*8b6cd535SAndroid Build Coastguard Worker // searching and indexing efficient. 410*8b6cd535SAndroid Build Coastguard Worker // 411*8b6cd535SAndroid Build Coastguard Worker // When the (single) posting list of the bucket is full: 412*8b6cd535SAndroid Build Coastguard Worker // - If the size of posting list hasn't reached the max size, then just simply 413*8b6cd535SAndroid Build Coastguard Worker // add a new key into it, and PostingListAccessor mechanism will 414*8b6cd535SAndroid Build Coastguard Worker // automatically double the size of the posting list. 415*8b6cd535SAndroid Build Coastguard Worker // - Else: 416*8b6cd535SAndroid Build Coastguard Worker // - If the bucket is splittable (i.e. key_lower < key_upper), then split it 417*8b6cd535SAndroid Build Coastguard Worker // into several new buckets with new ranges, and split the data (according 418*8b6cd535SAndroid Build Coastguard Worker // to their keys and the range of new buckets) of the original posting 419*8b6cd535SAndroid Build Coastguard Worker // list into several new posting lists. 420*8b6cd535SAndroid Build Coastguard Worker // - Otherwise, just simply add a new key into it, and PostingListAccessor 421*8b6cd535SAndroid Build Coastguard Worker // mechanism will automatically create a new max size posting list and 422*8b6cd535SAndroid Build Coastguard Worker // chain them. 423*8b6cd535SAndroid Build Coastguard Worker // 424*8b6cd535SAndroid Build Coastguard Worker // Returns: 425*8b6cd535SAndroid Build Coastguard Worker // - On success: a vector of new Buckets (to add into the unsorted bucket 426*8b6cd535SAndroid Build Coastguard Worker // array later) 427*8b6cd535SAndroid Build Coastguard Worker // - Any FileBackedVector or PostingList errors 428*8b6cd535SAndroid Build Coastguard Worker libtextclassifier3::StatusOr<std::vector<Bucket>> 429*8b6cd535SAndroid Build Coastguard Worker AddKeysIntoBucketAndSplitIfNecessary( 430*8b6cd535SAndroid Build Coastguard Worker DocumentId document_id, SectionId section_id, 431*8b6cd535SAndroid Build Coastguard Worker const std::vector<int64_t>::const_iterator& it_start, 432*8b6cd535SAndroid Build Coastguard Worker const std::vector<int64_t>::const_iterator& it_end, 433*8b6cd535SAndroid Build Coastguard Worker FileBackedVector<Bucket>::MutableView& mutable_bucket); 434*8b6cd535SAndroid Build Coastguard Worker 435*8b6cd535SAndroid Build Coastguard Worker // Merges all unsorted buckets into sorted buckets and clears unsorted 436*8b6cd535SAndroid Build Coastguard Worker // buckets. 437*8b6cd535SAndroid Build Coastguard Worker // 438*8b6cd535SAndroid Build Coastguard Worker // Returns: 439*8b6cd535SAndroid Build Coastguard Worker // - OK on success 440*8b6cd535SAndroid Build Coastguard Worker // - OUT_OF_RANGE_ERROR if sorted buckets length exceeds the limit after 441*8b6cd535SAndroid Build Coastguard Worker // merging 442*8b6cd535SAndroid Build Coastguard Worker // - Any FileBackedVector errors 443*8b6cd535SAndroid Build Coastguard Worker libtextclassifier3::Status SortBuckets(); 444*8b6cd535SAndroid Build Coastguard Worker crcs()445*8b6cd535SAndroid Build Coastguard Worker Crcs& crcs() override { 446*8b6cd535SAndroid Build Coastguard Worker return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() + 447*8b6cd535SAndroid Build Coastguard Worker kCrcsMetadataFileOffset); 448*8b6cd535SAndroid Build Coastguard Worker } 449*8b6cd535SAndroid Build Coastguard Worker crcs()450*8b6cd535SAndroid Build Coastguard Worker const Crcs& crcs() const override { 451*8b6cd535SAndroid Build Coastguard Worker return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() + 452*8b6cd535SAndroid Build Coastguard Worker kCrcsMetadataFileOffset); 453*8b6cd535SAndroid Build Coastguard Worker } 454*8b6cd535SAndroid Build Coastguard Worker info()455*8b6cd535SAndroid Build Coastguard Worker Info& info() { 456*8b6cd535SAndroid Build Coastguard Worker return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() + 457*8b6cd535SAndroid Build Coastguard Worker kInfoMetadataFileOffset); 458*8b6cd535SAndroid Build Coastguard Worker } 459*8b6cd535SAndroid Build Coastguard Worker info()460*8b6cd535SAndroid Build Coastguard Worker const Info& info() const { 461*8b6cd535SAndroid Build Coastguard Worker return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() + 462*8b6cd535SAndroid Build Coastguard Worker kInfoMetadataFileOffset); 463*8b6cd535SAndroid Build Coastguard Worker } 464*8b6cd535SAndroid Build Coastguard Worker SetInfoDirty()465*8b6cd535SAndroid Build Coastguard Worker void SetInfoDirty() { is_info_dirty_ = true; } 466*8b6cd535SAndroid Build Coastguard Worker 467*8b6cd535SAndroid Build Coastguard Worker // When the storage is dirty, then the checksum in the info is invalid and 468*8b6cd535SAndroid Build Coastguard Worker // must be recalculated. Therefore, also mark the info as dirty. SetDirty()469*8b6cd535SAndroid Build Coastguard Worker void SetDirty() { 470*8b6cd535SAndroid Build Coastguard Worker SetInfoDirty(); 471*8b6cd535SAndroid Build Coastguard Worker is_storage_dirty_ = true; 472*8b6cd535SAndroid Build Coastguard Worker } 473*8b6cd535SAndroid Build Coastguard Worker is_info_dirty()474*8b6cd535SAndroid Build Coastguard Worker bool is_info_dirty() const { return is_info_dirty_; } is_storage_dirty()475*8b6cd535SAndroid Build Coastguard Worker bool is_storage_dirty() const { return is_storage_dirty_; } 476*8b6cd535SAndroid Build Coastguard Worker 477*8b6cd535SAndroid Build Coastguard Worker Options options_; 478*8b6cd535SAndroid Build Coastguard Worker 479*8b6cd535SAndroid Build Coastguard Worker PostingListIntegerIndexSerializer* posting_list_serializer_; // Does not own. 480*8b6cd535SAndroid Build Coastguard Worker 481*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_; 482*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets_; 483*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets_; 484*8b6cd535SAndroid Build Coastguard Worker std::unique_ptr<FlashIndexStorage> flash_index_storage_; 485*8b6cd535SAndroid Build Coastguard Worker 486*8b6cd535SAndroid Build Coastguard Worker bool is_info_dirty_; 487*8b6cd535SAndroid Build Coastguard Worker bool is_storage_dirty_; 488*8b6cd535SAndroid Build Coastguard Worker }; 489*8b6cd535SAndroid Build Coastguard Worker 490*8b6cd535SAndroid Build Coastguard Worker } // namespace lib 491*8b6cd535SAndroid Build Coastguard Worker } // namespace icing 492*8b6cd535SAndroid Build Coastguard Worker 493*8b6cd535SAndroid Build Coastguard Worker #endif // ICING_INDEX_NUMERIC_INTEGER_INDEX_STORAGE_H_ 494