1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_MAIN_MAIN_INDEX_H_ 16 #define ICING_INDEX_MAIN_MAIN_INDEX_H_ 17 18 #include <cstddef> 19 #include <cstdint> 20 #include <memory> 21 #include <string> 22 #include <unordered_map> 23 #include <utility> 24 #include <vector> 25 26 #include "icing/text_classifier/lib3/utils/base/status.h" 27 #include "icing/text_classifier/lib3/utils/base/statusor.h" 28 #include "icing/absl_ports/canonical_errors.h" 29 #include "icing/file/filesystem.h" 30 #include "icing/file/posting_list/flash-index-storage.h" 31 #include "icing/file/posting_list/posting-list-identifier.h" 32 #include "icing/index/lite/term-id-hit-pair.h" 33 #include "icing/index/main/posting-list-hit-accessor.h" 34 #include "icing/index/main/posting-list-hit-serializer.h" 35 #include "icing/index/term-id-codec.h" 36 #include "icing/index/term-metadata.h" 37 #include "icing/legacy/index/icing-dynamic-trie.h" 38 #include "icing/legacy/index/icing-filesystem.h" 39 #include "icing/proto/debug.pb.h" 40 #include "icing/proto/scoring.pb.h" 41 #include "icing/proto/storage.pb.h" 42 #include "icing/proto/term.pb.h" 43 #include "icing/store/document-id.h" 44 #include "icing/store/suggestion-result-checker.h" 45 #include "icing/util/crc32.h" 46 #include "icing/util/status-macros.h" 47 48 namespace icing { 49 namespace lib { 50 51 class MainIndex { 52 public: 53 // RETURNS: 54 // - valid instance of MainIndex, on success. 55 // - INTERNAL error if unable to create the lexicon or flash storage. 56 static libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> Create( 57 const std::string& index_directory, const Filesystem* filesystem, 58 const IcingFilesystem* icing_filesystem); 59 60 // Reads magic from existing flash index storage file header. We need this 61 // during Icing initialization phase to determine the version. 62 // 63 // RETURNS: 64 // - On success, a valid magic. 65 // - NOT_FOUND if the flash index doesn't exist. 66 // - INTERNAL on I/O error. 67 static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic( 68 const Filesystem* filesystem, const std::string& index_directory); 69 70 // Get a PostingListHitAccessor that holds the posting list chain for 'term'. 71 // 72 // RETURNS: 73 // - On success, a valid PostingListHitAccessor 74 // - NOT_FOUND if term is not present in the main index. 75 libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>> 76 GetAccessorForExactTerm(const std::string& term); 77 78 // Get a PostingListHitAccessor for 'prefix'. 79 // 80 // RETURNS: 81 // - On success, a result containing a valid PostingListHitAccessor. 82 // - NOT_FOUND if neither 'prefix' nor any terms for which 'prefix' is a 83 // prefix are present in the main index. 84 struct GetPrefixAccessorResult { 85 // A PostingListHitAccessor that holds the posting list chain for the term 86 // that best represents 'prefix' in the main index. 87 std::unique_ptr<PostingListHitAccessor> accessor; 88 // True if the returned posting list chain is for 'prefix' or false if the 89 // returned posting list chain is for a term for which 'prefix' is a prefix. 90 bool exact; 91 GetPrefixAccessorResultGetPrefixAccessorResult92 explicit GetPrefixAccessorResult( 93 std::unique_ptr<PostingListHitAccessor> accessor_in, bool exact_in) 94 : accessor(std::move(accessor_in)), exact(exact_in) {} 95 }; 96 libtextclassifier3::StatusOr<GetPrefixAccessorResult> 97 GetAccessorForPrefixTerm(const std::string& prefix); 98 99 // Finds terms with the given prefix in the given result checker. The 100 // input prefix must be normalized, otherwise inaccurate results may be 101 // returned. If scoring_match_type is EXACT, only exact hit will be counted 102 // and it is PREFIX, both prefix and exact hits will be counted. Results are 103 // not sorted specifically and are in lexicographical order. Number of results 104 // are no more than 'num_to_return'. 105 // 106 // Returns: 107 // A list of TermMetadata on success 108 // INTERNAL_ERROR if failed to access term data. 109 libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix( 110 const std::string& prefix, TermMatchType::Code scoring_match_type, 111 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, 112 const SuggestionResultChecker* suggestion_result_checker); 113 114 struct LexiconMergeOutputs { 115 // Maps from main_lexicon tvi for new branching point to the main_lexicon 116 // tvi for posting list whose hits must be backfilled. 117 std::unordered_map<uint32_t, uint32_t> backfill_map; 118 119 // Maps from lexicon tvis to main_lexicon tvis. 120 std::unordered_map<uint32_t, uint32_t> other_tvi_to_main_tvi; 121 122 // Maps from main lexicon tvi to the block index. Tvis with no entry do not 123 // have an allocated posting list. 124 std::unordered_map<uint32_t, int> main_tvi_to_block_index; 125 126 // Maps from the lexicon tvi to the beginning position in 127 // prefix_tvis_buf and the length. 128 std::unordered_map<uint32_t, std::pair<int, int>> 129 other_tvi_to_prefix_main_tvis; 130 131 // Stores tvis that are mapped to by other_tvi_to_prefix_tvis. 132 std::vector<uint32_t> prefix_tvis_buf; 133 }; 134 135 // Merge the lexicon into the main lexicon and populate the data 136 // structures necessary to translate lite tvis to main tvis, track backfilling 137 // and expanding lite terms to prefix terms. 138 // 139 // RETURNS: 140 // - OK on success 141 // - INTERNAL on IO error while writing to the main lexicon. MergeLexicon(const IcingDynamicTrie & other_lexicon)142 libtextclassifier3::StatusOr<LexiconMergeOutputs> MergeLexicon( 143 const IcingDynamicTrie& other_lexicon) { 144 // Backfill branch points need to be added first so that the backfill_map 145 // can be correctly populated. 146 ICING_ASSIGN_OR_RETURN(LexiconMergeOutputs outputs, 147 AddBackfillBranchPoints(other_lexicon)); 148 ICING_ASSIGN_OR_RETURN(outputs, 149 AddTerms(other_lexicon, std::move(outputs))); 150 // Non-backfill branch points need to be added last so that the mapping of 151 // newly added terms to prefix terms can be correctly populated (prefix 152 // terms might be branch points between two new terms or between a 153 // pre-existing term and a new term). 154 ICING_ASSIGN_OR_RETURN(outputs, 155 AddBranchPoints(other_lexicon, std::move(outputs))); 156 return outputs; 157 } 158 159 // Add hits to the main index and backfill from existing posting lists to new 160 // backfill branch points. 161 // 162 // The backfill_map maps from main_lexicon tvi for a newly added branching 163 // point to the main_lexicon tvi for the posting list whose hits must be 164 // backfilled. backfill_map should be populated as part of LexiconMergeOutputs 165 // in MergeLexicon and be blindly passed to this function. 166 // 167 // RETURNS: 168 // - OK on success 169 // - INVALID_ARGUMENT if one of the elements in the lite index has a term_id 170 // exceeds the max TermId, is not valid or is not less than pre-existing hits 171 // in the main index. 172 // - INTERNAL_ERROR if unable to mmap necessary IndexBlocks 173 // - RESOURCE_EXHAUSTED error if unable to grow the index 174 libtextclassifier3::Status AddHits( 175 const TermIdCodec& term_id_codec, 176 std::unordered_map<uint32_t, uint32_t>&& backfill_map, 177 std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id); 178 PersistToDisk()179 libtextclassifier3::Status PersistToDisk() { 180 if (main_lexicon_->Sync() && flash_index_storage_->PersistToDisk()) { 181 return libtextclassifier3::Status::OK; 182 } 183 return absl_ports::InternalError("Unable to sync main index components."); 184 } 185 186 // Updates and returns the checksums of the components in the MainIndex. UpdateChecksum()187 Crc32 UpdateChecksum() { return main_lexicon_->UpdateCrc(); } 188 189 // Calculates and returns the checksums of the components in the MainIndex. GetChecksum()190 Crc32 GetChecksum() const { return main_lexicon_->GetCrc(); } 191 last_added_document_id()192 DocumentId last_added_document_id() const { 193 return flash_index_storage_->get_last_indexed_docid(); 194 } 195 Reset()196 libtextclassifier3::Status Reset() { 197 ICING_RETURN_IF_ERROR(flash_index_storage_->Reset()); 198 main_lexicon_->Clear(); 199 return libtextclassifier3::Status::OK; 200 } 201 Warm()202 void Warm() { main_lexicon_->Warm(); } 203 204 // Returns: 205 // - elements size of lexicon and index, on success 206 // - INTERNAL on IO error 207 libtextclassifier3::StatusOr<int64_t> GetElementsSize() const; 208 209 // Takes the provided storage_info, populates the fields related to the main 210 // index and returns that storage_info. 211 // 212 // If an IO error occurs while trying to calculate the value for a field, then 213 // that field will be set to -1. 214 IndexStorageInfoProto GetStorageInfo( 215 IndexStorageInfoProto storage_info) const; 216 217 // Returns debug information for the main index in out. 218 // verbosity = BASIC, simplest debug information - just the lexicon 219 // verbosity = DETAILED, more detailed debug information including raw 220 // postings lists. 221 std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity) const; 222 223 // Reduces internal file sizes by reclaiming space of deleted documents. 224 // 225 // This method will update the last_added_docid of the index to the largest 226 // document id that still appears in the index after compaction. 227 // 228 // Returns: 229 // OK on success 230 // INTERNAL_ERROR on IO error, this indicates that the index may be in an 231 // invalid state and should be cleared. 232 libtextclassifier3::Status Optimize( 233 const std::vector<DocumentId>& document_id_old_to_new); 234 235 private: 236 explicit MainIndex(const std::string& index_directory, 237 const Filesystem* filesystem, 238 const IcingFilesystem* icing_filesystem); 239 240 libtextclassifier3::Status Init(); 241 242 // Helpers for merging the lexicon 243 // Add all 'backfill' branch points. Backfill branch points are prefix 244 // branch points that are a prefix of terms that existed in the lexicon 245 // to the merge. 246 // 247 // For example, if the main lexicon only contains "foot" and is then merged 248 // with a lite lexicon containing only "fool", then a backfill branch point 249 // for "foo" will be added to contain prefix hits from both the pre-existing 250 // posting list for "foot" and the new posting list for "fool". 251 // 252 // Populates LexiconMergeOutputs.backfill_map 253 // 254 // RETURNS: 255 // - OK on success 256 // - INTERNAL on IO error while writing to the main lexicon. 257 libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBackfillBranchPoints( 258 const IcingDynamicTrie& other_lexicon); 259 260 // Add all terms from the lexicon. 261 // 262 // Populates LexiconMergeOutputs.other_tvi_to_main_tvi 263 // 264 // RETURNS: 265 // - OK on success 266 // - INTERNAL on IO error while writing to the main lexicon. 267 libtextclassifier3::StatusOr<LexiconMergeOutputs> AddTerms( 268 const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs); 269 270 // Add all branch points for terms added from the lexicon. 271 // For example, if the main lexicon is empty and is then merged with a 272 // lexicon containing only "foot" and "fool", then a branch point for "foo" 273 // will be added to contain prefix hits from both "foot" and "fool". 274 // 275 // Populates LexiconMergeOutputs.other_tvi_to_prefix_main_tvis and 276 // LexiconMergeOutputs.prefix_tvis_buf; 277 // 278 // RETURNS: 279 // - OK on success 280 // - INTERNAL on IO error while writing to the main lexicon. 281 libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBranchPoints( 282 const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs); 283 284 // Copies all properties from old_tvi in the other lexicon to the new_tvi in 285 // the main lexicon. 286 // Returns true on success, false if an IO error is encountered. 287 bool CopyProperties(const IcingDynamicTrie::PropertyReadersAll& prop_reader, 288 const IcingDynamicTrie& other_lexicon, uint32_t other_tvi, 289 uint32_t new_main_tvi); 290 291 // Add all hits between [hit_elements, hit_elements + len) to main_index, 292 // updating the entry in the main lexicon at trie_value_index to point to the 293 // resulting posting list. Hits are sorted in descending document id order, so 294 // they should be to posting lists in reverse (starting at hit_elements 295 // + len - 1) and working backwards. Therefore, hit_elements must be in sorted 296 // order. 297 // 298 // trie_value_index may point to a valid posting list id if there is a 299 // pre-existing posting list to append to. 300 // 301 // If backfill_posting_list_id is valid, then the hits from the posting list 302 // identified by backfill_posting_list_id should be added to the new posting 303 // list before the hits in hit_elements. 304 // 305 // RETURNS: 306 // - OK on success 307 // - INVALID_ARGUMENT if posting_list_id stored at trie_value_index is valid 308 // but points out of bounds in the IndexBlock referred to by 309 // id.block_index(), if one of the hits from [hit_elements,hit_elements+len) 310 // is not valid, or if one of the hits from [hit_elements,hit_elements+len) 311 // is not less than the previously added hits. 312 // - INTERNAL_ERROR if posting_list_id stored at trie_value_index is valid 313 // but points to an invalid block index or if unable to mmap the IndexBlock. 314 // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new 315 // posting list. 316 libtextclassifier3::Status AddHitsForTerm( 317 uint32_t tvi, PostingListIdentifier backfill_posting_list_id, 318 const TermIdHitPair* hit_elements, size_t len); 319 320 // Adds all prefix hits or hits from prefix sections present on the posting 321 // list identified by backfill_posting_list_id to hit_accum. 322 // 323 // RETURNS: 324 // - OK, on success 325 // - INVALID_ARGUMENT if backfill_posting_list_id points out of bounds in the 326 // IndexBlock referred to by id.block_index() 327 // - INTERNAL_ERROR if unable to mmap the block identified by 328 // backfill_posting_list_id or if the posting list identified by 329 // backfill_posting_list_id has been corrupted. 330 // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new 331 // posting list. 332 libtextclassifier3::Status AddPrefixBackfillHits( 333 PostingListIdentifier backfill_posting_list_id, 334 PostingListHitAccessor* hit_accum); 335 336 // Transfer hits from old_pl_accessor to new_index for term. 337 // 338 // Returns: 339 // largest document id added to the translated posting list, on success 340 // INTERNAL_ERROR on IO error 341 static libtextclassifier3::StatusOr<DocumentId> TransferAndAddHits( 342 const std::vector<DocumentId>& document_id_old_to_new, 343 std::string_view term, PostingListHitAccessor& old_pl_accessor, 344 MainIndex* new_index); 345 346 // Transfer hits from the current main index to new_index. 347 // 348 // Returns: 349 // OK on success 350 // INTERNAL_ERROR on IO error 351 libtextclassifier3::Status TransferIndex( 352 const std::vector<DocumentId>& document_id_old_to_new, 353 MainIndex* new_index); 354 355 std::string base_dir_; 356 const Filesystem* filesystem_; 357 const IcingFilesystem* icing_filesystem_; 358 std::unique_ptr<PostingListHitSerializer> posting_list_hit_serializer_; 359 std::unique_ptr<FlashIndexStorage> flash_index_storage_; 360 std::unique_ptr<IcingDynamicTrie> main_lexicon_; 361 }; 362 363 } // namespace lib 364 } // namespace icing 365 366 #endif // ICING_INDEX_MAIN_MAIN_INDEX_H_ 367