xref: /aosp_15_r20/external/icing/icing/index/main/main-index.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 #include "icing/index/main/main-index.h"
15 
16 #include <algorithm>
17 #include <cstdint>
18 #include <cstring>
19 #include <iterator>
20 #include <memory>
21 #include <string>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/file/destructible-directory.h"
31 #include "icing/file/filesystem.h"
32 #include "icing/file/posting_list/flash-index-storage.h"
33 #include "icing/file/posting_list/posting-list-accessor.h"
34 #include "icing/file/posting_list/posting-list-common.h"
35 #include "icing/file/posting_list/posting-list-identifier.h"
36 #include "icing/index/hit/hit.h"
37 #include "icing/index/lite/term-id-hit-pair.h"
38 #include "icing/index/main/posting-list-hit-accessor.h"
39 #include "icing/index/main/posting-list-hit-serializer.h"
40 #include "icing/index/term-id-codec.h"
41 #include "icing/index/term-metadata.h"
42 #include "icing/index/term-property-id.h"
43 #include "icing/legacy/core/icing-string-util.h"
44 #include "icing/legacy/index/icing-dynamic-trie.h"
45 #include "icing/legacy/index/icing-filesystem.h"
46 #include "icing/proto/debug.pb.h"
47 #include "icing/proto/storage.pb.h"
48 #include "icing/proto/term.pb.h"
49 #include "icing/store/document-id.h"
50 #include "icing/store/namespace-id.h"
51 #include "icing/store/suggestion-result-checker.h"
52 #include "icing/util/logging.h"
53 #include "icing/util/status-macros.h"
54 
55 namespace icing {
56 namespace lib {
57 
58 namespace {
59 
60 // Finds the shortest,valid prefix term with prefix hits in lexicon for which
61 // "prefix" is a prefix.
62 // Returns a valid FindTermResult with found=true if either:
63 //   1. prefix exists as a term in lexicon.
64 //   2. the shortest, valid prefix in the lexicon exists and contains prefix
65 //      hits.
66 // Returns a FindTermResult with found=false and undefined values of tvi and
67 // exact if no term was found.
68 struct FindTermResult {
69   // TVI of the term that was found. Undefined if found=false.
70   uint32_t tvi;
71   // Whether or not a valid term with prefix hits was found.
72   bool found;
73   // Whether or not that term is equal to 'prefix'
74   bool exact;
75 };
FindShortestValidTermWithPrefixHits(const IcingDynamicTrie * lexicon,const std::string & prefix)76 FindTermResult FindShortestValidTermWithPrefixHits(
77     const IcingDynamicTrie* lexicon, const std::string& prefix) {
78   // For prefix indexing: when we are doing a prefix match for "prefix", find
79   // the tvi to the equivalent posting list. prefix's own posting list might not
80   // exist but one of its children acts as a proxy.
81   IcingDynamicTrie::PropertyReader hits_in_prefix_section(
82       *lexicon, GetHasHitsInPrefixSectionPropertyId());
83   uint32_t tvi = 0;
84   bool found = false;
85   bool exact = false;
86   for (IcingDynamicTrie::Iterator it(*lexicon, prefix); it.IsValid();
87        it.Advance()) {
88     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
89     memcpy(&posting_list_id, it.GetValue(), sizeof(posting_list_id));
90 
91     // Posting list id might be invalid if this is also a backfill term.
92     // Suppose that the main index has two pre-existing prefix hits "foot" and
93     // "fool" - it will have a branch point posting list for "foo". Then, let's
94     // suppose that the other index adds hits for "foul", "four" and "far". This
95     // will result in branch points for "fo" and "f".
96     // If "fo" was added before "f", then the iterator would first give us "fo".
97     // "fo" will have an invalid posting_list_id because it hasn't been
98     // backfilled yet, so we need to continue iterating to "foo".
99     if (posting_list_id.is_valid()) {
100       exact = (prefix.size() == it.GetKey().size());
101       tvi = it.GetValueIndex();
102       // Found it. Does it have prefix hits?
103       found = exact || hits_in_prefix_section.HasProperty(tvi);
104       break;
105     }
106   }
107   FindTermResult result = {tvi, found, exact};
108   return result;
109 }
110 
MakeFlashIndexFilename(const std::string & base_dir)111 std::string MakeFlashIndexFilename(const std::string& base_dir) {
112   return base_dir + "/main_index";
113 }
114 
115 }  // namespace
116 
MainIndex(const std::string & index_directory,const Filesystem * filesystem,const IcingFilesystem * icing_filesystem)117 MainIndex::MainIndex(const std::string& index_directory,
118                      const Filesystem* filesystem,
119                      const IcingFilesystem* icing_filesystem)
120     : base_dir_(index_directory),
121       filesystem_(filesystem),
122       icing_filesystem_(icing_filesystem),
123       posting_list_hit_serializer_(
124           std::make_unique<PostingListHitSerializer>()) {}
125 
Create(const std::string & index_directory,const Filesystem * filesystem,const IcingFilesystem * icing_filesystem)126 libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> MainIndex::Create(
127     const std::string& index_directory, const Filesystem* filesystem,
128     const IcingFilesystem* icing_filesystem) {
129   ICING_RETURN_ERROR_IF_NULL(filesystem);
130   ICING_RETURN_ERROR_IF_NULL(icing_filesystem);
131   std::unique_ptr<MainIndex> main_index(
132       new MainIndex(index_directory, filesystem, icing_filesystem));
133   ICING_RETURN_IF_ERROR(main_index->Init());
134   return main_index;
135 }
136 
ReadFlashIndexMagic(const Filesystem * filesystem,const std::string & index_directory)137 /* static */ libtextclassifier3::StatusOr<int> MainIndex::ReadFlashIndexMagic(
138     const Filesystem* filesystem, const std::string& index_directory) {
139   return FlashIndexStorage::ReadHeaderMagic(
140       filesystem, MakeFlashIndexFilename(index_directory));
141 }
142 
143 // TODO(b/139087650) : Migrate off of IcingFilesystem.
Init()144 libtextclassifier3::Status MainIndex::Init() {
145   if (!filesystem_->CreateDirectoryRecursively(base_dir_.c_str())) {
146     return absl_ports::InternalError("Unable to create main index directory.");
147   }
148   std::string flash_index_file = MakeFlashIndexFilename(base_dir_);
149   ICING_ASSIGN_OR_RETURN(
150       FlashIndexStorage flash_index,
151       FlashIndexStorage::Create(flash_index_file, filesystem_,
152                                 posting_list_hit_serializer_.get()));
153   flash_index_storage_ =
154       std::make_unique<FlashIndexStorage>(std::move(flash_index));
155 
156   std::string lexicon_file = base_dir_ + "/main-lexicon";
157   IcingDynamicTrie::RuntimeOptions runtime_options;
158   main_lexicon_ = std::make_unique<IcingDynamicTrie>(
159       lexicon_file, runtime_options, icing_filesystem_);
160   IcingDynamicTrie::Options lexicon_options;
161   if (!main_lexicon_->CreateIfNotExist(lexicon_options) ||
162       !main_lexicon_->Init()) {
163     return absl_ports::InternalError("Failed to initialize lexicon trie");
164   }
165   return libtextclassifier3::Status::OK;
166 }
167 
GetElementsSize() const168 libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const {
169   IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto());
170   if (storage_info.main_index_storage_size() == -1 ||
171       storage_info.main_index_lexicon_size() == -1) {
172     return absl_ports::AbortedError(
173         "Failed to get size of MainIndex's members.");
174   }
175   return storage_info.main_index_storage_size() +
176          storage_info.main_index_lexicon_size();
177 }
178 
GetStorageInfo(IndexStorageInfoProto storage_info) const179 IndexStorageInfoProto MainIndex::GetStorageInfo(
180     IndexStorageInfoProto storage_info) const {
181   storage_info.set_main_index_lexicon_size(
182       IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize()));
183   storage_info.set_main_index_storage_size(
184       Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize()));
185   storage_info.set_main_index_block_size(flash_index_storage_->block_size());
186   storage_info.set_num_blocks(flash_index_storage_->num_blocks());
187   storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction());
188   return storage_info;
189 }
190 
191 libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>>
GetAccessorForExactTerm(const std::string & term)192 MainIndex::GetAccessorForExactTerm(const std::string& term) {
193   PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
194   if (!main_lexicon_->Find(term, &posting_list_id)) {
195     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
196         "Term %s is not present in main lexicon.", term.c_str()));
197   }
198   return PostingListHitAccessor::CreateFromExisting(
199       flash_index_storage_.get(), posting_list_hit_serializer_.get(),
200       posting_list_id);
201 }
202 
203 libtextclassifier3::StatusOr<MainIndex::GetPrefixAccessorResult>
GetAccessorForPrefixTerm(const std::string & prefix)204 MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) {
205   bool exact = false;
206   // For prefix indexing: when we are doing a prefix match for
207   // "prefix", find the tvi to the equivalent posting list. prefix's
208   // own posting list might not exist but its shortest child acts as a proxy.
209   //
210   // For example, if there are only two hits in the index are prefix hits for
211   // "bar" and "bat", then both will appear on a posting list for "ba". "b"
212   // won't have a posting list, but "ba" will suffice.
213   IcingDynamicTrie::PropertyReader hits_in_prefix_section(
214       *main_lexicon_, GetHasHitsInPrefixSectionPropertyId());
215   IcingDynamicTrie::Iterator main_itr(*main_lexicon_, prefix);
216   if (!main_itr.IsValid()) {
217     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
218         "Term: %s is not present in the main lexicon.", prefix.c_str()));
219   }
220   exact = (prefix.size() == main_itr.GetKey().size());
221 
222   if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) {
223     // Found it, but it doesn't have prefix hits. Exit early. No need to
224     // retrieve the posting list because there's nothing there for us.
225     return absl_ports::NotFoundError("The term doesn't have any prefix hits.");
226   }
227   PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
228   memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id));
229   ICING_ASSIGN_OR_RETURN(
230       std::unique_ptr<PostingListHitAccessor> pl_accessor,
231       PostingListHitAccessor::CreateFromExisting(
232           flash_index_storage_.get(), posting_list_hit_serializer_.get(),
233           posting_list_id));
234   return GetPrefixAccessorResult(std::move(pl_accessor), exact);
235 }
236 
237 // TODO(tjbarron): Implement a method PropertyReadersAll.HasAnyProperty().
IsTermInNamespaces(const IcingDynamicTrie::PropertyReadersAll & property_reader,uint32_t value_index,const std::vector<NamespaceId> & namespace_ids)238 bool IsTermInNamespaces(
239     const IcingDynamicTrie::PropertyReadersAll& property_reader,
240     uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) {
241   if (namespace_ids.empty()) {
242     return true;
243   }
244   for (NamespaceId namespace_id : namespace_ids) {
245     if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id),
246                                     value_index)) {
247       return true;
248     }
249   }
250 
251   return false;
252 }
253 
254 libtextclassifier3::StatusOr<std::vector<TermMetadata>>
FindTermsByPrefix(const std::string & prefix,TermMatchType::Code scoring_match_type,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker)255 MainIndex::FindTermsByPrefix(
256     const std::string& prefix, TermMatchType::Code scoring_match_type,
257     SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
258     const SuggestionResultChecker* suggestion_result_checker) {
259   // Finds all the terms that start with the given prefix in the lexicon.
260   IcingDynamicTrie::Iterator term_iterator(*main_lexicon_, prefix);
261 
262   std::vector<TermMetadata> term_metadata_list;
263   while (term_iterator.IsValid()) {
264     int score = 0;
265     DocumentId last_document_id = kInvalidDocumentId;
266     bool is_last_document_in_desired = false;
267 
268     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
269     memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id));
270     ICING_ASSIGN_OR_RETURN(
271         std::unique_ptr<PostingListHitAccessor> pl_accessor,
272         PostingListHitAccessor::CreateFromExisting(
273             flash_index_storage_.get(), posting_list_hit_serializer_.get(),
274             posting_list_id));
275     ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
276                            pl_accessor->GetNextHitsBatch());
277     while (!hits.empty()) {
278       for (const Hit& hit : hits) {
279         // Check whether this Hit is desired.
280         DocumentId document_id = hit.document_id();
281         bool is_new_document = document_id != last_document_id;
282         if (is_new_document) {
283           last_document_id = document_id;
284           is_last_document_in_desired =
285               suggestion_result_checker->BelongsToTargetResults(
286                   document_id, hit.section_id());
287         }
288         if (!is_last_document_in_desired) {
289           // The document is removed or expired or not belongs to target
290           // namespaces.
291           continue;
292         }
293         if (scoring_match_type == TermMatchType::EXACT_ONLY &&
294             hit.is_prefix_hit()) {
295           continue;
296         }
297 
298         // Score the hit by the strategy
299         if (score_by ==
300             SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE) {
301           // Give 1 to all match terms and return them in arbitrary order
302           score = 1;
303           break;
304         } else if (score_by == SuggestionScoringSpecProto::
305                                    SuggestionRankingStrategy::DOCUMENT_COUNT &&
306                    is_new_document) {
307           ++score;
308         } else if (score_by == SuggestionScoringSpecProto::
309                                    SuggestionRankingStrategy::TERM_FREQUENCY) {
310           if (hit.has_term_frequency()) {
311             score += hit.term_frequency();
312           } else {
313             ++score;
314           }
315         }
316       }
317       if (score_by ==
318               SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE &&
319           score == 1) {
320         // The term is desired and no need to be scored.
321         break;
322       }
323       ICING_ASSIGN_OR_RETURN(hits, pl_accessor->GetNextHitsBatch());
324     }
325     if (score > 0) {
326       term_metadata_list.push_back(
327           TermMetadata(std::string(term_iterator.GetKey()), score));
328     }
329 
330     term_iterator.Advance();
331   }
332   return term_metadata_list;
333 }
334 
335 libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
AddBackfillBranchPoints(const IcingDynamicTrie & other_lexicon)336 MainIndex::AddBackfillBranchPoints(const IcingDynamicTrie& other_lexicon) {
337   // Maps new branching points in main lexicon to the term such that
338   // branching_point_term is a prefix of term and there are no terms smaller
339   // than term and greater than branching_point_term.
340   std::string prefix;
341   LexiconMergeOutputs outputs;
342   for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
343        other_term_itr.IsValid(); other_term_itr.Advance()) {
344     // If term were inserted in the main lexicon, what new branching would it
345     // create? (It always creates at most one.)
346     int prefix_len = main_lexicon_->FindNewBranchingPrefixLength(
347         other_term_itr.GetKey(), /*utf8=*/true);
348     if (prefix_len <= 0) {
349       continue;
350     }
351     prefix.assign(other_term_itr.GetKey().substr(0, prefix_len));
352 
353     // Figure out backfill tvi. Might not exist since all children terms could
354     // only contain hits from non-prefix sections.
355     //
356     // Ex. Suppose that the main lexicon contains "foot" and "fool" and that
357     // we're adding "foul". The new branching prefix will be "fo". The backfill
358     // prefix will be "foo" - all hits in prefix section on "foo" will need to
359     // be added to the new "fo" posting list later.
360     FindTermResult result =
361         FindShortestValidTermWithPrefixHits(main_lexicon_.get(), prefix);
362     if (!result.found || result.exact) {
363       continue;
364     }
365 
366     // This is a new prefix that will need backfilling from its next-in-line
367     // posting list. This new prefix will have to have a posting list eventually
368     // so insert a default PostingListIdentifier as a placeholder.
369     uint32_t branching_prefix_tvi;
370     bool new_key;
371     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
372     libtextclassifier3::Status status =
373         main_lexicon_->Insert(prefix, &posting_list_id, &branching_prefix_tvi,
374                               /*replace=*/false, &new_key);
375     if (!status.ok()) {
376       ICING_LOG(DBG) << "Could not insert branching prefix\n"
377                      << status.error_message();
378       return status;
379     }
380 
381     // Backfills only contain prefix hits by default. So set these here but
382     // could be overridden when adding hits from the other index later.
383     if (!main_lexicon_->SetProperty(branching_prefix_tvi,
384                                     GetHasNoExactHitsPropertyId()) ||
385         !main_lexicon_->SetProperty(branching_prefix_tvi,
386                                     GetHasHitsInPrefixSectionPropertyId())) {
387       return absl_ports::InternalError("Setting prefix prop failed");
388     }
389 
390     outputs.backfill_map[branching_prefix_tvi] = result.tvi;
391   }
392   return outputs;
393 }
394 
395 libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
AddTerms(const IcingDynamicTrie & other_lexicon,LexiconMergeOutputs && outputs)396 MainIndex::AddTerms(const IcingDynamicTrie& other_lexicon,
397                     LexiconMergeOutputs&& outputs) {
398   IcingDynamicTrie::PropertyReadersAll new_term_prop_readers(other_lexicon);
399   for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
400        other_term_itr.IsValid(); other_term_itr.Advance()) {
401     uint32_t new_main_tvi;
402     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
403     libtextclassifier3::Status status = main_lexicon_->Insert(
404         other_term_itr.GetKey(), &posting_list_id, &new_main_tvi,
405         /*replace=*/false);
406     if (!status.ok()) {
407       ICING_LOG(DBG) << "Could not insert term: " << other_term_itr.GetKey()
408                      << "\n"
409                      << status.error_message();
410       return status;
411     }
412 
413     // Copy the properties from the other lexicon over to the main lexicon.
414     uint32_t other_tvi = other_term_itr.GetValueIndex();
415     if (!CopyProperties(new_term_prop_readers, other_lexicon, other_tvi,
416                         new_main_tvi)) {
417       return absl_ports::InternalError(absl_ports::StrCat(
418           "Could not insert term: ", other_term_itr.GetKey()));
419     }
420 
421     // Add other to main mapping.
422     outputs.other_tvi_to_main_tvi.emplace(other_tvi, new_main_tvi);
423 
424     memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(new_main_tvi),
425            sizeof(posting_list_id));
426     if (posting_list_id.block_index() != kInvalidBlockIndex) {
427       outputs.main_tvi_to_block_index[new_main_tvi] =
428           posting_list_id.block_index();
429     }
430   }
431   return std::move(outputs);
432 }
433 
434 libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
AddBranchPoints(const IcingDynamicTrie & other_lexicon,LexiconMergeOutputs && outputs)435 MainIndex::AddBranchPoints(const IcingDynamicTrie& other_lexicon,
436                            LexiconMergeOutputs&& outputs) {
437   IcingDynamicTrie::PropertyReader has_prefix_prop_reader(
438       other_lexicon, GetHasHitsInPrefixSectionPropertyId());
439   if (!has_prefix_prop_reader.Exists()) {
440     return std::move(outputs);
441   }
442   std::string prefix;
443   for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
444        other_term_itr.IsValid(); other_term_itr.Advance()) {
445     // Only expand terms that have hits in prefix sections.
446     if (!has_prefix_prop_reader.HasProperty(other_term_itr.GetValueIndex())) {
447       continue;
448     }
449 
450     // Get prefixes where there is already a branching point in the main
451     // lexicon. We skip prefixes which don't already have a branching point.
452     std::vector<int> prefix_lengths = main_lexicon_->FindBranchingPrefixLengths(
453         other_term_itr.GetKey(), /*utf8=*/true);
454 
455     int buf_start = outputs.prefix_tvis_buf.size();
456     // Add prefixes.
457     for (int prefix_length : prefix_lengths) {
458       if (prefix_length <= 0) {
459         continue;
460       }
461 
462       prefix.assign(other_term_itr.GetKey().substr(0, prefix_length));
463       uint32_t prefix_tvi;
464       bool new_key;
465       PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
466       libtextclassifier3::Status status =
467           main_lexicon_->Insert(prefix, &posting_list_id, &prefix_tvi,
468                                 /*replace=*/false, &new_key);
469       if (!status.ok()) {
470         ICING_LOG(DBG) << "Could not insert prefix: " << prefix << "\n"
471                        << status.error_message();
472         return status;
473       }
474 
475       // Prefix tvi will have hits in prefix section.
476       if (!main_lexicon_->SetProperty(prefix_tvi,
477                                       GetHasHitsInPrefixSectionPropertyId())) {
478         return absl_ports::InternalError(
479             "Setting has hits in prefix section prop failed");
480       }
481 
482       // If it hasn't been added by non-prefix term insertions in
483       // AddBackfillBranchPoints and AddTerms, it is a prefix-only term.
484       if (new_key && !main_lexicon_->SetProperty(
485                          prefix_tvi, GetHasNoExactHitsPropertyId())) {
486         return absl_ports::InternalError("Setting no exact hits prop failed");
487       }
488 
489       outputs.prefix_tvis_buf.push_back(prefix_tvi);
490 
491       memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(prefix_tvi),
492              sizeof(posting_list_id));
493       if (posting_list_id.block_index() != kInvalidBlockIndex) {
494         outputs.main_tvi_to_block_index[prefix_tvi] =
495             posting_list_id.block_index();
496       }
497     }
498 
499     // Any prefixes added? Then add to map.
500     if (buf_start < outputs.prefix_tvis_buf.size()) {
501       outputs.other_tvi_to_prefix_main_tvis[other_term_itr.GetValueIndex()] = {
502           buf_start, outputs.prefix_tvis_buf.size() - buf_start};
503     }
504   }
505   return std::move(outputs);
506 }
507 
CopyProperties(const IcingDynamicTrie::PropertyReadersAll & prop_reader,const IcingDynamicTrie & other_lexicon,uint32_t other_tvi,uint32_t new_main_tvi)508 bool MainIndex::CopyProperties(
509     const IcingDynamicTrie::PropertyReadersAll& prop_reader,
510     const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
511     uint32_t new_main_tvi) {
512   for (uint32_t property_id = 0; property_id < prop_reader.size();
513        ++property_id) {
514     if (property_id == GetHasNoExactHitsPropertyId()) {
515       // HasNoExactHitsProperty is an inverse. If other_lexicon has exact hits
516       // for this term, then HasNoExactHits needs to be set to false in
517       // main_lexicon. If other_lexicon has no exact hits for this term, then
518       // HasNoExactHits in the main_lexicon should not be modified.
519       if (!prop_reader.HasProperty(property_id, other_tvi) &&
520           !main_lexicon_->ClearProperty(new_main_tvi, property_id)) {
521         ICING_LOG(ERROR) << "Clearing HasNoExactHitsProperty failed";
522         return false;
523       }
524     } else {
525       // If other_lexicon has this property set for this term, then that
526       // property needs to be set for the main_lexicon. If other_lexicon
527       // doesn't have this property set, then the property in the main lexicon
528       // should not be modified.
529       if (prop_reader.HasProperty(property_id, other_tvi) &&
530           !main_lexicon_->SetProperty(new_main_tvi, property_id)) {
531         return false;
532       }
533     }
534   }
535   return true;
536 }
537 
AddHits(const TermIdCodec & term_id_codec,std::unordered_map<uint32_t,uint32_t> && backfill_map,std::vector<TermIdHitPair> && hits,DocumentId last_added_document_id)538 libtextclassifier3::Status MainIndex::AddHits(
539     const TermIdCodec& term_id_codec,
540     std::unordered_map<uint32_t, uint32_t>&& backfill_map,
541     std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id) {
542   if (hits.empty()) {
543     flash_index_storage_->set_last_indexed_docid(last_added_document_id);
544     return libtextclassifier3::Status::OK;
545   }
546   uint32_t cur_term_id = hits[0].term_id();
547   ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
548                          term_id_codec.DecodeTermInfo(cur_term_id));
549   // Iterate through all hits. If these hits are for a term that also needs
550   // backfill, then backfill first and then add the new hits.
551   size_t k_start = 0;
552   size_t k_end = 0;
553   while (k_start < hits.size()) {
554     uint32_t term_id = hits[k_end].term_id();
555     while (term_id == cur_term_id && ++k_end < hits.size()) {
556       term_id = hits[k_end].term_id();
557     }
558 
559     // Look for backfill.
560     PostingListIdentifier backfill_posting_list_id =
561         PostingListIdentifier::kInvalid;
562     auto itr = backfill_map.find(cur_decoded_term.tvi);
563     if (itr != backfill_map.end()) {
564       const void* value = main_lexicon_->GetValueAtIndex(itr->second);
565       memcpy(&backfill_posting_list_id, value,
566              sizeof(backfill_posting_list_id));
567       backfill_map.erase(itr);
568     }
569     ICING_RETURN_IF_ERROR(AddHitsForTerm(cur_decoded_term.tvi,
570                                          backfill_posting_list_id,
571                                          &hits[k_start], k_end - k_start));
572     cur_term_id = term_id;
573     ICING_ASSIGN_OR_RETURN(cur_decoded_term,
574                            term_id_codec.DecodeTermInfo(cur_term_id));
575     k_start = k_end;
576   }
577 
578   // Now copy remaining backfills.
579   ICING_VLOG(1) << "Remaining backfills " << backfill_map.size();
580   for (auto other_tvi_main_tvi_pair : backfill_map) {
581     PostingListIdentifier backfill_posting_list_id =
582         PostingListIdentifier::kInvalid;
583     memcpy(&backfill_posting_list_id,
584            main_lexicon_->GetValueAtIndex(other_tvi_main_tvi_pair.second),
585            sizeof(backfill_posting_list_id));
586     ICING_ASSIGN_OR_RETURN(
587         std::unique_ptr<PostingListHitAccessor> hit_accum,
588         PostingListHitAccessor::Create(flash_index_storage_.get(),
589                                        posting_list_hit_serializer_.get()));
590     ICING_RETURN_IF_ERROR(
591         AddPrefixBackfillHits(backfill_posting_list_id, hit_accum.get()));
592     PostingListAccessor::FinalizeResult result =
593         std::move(*hit_accum).Finalize();
594     if (result.id.is_valid()) {
595       main_lexicon_->SetValueAtIndex(other_tvi_main_tvi_pair.first, &result.id);
596     }
597   }
598   flash_index_storage_->set_last_indexed_docid(last_added_document_id);
599   return libtextclassifier3::Status::OK;
600 }
601 
AddHitsForTerm(uint32_t tvi,PostingListIdentifier backfill_posting_list_id,const TermIdHitPair * hit_elements,size_t len)602 libtextclassifier3::Status MainIndex::AddHitsForTerm(
603     uint32_t tvi, PostingListIdentifier backfill_posting_list_id,
604     const TermIdHitPair* hit_elements, size_t len) {
605   // 1. Create a PostingListHitAccessor - either from the pre-existing block, if
606   // one exists, or from scratch.
607   PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
608   memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(tvi),
609          sizeof(posting_list_id));
610   std::unique_ptr<PostingListHitAccessor> pl_accessor;
611   if (posting_list_id.is_valid()) {
612     if (posting_list_id.block_index() >= flash_index_storage_->num_blocks()) {
613       ICING_LOG(ERROR) << "Index dropped hits. Invalid block index "
614                        << posting_list_id.block_index()
615                        << " >= " << flash_index_storage_->num_blocks();
616       // TODO(b/159918304) : Consider revising the checksumming strategy in the
617       // main index. Providing some mechanism to check for corruption - either
618       // during initialization or some later time would allow us to avoid
619       // whack-a-mole with odd corruption issues like this one (b/62820689).
620       return absl_ports::InternalError(
621           "Valid posting list has an invalid block index!");
622     }
623     ICING_ASSIGN_OR_RETURN(
624         pl_accessor, PostingListHitAccessor::CreateFromExisting(
625                          flash_index_storage_.get(),
626                          posting_list_hit_serializer_.get(), posting_list_id));
627   } else {
628     // New posting list.
629     ICING_ASSIGN_OR_RETURN(
630         pl_accessor,
631         PostingListHitAccessor::Create(flash_index_storage_.get(),
632                                        posting_list_hit_serializer_.get()));
633   }
634 
635   // 2. Backfill any hits if necessary.
636   if (backfill_posting_list_id.is_valid()) {
637     ICING_RETURN_IF_ERROR(
638         AddPrefixBackfillHits(backfill_posting_list_id, pl_accessor.get()));
639   }
640 
641   // 3. Add all the new hits.
642   for (int i = len - 1; i >= 0; --i) {
643     Hit hit = hit_elements[i].hit();
644     ICING_RETURN_IF_ERROR(pl_accessor->PrependHit(hit));
645   }
646 
647   // 4. Finalize this posting list and put its identifier in the lexicon.
648   PostingListAccessor::FinalizeResult result =
649       std::move(*pl_accessor).Finalize();
650   if (result.id.is_valid()) {
651     main_lexicon_->SetValueAtIndex(tvi, &result.id);
652   }
653   return libtextclassifier3::Status::OK;
654 }
655 
AddPrefixBackfillHits(PostingListIdentifier backfill_posting_list_id,PostingListHitAccessor * hit_accum)656 libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
657     PostingListIdentifier backfill_posting_list_id,
658     PostingListHitAccessor* hit_accum) {
659   ICING_ASSIGN_OR_RETURN(
660       std::unique_ptr<PostingListHitAccessor> backfill_accessor,
661       PostingListHitAccessor::CreateFromExisting(
662           flash_index_storage_.get(), posting_list_hit_serializer_.get(),
663           backfill_posting_list_id));
664   std::vector<Hit> backfill_hits;
665   ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp,
666                          backfill_accessor->GetNextHitsBatch());
667   while (!tmp.empty()) {
668     std::copy(tmp.begin(), tmp.end(), std::back_inserter(backfill_hits));
669     ICING_ASSIGN_OR_RETURN(tmp, backfill_accessor->GetNextHitsBatch());
670   }
671 
672   Hit::EqualsValueAndFlags hit_equals_value_and_flags_comparator;
673   Hit last_added_hit(Hit::kInvalidValue);
674   // The hits in backfill_hits are in the reverse order of how they were added.
675   // Iterate in reverse to add them to this new posting list in the correct
676   // order.
677   for (auto itr = backfill_hits.rbegin(); itr != backfill_hits.rend(); ++itr) {
678     const Hit& hit = *itr;
679     // Skip hits from non-prefix-enabled sections.
680     if (!hit.is_in_prefix_section()) {
681       continue;
682     }
683 
684     // A backfill hit is a prefix hit in a prefix section.
685     const Hit backfill_hit(hit.section_id(), hit.document_id(),
686                            hit.term_frequency(),
687                            /*is_in_prefix_section=*/true,
688                            /*is_prefix_hit=*/true, /*is_stemmed_hit=*/false);
689     if (hit_equals_value_and_flags_comparator(backfill_hit, last_added_hit)) {
690       // Skip duplicate values due to overriding of the is_prefix flag.
691       continue;
692     }
693     last_added_hit = backfill_hit;
694     ICING_RETURN_IF_ERROR(hit_accum->PrependHit(backfill_hit));
695   }
696   return libtextclassifier3::Status::OK;
697 }
698 
GetDebugInfo(DebugInfoVerbosity::Code verbosity) const699 std::string MainIndex::GetDebugInfo(DebugInfoVerbosity::Code verbosity) const {
700   std::string res;
701 
702   // Lexicon.
703   std::string lexicon_info;
704   main_lexicon_->GetDebugInfo(verbosity, &lexicon_info);
705 
706   IcingStringUtil::SStringAppendF(&res, 0,
707                                   "last_added_document_id: %u\n"
708                                   "\n"
709                                   "main_lexicon_info:\n%s\n",
710                                   last_added_document_id(),
711                                   lexicon_info.c_str());
712 
713   if (verbosity == DebugInfoVerbosity::BASIC) {
714     return res;
715   }
716 
717   std::string flash_index_storage_info;
718   flash_index_storage_->GetDebugInfo(verbosity, &flash_index_storage_info);
719   IcingStringUtil::SStringAppendF(&res, 0, "flash_index_storage_info:\n%s\n",
720                                   flash_index_storage_info.c_str());
721   return res;
722 }
723 
Optimize(const std::vector<DocumentId> & document_id_old_to_new)724 libtextclassifier3::Status MainIndex::Optimize(
725     const std::vector<DocumentId>& document_id_old_to_new) {
726   std::string temporary_index_dir_path = base_dir_ + "_temp";
727   if (!filesystem_->DeleteDirectoryRecursively(
728           temporary_index_dir_path.c_str())) {
729     ICING_LOG(ERROR) << "Recursively deleting " << temporary_index_dir_path;
730     return absl_ports::InternalError(
731         "Unable to delete temp directory to prepare to build new index.");
732   }
733 
734   DestructibleDirectory temporary_index_dir(
735       filesystem_, std::move(temporary_index_dir_path));
736   if (!temporary_index_dir.is_valid()) {
737     return absl_ports::InternalError(
738         "Unable to create temp directory to build new index.");
739   }
740 
741   ICING_ASSIGN_OR_RETURN(std::unique_ptr<MainIndex> new_index,
742                          MainIndex::Create(temporary_index_dir.dir(),
743                                            filesystem_, icing_filesystem_));
744   ICING_RETURN_IF_ERROR(TransferIndex(document_id_old_to_new, new_index.get()));
745   ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
746   new_index = nullptr;
747   flash_index_storage_ = nullptr;
748   main_lexicon_ = nullptr;
749 
750   if (!filesystem_->SwapFiles(temporary_index_dir.dir().c_str(),
751                               base_dir_.c_str())) {
752     return absl_ports::InternalError(
753         "Unable to apply new index due to failed swap!");
754   }
755 
756   // Reinitialize the index so that flash_index_storage_ and main_lexicon_ are
757   // properly updated.
758   return Init();
759 }
760 
TransferAndAddHits(const std::vector<DocumentId> & document_id_old_to_new,std::string_view term,PostingListHitAccessor & old_pl_accessor,MainIndex * new_index)761 libtextclassifier3::StatusOr<DocumentId> MainIndex::TransferAndAddHits(
762     const std::vector<DocumentId>& document_id_old_to_new,
763     std::string_view term, PostingListHitAccessor& old_pl_accessor,
764     MainIndex* new_index) {
765   std::vector<Hit> new_hits;
766   bool has_no_exact_hits = true;
767   bool has_hits_in_prefix_section = false;
768   // The largest document id after translating hits.
769   DocumentId largest_document_id = kInvalidDocumentId;
770   ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp,
771                          old_pl_accessor.GetNextHitsBatch());
772   while (!tmp.empty()) {
773     for (const Hit& hit : tmp) {
774       // A safety check to add robustness to the codebase, so to make sure that
775       // we never access invalid memory, in case that hit from the posting list
776       // is corrupted.
777       if (hit.document_id() < 0 ||
778           hit.document_id() >= document_id_old_to_new.size()) {
779         continue;
780       }
781       DocumentId new_document_id = document_id_old_to_new[hit.document_id()];
782       // Transfer the document id of the hit, if the document is not deleted
783       // or outdated.
784       if (new_document_id != kInvalidDocumentId) {
785         if (hit.is_in_prefix_section()) {
786           has_hits_in_prefix_section = true;
787         }
788         if (!hit.is_prefix_hit()) {
789           has_no_exact_hits = false;
790         }
791         if (largest_document_id == kInvalidDocumentId ||
792             new_document_id > largest_document_id) {
793           largest_document_id = new_document_id;
794         }
795         new_hits.push_back(Hit::TranslateHit(hit, new_document_id));
796       }
797     }
798     ICING_ASSIGN_OR_RETURN(tmp, old_pl_accessor.GetNextHitsBatch());
799   }
800   // A term without exact hits indicates that it is a purely backfill term. If
801   // the term is not branching in the new trie, it means backfilling is no
802   // longer necessary, so that we can skip.
803   if (new_hits.empty() ||
804       (has_no_exact_hits && !new_index->main_lexicon_->IsBranchingTerm(term))) {
805     return largest_document_id;
806   }
807 
808   ICING_ASSIGN_OR_RETURN(std::unique_ptr<PostingListHitAccessor> hit_accum,
809                          PostingListHitAccessor::Create(
810                              new_index->flash_index_storage_.get(),
811                              new_index->posting_list_hit_serializer_.get()));
812   for (auto itr = new_hits.rbegin(); itr != new_hits.rend(); ++itr) {
813     ICING_RETURN_IF_ERROR(hit_accum->PrependHit(*itr));
814   }
815   PostingListAccessor::FinalizeResult result = std::move(*hit_accum).Finalize();
816   if (!result.id.is_valid()) {
817     return absl_ports::InternalError(
818         absl_ports::StrCat("Failed to add translated hits for term: ", term));
819   }
820   uint32_t tvi;
821   libtextclassifier3::Status status =
822       new_index->main_lexicon_->Insert(term, &result.id, &tvi,
823                                        /*replace=*/false);
824   if (!status.ok()) {
825     ICING_LOG(DBG) << "Could not transfer main index for term: " << term << "\n"
826                    << status.error_message();
827     return status;
828   }
829   if (has_no_exact_hits && !new_index->main_lexicon_->SetProperty(
830                                tvi, GetHasNoExactHitsPropertyId())) {
831     return absl_ports::InternalError("Setting prefix prop failed");
832   }
833   if (has_hits_in_prefix_section &&
834       !new_index->main_lexicon_->SetProperty(
835           tvi, GetHasHitsInPrefixSectionPropertyId())) {
836     return absl_ports::InternalError("Setting prefix prop failed");
837   }
838   return largest_document_id;
839 }
840 
TransferIndex(const std::vector<DocumentId> & document_id_old_to_new,MainIndex * new_index)841 libtextclassifier3::Status MainIndex::TransferIndex(
842     const std::vector<DocumentId>& document_id_old_to_new,
843     MainIndex* new_index) {
844   DocumentId largest_document_id = kInvalidDocumentId;
845   for (IcingDynamicTrie::Iterator term_itr(*main_lexicon_, /*prefix=*/"",
846                                            /*reverse=*/true);
847        term_itr.IsValid(); term_itr.Advance()) {
848     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
849     memcpy(&posting_list_id, term_itr.GetValue(), sizeof(posting_list_id));
850     if (posting_list_id == PostingListIdentifier::kInvalid) {
851       // Why?
852       ICING_LOG(ERROR)
853           << "Got invalid posting_list_id from previous main index";
854       continue;
855     }
856     ICING_ASSIGN_OR_RETURN(
857         std::unique_ptr<PostingListHitAccessor> pl_accessor,
858         PostingListHitAccessor::CreateFromExisting(
859             flash_index_storage_.get(), posting_list_hit_serializer_.get(),
860             posting_list_id));
861     ICING_ASSIGN_OR_RETURN(
862         DocumentId curr_largest_document_id,
863         TransferAndAddHits(document_id_old_to_new, term_itr.GetKey(),
864                            *pl_accessor, new_index));
865     if (curr_largest_document_id == kInvalidDocumentId) {
866       continue;
867     }
868     if (largest_document_id == kInvalidDocumentId ||
869         curr_largest_document_id > largest_document_id) {
870       largest_document_id = curr_largest_document_id;
871     }
872   }
873   new_index->flash_index_storage_->set_last_indexed_docid(largest_document_id);
874   return libtextclassifier3::Status::OK;
875 }
876 
877 }  // namespace lib
878 }  // namespace icing
879