xref: /aosp_15_r20/external/icing/icing/index/main/main-index.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_MAIN_MAIN_INDEX_H_
16 #define ICING_INDEX_MAIN_MAIN_INDEX_H_
17 
18 #include <cstddef>
19 #include <cstdint>
20 #include <memory>
21 #include <string>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/file/filesystem.h"
30 #include "icing/file/posting_list/flash-index-storage.h"
31 #include "icing/file/posting_list/posting-list-identifier.h"
32 #include "icing/index/lite/term-id-hit-pair.h"
33 #include "icing/index/main/posting-list-hit-accessor.h"
34 #include "icing/index/main/posting-list-hit-serializer.h"
35 #include "icing/index/term-id-codec.h"
36 #include "icing/index/term-metadata.h"
37 #include "icing/legacy/index/icing-dynamic-trie.h"
38 #include "icing/legacy/index/icing-filesystem.h"
39 #include "icing/proto/debug.pb.h"
40 #include "icing/proto/scoring.pb.h"
41 #include "icing/proto/storage.pb.h"
42 #include "icing/proto/term.pb.h"
43 #include "icing/store/document-id.h"
44 #include "icing/store/suggestion-result-checker.h"
45 #include "icing/util/crc32.h"
46 #include "icing/util/status-macros.h"
47 
48 namespace icing {
49 namespace lib {
50 
51 class MainIndex {
52  public:
53   // RETURNS:
54   //  - valid instance of MainIndex, on success.
55   //  - INTERNAL error if unable to create the lexicon or flash storage.
56   static libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> Create(
57       const std::string& index_directory, const Filesystem* filesystem,
58       const IcingFilesystem* icing_filesystem);
59 
60   // Reads magic from existing flash index storage file header. We need this
61   // during Icing initialization phase to determine the version.
62   //
63   // RETURNS:
64   //   - On success, a valid magic.
65   //   - NOT_FOUND if the flash index doesn't exist.
66   //   - INTERNAL on I/O error.
67   static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic(
68       const Filesystem* filesystem, const std::string& index_directory);
69 
70   // Get a PostingListHitAccessor that holds the posting list chain for 'term'.
71   //
72   // RETURNS:
73   //  - On success, a valid PostingListHitAccessor
74   //  - NOT_FOUND if term is not present in the main index.
75   libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>>
76   GetAccessorForExactTerm(const std::string& term);
77 
78   // Get a PostingListHitAccessor for 'prefix'.
79   //
80   // RETURNS:
81   //  - On success, a result containing a valid PostingListHitAccessor.
82   //  - NOT_FOUND if neither 'prefix' nor any terms for which 'prefix' is a
83   //    prefix are present in the main index.
84   struct GetPrefixAccessorResult {
85     // A PostingListHitAccessor that holds the posting list chain for the term
86     // that best represents 'prefix' in the main index.
87     std::unique_ptr<PostingListHitAccessor> accessor;
88     // True if the returned posting list chain is for 'prefix' or false if the
89     // returned posting list chain is for a term for which 'prefix' is a prefix.
90     bool exact;
91 
GetPrefixAccessorResultGetPrefixAccessorResult92     explicit GetPrefixAccessorResult(
93         std::unique_ptr<PostingListHitAccessor> accessor_in, bool exact_in)
94         : accessor(std::move(accessor_in)), exact(exact_in) {}
95   };
96   libtextclassifier3::StatusOr<GetPrefixAccessorResult>
97   GetAccessorForPrefixTerm(const std::string& prefix);
98 
99   // Finds terms with the given prefix in the given result checker. The
100   // input prefix must be normalized, otherwise inaccurate results may be
101   // returned. If scoring_match_type is EXACT, only exact hit will be counted
102   // and it is PREFIX, both prefix and exact hits will be counted. Results are
103   // not sorted specifically and are in lexicographical order. Number of results
104   // are no more than 'num_to_return'.
105   //
106   // Returns:
107   //   A list of TermMetadata on success
108   //   INTERNAL_ERROR if failed to access term data.
109   libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
110       const std::string& prefix, TermMatchType::Code scoring_match_type,
111       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
112       const SuggestionResultChecker* suggestion_result_checker);
113 
114   struct LexiconMergeOutputs {
115     // Maps from main_lexicon tvi for new branching point to the main_lexicon
116     // tvi for posting list whose hits must be backfilled.
117     std::unordered_map<uint32_t, uint32_t> backfill_map;
118 
119     // Maps from lexicon tvis to main_lexicon tvis.
120     std::unordered_map<uint32_t, uint32_t> other_tvi_to_main_tvi;
121 
122     // Maps from main lexicon tvi to the block index. Tvis with no entry do not
123     // have an allocated posting list.
124     std::unordered_map<uint32_t, int> main_tvi_to_block_index;
125 
126     // Maps from the lexicon tvi to the beginning position in
127     // prefix_tvis_buf and the length.
128     std::unordered_map<uint32_t, std::pair<int, int>>
129         other_tvi_to_prefix_main_tvis;
130 
131     // Stores tvis that are mapped to by other_tvi_to_prefix_tvis.
132     std::vector<uint32_t> prefix_tvis_buf;
133   };
134 
135   // Merge the lexicon into the main lexicon and populate the data
136   // structures necessary to translate lite tvis to main tvis, track backfilling
137   // and expanding lite terms to prefix terms.
138   //
139   // RETURNS:
140   //   - OK on success
141   //   - INTERNAL on IO error while writing to the main lexicon.
MergeLexicon(const IcingDynamicTrie & other_lexicon)142   libtextclassifier3::StatusOr<LexiconMergeOutputs> MergeLexicon(
143       const IcingDynamicTrie& other_lexicon) {
144     // Backfill branch points need to be added first so that the backfill_map
145     // can be correctly populated.
146     ICING_ASSIGN_OR_RETURN(LexiconMergeOutputs outputs,
147                            AddBackfillBranchPoints(other_lexicon));
148     ICING_ASSIGN_OR_RETURN(outputs,
149                            AddTerms(other_lexicon, std::move(outputs)));
150     // Non-backfill branch points need to be added last so that the mapping of
151     // newly added terms to prefix terms can be correctly populated (prefix
152     // terms might be branch points between two new terms or between a
153     // pre-existing term and a new term).
154     ICING_ASSIGN_OR_RETURN(outputs,
155                            AddBranchPoints(other_lexicon, std::move(outputs)));
156     return outputs;
157   }
158 
159   // Add hits to the main index and backfill from existing posting lists to new
160   // backfill branch points.
161   //
162   // The backfill_map maps from main_lexicon tvi for a newly added branching
163   // point to the main_lexicon tvi for the posting list whose hits must be
164   // backfilled. backfill_map should be populated as part of LexiconMergeOutputs
165   // in MergeLexicon and be blindly passed to this function.
166   //
167   // RETURNS:
168   //  - OK on success
169   //  - INVALID_ARGUMENT if one of the elements in the lite index has a term_id
170   //  exceeds the max TermId, is not valid or is not less than pre-existing hits
171   //  in the main index.
172   //  - INTERNAL_ERROR if unable to mmap necessary IndexBlocks
173   //  - RESOURCE_EXHAUSTED error if unable to grow the index
174   libtextclassifier3::Status AddHits(
175       const TermIdCodec& term_id_codec,
176       std::unordered_map<uint32_t, uint32_t>&& backfill_map,
177       std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id);
178 
PersistToDisk()179   libtextclassifier3::Status PersistToDisk() {
180     if (main_lexicon_->Sync() && flash_index_storage_->PersistToDisk()) {
181       return libtextclassifier3::Status::OK;
182     }
183     return absl_ports::InternalError("Unable to sync main index components.");
184   }
185 
186   // Updates and returns the checksums of the components in the MainIndex.
UpdateChecksum()187   Crc32 UpdateChecksum() { return main_lexicon_->UpdateCrc(); }
188 
189   // Calculates and returns the checksums of the components in the MainIndex.
GetChecksum()190   Crc32 GetChecksum() const { return main_lexicon_->GetCrc(); }
191 
last_added_document_id()192   DocumentId last_added_document_id() const {
193     return flash_index_storage_->get_last_indexed_docid();
194   }
195 
Reset()196   libtextclassifier3::Status Reset() {
197     ICING_RETURN_IF_ERROR(flash_index_storage_->Reset());
198     main_lexicon_->Clear();
199     return libtextclassifier3::Status::OK;
200   }
201 
Warm()202   void Warm() { main_lexicon_->Warm(); }
203 
204   // Returns:
205   //  - elements size of lexicon and index, on success
206   //  - INTERNAL on IO error
207   libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
208 
209   // Takes the provided storage_info, populates the fields related to the main
210   // index and returns that storage_info.
211   //
212   // If an IO error occurs while trying to calculate the value for a field, then
213   // that field will be set to -1.
214   IndexStorageInfoProto GetStorageInfo(
215       IndexStorageInfoProto storage_info) const;
216 
217   // Returns debug information for the main index in out.
218   // verbosity = BASIC, simplest debug information - just the lexicon
219   // verbosity = DETAILED, more detailed debug information including raw
220   // postings lists.
221   std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity) const;
222 
223   // Reduces internal file sizes by reclaiming space of deleted documents.
224   //
225   // This method will update the last_added_docid of the index to the largest
226   // document id that still appears in the index after compaction.
227   //
228   // Returns:
229   //   OK on success
230   //   INTERNAL_ERROR on IO error, this indicates that the index may be in an
231   //                               invalid state and should be cleared.
232   libtextclassifier3::Status Optimize(
233       const std::vector<DocumentId>& document_id_old_to_new);
234 
235  private:
236   explicit MainIndex(const std::string& index_directory,
237                      const Filesystem* filesystem,
238                      const IcingFilesystem* icing_filesystem);
239 
240   libtextclassifier3::Status Init();
241 
242   // Helpers for merging the lexicon
243   // Add all 'backfill' branch points. Backfill branch points are prefix
244   // branch points that are a prefix of terms that existed in the lexicon
245   // to the merge.
246   //
247   // For example, if the main lexicon only contains "foot" and is then merged
248   // with a lite lexicon containing only "fool", then a backfill branch point
249   // for "foo" will be added to contain prefix hits from both the pre-existing
250   // posting list for "foot" and the new posting list for "fool".
251   //
252   // Populates LexiconMergeOutputs.backfill_map
253   //
254   // RETURNS:
255   //   - OK on success
256   //   - INTERNAL on IO error while writing to the main lexicon.
257   libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBackfillBranchPoints(
258       const IcingDynamicTrie& other_lexicon);
259 
260   // Add all terms from the lexicon.
261   //
262   // Populates LexiconMergeOutputs.other_tvi_to_main_tvi
263   //
264   // RETURNS:
265   //   - OK on success
266   //   - INTERNAL on IO error while writing to the main lexicon.
267   libtextclassifier3::StatusOr<LexiconMergeOutputs> AddTerms(
268       const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
269 
270   // Add all branch points for terms added from the lexicon.
271   // For example, if the main lexicon is empty and is then merged with a
272   // lexicon containing only "foot" and "fool", then a branch point for "foo"
273   // will be added to contain prefix hits from both "foot" and "fool".
274   //
275   // Populates LexiconMergeOutputs.other_tvi_to_prefix_main_tvis and
276   // LexiconMergeOutputs.prefix_tvis_buf;
277   //
278   // RETURNS:
279   //   - OK on success
280   //   - INTERNAL on IO error while writing to the main lexicon.
281   libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBranchPoints(
282       const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
283 
284   // Copies all properties from old_tvi in the other lexicon to the new_tvi in
285   // the main lexicon.
286   // Returns true on success, false if an IO error is encountered.
287   bool CopyProperties(const IcingDynamicTrie::PropertyReadersAll& prop_reader,
288                       const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
289                       uint32_t new_main_tvi);
290 
291   // Add all hits between [hit_elements, hit_elements + len) to main_index,
292   // updating the entry in the main lexicon at trie_value_index to point to the
293   // resulting posting list. Hits are sorted in descending document id order, so
294   // they should be to posting lists in reverse (starting at hit_elements
295   // + len - 1) and working backwards. Therefore, hit_elements must be in sorted
296   // order.
297   //
298   // trie_value_index may point to a valid posting list id if there is a
299   // pre-existing posting list to append to.
300   //
301   // If backfill_posting_list_id is valid, then the hits from the posting list
302   // identified by backfill_posting_list_id should be added to the new posting
303   // list before the hits in hit_elements.
304   //
305   // RETURNS:
306   //  - OK on success
307   //  - INVALID_ARGUMENT if posting_list_id stored at trie_value_index is valid
308   //  but points out of bounds in the IndexBlock referred to by
309   //  id.block_index(), if one of the hits from [hit_elements,hit_elements+len)
310   //  is not valid, or if one of the hits from [hit_elements,hit_elements+len)
311   //  is not less than the previously added hits.
312   //  - INTERNAL_ERROR if posting_list_id stored at trie_value_index is valid
313   //  but points to an invalid block index or if unable to mmap the IndexBlock.
314   //  - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
315   //  posting list.
316   libtextclassifier3::Status AddHitsForTerm(
317       uint32_t tvi, PostingListIdentifier backfill_posting_list_id,
318       const TermIdHitPair* hit_elements, size_t len);
319 
320   // Adds all prefix hits or hits from prefix sections present on the posting
321   // list identified by backfill_posting_list_id to hit_accum.
322   //
323   // RETURNS:
324   //  - OK, on success
325   //  - INVALID_ARGUMENT if backfill_posting_list_id points out of bounds in the
326   //  IndexBlock referred to by id.block_index()
327   //  - INTERNAL_ERROR if unable to mmap the block identified by
328   //  backfill_posting_list_id or if the posting list identified by
329   //  backfill_posting_list_id has been corrupted.
330   //  - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
331   //  posting list.
332   libtextclassifier3::Status AddPrefixBackfillHits(
333       PostingListIdentifier backfill_posting_list_id,
334       PostingListHitAccessor* hit_accum);
335 
336   // Transfer hits from old_pl_accessor to new_index for term.
337   //
338   // Returns:
339   //   largest document id added to the translated posting list, on success
340   //   INTERNAL_ERROR on IO error
341   static libtextclassifier3::StatusOr<DocumentId> TransferAndAddHits(
342       const std::vector<DocumentId>& document_id_old_to_new,
343       std::string_view term, PostingListHitAccessor& old_pl_accessor,
344       MainIndex* new_index);
345 
346   // Transfer hits from the current main index to new_index.
347   //
348   // Returns:
349   //   OK on success
350   //   INTERNAL_ERROR on IO error
351   libtextclassifier3::Status TransferIndex(
352       const std::vector<DocumentId>& document_id_old_to_new,
353       MainIndex* new_index);
354 
355   std::string base_dir_;
356   const Filesystem* filesystem_;
357   const IcingFilesystem* icing_filesystem_;
358   std::unique_ptr<PostingListHitSerializer> posting_list_hit_serializer_;
359   std::unique_ptr<FlashIndexStorage> flash_index_storage_;
360   std::unique_ptr<IcingDynamicTrie> main_lexicon_;
361 };
362 
363 }  // namespace lib
364 }  // namespace icing
365 
366 #endif  // ICING_INDEX_MAIN_MAIN_INDEX_H_
367