xref: /aosp_15_r20/external/icing/icing/scoring/bm25f-calculator.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_SCORING_BM25F_CALCULATOR_H_
16 #define ICING_SCORING_BM25F_CALCULATOR_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 
24 #include "icing/index/hit/doc-hit-info.h"
25 #include "icing/index/iterator/doc-hit-info-iterator.h"
26 #include "icing/legacy/index/icing-bit-util.h"
27 #include "icing/scoring/section-weights.h"
28 #include "icing/store/corpus-id.h"
29 #include "icing/store/document-associated-score-data.h"
30 #include "icing/store/document-filter-data.h"
31 #include "icing/store/document-id.h"
32 #include "icing/store/document-store.h"
33 
34 namespace icing {
35 namespace lib {
36 
37 // Bm25fCalculator encapsulates the logic to compute BM25F term-weight based
38 // ranking function.
39 //
40 // The formula to compute BM25F is as follows:
41 //
42 // BM25F = \sum_i IDF(q_i) * tf(q_i, D)
43 //
44 // where IDF(q_i) is the Inverse Document Frequency (IDF) weight of the query
45 // term q_i in the corpus with document D, and tf(q_i, D) is the weighted and
46 // normalized term frequency of query term q_i in the document D.
47 //
48 // IDF(q_i) is computed as follows:
49 //
50 //                     N - n(q_i) + 0.5
51 // IDF(q_i) = log(1 + ------------------)
52 //                       n(q_i) + 0.5
53 //
54 // where N is the number of documents in the corpus, and n(q_i) is the number
55 // of documents in the corpus containing the query term q_i.
56 //
57 // Lastly, tf(q_i, D) is computed as follows:
58 //
59 //                            f(q_i, D) * (k1 + 1)
60 // Normalized TF = --------------------------------------------
61 //                 f(q_i, D) + k1 * (1 - b + b * |D| / avgdl)
62 //
63 // where f(q_i, D) is the frequency of query term q_i in document D,
64 // |D| is the #tokens in D, avgdl is the average document length in the corpus,
65 // k1 and b are smoothing parameters.
66 //
67 // see: go/icing-bm25f
68 // see: glossary/bm25
69 class Bm25fCalculator {
70  public:
71   explicit Bm25fCalculator(const DocumentStore *document_store,
72                            SectionWeights *section_weights,
73                            int64_t current_time_ms);
74 
75   // Precompute and cache statistics relevant to BM25F.
76   // Populates term_id_map_ and corpus_nqi_map_ for use while scoring other
77   // results.
78   // The query_term_iterators map is used to build the
79   // std::unordered_map<std::string_view, TermId> term_id_map_. It must
80   // outlive the bm25f-calculator otherwise the string_view key in term_id_map_,
81   // used later to compute a document score, will be meaningless.
82   void PrepareToScore(
83       std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
84           *query_term_iterators);
85 
86   // Compute the BM25F relevance score for the given hit, represented by
87   // DocHitInfo.
88   // The default score will be returned only when the scorer fails to find or
89   // calculate a score for the document.
90   float ComputeScore(const DocHitInfoIterator *query_it,
91                      const DocHitInfo &hit_info, double default_score);
92 
93  private:
94   // Compact ID for each query term.
95   using TermId = uint16_t;
96 
97   // Compact representation of <CorpusId, TermId> for use as a key in a
98   // hash_map.
99   struct CorpusTermInfo {
100     // Layout bits: 16 bit padding + 32 bit CorpusId + 16 bit TermId
101     using Value = uint64_t;
102 
103     Value value;
104 
105     static constexpr int kCorpusIdBits = sizeof(CorpusId) * 8;
106     static constexpr int kTermIdBits = sizeof(TermId) * 8;
107 
CorpusTermInfoCorpusTermInfo108     explicit CorpusTermInfo(CorpusId corpus_id, TermId term_id) : value(0) {
109       BITFIELD_OR(value, kTermIdBits, kCorpusIdBits,
110                   static_cast<uint64_t>(corpus_id));
111       BITFIELD_OR(value, 0, kTermIdBits, term_id);
112     }
113 
114     bool operator==(const CorpusTermInfo &other) const {
115       return value == other.value;
116     }
117   };
118 
119   // Returns idf weight for the term and provided corpus.
120   float GetCorpusIdfWeightForTerm(std::string_view term, CorpusId corpus_id);
121 
122   // Returns the average document length for the corpus. The average is
123   // calculated as the sum of tokens in the corpus' documents over the total
124   // number of documents plus one.
125   float GetCorpusAvgDocLength(CorpusId corpus_id);
126 
127   // Returns the normalized term frequency for the term match and document hit.
128   // This normalizes the term frequency by applying smoothing parameters and
129   // factoring document length.
130   float ComputedNormalizedTermFrequency(
131       const TermMatchInfo &term_match_info, const DocHitInfo &hit_info,
132       const DocumentAssociatedScoreData &data);
133 
134   // Returns the weighted term frequency for the term match and document. For
135   // each section the term is present, we scale the term frequency by its
136   // section weight. We return the sum of the weighted term frequencies over all
137   // sections.
138   float ComputeTermFrequencyForMatchedSections(
139       CorpusId corpus_id, const TermMatchInfo &term_match_info,
140       DocumentId document_id) const;
141 
142   // Returns the schema type id for the document by retrieving it from the
143   // DocumentFilterData.
144   SchemaTypeId GetSchemaTypeId(DocumentId document_id) const;
145 
146   // Clears cached scoring data and prepares the calculator for a new scoring
147   // run.
148   void Clear();
149 
150   const DocumentStore *document_store_;  // Does not own.
151 
152   // Used for accessing normalized section weights when computing the weighted
153   // term frequency.
154   SectionWeights &section_weights_;
155 
156   // Map from query term to compact term ID.
157   // Necessary as a key to the other maps.
158   // The use of the string_view as key here means that the query_term_iterators
159   // map must outlive the bm25f
160   std::unordered_map<std::string_view, TermId> term_id_map_;
161 
162   // Map from corpus ID to average document length (avgdl).
163   // Necessary to calculate the normalized term frequency.
164   // This information is cached in the DocumentStore::CorpusScoreCache
165   std::unordered_map<CorpusId, float> corpus_avgdl_map_;
166   // Map from <corpus ID, term ID> to number of documents containing term q_i,
167   // called n(q_i).
168   // Necessary to calculate IDF(q_i) (inverse document frequency).
169   // This information must be calculated by iterating through the hits for these
170   // terms.
171   std::unordered_map<CorpusTermInfo::Value, uint32_t> corpus_nqi_map_;
172 
173   // Map from <corpus ID, term ID> to IDF(q_i) (inverse document frequency).
174   std::unordered_map<CorpusTermInfo::Value, float> corpus_idf_map_;
175 
176   int64_t current_time_ms_;
177 };
178 
179 }  // namespace lib
180 }  // namespace icing
181 
182 #endif  // ICING_SCORING_BM25F_CALCULATOR_H_
183