xref: /aosp_15_r20/external/icing/icing/scoring/bm25f-calculator.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/scoring/bm25f-calculator.h"
16 
17 #include <cmath>
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <vector>
24 
25 #include "icing/index/hit/doc-hit-info.h"
26 #include "icing/index/hit/hit.h"
27 #include "icing/index/iterator/doc-hit-info-iterator.h"
28 #include "icing/schema/section.h"
29 #include "icing/scoring/section-weights.h"
30 #include "icing/store/corpus-associated-scoring-data.h"
31 #include "icing/store/corpus-id.h"
32 #include "icing/store/document-associated-score-data.h"
33 #include "icing/store/document-filter-data.h"
34 #include "icing/store/document-id.h"
35 #include "icing/store/document-store.h"
36 #include "icing/util/logging.h"
37 
38 namespace icing {
39 namespace lib {
40 
41 // Smoothing parameter, determines the relevance of higher term frequency
42 // documents. The higher k1, the higher their relevance. 1.2 is the default
43 // value in the BM25F literature and works well in most corpora.
44 constexpr float k1_ = 1.2f;
45 // Smoothing parameter, determines the weight of the document length on the
46 // final score. The higher b, the higher the influence of the document length.
47 // 0.7 is the default value in the BM25F literature and works well in most
48 // corpora.
49 constexpr float b_ = 0.7f;
50 
51 // TODO(b/158603900): add tests for Bm25fCalculator
Bm25fCalculator(const DocumentStore * document_store,SectionWeights * section_weights,int64_t current_time_ms)52 Bm25fCalculator::Bm25fCalculator(const DocumentStore* document_store,
53                                  SectionWeights* section_weights,
54                                  int64_t current_time_ms)
55     : document_store_(document_store),
56       section_weights_(*section_weights),
57       current_time_ms_(current_time_ms) {}
58 
59 // During initialization, Bm25fCalculator iterates through
60 // hit-iterators for each query term to pre-compute n(q_i) for each corpus under
61 // consideration.
PrepareToScore(std::unordered_map<std::string,std::unique_ptr<DocHitInfoIterator>> * query_term_iterators)62 void Bm25fCalculator::PrepareToScore(
63     std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>*
64         query_term_iterators) {
65   Clear();
66   TermId term_id = 0;
67   for (auto& iter : *query_term_iterators) {
68     const std::string& term = iter.first;
69     if (term_id_map_.find(term) != term_id_map_.end()) {
70       continue;
71     }
72     term_id_map_[term] = ++term_id;
73     DocHitInfoIterator* term_it = iter.second.get();
74 
75     while (term_it->Advance().ok()) {
76       auto status_or = document_store_->GetDocumentAssociatedScoreData(
77           term_it->doc_hit_info().document_id());
78       if (!status_or.ok()) {
79         ICING_LOG(ERROR) << "No document score data";
80         continue;
81       }
82       DocumentAssociatedScoreData data = status_or.ValueOrDie();
83       CorpusId corpus_id = data.corpus_id();
84       CorpusTermInfo corpus_term_info(corpus_id, term_id);
85       corpus_nqi_map_[corpus_term_info.value]++;
86     }
87   }
88 }
89 
Clear()90 void Bm25fCalculator::Clear() {
91   term_id_map_.clear();
92   corpus_avgdl_map_.clear();
93   corpus_nqi_map_.clear();
94   corpus_idf_map_.clear();
95 }
96 
97 // Computes BM25F relevance score for query terms matched in document D.
98 //
99 // BM25F = \sum_i IDF(q_i) * tf(q_i, D)
100 //
101 // where IDF(q_i) is the Inverse Document Frequency (IDF) weight of the query
102 // term q_i in the corpus with document D, and tf(q_i, D) is the weighted and
103 // normalized term frequency of query term q_i in the document D.
ComputeScore(const DocHitInfoIterator * query_it,const DocHitInfo & hit_info,double default_score)104 float Bm25fCalculator::ComputeScore(const DocHitInfoIterator* query_it,
105                                     const DocHitInfo& hit_info,
106                                     double default_score) {
107   auto status_or =
108       document_store_->GetDocumentAssociatedScoreData(hit_info.document_id());
109   if (!status_or.ok()) {
110     ICING_LOG(ERROR) << "No document score data";
111     return default_score;
112   }
113   DocumentAssociatedScoreData data = status_or.ValueOrDie();
114   std::vector<TermMatchInfo> matched_terms_stats;
115   query_it->PopulateMatchedTermsStats(&matched_terms_stats);
116 
117   float score = 0;
118   for (const TermMatchInfo& term_match_info : matched_terms_stats) {
119     float idf_weight =
120         GetCorpusIdfWeightForTerm(term_match_info.term, data.corpus_id());
121     float normalized_tf =
122         ComputedNormalizedTermFrequency(term_match_info, hit_info, data);
123     score += idf_weight * normalized_tf;
124   }
125 
126   ICING_VLOG(1) << "BM25F: corpus_id:" << data.corpus_id()
127                 << " docid:" << hit_info.document_id() << " score:" << score;
128   return score;
129 }
130 
131 // Compute inverse document frequency (IDF) weight for query term in the given
132 // corpus, and cache it in the map.
133 //
134 //                    N - n(q_i) + 0.5
135 // IDF(q_i) = ln(1 + ------------------)
136 //                      n(q_i) + 0.5
137 //
138 // where N is the number of documents in the corpus, and n(q_i) is the number
139 // of documents in the corpus containing the query term q_i.
GetCorpusIdfWeightForTerm(std::string_view term,CorpusId corpus_id)140 float Bm25fCalculator::GetCorpusIdfWeightForTerm(std::string_view term,
141                                                  CorpusId corpus_id) {
142   TermId term_id = term_id_map_[term];
143 
144   CorpusTermInfo corpus_term_info(corpus_id, term_id);
145   auto iter = corpus_idf_map_.find(corpus_term_info.value);
146   if (iter != corpus_idf_map_.end()) {
147     return iter->second;
148   }
149 
150   // First, figure out corpus scoring data.
151   auto status_or = document_store_->GetCorpusAssociatedScoreData(corpus_id);
152   if (!status_or.ok()) {
153     ICING_LOG(ERROR) << "No scoring data for corpus [" << corpus_id << "]";
154     return 0;
155   }
156   CorpusAssociatedScoreData csdata = status_or.ValueOrDie();
157 
158   uint32_t num_docs = csdata.num_docs();
159   uint32_t nqi = corpus_nqi_map_[corpus_term_info.value];
160   if (nqi > num_docs) {
161     ICING_LOG(ERROR) << "nqi > num_docs when calculating idf for corpus "
162                      << corpus_id << " term " << term;
163     return 0;
164   }
165   float idf =
166       nqi != 0 ? log(1.0f + (num_docs - nqi + 0.5f) / (nqi + 0.5f)) : 0.0f;
167   corpus_idf_map_.insert({corpus_term_info.value, idf});
168   ICING_VLOG(1) << "corpus_id:" << corpus_id << " term:" << term
169                 << " N:" << num_docs << "nqi:" << nqi << " idf:" << idf;
170   return idf;
171 }
172 
173 // Get per corpus average document length and cache the result in the map.
174 // The average doc length is calculated as:
175 //
176 //                    total_tokens_in_corpus
177 // Avg Doc Length =  -------------------------
178 //                    num_docs_in_corpus + 1
GetCorpusAvgDocLength(CorpusId corpus_id)179 float Bm25fCalculator::GetCorpusAvgDocLength(CorpusId corpus_id) {
180   auto iter = corpus_avgdl_map_.find(corpus_id);
181   if (iter != corpus_avgdl_map_.end()) {
182     return iter->second;
183   }
184 
185   // First, figure out corpus scoring data.
186   auto status_or = document_store_->GetCorpusAssociatedScoreData(corpus_id);
187   if (!status_or.ok()) {
188     ICING_LOG(ERROR) << "No scoring data for corpus [" << corpus_id << "]";
189     return 0;
190   }
191   CorpusAssociatedScoreData csdata = status_or.ValueOrDie();
192 
193   corpus_avgdl_map_[corpus_id] = csdata.average_doc_length_in_tokens();
194   return csdata.average_doc_length_in_tokens();
195 }
196 
197 // Computes normalized term frequency for query term q_i in document D.
198 //
199 //                            f(q_i, D) * (k1 + 1)
200 // Normalized TF = --------------------------------------------
201 //                 f(q_i, D) + k1 * (1 - b + b * |D| / avgdl)
202 //
203 // where f(q_i, D) is the frequency of query term q_i in document D,
204 // |D| is the #tokens in D, avgdl is the average document length in the corpus,
205 // k1 and b are smoothing parameters.
ComputedNormalizedTermFrequency(const TermMatchInfo & term_match_info,const DocHitInfo & hit_info,const DocumentAssociatedScoreData & data)206 float Bm25fCalculator::ComputedNormalizedTermFrequency(
207     const TermMatchInfo& term_match_info, const DocHitInfo& hit_info,
208     const DocumentAssociatedScoreData& data) {
209   uint32_t dl = data.length_in_tokens();
210   float avgdl = GetCorpusAvgDocLength(data.corpus_id());
211   float f_q = ComputeTermFrequencyForMatchedSections(
212       data.corpus_id(), term_match_info, hit_info.document_id());
213   float normalized_tf =
214       f_q * (k1_ + 1) / (f_q + k1_ * (1 - b_ + b_ * dl / avgdl));
215 
216   ICING_VLOG(1) << "corpus_id:" << data.corpus_id()
217                 << " docid:" << hit_info.document_id() << " dl:" << dl
218                 << " avgdl:" << avgdl << " f_q:" << f_q
219                 << " norm_tf:" << normalized_tf;
220   return normalized_tf;
221 }
222 
ComputeTermFrequencyForMatchedSections(CorpusId corpus_id,const TermMatchInfo & term_match_info,DocumentId document_id) const223 float Bm25fCalculator::ComputeTermFrequencyForMatchedSections(
224     CorpusId corpus_id, const TermMatchInfo& term_match_info,
225     DocumentId document_id) const {
226   float sum = 0.0f;
227   SectionIdMask sections = term_match_info.section_ids_mask;
228   SchemaTypeId schema_type_id = GetSchemaTypeId(document_id);
229 
230   while (sections != 0) {
231     SectionId section_id = __builtin_ctzll(sections);
232     sections &= ~(UINT64_C(1) << section_id);
233 
234     Hit::TermFrequency tf = term_match_info.term_frequencies[section_id];
235     double weighted_tf = tf * section_weights_.GetNormalizedSectionWeight(
236                                   schema_type_id, section_id);
237     if (tf != Hit::kNoTermFrequency) {
238       sum += weighted_tf;
239     }
240   }
241   return sum;
242 }
243 
GetSchemaTypeId(DocumentId document_id) const244 SchemaTypeId Bm25fCalculator::GetSchemaTypeId(DocumentId document_id) const {
245   auto filter_data_optional = document_store_->GetAliveDocumentFilterData(
246       document_id, current_time_ms_);
247   if (!filter_data_optional) {
248     // This should never happen. The only failure case for
249     // GetAliveDocumentFilterData is if the document_id is outside of the range
250     // of allocated document_ids, which shouldn't be possible since we're
251     // getting this document_id from the posting lists.
252     ICING_LOG(WARNING) << "No document filter data for document ["
253                        << document_id << "]";
254     return kInvalidSchemaTypeId;
255   }
256   return filter_data_optional.value().schema_type_id();
257 }
258 
259 }  // namespace lib
260 }  // namespace icing
261