1 // Copyright (C) 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/scoring/bm25f-calculator.h"
16
17 #include <cmath>
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <vector>
24
25 #include "icing/index/hit/doc-hit-info.h"
26 #include "icing/index/hit/hit.h"
27 #include "icing/index/iterator/doc-hit-info-iterator.h"
28 #include "icing/schema/section.h"
29 #include "icing/scoring/section-weights.h"
30 #include "icing/store/corpus-associated-scoring-data.h"
31 #include "icing/store/corpus-id.h"
32 #include "icing/store/document-associated-score-data.h"
33 #include "icing/store/document-filter-data.h"
34 #include "icing/store/document-id.h"
35 #include "icing/store/document-store.h"
36 #include "icing/util/logging.h"
37
38 namespace icing {
39 namespace lib {
40
41 // Smoothing parameter, determines the relevance of higher term frequency
42 // documents. The higher k1, the higher their relevance. 1.2 is the default
43 // value in the BM25F literature and works well in most corpora.
44 constexpr float k1_ = 1.2f;
45 // Smoothing parameter, determines the weight of the document length on the
46 // final score. The higher b, the higher the influence of the document length.
47 // 0.7 is the default value in the BM25F literature and works well in most
48 // corpora.
49 constexpr float b_ = 0.7f;
50
51 // TODO(b/158603900): add tests for Bm25fCalculator
Bm25fCalculator(const DocumentStore * document_store,SectionWeights * section_weights,int64_t current_time_ms)52 Bm25fCalculator::Bm25fCalculator(const DocumentStore* document_store,
53 SectionWeights* section_weights,
54 int64_t current_time_ms)
55 : document_store_(document_store),
56 section_weights_(*section_weights),
57 current_time_ms_(current_time_ms) {}
58
59 // During initialization, Bm25fCalculator iterates through
60 // hit-iterators for each query term to pre-compute n(q_i) for each corpus under
61 // consideration.
PrepareToScore(std::unordered_map<std::string,std::unique_ptr<DocHitInfoIterator>> * query_term_iterators)62 void Bm25fCalculator::PrepareToScore(
63 std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>*
64 query_term_iterators) {
65 Clear();
66 TermId term_id = 0;
67 for (auto& iter : *query_term_iterators) {
68 const std::string& term = iter.first;
69 if (term_id_map_.find(term) != term_id_map_.end()) {
70 continue;
71 }
72 term_id_map_[term] = ++term_id;
73 DocHitInfoIterator* term_it = iter.second.get();
74
75 while (term_it->Advance().ok()) {
76 auto status_or = document_store_->GetDocumentAssociatedScoreData(
77 term_it->doc_hit_info().document_id());
78 if (!status_or.ok()) {
79 ICING_LOG(ERROR) << "No document score data";
80 continue;
81 }
82 DocumentAssociatedScoreData data = status_or.ValueOrDie();
83 CorpusId corpus_id = data.corpus_id();
84 CorpusTermInfo corpus_term_info(corpus_id, term_id);
85 corpus_nqi_map_[corpus_term_info.value]++;
86 }
87 }
88 }
89
Clear()90 void Bm25fCalculator::Clear() {
91 term_id_map_.clear();
92 corpus_avgdl_map_.clear();
93 corpus_nqi_map_.clear();
94 corpus_idf_map_.clear();
95 }
96
97 // Computes BM25F relevance score for query terms matched in document D.
98 //
99 // BM25F = \sum_i IDF(q_i) * tf(q_i, D)
100 //
101 // where IDF(q_i) is the Inverse Document Frequency (IDF) weight of the query
102 // term q_i in the corpus with document D, and tf(q_i, D) is the weighted and
103 // normalized term frequency of query term q_i in the document D.
ComputeScore(const DocHitInfoIterator * query_it,const DocHitInfo & hit_info,double default_score)104 float Bm25fCalculator::ComputeScore(const DocHitInfoIterator* query_it,
105 const DocHitInfo& hit_info,
106 double default_score) {
107 auto status_or =
108 document_store_->GetDocumentAssociatedScoreData(hit_info.document_id());
109 if (!status_or.ok()) {
110 ICING_LOG(ERROR) << "No document score data";
111 return default_score;
112 }
113 DocumentAssociatedScoreData data = status_or.ValueOrDie();
114 std::vector<TermMatchInfo> matched_terms_stats;
115 query_it->PopulateMatchedTermsStats(&matched_terms_stats);
116
117 float score = 0;
118 for (const TermMatchInfo& term_match_info : matched_terms_stats) {
119 float idf_weight =
120 GetCorpusIdfWeightForTerm(term_match_info.term, data.corpus_id());
121 float normalized_tf =
122 ComputedNormalizedTermFrequency(term_match_info, hit_info, data);
123 score += idf_weight * normalized_tf;
124 }
125
126 ICING_VLOG(1) << "BM25F: corpus_id:" << data.corpus_id()
127 << " docid:" << hit_info.document_id() << " score:" << score;
128 return score;
129 }
130
131 // Compute inverse document frequency (IDF) weight for query term in the given
132 // corpus, and cache it in the map.
133 //
134 // N - n(q_i) + 0.5
135 // IDF(q_i) = ln(1 + ------------------)
136 // n(q_i) + 0.5
137 //
138 // where N is the number of documents in the corpus, and n(q_i) is the number
139 // of documents in the corpus containing the query term q_i.
GetCorpusIdfWeightForTerm(std::string_view term,CorpusId corpus_id)140 float Bm25fCalculator::GetCorpusIdfWeightForTerm(std::string_view term,
141 CorpusId corpus_id) {
142 TermId term_id = term_id_map_[term];
143
144 CorpusTermInfo corpus_term_info(corpus_id, term_id);
145 auto iter = corpus_idf_map_.find(corpus_term_info.value);
146 if (iter != corpus_idf_map_.end()) {
147 return iter->second;
148 }
149
150 // First, figure out corpus scoring data.
151 auto status_or = document_store_->GetCorpusAssociatedScoreData(corpus_id);
152 if (!status_or.ok()) {
153 ICING_LOG(ERROR) << "No scoring data for corpus [" << corpus_id << "]";
154 return 0;
155 }
156 CorpusAssociatedScoreData csdata = status_or.ValueOrDie();
157
158 uint32_t num_docs = csdata.num_docs();
159 uint32_t nqi = corpus_nqi_map_[corpus_term_info.value];
160 if (nqi > num_docs) {
161 ICING_LOG(ERROR) << "nqi > num_docs when calculating idf for corpus "
162 << corpus_id << " term " << term;
163 return 0;
164 }
165 float idf =
166 nqi != 0 ? log(1.0f + (num_docs - nqi + 0.5f) / (nqi + 0.5f)) : 0.0f;
167 corpus_idf_map_.insert({corpus_term_info.value, idf});
168 ICING_VLOG(1) << "corpus_id:" << corpus_id << " term:" << term
169 << " N:" << num_docs << "nqi:" << nqi << " idf:" << idf;
170 return idf;
171 }
172
173 // Get per corpus average document length and cache the result in the map.
174 // The average doc length is calculated as:
175 //
176 // total_tokens_in_corpus
177 // Avg Doc Length = -------------------------
178 // num_docs_in_corpus + 1
GetCorpusAvgDocLength(CorpusId corpus_id)179 float Bm25fCalculator::GetCorpusAvgDocLength(CorpusId corpus_id) {
180 auto iter = corpus_avgdl_map_.find(corpus_id);
181 if (iter != corpus_avgdl_map_.end()) {
182 return iter->second;
183 }
184
185 // First, figure out corpus scoring data.
186 auto status_or = document_store_->GetCorpusAssociatedScoreData(corpus_id);
187 if (!status_or.ok()) {
188 ICING_LOG(ERROR) << "No scoring data for corpus [" << corpus_id << "]";
189 return 0;
190 }
191 CorpusAssociatedScoreData csdata = status_or.ValueOrDie();
192
193 corpus_avgdl_map_[corpus_id] = csdata.average_doc_length_in_tokens();
194 return csdata.average_doc_length_in_tokens();
195 }
196
197 // Computes normalized term frequency for query term q_i in document D.
198 //
199 // f(q_i, D) * (k1 + 1)
200 // Normalized TF = --------------------------------------------
201 // f(q_i, D) + k1 * (1 - b + b * |D| / avgdl)
202 //
203 // where f(q_i, D) is the frequency of query term q_i in document D,
204 // |D| is the #tokens in D, avgdl is the average document length in the corpus,
205 // k1 and b are smoothing parameters.
ComputedNormalizedTermFrequency(const TermMatchInfo & term_match_info,const DocHitInfo & hit_info,const DocumentAssociatedScoreData & data)206 float Bm25fCalculator::ComputedNormalizedTermFrequency(
207 const TermMatchInfo& term_match_info, const DocHitInfo& hit_info,
208 const DocumentAssociatedScoreData& data) {
209 uint32_t dl = data.length_in_tokens();
210 float avgdl = GetCorpusAvgDocLength(data.corpus_id());
211 float f_q = ComputeTermFrequencyForMatchedSections(
212 data.corpus_id(), term_match_info, hit_info.document_id());
213 float normalized_tf =
214 f_q * (k1_ + 1) / (f_q + k1_ * (1 - b_ + b_ * dl / avgdl));
215
216 ICING_VLOG(1) << "corpus_id:" << data.corpus_id()
217 << " docid:" << hit_info.document_id() << " dl:" << dl
218 << " avgdl:" << avgdl << " f_q:" << f_q
219 << " norm_tf:" << normalized_tf;
220 return normalized_tf;
221 }
222
ComputeTermFrequencyForMatchedSections(CorpusId corpus_id,const TermMatchInfo & term_match_info,DocumentId document_id) const223 float Bm25fCalculator::ComputeTermFrequencyForMatchedSections(
224 CorpusId corpus_id, const TermMatchInfo& term_match_info,
225 DocumentId document_id) const {
226 float sum = 0.0f;
227 SectionIdMask sections = term_match_info.section_ids_mask;
228 SchemaTypeId schema_type_id = GetSchemaTypeId(document_id);
229
230 while (sections != 0) {
231 SectionId section_id = __builtin_ctzll(sections);
232 sections &= ~(UINT64_C(1) << section_id);
233
234 Hit::TermFrequency tf = term_match_info.term_frequencies[section_id];
235 double weighted_tf = tf * section_weights_.GetNormalizedSectionWeight(
236 schema_type_id, section_id);
237 if (tf != Hit::kNoTermFrequency) {
238 sum += weighted_tf;
239 }
240 }
241 return sum;
242 }
243
GetSchemaTypeId(DocumentId document_id) const244 SchemaTypeId Bm25fCalculator::GetSchemaTypeId(DocumentId document_id) const {
245 auto filter_data_optional = document_store_->GetAliveDocumentFilterData(
246 document_id, current_time_ms_);
247 if (!filter_data_optional) {
248 // This should never happen. The only failure case for
249 // GetAliveDocumentFilterData is if the document_id is outside of the range
250 // of allocated document_ids, which shouldn't be possible since we're
251 // getting this document_id from the posting lists.
252 ICING_LOG(WARNING) << "No document filter data for document ["
253 << document_id << "]";
254 return kInvalidSchemaTypeId;
255 }
256 return filter_data_optional.value().schema_type_id();
257 }
258
259 } // namespace lib
260 } // namespace icing
261