1 // Copyright (C) 2021 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_SCORING_BM25F_CALCULATOR_H_ 16 #define ICING_SCORING_BM25F_CALCULATOR_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <unordered_map> 23 24 #include "icing/index/hit/doc-hit-info.h" 25 #include "icing/index/iterator/doc-hit-info-iterator.h" 26 #include "icing/legacy/index/icing-bit-util.h" 27 #include "icing/scoring/section-weights.h" 28 #include "icing/store/corpus-id.h" 29 #include "icing/store/document-associated-score-data.h" 30 #include "icing/store/document-filter-data.h" 31 #include "icing/store/document-id.h" 32 #include "icing/store/document-store.h" 33 34 namespace icing { 35 namespace lib { 36 37 // Bm25fCalculator encapsulates the logic to compute BM25F term-weight based 38 // ranking function. 39 // 40 // The formula to compute BM25F is as follows: 41 // 42 // BM25F = \sum_i IDF(q_i) * tf(q_i, D) 43 // 44 // where IDF(q_i) is the Inverse Document Frequency (IDF) weight of the query 45 // term q_i in the corpus with document D, and tf(q_i, D) is the weighted and 46 // normalized term frequency of query term q_i in the document D. 47 // 48 // IDF(q_i) is computed as follows: 49 // 50 // N - n(q_i) + 0.5 51 // IDF(q_i) = log(1 + ------------------) 52 // n(q_i) + 0.5 53 // 54 // where N is the number of documents in the corpus, and n(q_i) is the number 55 // of documents in the corpus containing the query term q_i. 56 // 57 // Lastly, tf(q_i, D) is computed as follows: 58 // 59 // f(q_i, D) * (k1 + 1) 60 // Normalized TF = -------------------------------------------- 61 // f(q_i, D) + k1 * (1 - b + b * |D| / avgdl) 62 // 63 // where f(q_i, D) is the frequency of query term q_i in document D, 64 // |D| is the #tokens in D, avgdl is the average document length in the corpus, 65 // k1 and b are smoothing parameters. 66 // 67 // see: go/icing-bm25f 68 // see: glossary/bm25 69 class Bm25fCalculator { 70 public: 71 explicit Bm25fCalculator(const DocumentStore *document_store, 72 SectionWeights *section_weights, 73 int64_t current_time_ms); 74 75 // Precompute and cache statistics relevant to BM25F. 76 // Populates term_id_map_ and corpus_nqi_map_ for use while scoring other 77 // results. 78 // The query_term_iterators map is used to build the 79 // std::unordered_map<std::string_view, TermId> term_id_map_. It must 80 // outlive the bm25f-calculator otherwise the string_view key in term_id_map_, 81 // used later to compute a document score, will be meaningless. 82 void PrepareToScore( 83 std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> 84 *query_term_iterators); 85 86 // Compute the BM25F relevance score for the given hit, represented by 87 // DocHitInfo. 88 // The default score will be returned only when the scorer fails to find or 89 // calculate a score for the document. 90 float ComputeScore(const DocHitInfoIterator *query_it, 91 const DocHitInfo &hit_info, double default_score); 92 93 private: 94 // Compact ID for each query term. 95 using TermId = uint16_t; 96 97 // Compact representation of <CorpusId, TermId> for use as a key in a 98 // hash_map. 99 struct CorpusTermInfo { 100 // Layout bits: 16 bit padding + 32 bit CorpusId + 16 bit TermId 101 using Value = uint64_t; 102 103 Value value; 104 105 static constexpr int kCorpusIdBits = sizeof(CorpusId) * 8; 106 static constexpr int kTermIdBits = sizeof(TermId) * 8; 107 CorpusTermInfoCorpusTermInfo108 explicit CorpusTermInfo(CorpusId corpus_id, TermId term_id) : value(0) { 109 BITFIELD_OR(value, kTermIdBits, kCorpusIdBits, 110 static_cast<uint64_t>(corpus_id)); 111 BITFIELD_OR(value, 0, kTermIdBits, term_id); 112 } 113 114 bool operator==(const CorpusTermInfo &other) const { 115 return value == other.value; 116 } 117 }; 118 119 // Returns idf weight for the term and provided corpus. 120 float GetCorpusIdfWeightForTerm(std::string_view term, CorpusId corpus_id); 121 122 // Returns the average document length for the corpus. The average is 123 // calculated as the sum of tokens in the corpus' documents over the total 124 // number of documents plus one. 125 float GetCorpusAvgDocLength(CorpusId corpus_id); 126 127 // Returns the normalized term frequency for the term match and document hit. 128 // This normalizes the term frequency by applying smoothing parameters and 129 // factoring document length. 130 float ComputedNormalizedTermFrequency( 131 const TermMatchInfo &term_match_info, const DocHitInfo &hit_info, 132 const DocumentAssociatedScoreData &data); 133 134 // Returns the weighted term frequency for the term match and document. For 135 // each section the term is present, we scale the term frequency by its 136 // section weight. We return the sum of the weighted term frequencies over all 137 // sections. 138 float ComputeTermFrequencyForMatchedSections( 139 CorpusId corpus_id, const TermMatchInfo &term_match_info, 140 DocumentId document_id) const; 141 142 // Returns the schema type id for the document by retrieving it from the 143 // DocumentFilterData. 144 SchemaTypeId GetSchemaTypeId(DocumentId document_id) const; 145 146 // Clears cached scoring data and prepares the calculator for a new scoring 147 // run. 148 void Clear(); 149 150 const DocumentStore *document_store_; // Does not own. 151 152 // Used for accessing normalized section weights when computing the weighted 153 // term frequency. 154 SectionWeights §ion_weights_; 155 156 // Map from query term to compact term ID. 157 // Necessary as a key to the other maps. 158 // The use of the string_view as key here means that the query_term_iterators 159 // map must outlive the bm25f 160 std::unordered_map<std::string_view, TermId> term_id_map_; 161 162 // Map from corpus ID to average document length (avgdl). 163 // Necessary to calculate the normalized term frequency. 164 // This information is cached in the DocumentStore::CorpusScoreCache 165 std::unordered_map<CorpusId, float> corpus_avgdl_map_; 166 // Map from <corpus ID, term ID> to number of documents containing term q_i, 167 // called n(q_i). 168 // Necessary to calculate IDF(q_i) (inverse document frequency). 169 // This information must be calculated by iterating through the hits for these 170 // terms. 171 std::unordered_map<CorpusTermInfo::Value, uint32_t> corpus_nqi_map_; 172 173 // Map from <corpus ID, term ID> to IDF(q_i) (inverse document frequency). 174 std::unordered_map<CorpusTermInfo::Value, float> corpus_idf_map_; 175 176 int64_t current_time_ms_; 177 }; 178 179 } // namespace lib 180 } // namespace icing 181 182 #endif // ICING_SCORING_BM25F_CALCULATOR_H_ 183