1// Copyright 2020 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package classifier 16 17type frequencyTable struct { 18 counts map[tokenID]int // key: token ID, value: number of instances of that token 19} 20 21func newFrequencyTable() *frequencyTable { 22 return &frequencyTable{ 23 counts: make(map[tokenID]int), 24 } 25} 26 27func (f *frequencyTable) update(d *indexedDocument) { 28 for _, tok := range d.Tokens { 29 f.counts[tok.ID]++ 30 } 31} 32 33func (d *indexedDocument) generateFrequencies() { 34 d.f = newFrequencyTable() 35 d.f.update(d) 36} 37 38// TokenSimilarity returns a confidence score of how well d contains 39// the tokens of o. This is used as a fast similarity metric to 40// avoid running more expensive classifiers. 41func (d *indexedDocument) tokenSimilarity(o *indexedDocument) float64 { 42 hits := 0 43 // For each token in the source document, see if the target has "enough" instances 44 // of that token to possibly be a match to the target. 45 // We count up all the matches, and divide by the total number of unique source 46 // tokens to get a similarity metric. 1.0 means that all the tokens in the target 47 // are present in the source in appropriate quantities. If the value here is lower 48 // than the desired matching threshold, the target can't possibly match the source. 49 // Profiling indicates a significant amount of time is spent here. 50 // Avoiding checking (or storing) "uninteresting" tokens (common English words) 51 // could help. 52 for t, c := range o.f.counts { 53 if d.f.counts[t] >= c { 54 hits++ 55 } 56 } 57 58 return float64(hits) / float64(len(o.f.counts)) 59} 60