1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouchepackage classifier 16*46c4c49dSIbrahim Kanouche 17*46c4c49dSIbrahim Kanouchetype frequencyTable struct { 18*46c4c49dSIbrahim Kanouche counts map[tokenID]int // key: token ID, value: number of instances of that token 19*46c4c49dSIbrahim Kanouche} 20*46c4c49dSIbrahim Kanouche 21*46c4c49dSIbrahim Kanouchefunc newFrequencyTable() *frequencyTable { 22*46c4c49dSIbrahim Kanouche return &frequencyTable{ 23*46c4c49dSIbrahim Kanouche counts: make(map[tokenID]int), 24*46c4c49dSIbrahim Kanouche } 25*46c4c49dSIbrahim Kanouche} 26*46c4c49dSIbrahim Kanouche 27*46c4c49dSIbrahim Kanouchefunc (f *frequencyTable) update(d *indexedDocument) { 28*46c4c49dSIbrahim Kanouche for _, tok := range d.Tokens { 29*46c4c49dSIbrahim Kanouche f.counts[tok.ID]++ 30*46c4c49dSIbrahim Kanouche } 31*46c4c49dSIbrahim Kanouche} 32*46c4c49dSIbrahim Kanouche 33*46c4c49dSIbrahim Kanouchefunc (d *indexedDocument) generateFrequencies() { 34*46c4c49dSIbrahim Kanouche d.f = newFrequencyTable() 35*46c4c49dSIbrahim Kanouche d.f.update(d) 36*46c4c49dSIbrahim Kanouche} 37*46c4c49dSIbrahim Kanouche 38*46c4c49dSIbrahim Kanouche// TokenSimilarity returns a confidence score of how well d contains 39*46c4c49dSIbrahim Kanouche// the tokens of o. This is used as a fast similarity metric to 40*46c4c49dSIbrahim Kanouche// avoid running more expensive classifiers. 41*46c4c49dSIbrahim Kanouchefunc (d *indexedDocument) tokenSimilarity(o *indexedDocument) float64 { 42*46c4c49dSIbrahim Kanouche hits := 0 43*46c4c49dSIbrahim Kanouche // For each token in the source document, see if the target has "enough" instances 44*46c4c49dSIbrahim Kanouche // of that token to possibly be a match to the target. 45*46c4c49dSIbrahim Kanouche // We count up all the matches, and divide by the total number of unique source 46*46c4c49dSIbrahim Kanouche // tokens to get a similarity metric. 1.0 means that all the tokens in the target 47*46c4c49dSIbrahim Kanouche // are present in the source in appropriate quantities. If the value here is lower 48*46c4c49dSIbrahim Kanouche // than the desired matching threshold, the target can't possibly match the source. 49*46c4c49dSIbrahim Kanouche // Profiling indicates a significant amount of time is spent here. 50*46c4c49dSIbrahim Kanouche // Avoiding checking (or storing) "uninteresting" tokens (common English words) 51*46c4c49dSIbrahim Kanouche // could help. 52*46c4c49dSIbrahim Kanouche for t, c := range o.f.counts { 53*46c4c49dSIbrahim Kanouche if d.f.counts[t] >= c { 54*46c4c49dSIbrahim Kanouche hits++ 55*46c4c49dSIbrahim Kanouche } 56*46c4c49dSIbrahim Kanouche } 57*46c4c49dSIbrahim Kanouche 58*46c4c49dSIbrahim Kanouche return float64(hits) / float64(len(o.f.counts)) 59*46c4c49dSIbrahim Kanouche} 60