xref: /aosp_15_r20/external/licenseclassifier/v2/frequencies.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouchepackage classifier
16*46c4c49dSIbrahim Kanouche
17*46c4c49dSIbrahim Kanouchetype frequencyTable struct {
18*46c4c49dSIbrahim Kanouche	counts map[tokenID]int // key: token ID, value: number of instances of that token
19*46c4c49dSIbrahim Kanouche}
20*46c4c49dSIbrahim Kanouche
21*46c4c49dSIbrahim Kanouchefunc newFrequencyTable() *frequencyTable {
22*46c4c49dSIbrahim Kanouche	return &frequencyTable{
23*46c4c49dSIbrahim Kanouche		counts: make(map[tokenID]int),
24*46c4c49dSIbrahim Kanouche	}
25*46c4c49dSIbrahim Kanouche}
26*46c4c49dSIbrahim Kanouche
27*46c4c49dSIbrahim Kanouchefunc (f *frequencyTable) update(d *indexedDocument) {
28*46c4c49dSIbrahim Kanouche	for _, tok := range d.Tokens {
29*46c4c49dSIbrahim Kanouche		f.counts[tok.ID]++
30*46c4c49dSIbrahim Kanouche	}
31*46c4c49dSIbrahim Kanouche}
32*46c4c49dSIbrahim Kanouche
33*46c4c49dSIbrahim Kanouchefunc (d *indexedDocument) generateFrequencies() {
34*46c4c49dSIbrahim Kanouche	d.f = newFrequencyTable()
35*46c4c49dSIbrahim Kanouche	d.f.update(d)
36*46c4c49dSIbrahim Kanouche}
37*46c4c49dSIbrahim Kanouche
38*46c4c49dSIbrahim Kanouche// TokenSimilarity returns a confidence score of how well d contains
39*46c4c49dSIbrahim Kanouche// the tokens of o. This is used as a fast similarity metric to
40*46c4c49dSIbrahim Kanouche// avoid running more expensive classifiers.
41*46c4c49dSIbrahim Kanouchefunc (d *indexedDocument) tokenSimilarity(o *indexedDocument) float64 {
42*46c4c49dSIbrahim Kanouche	hits := 0
43*46c4c49dSIbrahim Kanouche	// For each token in the source document, see if the target has "enough" instances
44*46c4c49dSIbrahim Kanouche	// of that token to possibly be a match to the target.
45*46c4c49dSIbrahim Kanouche	// We count up all the matches, and divide by the total number of unique source
46*46c4c49dSIbrahim Kanouche	// tokens to get a similarity metric. 1.0 means that all the tokens in the target
47*46c4c49dSIbrahim Kanouche	// are present in the source in appropriate quantities. If the value here is lower
48*46c4c49dSIbrahim Kanouche	// than the desired matching threshold, the target can't possibly match the source.
49*46c4c49dSIbrahim Kanouche	// Profiling indicates a significant amount of time is spent here.
50*46c4c49dSIbrahim Kanouche	// Avoiding checking (or storing) "uninteresting" tokens (common English words)
51*46c4c49dSIbrahim Kanouche	// could help.
52*46c4c49dSIbrahim Kanouche	for t, c := range o.f.counts {
53*46c4c49dSIbrahim Kanouche		if d.f.counts[t] >= c {
54*46c4c49dSIbrahim Kanouche			hits++
55*46c4c49dSIbrahim Kanouche		}
56*46c4c49dSIbrahim Kanouche	}
57*46c4c49dSIbrahim Kanouche
58*46c4c49dSIbrahim Kanouche	return float64(hits) / float64(len(o.f.counts))
59*46c4c49dSIbrahim Kanouche}
60