xref: /aosp_15_r20/external/licenseclassifier/v2/searchset.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouchepackage classifier
16*46c4c49dSIbrahim Kanouche
17*46c4c49dSIbrahim Kanoucheimport (
18*46c4c49dSIbrahim Kanouche	"fmt"
19*46c4c49dSIbrahim Kanouche	"hash/crc32"
20*46c4c49dSIbrahim Kanouche	"math"
21*46c4c49dSIbrahim Kanouche	"sort"
22*46c4c49dSIbrahim Kanouche
23*46c4c49dSIbrahim Kanouche	"github.com/davecgh/go-spew/spew"
24*46c4c49dSIbrahim Kanouche)
25*46c4c49dSIbrahim Kanouche
26*46c4c49dSIbrahim Kanouche// searchSet is a set of q-grams that have hashes associated with them,
27*46c4c49dSIbrahim Kanouche// making it fast to search for potential matches.
28*46c4c49dSIbrahim Kanouchetype searchSet struct {
29*46c4c49dSIbrahim Kanouche	// Tokens is a tokenized list of the original input string.
30*46c4c49dSIbrahim Kanouche	Tokens []indexedToken
31*46c4c49dSIbrahim Kanouche	// Hashes is a map of checksums to a range of tokens.
32*46c4c49dSIbrahim Kanouche	Hashes hash
33*46c4c49dSIbrahim Kanouche	// Checksums is a list of checksums ordered from longest range to
34*46c4c49dSIbrahim Kanouche	// shortest.
35*46c4c49dSIbrahim Kanouche	Checksums []uint32
36*46c4c49dSIbrahim Kanouche	// ChecksumRanges are the token ranges for the above checksums.
37*46c4c49dSIbrahim Kanouche	ChecksumRanges tokenRanges
38*46c4c49dSIbrahim Kanouche	origin         string // A debugging identifier to label what this searchset is associated with
39*46c4c49dSIbrahim Kanouche
40*46c4c49dSIbrahim Kanouche	nodes []*node
41*46c4c49dSIbrahim Kanouche	q     int // The length of q-grams in this searchset.
42*46c4c49dSIbrahim Kanouche}
43*46c4c49dSIbrahim Kanouche
44*46c4c49dSIbrahim Kanouche// node consists of a range of tokens along with the checksum for those tokens.
45*46c4c49dSIbrahim Kanouchetype node struct {
46*46c4c49dSIbrahim Kanouche	checksum uint32
47*46c4c49dSIbrahim Kanouche	tokens   *tokenRange
48*46c4c49dSIbrahim Kanouche}
49*46c4c49dSIbrahim Kanouche
50*46c4c49dSIbrahim Kanouchefunc (n *node) String() string {
51*46c4c49dSIbrahim Kanouche	return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End)
52*46c4c49dSIbrahim Kanouche}
53*46c4c49dSIbrahim Kanouche
54*46c4c49dSIbrahim Kanouche// newSearchSet creates a new searchSet object. A searchset generates all
55*46c4c49dSIbrahim Kanouche// possible q-grams of tokens. These q-grams of tokens can be correlated to
56*46c4c49dSIbrahim Kanouche// determine where a section of text from one source may appear in another
57*46c4c49dSIbrahim Kanouche// source.
58*46c4c49dSIbrahim Kanouchefunc newSearchSet(s *indexedDocument, q int) *searchSet {
59*46c4c49dSIbrahim Kanouche	// Start generating hash values for all q-grams within the text.
60*46c4c49dSIbrahim Kanouche	h := make(hash)
61*46c4c49dSIbrahim Kanouche	if len(s.Tokens) < q {
62*46c4c49dSIbrahim Kanouche		// We can't have a smaller q than the number of tokens.
63*46c4c49dSIbrahim Kanouche		q = len(s.Tokens)
64*46c4c49dSIbrahim Kanouche	}
65*46c4c49dSIbrahim Kanouche	checksums, tokenRanges := generateHashes(h, q, s.Tokens, s.dict)
66*46c4c49dSIbrahim Kanouche	sset := &searchSet{
67*46c4c49dSIbrahim Kanouche		Tokens:         s.Tokens,
68*46c4c49dSIbrahim Kanouche		Hashes:         h,
69*46c4c49dSIbrahim Kanouche		Checksums:      checksums,
70*46c4c49dSIbrahim Kanouche		ChecksumRanges: tokenRanges,
71*46c4c49dSIbrahim Kanouche		q:              q,
72*46c4c49dSIbrahim Kanouche	}
73*46c4c49dSIbrahim Kanouche	sset.generateNodeList()
74*46c4c49dSIbrahim Kanouche	return sset
75*46c4c49dSIbrahim Kanouche}
76*46c4c49dSIbrahim Kanouche
77*46c4c49dSIbrahim Kanouche// tokenRange indicates the range of tokens that map to a particular checksum.
78*46c4c49dSIbrahim Kanouchetype tokenRange struct {
79*46c4c49dSIbrahim Kanouche	Start int
80*46c4c49dSIbrahim Kanouche	End   int
81*46c4c49dSIbrahim Kanouche}
82*46c4c49dSIbrahim Kanouche
83*46c4c49dSIbrahim Kanouchefunc (t *tokenRange) String() string {
84*46c4c49dSIbrahim Kanouche	return fmt.Sprintf("[%v, %v)", t.Start, t.End)
85*46c4c49dSIbrahim Kanouche}
86*46c4c49dSIbrahim Kanouche
87*46c4c49dSIbrahim Kanouche// tokenRanges is a sortable type of a slice of TokenRange.
88*46c4c49dSIbrahim Kanouchetype tokenRanges []*tokenRange
89*46c4c49dSIbrahim Kanouche
90*46c4c49dSIbrahim Kanouche// generateHashes computes a hash using CRC-32 for each q-gram encountered in the provided tokens.
91*46c4c49dSIbrahim Kanouchefunc generateHashes(h hash, q int, toks []indexedToken, dict *dictionary) ([]uint32, tokenRanges) {
92*46c4c49dSIbrahim Kanouche	if q == 0 {
93*46c4c49dSIbrahim Kanouche		return nil, nil
94*46c4c49dSIbrahim Kanouche	}
95*46c4c49dSIbrahim Kanouche	var css []uint32
96*46c4c49dSIbrahim Kanouche	var tr tokenRanges
97*46c4c49dSIbrahim Kanouche	crc := crc32.NewIEEE()
98*46c4c49dSIbrahim Kanouche	for offset := 0; offset+q <= len(toks); offset++ {
99*46c4c49dSIbrahim Kanouche		crc.Reset()
100*46c4c49dSIbrahim Kanouche		for i := 0; i < q; i++ {
101*46c4c49dSIbrahim Kanouche			crc.Write([]byte(dict.getWord(toks[offset+i].ID)))
102*46c4c49dSIbrahim Kanouche			crc.Write([]byte{' '})
103*46c4c49dSIbrahim Kanouche		}
104*46c4c49dSIbrahim Kanouche		cs := crc.Sum32()
105*46c4c49dSIbrahim Kanouche		css = append(css, cs)
106*46c4c49dSIbrahim Kanouche		tr = append(tr, &tokenRange{offset, offset + q})
107*46c4c49dSIbrahim Kanouche		h.add(cs, offset, offset+q)
108*46c4c49dSIbrahim Kanouche	}
109*46c4c49dSIbrahim Kanouche
110*46c4c49dSIbrahim Kanouche	return css, tr
111*46c4c49dSIbrahim Kanouche}
112*46c4c49dSIbrahim Kanouche
113*46c4c49dSIbrahim Kanouche// generateNodeList creates a node list out of the search set.
114*46c4c49dSIbrahim Kanouchefunc (s *searchSet) generateNodeList() {
115*46c4c49dSIbrahim Kanouche	if len(s.Tokens) == 0 {
116*46c4c49dSIbrahim Kanouche		return
117*46c4c49dSIbrahim Kanouche	}
118*46c4c49dSIbrahim Kanouche
119*46c4c49dSIbrahim Kanouche	for i := 0; i < len(s.Checksums); i++ {
120*46c4c49dSIbrahim Kanouche		s.nodes = append(s.nodes, &node{
121*46c4c49dSIbrahim Kanouche			checksum: s.Checksums[i],
122*46c4c49dSIbrahim Kanouche			tokens:   s.ChecksumRanges[i],
123*46c4c49dSIbrahim Kanouche		})
124*46c4c49dSIbrahim Kanouche	}
125*46c4c49dSIbrahim Kanouche}
126*46c4c49dSIbrahim Kanouche
127*46c4c49dSIbrahim Kanouche// matchRange is the range within the source text that is a match to the range
128*46c4c49dSIbrahim Kanouche// in the target text.
129*46c4c49dSIbrahim Kanouchetype matchRange struct {
130*46c4c49dSIbrahim Kanouche	// Offsets into the source tokens.
131*46c4c49dSIbrahim Kanouche	SrcStart, SrcEnd int
132*46c4c49dSIbrahim Kanouche	// Offsets into the target tokens.
133*46c4c49dSIbrahim Kanouche	TargetStart, TargetEnd int
134*46c4c49dSIbrahim Kanouche	// TokensClaimed tracks the number of positively matched tokens in this
135*46c4c49dSIbrahim Kanouche	// range.  For initially created matchRanges, this is equal to the extent of
136*46c4c49dSIbrahim Kanouche	// the range.  However, as matchRanges get merged together and error is
137*46c4c49dSIbrahim Kanouche	// potentially introduced, this tracks the precise number of tokens that
138*46c4c49dSIbrahim Kanouche	// exist in the range.
139*46c4c49dSIbrahim Kanouche	TokensClaimed int
140*46c4c49dSIbrahim Kanouche}
141*46c4c49dSIbrahim Kanouche
142*46c4c49dSIbrahim Kanouche// in returns true if the start and end are enclosed in the match range.
143*46c4c49dSIbrahim Kanouchefunc (m *matchRange) in(other *matchRange) bool {
144*46c4c49dSIbrahim Kanouche	return m.TargetStart >= other.TargetStart && m.TargetEnd <= other.TargetEnd
145*46c4c49dSIbrahim Kanouche}
146*46c4c49dSIbrahim Kanouche
147*46c4c49dSIbrahim Kanouchefunc (m *matchRange) String() string {
148*46c4c49dSIbrahim Kanouche	return fmt.Sprintf("S: [%v, %v)-> T: [%v, %v) => %v [%v]", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd, m.TargetStart-m.SrcStart, m.TokensClaimed)
149*46c4c49dSIbrahim Kanouche}
150*46c4c49dSIbrahim Kanouche
151*46c4c49dSIbrahim Kanouche// matchRanges is a list of "matchRange"s. The ranges are monotonically
152*46c4c49dSIbrahim Kanouche// increasing in value and indicate a single potential occurrence of the source
153*46c4c49dSIbrahim Kanouche// text in the target text. They are sorted to support greedy matching with the
154*46c4c49dSIbrahim Kanouche// longest runs of q-grams appearing first in the sort.
155*46c4c49dSIbrahim Kanouchetype matchRanges []*matchRange
156*46c4c49dSIbrahim Kanouche
157*46c4c49dSIbrahim Kanouchefunc (m matchRanges) Len() int      { return len(m) }
158*46c4c49dSIbrahim Kanouchefunc (m matchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
159*46c4c49dSIbrahim Kanouchefunc (m matchRanges) Less(i, j int) bool {
160*46c4c49dSIbrahim Kanouche	if m[i].TokensClaimed != m[j].TokensClaimed {
161*46c4c49dSIbrahim Kanouche		return m[i].TokensClaimed > m[j].TokensClaimed
162*46c4c49dSIbrahim Kanouche	}
163*46c4c49dSIbrahim Kanouche
164*46c4c49dSIbrahim Kanouche	if m[i].TargetStart != m[j].TargetStart {
165*46c4c49dSIbrahim Kanouche		return m[i].TargetStart < m[j].TargetStart
166*46c4c49dSIbrahim Kanouche	}
167*46c4c49dSIbrahim Kanouche	return m[i].SrcStart < m[j].SrcStart
168*46c4c49dSIbrahim Kanouche}
169*46c4c49dSIbrahim Kanouche
170*46c4c49dSIbrahim Kanouche// findPotentialMatches returns the ranges in the target (unknown) text that
171*46c4c49dSIbrahim Kanouche// are best potential matches to the source (known) text.
172*46c4c49dSIbrahim Kanouchefunc (c *Classifier) findPotentialMatches(src, target *searchSet, confidence float64) matchRanges {
173*46c4c49dSIbrahim Kanouche	matchedRanges := c.getMatchedRanges(src, target, confidence, src.q)
174*46c4c49dSIbrahim Kanouche	if c.tc.traceSearchset(src.origin) {
175*46c4c49dSIbrahim Kanouche		c.tc.trace("matchedRanges = %s", spew.Sdump(matchedRanges))
176*46c4c49dSIbrahim Kanouche	}
177*46c4c49dSIbrahim Kanouche	if len(matchedRanges) == 0 {
178*46c4c49dSIbrahim Kanouche		return nil
179*46c4c49dSIbrahim Kanouche	}
180*46c4c49dSIbrahim Kanouche
181*46c4c49dSIbrahim Kanouche	// After computing all potential matches, we only output ranges that contain
182*46c4c49dSIbrahim Kanouche	// enough tokens to clear the confidence threshold. As noted, this number can
183*46c4c49dSIbrahim Kanouche	// be too high, yielding false positives, but cannot yield false negatives.
184*46c4c49dSIbrahim Kanouche	threshold := int(confidence * float64(len(src.Tokens)))
185*46c4c49dSIbrahim Kanouche
186*46c4c49dSIbrahim Kanouche	for i, m := range matchedRanges {
187*46c4c49dSIbrahim Kanouche		if m.TokensClaimed < threshold {
188*46c4c49dSIbrahim Kanouche			matchedRanges = matchedRanges[:i]
189*46c4c49dSIbrahim Kanouche			break
190*46c4c49dSIbrahim Kanouche		}
191*46c4c49dSIbrahim Kanouche	}
192*46c4c49dSIbrahim Kanouche
193*46c4c49dSIbrahim Kanouche	if c.tc.traceSearchset(src.origin) {
194*46c4c49dSIbrahim Kanouche		c.tc.trace("finalized matchedRanges for %s: %d = %s", src.origin, len(src.Tokens), spew.Sdump(matchedRanges))
195*46c4c49dSIbrahim Kanouche	}
196*46c4c49dSIbrahim Kanouche	return matchedRanges
197*46c4c49dSIbrahim Kanouche}
198*46c4c49dSIbrahim Kanouche
199*46c4c49dSIbrahim Kanouche// fuseRanges analyzes the source matches, attempting to combine hits without
200*46c4c49dSIbrahim Kanouche// errors into larger hits with tolerable amounts of error to produce matches
201*46c4c49dSIbrahim Kanouche// that contain enough tokens to be considered for exact matching against a a
202*46c4c49dSIbrahim Kanouche// target document. This routine intentionally does not accurately track error
203*46c4c49dSIbrahim Kanouche// contributions from merging runs, trading false positives (but not false
204*46c4c49dSIbrahim Kanouche// negatives), for faster performance.
205*46c4c49dSIbrahim Kanouchefunc (c *Classifier) fuseRanges(origin string, matched matchRanges, confidence float64, size int, runs []matchRange, targetSize int) matchRanges {
206*46c4c49dSIbrahim Kanouche	var claimed matchRanges
207*46c4c49dSIbrahim Kanouche	errorMargin := int(math.Round(float64(size) * (1.0 - confidence)))
208*46c4c49dSIbrahim Kanouche
209*46c4c49dSIbrahim Kanouche	filter := make([]bool, targetSize)
210*46c4c49dSIbrahim Kanouche	for _, m := range runs {
211*46c4c49dSIbrahim Kanouche		for i := m.SrcStart; i < m.SrcEnd; i++ {
212*46c4c49dSIbrahim Kanouche			// Only consider these offsets if they fit into the target document, since
213*46c4c49dSIbrahim Kanouche			// the target may be smaller than the source being analyzed
214*46c4c49dSIbrahim Kanouche			if i < targetSize {
215*46c4c49dSIbrahim Kanouche				filter[i] = true
216*46c4c49dSIbrahim Kanouche			}
217*46c4c49dSIbrahim Kanouche		}
218*46c4c49dSIbrahim Kanouche	}
219*46c4c49dSIbrahim Kanouche
220*46c4c49dSIbrahim Kanouche	filterDrops := 0
221*46c4c49dSIbrahim Kanouche	filterPasses := 0
222*46c4c49dSIbrahim Kanouche
223*46c4c49dSIbrahim Kanouche	// For each hit detected, compare it against all other previous hits to see if it can be part of match
224*46c4c49dSIbrahim Kanouche	// or represents a group that is eligible for matching and having other hits contribute to it.
225*46c4c49dSIbrahim Kanouche	for _, m := range matched {
226*46c4c49dSIbrahim Kanouche		off := m.TargetStart - m.SrcStart
227*46c4c49dSIbrahim Kanouche
228*46c4c49dSIbrahim Kanouche		// If the offset is negative, but within error margins, we associate it
229*46c4c49dSIbrahim Kanouche		// with the first index to see if it could contribute to a run. If the
230*46c4c49dSIbrahim Kanouche		// absolute offset is larger than the error margin, it can't possibly
231*46c4c49dSIbrahim Kanouche		// contribute and will be dropped. This puts more potential error into the zero
232*46c4c49dSIbrahim Kanouche		// index, but that just slightly increases the rate of false positives. In
233*46c4c49dSIbrahim Kanouche		// practice, this would only be an issue if there are major substrings of a
234*46c4c49dSIbrahim Kanouche		// source in a target that aren't part of a real hit. We see many small
235*46c4c49dSIbrahim Kanouche		// references (the name of a license) but not large chunks of the license.
236*46c4c49dSIbrahim Kanouche		if off < 0 {
237*46c4c49dSIbrahim Kanouche			if -off <= errorMargin {
238*46c4c49dSIbrahim Kanouche				off = 0
239*46c4c49dSIbrahim Kanouche			} else {
240*46c4c49dSIbrahim Kanouche				continue
241*46c4c49dSIbrahim Kanouche			}
242*46c4c49dSIbrahim Kanouche		}
243*46c4c49dSIbrahim Kanouche
244*46c4c49dSIbrahim Kanouche		// If the filter is false, there was not sufficient token density in that
245*46c4c49dSIbrahim Kanouche		// part of the target document for a viable match, so this match is a
246*46c4c49dSIbrahim Kanouche		// spurious hit and can be discarded.
247*46c4c49dSIbrahim Kanouche		if !filter[off] {
248*46c4c49dSIbrahim Kanouche			filterDrops++
249*46c4c49dSIbrahim Kanouche			continue
250*46c4c49dSIbrahim Kanouche		}
251*46c4c49dSIbrahim Kanouche
252*46c4c49dSIbrahim Kanouche		filterPasses++
253*46c4c49dSIbrahim Kanouche		unclaimed := true
254*46c4c49dSIbrahim Kanouche
255*46c4c49dSIbrahim Kanouche		for _, c := range claimed {
256*46c4c49dSIbrahim Kanouche			moff := m.TargetStart - m.SrcStart
257*46c4c49dSIbrahim Kanouche			coff := c.TargetStart - c.SrcStart
258*46c4c49dSIbrahim Kanouche			sampleError := int(math.Round(math.Abs(float64(moff - coff))))
259*46c4c49dSIbrahim Kanouche			withinError := sampleError < errorMargin
260*46c4c49dSIbrahim Kanouche
261*46c4c49dSIbrahim Kanouche			// The contribution needs to add more value than accumulated error. This prevents
262*46c4c49dSIbrahim Kanouche			// against spurious matches of a reference to a license incorrectly overextending the
263*46c4c49dSIbrahim Kanouche			// match range.
264*46c4c49dSIbrahim Kanouche			if withinError && m.TokensClaimed > int(sampleError) {
265*46c4c49dSIbrahim Kanouche				if m.in(c) {
266*46c4c49dSIbrahim Kanouche					// This can cause too many tokens to be claimed, but that's fine since we want to avoid
267*46c4c49dSIbrahim Kanouche					// undercounting and missing content.
268*46c4c49dSIbrahim Kanouche					c.TokensClaimed += m.TokensClaimed
269*46c4c49dSIbrahim Kanouche					unclaimed = false
270*46c4c49dSIbrahim Kanouche				} else {
271*46c4c49dSIbrahim Kanouche					// See if the claims can be merged. If the error tolerances allow for it,
272*46c4c49dSIbrahim Kanouche					// we merge the claims and the ranges. This doesn't accumulate error
273*46c4c49dSIbrahim Kanouche					// accurately so it's possible that repeated merges can introduce too
274*46c4c49dSIbrahim Kanouche					// much error to actually make a match, but we won't get false
275*46c4c49dSIbrahim Kanouche					// negatives from this approach.  The error tolerances allow for a
276*46c4c49dSIbrahim Kanouche					// merge, but we only want to merge if it increases the range of
277*46c4c49dSIbrahim Kanouche					// tokens being covered. If this match is already handled by an
278*46c4c49dSIbrahim Kanouche					// existing (stronger by definition) claim, we don't merge this one,
279*46c4c49dSIbrahim Kanouche					// but treat it as a new claim. This allows for the case where a
280*46c4c49dSIbrahim Kanouche					// highly fragmented text will be matched by a long series of low
281*46c4c49dSIbrahim Kanouche					// score matches.
282*46c4c49dSIbrahim Kanouche					if m.TargetStart < c.TargetStart && m.SrcStart < c.SrcStart {
283*46c4c49dSIbrahim Kanouche						c.TargetStart = m.TargetStart
284*46c4c49dSIbrahim Kanouche						c.SrcStart = m.SrcStart
285*46c4c49dSIbrahim Kanouche						c.TokensClaimed += m.TokensClaimed
286*46c4c49dSIbrahim Kanouche						unclaimed = false
287*46c4c49dSIbrahim Kanouche					} else if m.TargetEnd > c.TargetEnd && m.SrcEnd > c.SrcEnd {
288*46c4c49dSIbrahim Kanouche						c.TargetEnd = m.TargetEnd
289*46c4c49dSIbrahim Kanouche						c.SrcEnd = m.SrcEnd
290*46c4c49dSIbrahim Kanouche						c.TokensClaimed += m.TokensClaimed
291*46c4c49dSIbrahim Kanouche						unclaimed = false
292*46c4c49dSIbrahim Kanouche					}
293*46c4c49dSIbrahim Kanouche					// This claim does not extend any existing block, and it may be claimed in its own
294*46c4c49dSIbrahim Kanouche					// right.
295*46c4c49dSIbrahim Kanouche				}
296*46c4c49dSIbrahim Kanouche			}
297*46c4c49dSIbrahim Kanouche			if !unclaimed {
298*46c4c49dSIbrahim Kanouche				break
299*46c4c49dSIbrahim Kanouche			}
300*46c4c49dSIbrahim Kanouche		}
301*46c4c49dSIbrahim Kanouche		// Only create a claim if this claim is likely relevant. If we had some higher quality hits,
302*46c4c49dSIbrahim Kanouche		// it's likely this is spurious noise. If we haven't had any significantly better hits, we'll keep
303*46c4c49dSIbrahim Kanouche		// this around.
304*46c4c49dSIbrahim Kanouche		if unclaimed && m.TokensClaimed*10 > matched[0].TokensClaimed {
305*46c4c49dSIbrahim Kanouche			claimed = append(claimed, m)
306*46c4c49dSIbrahim Kanouche		}
307*46c4c49dSIbrahim Kanouche	}
308*46c4c49dSIbrahim Kanouche	sort.Sort(claimed)
309*46c4c49dSIbrahim Kanouche	if c.tc.traceSearchset(origin) {
310*46c4c49dSIbrahim Kanouche		c.tc.trace("filterPasses = %+v", filterPasses)
311*46c4c49dSIbrahim Kanouche		c.tc.trace("filterDrops = %+v", filterDrops)
312*46c4c49dSIbrahim Kanouche		c.tc.trace("claimed = %s", spew.Sdump(claimed))
313*46c4c49dSIbrahim Kanouche	}
314*46c4c49dSIbrahim Kanouche	return claimed
315*46c4c49dSIbrahim Kanouche}
316*46c4c49dSIbrahim Kanouche
317*46c4c49dSIbrahim Kanouche// getMatchedRanges finds the ranges in the target text that match the source
318*46c4c49dSIbrahim Kanouche// text. The ranges returned are ordered from the entries with the most matched
319*46c4c49dSIbrahim Kanouche// tokens to the least.
320*46c4c49dSIbrahim Kanouchefunc (c *Classifier) getMatchedRanges(src, target *searchSet, confidence float64, q int) matchRanges {
321*46c4c49dSIbrahim Kanouche	shouldTrace := c.tc.traceSearchset(src.origin)
322*46c4c49dSIbrahim Kanouche
323*46c4c49dSIbrahim Kanouche	if shouldTrace {
324*46c4c49dSIbrahim Kanouche		c.tc.trace("src.origin = %+v", src.origin)
325*46c4c49dSIbrahim Kanouche	}
326*46c4c49dSIbrahim Kanouche	// Assemble a list of all the matched q-grams without any consideration to
327*46c4c49dSIbrahim Kanouche	// error tolerances.
328*46c4c49dSIbrahim Kanouche	matched := targetMatchedRanges(src, target)
329*46c4c49dSIbrahim Kanouche	if shouldTrace {
330*46c4c49dSIbrahim Kanouche		c.tc.trace("matched = %s", spew.Sdump(matched))
331*46c4c49dSIbrahim Kanouche	}
332*46c4c49dSIbrahim Kanouche	if len(matched) == 0 {
333*46c4c49dSIbrahim Kanouche		return nil
334*46c4c49dSIbrahim Kanouche	}
335*46c4c49dSIbrahim Kanouche
336*46c4c49dSIbrahim Kanouche	// Perform neighborhood matching to figure out which clusters of q-grams have
337*46c4c49dSIbrahim Kanouche	// sufficient likelihood to be a potential match to the source. For an error
338*46c4c49dSIbrahim Kanouche	// confidence threshold of X, we require that a sequence of N target tokens
339*46c4c49dSIbrahim Kanouche	// must contain N*X (X <=1.0) ordered source tokens in order to be a viable
340*46c4c49dSIbrahim Kanouche	// match.
341*46c4c49dSIbrahim Kanouche	//
342*46c4c49dSIbrahim Kanouche	// It's much easier to compute potential ranges in the target if we disregard
343*46c4c49dSIbrahim Kanouche	// proper ordering of source tokens initially, and see which source q-grams
344*46c4c49dSIbrahim Kanouche	// appear in sufficient quantities to be a potential match. We can then
345*46c4c49dSIbrahim Kanouche	// disregard any q-gram that falls outside of that range. This helps
346*46c4c49dSIbrahim Kanouche	// significantly since processing token matches is an N^2 (or worse)
347*46c4c49dSIbrahim Kanouche	// operation, so reducing N is a big win.
348*46c4c49dSIbrahim Kanouche
349*46c4c49dSIbrahim Kanouche	runs := c.detectRuns(src.origin, matched, len(target.Tokens), len(src.Tokens), confidence, q)
350*46c4c49dSIbrahim Kanouche
351*46c4c49dSIbrahim Kanouche	if shouldTrace {
352*46c4c49dSIbrahim Kanouche		c.tc.trace("runs = %d: %s", len(runs), spew.Sdump(runs))
353*46c4c49dSIbrahim Kanouche	}
354*46c4c49dSIbrahim Kanouche
355*46c4c49dSIbrahim Kanouche	// If there are no target runs of source tokens, we're done.
356*46c4c49dSIbrahim Kanouche	if len(runs) == 0 {
357*46c4c49dSIbrahim Kanouche		return nil
358*46c4c49dSIbrahim Kanouche	}
359*46c4c49dSIbrahim Kanouche
360*46c4c49dSIbrahim Kanouche	// Using the runs as a filter to ignore irrelevant matches, fuse the source
361*46c4c49dSIbrahim Kanouche	// match ranges into larger matches (with possible errors) to see if we can
362*46c4c49dSIbrahim Kanouche	// produce large enough runs that pass the confidence threshold.
363*46c4c49dSIbrahim Kanouche
364*46c4c49dSIbrahim Kanouche	fr := c.fuseRanges(src.origin, matched, confidence, len(src.Tokens), runs, len(target.Tokens))
365*46c4c49dSIbrahim Kanouche	if shouldTrace {
366*46c4c49dSIbrahim Kanouche		c.tc.trace("fr = %s", spew.Sdump(fr))
367*46c4c49dSIbrahim Kanouche	}
368*46c4c49dSIbrahim Kanouche	return fr
369*46c4c49dSIbrahim Kanouche}
370*46c4c49dSIbrahim Kanouche
371*46c4c49dSIbrahim Kanouchefunc (c *Classifier) detectRuns(origin string, matched matchRanges, targetLength, subsetLength int, threshold float64, q int) []matchRange {
372*46c4c49dSIbrahim Kanouche	shouldTrace := c.tc.traceSearchset(origin)
373*46c4c49dSIbrahim Kanouche	hits := make([]bool, targetLength)
374*46c4c49dSIbrahim Kanouche	for _, m := range matched {
375*46c4c49dSIbrahim Kanouche		for idx := m.TargetStart; idx < m.TargetEnd; idx++ {
376*46c4c49dSIbrahim Kanouche			hits[idx] = true
377*46c4c49dSIbrahim Kanouche		}
378*46c4c49dSIbrahim Kanouche	}
379*46c4c49dSIbrahim Kanouche
380*46c4c49dSIbrahim Kanouche	if len(hits) == 0 {
381*46c4c49dSIbrahim Kanouche		return nil
382*46c4c49dSIbrahim Kanouche	}
383*46c4c49dSIbrahim Kanouche	var out []int
384*46c4c49dSIbrahim Kanouche
385*46c4c49dSIbrahim Kanouche	total := 0
386*46c4c49dSIbrahim Kanouche	target := int(float64(subsetLength) * threshold)
387*46c4c49dSIbrahim Kanouche	if shouldTrace {
388*46c4c49dSIbrahim Kanouche		c.tc.trace("target = %+v", target)
389*46c4c49dSIbrahim Kanouche		c.tc.trace("targetLength = %+v", targetLength)
390*46c4c49dSIbrahim Kanouche		c.tc.trace("subsetLength = %+v", subsetLength)
391*46c4c49dSIbrahim Kanouche	}
392*46c4c49dSIbrahim Kanouche
393*46c4c49dSIbrahim Kanouche	// If we don't have at least 1 subset (i.e. the target is shorter than the
394*46c4c49dSIbrahim Kanouche	// source) just analyze what we have.
395*46c4c49dSIbrahim Kanouche	if len(hits) < subsetLength {
396*46c4c49dSIbrahim Kanouche		if shouldTrace {
397*46c4c49dSIbrahim Kanouche			c.tc.trace("trimmed search length from %d to %d", subsetLength, len(hits))
398*46c4c49dSIbrahim Kanouche		}
399*46c4c49dSIbrahim Kanouche		subsetLength = len(hits)
400*46c4c49dSIbrahim Kanouche	}
401*46c4c49dSIbrahim Kanouche	// Initialize our sliding window value.
402*46c4c49dSIbrahim Kanouche	for i := 0; i < subsetLength; i++ {
403*46c4c49dSIbrahim Kanouche		if hits[i] {
404*46c4c49dSIbrahim Kanouche			total++
405*46c4c49dSIbrahim Kanouche		}
406*46c4c49dSIbrahim Kanouche	}
407*46c4c49dSIbrahim Kanouche
408*46c4c49dSIbrahim Kanouche	if total >= target {
409*46c4c49dSIbrahim Kanouche		out = append(out, 0)
410*46c4c49dSIbrahim Kanouche	}
411*46c4c49dSIbrahim Kanouche
412*46c4c49dSIbrahim Kanouche	// Now move through the window adjusting the total by subtracting out the
413*46c4c49dSIbrahim Kanouche	// last bit and adding in the new bit.
414*46c4c49dSIbrahim Kanouche	for i := 1; i < len(hits); i++ {
415*46c4c49dSIbrahim Kanouche		if hits[i-1] {
416*46c4c49dSIbrahim Kanouche			total--
417*46c4c49dSIbrahim Kanouche		}
418*46c4c49dSIbrahim Kanouche		end := i + subsetLength - 1
419*46c4c49dSIbrahim Kanouche		if end < len(hits) && hits[i+subsetLength-1] {
420*46c4c49dSIbrahim Kanouche			total++
421*46c4c49dSIbrahim Kanouche		}
422*46c4c49dSIbrahim Kanouche		if total >= target {
423*46c4c49dSIbrahim Kanouche			out = append(out, i)
424*46c4c49dSIbrahim Kanouche		}
425*46c4c49dSIbrahim Kanouche	}
426*46c4c49dSIbrahim Kanouche	if len(out) == 0 {
427*46c4c49dSIbrahim Kanouche		return nil
428*46c4c49dSIbrahim Kanouche	}
429*46c4c49dSIbrahim Kanouche
430*46c4c49dSIbrahim Kanouche	final := []matchRange{
431*46c4c49dSIbrahim Kanouche		{
432*46c4c49dSIbrahim Kanouche			SrcStart: out[0],
433*46c4c49dSIbrahim Kanouche			SrcEnd:   out[0] + q,
434*46c4c49dSIbrahim Kanouche		},
435*46c4c49dSIbrahim Kanouche	}
436*46c4c49dSIbrahim Kanouche	for i := 1; i < len(out); i++ {
437*46c4c49dSIbrahim Kanouche		if out[i] != 1+out[i-1] {
438*46c4c49dSIbrahim Kanouche			final = append(final, matchRange{
439*46c4c49dSIbrahim Kanouche				SrcStart: out[i],
440*46c4c49dSIbrahim Kanouche				SrcEnd:   out[i] + q,
441*46c4c49dSIbrahim Kanouche			})
442*46c4c49dSIbrahim Kanouche		} else {
443*46c4c49dSIbrahim Kanouche			final[len(final)-1].SrcEnd = out[i] + q
444*46c4c49dSIbrahim Kanouche		}
445*46c4c49dSIbrahim Kanouche	}
446*46c4c49dSIbrahim Kanouche
447*46c4c49dSIbrahim Kanouche	return final
448*46c4c49dSIbrahim Kanouche}
449*46c4c49dSIbrahim Kanouche
450*46c4c49dSIbrahim Kanouchefunc targetMatchedRanges(src, target *searchSet) matchRanges {
451*46c4c49dSIbrahim Kanouche	offsetMappings := make(map[int][]*matchRange)
452*46c4c49dSIbrahim Kanouche
453*46c4c49dSIbrahim Kanouche	var matched matchRanges
454*46c4c49dSIbrahim Kanouche	for _, tgtNode := range target.nodes {
455*46c4c49dSIbrahim Kanouche		sr, ok := src.Hashes[tgtNode.checksum]
456*46c4c49dSIbrahim Kanouche		if !ok {
457*46c4c49dSIbrahim Kanouche			continue
458*46c4c49dSIbrahim Kanouche		}
459*46c4c49dSIbrahim Kanouche
460*46c4c49dSIbrahim Kanouche		tv := tgtNode.tokens
461*46c4c49dSIbrahim Kanouche		for _, sv := range sr {
462*46c4c49dSIbrahim Kanouche			offset := tv.Start - sv.Start
463*46c4c49dSIbrahim Kanouche			if om, ok := offsetMappings[offset]; ok {
464*46c4c49dSIbrahim Kanouche				// See if this extends the most recent existing mapping
465*46c4c49dSIbrahim Kanouche				lastIdx := len(om) - 1
466*46c4c49dSIbrahim Kanouche				if om[lastIdx].TargetEnd == tv.End-1 {
467*46c4c49dSIbrahim Kanouche					// This new value extends. Update the value in place
468*46c4c49dSIbrahim Kanouche					om[lastIdx].SrcEnd = sv.End
469*46c4c49dSIbrahim Kanouche					om[lastIdx].TargetEnd = tv.End
470*46c4c49dSIbrahim Kanouche					continue
471*46c4c49dSIbrahim Kanouche				}
472*46c4c49dSIbrahim Kanouche			}
473*46c4c49dSIbrahim Kanouche			offsetMappings[offset] = append(offsetMappings[offset], &matchRange{
474*46c4c49dSIbrahim Kanouche				SrcStart:    sv.Start,
475*46c4c49dSIbrahim Kanouche				SrcEnd:      sv.End,
476*46c4c49dSIbrahim Kanouche				TargetStart: tv.Start,
477*46c4c49dSIbrahim Kanouche				TargetEnd:   tv.End,
478*46c4c49dSIbrahim Kanouche			})
479*46c4c49dSIbrahim Kanouche		}
480*46c4c49dSIbrahim Kanouche	}
481*46c4c49dSIbrahim Kanouche
482*46c4c49dSIbrahim Kanouche	// Compute the number of tokens claimed in each run and flatten into a single slice.
483*46c4c49dSIbrahim Kanouche	for _, mr := range offsetMappings {
484*46c4c49dSIbrahim Kanouche		for _, m := range mr {
485*46c4c49dSIbrahim Kanouche			m.TokensClaimed = m.TargetEnd - m.TargetStart
486*46c4c49dSIbrahim Kanouche		}
487*46c4c49dSIbrahim Kanouche		matched = append(matched, mr...)
488*46c4c49dSIbrahim Kanouche	}
489*46c4c49dSIbrahim Kanouche	sort.Sort(matched)
490*46c4c49dSIbrahim Kanouche	return matched
491*46c4c49dSIbrahim Kanouche}
492*46c4c49dSIbrahim Kanouche
493*46c4c49dSIbrahim Kanouchetype hash map[uint32]tokenRanges
494*46c4c49dSIbrahim Kanouche
495*46c4c49dSIbrahim Kanouchefunc (h hash) add(checksum uint32, start, end int) {
496*46c4c49dSIbrahim Kanouche	h[checksum] = append(h[checksum], &tokenRange{Start: start, End: end})
497*46c4c49dSIbrahim Kanouche}
498