1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouchepackage classifier 16*46c4c49dSIbrahim Kanouche 17*46c4c49dSIbrahim Kanoucheimport ( 18*46c4c49dSIbrahim Kanouche "fmt" 19*46c4c49dSIbrahim Kanouche "hash/crc32" 20*46c4c49dSIbrahim Kanouche "math" 21*46c4c49dSIbrahim Kanouche "sort" 22*46c4c49dSIbrahim Kanouche 23*46c4c49dSIbrahim Kanouche "github.com/davecgh/go-spew/spew" 24*46c4c49dSIbrahim Kanouche) 25*46c4c49dSIbrahim Kanouche 26*46c4c49dSIbrahim Kanouche// searchSet is a set of q-grams that have hashes associated with them, 27*46c4c49dSIbrahim Kanouche// making it fast to search for potential matches. 28*46c4c49dSIbrahim Kanouchetype searchSet struct { 29*46c4c49dSIbrahim Kanouche // Tokens is a tokenized list of the original input string. 30*46c4c49dSIbrahim Kanouche Tokens []indexedToken 31*46c4c49dSIbrahim Kanouche // Hashes is a map of checksums to a range of tokens. 32*46c4c49dSIbrahim Kanouche Hashes hash 33*46c4c49dSIbrahim Kanouche // Checksums is a list of checksums ordered from longest range to 34*46c4c49dSIbrahim Kanouche // shortest. 35*46c4c49dSIbrahim Kanouche Checksums []uint32 36*46c4c49dSIbrahim Kanouche // ChecksumRanges are the token ranges for the above checksums. 37*46c4c49dSIbrahim Kanouche ChecksumRanges tokenRanges 38*46c4c49dSIbrahim Kanouche origin string // A debugging identifier to label what this searchset is associated with 39*46c4c49dSIbrahim Kanouche 40*46c4c49dSIbrahim Kanouche nodes []*node 41*46c4c49dSIbrahim Kanouche q int // The length of q-grams in this searchset. 42*46c4c49dSIbrahim Kanouche} 43*46c4c49dSIbrahim Kanouche 44*46c4c49dSIbrahim Kanouche// node consists of a range of tokens along with the checksum for those tokens. 45*46c4c49dSIbrahim Kanouchetype node struct { 46*46c4c49dSIbrahim Kanouche checksum uint32 47*46c4c49dSIbrahim Kanouche tokens *tokenRange 48*46c4c49dSIbrahim Kanouche} 49*46c4c49dSIbrahim Kanouche 50*46c4c49dSIbrahim Kanouchefunc (n *node) String() string { 51*46c4c49dSIbrahim Kanouche return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End) 52*46c4c49dSIbrahim Kanouche} 53*46c4c49dSIbrahim Kanouche 54*46c4c49dSIbrahim Kanouche// newSearchSet creates a new searchSet object. A searchset generates all 55*46c4c49dSIbrahim Kanouche// possible q-grams of tokens. These q-grams of tokens can be correlated to 56*46c4c49dSIbrahim Kanouche// determine where a section of text from one source may appear in another 57*46c4c49dSIbrahim Kanouche// source. 58*46c4c49dSIbrahim Kanouchefunc newSearchSet(s *indexedDocument, q int) *searchSet { 59*46c4c49dSIbrahim Kanouche // Start generating hash values for all q-grams within the text. 60*46c4c49dSIbrahim Kanouche h := make(hash) 61*46c4c49dSIbrahim Kanouche if len(s.Tokens) < q { 62*46c4c49dSIbrahim Kanouche // We can't have a smaller q than the number of tokens. 63*46c4c49dSIbrahim Kanouche q = len(s.Tokens) 64*46c4c49dSIbrahim Kanouche } 65*46c4c49dSIbrahim Kanouche checksums, tokenRanges := generateHashes(h, q, s.Tokens, s.dict) 66*46c4c49dSIbrahim Kanouche sset := &searchSet{ 67*46c4c49dSIbrahim Kanouche Tokens: s.Tokens, 68*46c4c49dSIbrahim Kanouche Hashes: h, 69*46c4c49dSIbrahim Kanouche Checksums: checksums, 70*46c4c49dSIbrahim Kanouche ChecksumRanges: tokenRanges, 71*46c4c49dSIbrahim Kanouche q: q, 72*46c4c49dSIbrahim Kanouche } 73*46c4c49dSIbrahim Kanouche sset.generateNodeList() 74*46c4c49dSIbrahim Kanouche return sset 75*46c4c49dSIbrahim Kanouche} 76*46c4c49dSIbrahim Kanouche 77*46c4c49dSIbrahim Kanouche// tokenRange indicates the range of tokens that map to a particular checksum. 78*46c4c49dSIbrahim Kanouchetype tokenRange struct { 79*46c4c49dSIbrahim Kanouche Start int 80*46c4c49dSIbrahim Kanouche End int 81*46c4c49dSIbrahim Kanouche} 82*46c4c49dSIbrahim Kanouche 83*46c4c49dSIbrahim Kanouchefunc (t *tokenRange) String() string { 84*46c4c49dSIbrahim Kanouche return fmt.Sprintf("[%v, %v)", t.Start, t.End) 85*46c4c49dSIbrahim Kanouche} 86*46c4c49dSIbrahim Kanouche 87*46c4c49dSIbrahim Kanouche// tokenRanges is a sortable type of a slice of TokenRange. 88*46c4c49dSIbrahim Kanouchetype tokenRanges []*tokenRange 89*46c4c49dSIbrahim Kanouche 90*46c4c49dSIbrahim Kanouche// generateHashes computes a hash using CRC-32 for each q-gram encountered in the provided tokens. 91*46c4c49dSIbrahim Kanouchefunc generateHashes(h hash, q int, toks []indexedToken, dict *dictionary) ([]uint32, tokenRanges) { 92*46c4c49dSIbrahim Kanouche if q == 0 { 93*46c4c49dSIbrahim Kanouche return nil, nil 94*46c4c49dSIbrahim Kanouche } 95*46c4c49dSIbrahim Kanouche var css []uint32 96*46c4c49dSIbrahim Kanouche var tr tokenRanges 97*46c4c49dSIbrahim Kanouche crc := crc32.NewIEEE() 98*46c4c49dSIbrahim Kanouche for offset := 0; offset+q <= len(toks); offset++ { 99*46c4c49dSIbrahim Kanouche crc.Reset() 100*46c4c49dSIbrahim Kanouche for i := 0; i < q; i++ { 101*46c4c49dSIbrahim Kanouche crc.Write([]byte(dict.getWord(toks[offset+i].ID))) 102*46c4c49dSIbrahim Kanouche crc.Write([]byte{' '}) 103*46c4c49dSIbrahim Kanouche } 104*46c4c49dSIbrahim Kanouche cs := crc.Sum32() 105*46c4c49dSIbrahim Kanouche css = append(css, cs) 106*46c4c49dSIbrahim Kanouche tr = append(tr, &tokenRange{offset, offset + q}) 107*46c4c49dSIbrahim Kanouche h.add(cs, offset, offset+q) 108*46c4c49dSIbrahim Kanouche } 109*46c4c49dSIbrahim Kanouche 110*46c4c49dSIbrahim Kanouche return css, tr 111*46c4c49dSIbrahim Kanouche} 112*46c4c49dSIbrahim Kanouche 113*46c4c49dSIbrahim Kanouche// generateNodeList creates a node list out of the search set. 114*46c4c49dSIbrahim Kanouchefunc (s *searchSet) generateNodeList() { 115*46c4c49dSIbrahim Kanouche if len(s.Tokens) == 0 { 116*46c4c49dSIbrahim Kanouche return 117*46c4c49dSIbrahim Kanouche } 118*46c4c49dSIbrahim Kanouche 119*46c4c49dSIbrahim Kanouche for i := 0; i < len(s.Checksums); i++ { 120*46c4c49dSIbrahim Kanouche s.nodes = append(s.nodes, &node{ 121*46c4c49dSIbrahim Kanouche checksum: s.Checksums[i], 122*46c4c49dSIbrahim Kanouche tokens: s.ChecksumRanges[i], 123*46c4c49dSIbrahim Kanouche }) 124*46c4c49dSIbrahim Kanouche } 125*46c4c49dSIbrahim Kanouche} 126*46c4c49dSIbrahim Kanouche 127*46c4c49dSIbrahim Kanouche// matchRange is the range within the source text that is a match to the range 128*46c4c49dSIbrahim Kanouche// in the target text. 129*46c4c49dSIbrahim Kanouchetype matchRange struct { 130*46c4c49dSIbrahim Kanouche // Offsets into the source tokens. 131*46c4c49dSIbrahim Kanouche SrcStart, SrcEnd int 132*46c4c49dSIbrahim Kanouche // Offsets into the target tokens. 133*46c4c49dSIbrahim Kanouche TargetStart, TargetEnd int 134*46c4c49dSIbrahim Kanouche // TokensClaimed tracks the number of positively matched tokens in this 135*46c4c49dSIbrahim Kanouche // range. For initially created matchRanges, this is equal to the extent of 136*46c4c49dSIbrahim Kanouche // the range. However, as matchRanges get merged together and error is 137*46c4c49dSIbrahim Kanouche // potentially introduced, this tracks the precise number of tokens that 138*46c4c49dSIbrahim Kanouche // exist in the range. 139*46c4c49dSIbrahim Kanouche TokensClaimed int 140*46c4c49dSIbrahim Kanouche} 141*46c4c49dSIbrahim Kanouche 142*46c4c49dSIbrahim Kanouche// in returns true if the start and end are enclosed in the match range. 143*46c4c49dSIbrahim Kanouchefunc (m *matchRange) in(other *matchRange) bool { 144*46c4c49dSIbrahim Kanouche return m.TargetStart >= other.TargetStart && m.TargetEnd <= other.TargetEnd 145*46c4c49dSIbrahim Kanouche} 146*46c4c49dSIbrahim Kanouche 147*46c4c49dSIbrahim Kanouchefunc (m *matchRange) String() string { 148*46c4c49dSIbrahim Kanouche return fmt.Sprintf("S: [%v, %v)-> T: [%v, %v) => %v [%v]", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd, m.TargetStart-m.SrcStart, m.TokensClaimed) 149*46c4c49dSIbrahim Kanouche} 150*46c4c49dSIbrahim Kanouche 151*46c4c49dSIbrahim Kanouche// matchRanges is a list of "matchRange"s. The ranges are monotonically 152*46c4c49dSIbrahim Kanouche// increasing in value and indicate a single potential occurrence of the source 153*46c4c49dSIbrahim Kanouche// text in the target text. They are sorted to support greedy matching with the 154*46c4c49dSIbrahim Kanouche// longest runs of q-grams appearing first in the sort. 155*46c4c49dSIbrahim Kanouchetype matchRanges []*matchRange 156*46c4c49dSIbrahim Kanouche 157*46c4c49dSIbrahim Kanouchefunc (m matchRanges) Len() int { return len(m) } 158*46c4c49dSIbrahim Kanouchefunc (m matchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 159*46c4c49dSIbrahim Kanouchefunc (m matchRanges) Less(i, j int) bool { 160*46c4c49dSIbrahim Kanouche if m[i].TokensClaimed != m[j].TokensClaimed { 161*46c4c49dSIbrahim Kanouche return m[i].TokensClaimed > m[j].TokensClaimed 162*46c4c49dSIbrahim Kanouche } 163*46c4c49dSIbrahim Kanouche 164*46c4c49dSIbrahim Kanouche if m[i].TargetStart != m[j].TargetStart { 165*46c4c49dSIbrahim Kanouche return m[i].TargetStart < m[j].TargetStart 166*46c4c49dSIbrahim Kanouche } 167*46c4c49dSIbrahim Kanouche return m[i].SrcStart < m[j].SrcStart 168*46c4c49dSIbrahim Kanouche} 169*46c4c49dSIbrahim Kanouche 170*46c4c49dSIbrahim Kanouche// findPotentialMatches returns the ranges in the target (unknown) text that 171*46c4c49dSIbrahim Kanouche// are best potential matches to the source (known) text. 172*46c4c49dSIbrahim Kanouchefunc (c *Classifier) findPotentialMatches(src, target *searchSet, confidence float64) matchRanges { 173*46c4c49dSIbrahim Kanouche matchedRanges := c.getMatchedRanges(src, target, confidence, src.q) 174*46c4c49dSIbrahim Kanouche if c.tc.traceSearchset(src.origin) { 175*46c4c49dSIbrahim Kanouche c.tc.trace("matchedRanges = %s", spew.Sdump(matchedRanges)) 176*46c4c49dSIbrahim Kanouche } 177*46c4c49dSIbrahim Kanouche if len(matchedRanges) == 0 { 178*46c4c49dSIbrahim Kanouche return nil 179*46c4c49dSIbrahim Kanouche } 180*46c4c49dSIbrahim Kanouche 181*46c4c49dSIbrahim Kanouche // After computing all potential matches, we only output ranges that contain 182*46c4c49dSIbrahim Kanouche // enough tokens to clear the confidence threshold. As noted, this number can 183*46c4c49dSIbrahim Kanouche // be too high, yielding false positives, but cannot yield false negatives. 184*46c4c49dSIbrahim Kanouche threshold := int(confidence * float64(len(src.Tokens))) 185*46c4c49dSIbrahim Kanouche 186*46c4c49dSIbrahim Kanouche for i, m := range matchedRanges { 187*46c4c49dSIbrahim Kanouche if m.TokensClaimed < threshold { 188*46c4c49dSIbrahim Kanouche matchedRanges = matchedRanges[:i] 189*46c4c49dSIbrahim Kanouche break 190*46c4c49dSIbrahim Kanouche } 191*46c4c49dSIbrahim Kanouche } 192*46c4c49dSIbrahim Kanouche 193*46c4c49dSIbrahim Kanouche if c.tc.traceSearchset(src.origin) { 194*46c4c49dSIbrahim Kanouche c.tc.trace("finalized matchedRanges for %s: %d = %s", src.origin, len(src.Tokens), spew.Sdump(matchedRanges)) 195*46c4c49dSIbrahim Kanouche } 196*46c4c49dSIbrahim Kanouche return matchedRanges 197*46c4c49dSIbrahim Kanouche} 198*46c4c49dSIbrahim Kanouche 199*46c4c49dSIbrahim Kanouche// fuseRanges analyzes the source matches, attempting to combine hits without 200*46c4c49dSIbrahim Kanouche// errors into larger hits with tolerable amounts of error to produce matches 201*46c4c49dSIbrahim Kanouche// that contain enough tokens to be considered for exact matching against a a 202*46c4c49dSIbrahim Kanouche// target document. This routine intentionally does not accurately track error 203*46c4c49dSIbrahim Kanouche// contributions from merging runs, trading false positives (but not false 204*46c4c49dSIbrahim Kanouche// negatives), for faster performance. 205*46c4c49dSIbrahim Kanouchefunc (c *Classifier) fuseRanges(origin string, matched matchRanges, confidence float64, size int, runs []matchRange, targetSize int) matchRanges { 206*46c4c49dSIbrahim Kanouche var claimed matchRanges 207*46c4c49dSIbrahim Kanouche errorMargin := int(math.Round(float64(size) * (1.0 - confidence))) 208*46c4c49dSIbrahim Kanouche 209*46c4c49dSIbrahim Kanouche filter := make([]bool, targetSize) 210*46c4c49dSIbrahim Kanouche for _, m := range runs { 211*46c4c49dSIbrahim Kanouche for i := m.SrcStart; i < m.SrcEnd; i++ { 212*46c4c49dSIbrahim Kanouche // Only consider these offsets if they fit into the target document, since 213*46c4c49dSIbrahim Kanouche // the target may be smaller than the source being analyzed 214*46c4c49dSIbrahim Kanouche if i < targetSize { 215*46c4c49dSIbrahim Kanouche filter[i] = true 216*46c4c49dSIbrahim Kanouche } 217*46c4c49dSIbrahim Kanouche } 218*46c4c49dSIbrahim Kanouche } 219*46c4c49dSIbrahim Kanouche 220*46c4c49dSIbrahim Kanouche filterDrops := 0 221*46c4c49dSIbrahim Kanouche filterPasses := 0 222*46c4c49dSIbrahim Kanouche 223*46c4c49dSIbrahim Kanouche // For each hit detected, compare it against all other previous hits to see if it can be part of match 224*46c4c49dSIbrahim Kanouche // or represents a group that is eligible for matching and having other hits contribute to it. 225*46c4c49dSIbrahim Kanouche for _, m := range matched { 226*46c4c49dSIbrahim Kanouche off := m.TargetStart - m.SrcStart 227*46c4c49dSIbrahim Kanouche 228*46c4c49dSIbrahim Kanouche // If the offset is negative, but within error margins, we associate it 229*46c4c49dSIbrahim Kanouche // with the first index to see if it could contribute to a run. If the 230*46c4c49dSIbrahim Kanouche // absolute offset is larger than the error margin, it can't possibly 231*46c4c49dSIbrahim Kanouche // contribute and will be dropped. This puts more potential error into the zero 232*46c4c49dSIbrahim Kanouche // index, but that just slightly increases the rate of false positives. In 233*46c4c49dSIbrahim Kanouche // practice, this would only be an issue if there are major substrings of a 234*46c4c49dSIbrahim Kanouche // source in a target that aren't part of a real hit. We see many small 235*46c4c49dSIbrahim Kanouche // references (the name of a license) but not large chunks of the license. 236*46c4c49dSIbrahim Kanouche if off < 0 { 237*46c4c49dSIbrahim Kanouche if -off <= errorMargin { 238*46c4c49dSIbrahim Kanouche off = 0 239*46c4c49dSIbrahim Kanouche } else { 240*46c4c49dSIbrahim Kanouche continue 241*46c4c49dSIbrahim Kanouche } 242*46c4c49dSIbrahim Kanouche } 243*46c4c49dSIbrahim Kanouche 244*46c4c49dSIbrahim Kanouche // If the filter is false, there was not sufficient token density in that 245*46c4c49dSIbrahim Kanouche // part of the target document for a viable match, so this match is a 246*46c4c49dSIbrahim Kanouche // spurious hit and can be discarded. 247*46c4c49dSIbrahim Kanouche if !filter[off] { 248*46c4c49dSIbrahim Kanouche filterDrops++ 249*46c4c49dSIbrahim Kanouche continue 250*46c4c49dSIbrahim Kanouche } 251*46c4c49dSIbrahim Kanouche 252*46c4c49dSIbrahim Kanouche filterPasses++ 253*46c4c49dSIbrahim Kanouche unclaimed := true 254*46c4c49dSIbrahim Kanouche 255*46c4c49dSIbrahim Kanouche for _, c := range claimed { 256*46c4c49dSIbrahim Kanouche moff := m.TargetStart - m.SrcStart 257*46c4c49dSIbrahim Kanouche coff := c.TargetStart - c.SrcStart 258*46c4c49dSIbrahim Kanouche sampleError := int(math.Round(math.Abs(float64(moff - coff)))) 259*46c4c49dSIbrahim Kanouche withinError := sampleError < errorMargin 260*46c4c49dSIbrahim Kanouche 261*46c4c49dSIbrahim Kanouche // The contribution needs to add more value than accumulated error. This prevents 262*46c4c49dSIbrahim Kanouche // against spurious matches of a reference to a license incorrectly overextending the 263*46c4c49dSIbrahim Kanouche // match range. 264*46c4c49dSIbrahim Kanouche if withinError && m.TokensClaimed > int(sampleError) { 265*46c4c49dSIbrahim Kanouche if m.in(c) { 266*46c4c49dSIbrahim Kanouche // This can cause too many tokens to be claimed, but that's fine since we want to avoid 267*46c4c49dSIbrahim Kanouche // undercounting and missing content. 268*46c4c49dSIbrahim Kanouche c.TokensClaimed += m.TokensClaimed 269*46c4c49dSIbrahim Kanouche unclaimed = false 270*46c4c49dSIbrahim Kanouche } else { 271*46c4c49dSIbrahim Kanouche // See if the claims can be merged. If the error tolerances allow for it, 272*46c4c49dSIbrahim Kanouche // we merge the claims and the ranges. This doesn't accumulate error 273*46c4c49dSIbrahim Kanouche // accurately so it's possible that repeated merges can introduce too 274*46c4c49dSIbrahim Kanouche // much error to actually make a match, but we won't get false 275*46c4c49dSIbrahim Kanouche // negatives from this approach. The error tolerances allow for a 276*46c4c49dSIbrahim Kanouche // merge, but we only want to merge if it increases the range of 277*46c4c49dSIbrahim Kanouche // tokens being covered. If this match is already handled by an 278*46c4c49dSIbrahim Kanouche // existing (stronger by definition) claim, we don't merge this one, 279*46c4c49dSIbrahim Kanouche // but treat it as a new claim. This allows for the case where a 280*46c4c49dSIbrahim Kanouche // highly fragmented text will be matched by a long series of low 281*46c4c49dSIbrahim Kanouche // score matches. 282*46c4c49dSIbrahim Kanouche if m.TargetStart < c.TargetStart && m.SrcStart < c.SrcStart { 283*46c4c49dSIbrahim Kanouche c.TargetStart = m.TargetStart 284*46c4c49dSIbrahim Kanouche c.SrcStart = m.SrcStart 285*46c4c49dSIbrahim Kanouche c.TokensClaimed += m.TokensClaimed 286*46c4c49dSIbrahim Kanouche unclaimed = false 287*46c4c49dSIbrahim Kanouche } else if m.TargetEnd > c.TargetEnd && m.SrcEnd > c.SrcEnd { 288*46c4c49dSIbrahim Kanouche c.TargetEnd = m.TargetEnd 289*46c4c49dSIbrahim Kanouche c.SrcEnd = m.SrcEnd 290*46c4c49dSIbrahim Kanouche c.TokensClaimed += m.TokensClaimed 291*46c4c49dSIbrahim Kanouche unclaimed = false 292*46c4c49dSIbrahim Kanouche } 293*46c4c49dSIbrahim Kanouche // This claim does not extend any existing block, and it may be claimed in its own 294*46c4c49dSIbrahim Kanouche // right. 295*46c4c49dSIbrahim Kanouche } 296*46c4c49dSIbrahim Kanouche } 297*46c4c49dSIbrahim Kanouche if !unclaimed { 298*46c4c49dSIbrahim Kanouche break 299*46c4c49dSIbrahim Kanouche } 300*46c4c49dSIbrahim Kanouche } 301*46c4c49dSIbrahim Kanouche // Only create a claim if this claim is likely relevant. If we had some higher quality hits, 302*46c4c49dSIbrahim Kanouche // it's likely this is spurious noise. If we haven't had any significantly better hits, we'll keep 303*46c4c49dSIbrahim Kanouche // this around. 304*46c4c49dSIbrahim Kanouche if unclaimed && m.TokensClaimed*10 > matched[0].TokensClaimed { 305*46c4c49dSIbrahim Kanouche claimed = append(claimed, m) 306*46c4c49dSIbrahim Kanouche } 307*46c4c49dSIbrahim Kanouche } 308*46c4c49dSIbrahim Kanouche sort.Sort(claimed) 309*46c4c49dSIbrahim Kanouche if c.tc.traceSearchset(origin) { 310*46c4c49dSIbrahim Kanouche c.tc.trace("filterPasses = %+v", filterPasses) 311*46c4c49dSIbrahim Kanouche c.tc.trace("filterDrops = %+v", filterDrops) 312*46c4c49dSIbrahim Kanouche c.tc.trace("claimed = %s", spew.Sdump(claimed)) 313*46c4c49dSIbrahim Kanouche } 314*46c4c49dSIbrahim Kanouche return claimed 315*46c4c49dSIbrahim Kanouche} 316*46c4c49dSIbrahim Kanouche 317*46c4c49dSIbrahim Kanouche// getMatchedRanges finds the ranges in the target text that match the source 318*46c4c49dSIbrahim Kanouche// text. The ranges returned are ordered from the entries with the most matched 319*46c4c49dSIbrahim Kanouche// tokens to the least. 320*46c4c49dSIbrahim Kanouchefunc (c *Classifier) getMatchedRanges(src, target *searchSet, confidence float64, q int) matchRanges { 321*46c4c49dSIbrahim Kanouche shouldTrace := c.tc.traceSearchset(src.origin) 322*46c4c49dSIbrahim Kanouche 323*46c4c49dSIbrahim Kanouche if shouldTrace { 324*46c4c49dSIbrahim Kanouche c.tc.trace("src.origin = %+v", src.origin) 325*46c4c49dSIbrahim Kanouche } 326*46c4c49dSIbrahim Kanouche // Assemble a list of all the matched q-grams without any consideration to 327*46c4c49dSIbrahim Kanouche // error tolerances. 328*46c4c49dSIbrahim Kanouche matched := targetMatchedRanges(src, target) 329*46c4c49dSIbrahim Kanouche if shouldTrace { 330*46c4c49dSIbrahim Kanouche c.tc.trace("matched = %s", spew.Sdump(matched)) 331*46c4c49dSIbrahim Kanouche } 332*46c4c49dSIbrahim Kanouche if len(matched) == 0 { 333*46c4c49dSIbrahim Kanouche return nil 334*46c4c49dSIbrahim Kanouche } 335*46c4c49dSIbrahim Kanouche 336*46c4c49dSIbrahim Kanouche // Perform neighborhood matching to figure out which clusters of q-grams have 337*46c4c49dSIbrahim Kanouche // sufficient likelihood to be a potential match to the source. For an error 338*46c4c49dSIbrahim Kanouche // confidence threshold of X, we require that a sequence of N target tokens 339*46c4c49dSIbrahim Kanouche // must contain N*X (X <=1.0) ordered source tokens in order to be a viable 340*46c4c49dSIbrahim Kanouche // match. 341*46c4c49dSIbrahim Kanouche // 342*46c4c49dSIbrahim Kanouche // It's much easier to compute potential ranges in the target if we disregard 343*46c4c49dSIbrahim Kanouche // proper ordering of source tokens initially, and see which source q-grams 344*46c4c49dSIbrahim Kanouche // appear in sufficient quantities to be a potential match. We can then 345*46c4c49dSIbrahim Kanouche // disregard any q-gram that falls outside of that range. This helps 346*46c4c49dSIbrahim Kanouche // significantly since processing token matches is an N^2 (or worse) 347*46c4c49dSIbrahim Kanouche // operation, so reducing N is a big win. 348*46c4c49dSIbrahim Kanouche 349*46c4c49dSIbrahim Kanouche runs := c.detectRuns(src.origin, matched, len(target.Tokens), len(src.Tokens), confidence, q) 350*46c4c49dSIbrahim Kanouche 351*46c4c49dSIbrahim Kanouche if shouldTrace { 352*46c4c49dSIbrahim Kanouche c.tc.trace("runs = %d: %s", len(runs), spew.Sdump(runs)) 353*46c4c49dSIbrahim Kanouche } 354*46c4c49dSIbrahim Kanouche 355*46c4c49dSIbrahim Kanouche // If there are no target runs of source tokens, we're done. 356*46c4c49dSIbrahim Kanouche if len(runs) == 0 { 357*46c4c49dSIbrahim Kanouche return nil 358*46c4c49dSIbrahim Kanouche } 359*46c4c49dSIbrahim Kanouche 360*46c4c49dSIbrahim Kanouche // Using the runs as a filter to ignore irrelevant matches, fuse the source 361*46c4c49dSIbrahim Kanouche // match ranges into larger matches (with possible errors) to see if we can 362*46c4c49dSIbrahim Kanouche // produce large enough runs that pass the confidence threshold. 363*46c4c49dSIbrahim Kanouche 364*46c4c49dSIbrahim Kanouche fr := c.fuseRanges(src.origin, matched, confidence, len(src.Tokens), runs, len(target.Tokens)) 365*46c4c49dSIbrahim Kanouche if shouldTrace { 366*46c4c49dSIbrahim Kanouche c.tc.trace("fr = %s", spew.Sdump(fr)) 367*46c4c49dSIbrahim Kanouche } 368*46c4c49dSIbrahim Kanouche return fr 369*46c4c49dSIbrahim Kanouche} 370*46c4c49dSIbrahim Kanouche 371*46c4c49dSIbrahim Kanouchefunc (c *Classifier) detectRuns(origin string, matched matchRanges, targetLength, subsetLength int, threshold float64, q int) []matchRange { 372*46c4c49dSIbrahim Kanouche shouldTrace := c.tc.traceSearchset(origin) 373*46c4c49dSIbrahim Kanouche hits := make([]bool, targetLength) 374*46c4c49dSIbrahim Kanouche for _, m := range matched { 375*46c4c49dSIbrahim Kanouche for idx := m.TargetStart; idx < m.TargetEnd; idx++ { 376*46c4c49dSIbrahim Kanouche hits[idx] = true 377*46c4c49dSIbrahim Kanouche } 378*46c4c49dSIbrahim Kanouche } 379*46c4c49dSIbrahim Kanouche 380*46c4c49dSIbrahim Kanouche if len(hits) == 0 { 381*46c4c49dSIbrahim Kanouche return nil 382*46c4c49dSIbrahim Kanouche } 383*46c4c49dSIbrahim Kanouche var out []int 384*46c4c49dSIbrahim Kanouche 385*46c4c49dSIbrahim Kanouche total := 0 386*46c4c49dSIbrahim Kanouche target := int(float64(subsetLength) * threshold) 387*46c4c49dSIbrahim Kanouche if shouldTrace { 388*46c4c49dSIbrahim Kanouche c.tc.trace("target = %+v", target) 389*46c4c49dSIbrahim Kanouche c.tc.trace("targetLength = %+v", targetLength) 390*46c4c49dSIbrahim Kanouche c.tc.trace("subsetLength = %+v", subsetLength) 391*46c4c49dSIbrahim Kanouche } 392*46c4c49dSIbrahim Kanouche 393*46c4c49dSIbrahim Kanouche // If we don't have at least 1 subset (i.e. the target is shorter than the 394*46c4c49dSIbrahim Kanouche // source) just analyze what we have. 395*46c4c49dSIbrahim Kanouche if len(hits) < subsetLength { 396*46c4c49dSIbrahim Kanouche if shouldTrace { 397*46c4c49dSIbrahim Kanouche c.tc.trace("trimmed search length from %d to %d", subsetLength, len(hits)) 398*46c4c49dSIbrahim Kanouche } 399*46c4c49dSIbrahim Kanouche subsetLength = len(hits) 400*46c4c49dSIbrahim Kanouche } 401*46c4c49dSIbrahim Kanouche // Initialize our sliding window value. 402*46c4c49dSIbrahim Kanouche for i := 0; i < subsetLength; i++ { 403*46c4c49dSIbrahim Kanouche if hits[i] { 404*46c4c49dSIbrahim Kanouche total++ 405*46c4c49dSIbrahim Kanouche } 406*46c4c49dSIbrahim Kanouche } 407*46c4c49dSIbrahim Kanouche 408*46c4c49dSIbrahim Kanouche if total >= target { 409*46c4c49dSIbrahim Kanouche out = append(out, 0) 410*46c4c49dSIbrahim Kanouche } 411*46c4c49dSIbrahim Kanouche 412*46c4c49dSIbrahim Kanouche // Now move through the window adjusting the total by subtracting out the 413*46c4c49dSIbrahim Kanouche // last bit and adding in the new bit. 414*46c4c49dSIbrahim Kanouche for i := 1; i < len(hits); i++ { 415*46c4c49dSIbrahim Kanouche if hits[i-1] { 416*46c4c49dSIbrahim Kanouche total-- 417*46c4c49dSIbrahim Kanouche } 418*46c4c49dSIbrahim Kanouche end := i + subsetLength - 1 419*46c4c49dSIbrahim Kanouche if end < len(hits) && hits[i+subsetLength-1] { 420*46c4c49dSIbrahim Kanouche total++ 421*46c4c49dSIbrahim Kanouche } 422*46c4c49dSIbrahim Kanouche if total >= target { 423*46c4c49dSIbrahim Kanouche out = append(out, i) 424*46c4c49dSIbrahim Kanouche } 425*46c4c49dSIbrahim Kanouche } 426*46c4c49dSIbrahim Kanouche if len(out) == 0 { 427*46c4c49dSIbrahim Kanouche return nil 428*46c4c49dSIbrahim Kanouche } 429*46c4c49dSIbrahim Kanouche 430*46c4c49dSIbrahim Kanouche final := []matchRange{ 431*46c4c49dSIbrahim Kanouche { 432*46c4c49dSIbrahim Kanouche SrcStart: out[0], 433*46c4c49dSIbrahim Kanouche SrcEnd: out[0] + q, 434*46c4c49dSIbrahim Kanouche }, 435*46c4c49dSIbrahim Kanouche } 436*46c4c49dSIbrahim Kanouche for i := 1; i < len(out); i++ { 437*46c4c49dSIbrahim Kanouche if out[i] != 1+out[i-1] { 438*46c4c49dSIbrahim Kanouche final = append(final, matchRange{ 439*46c4c49dSIbrahim Kanouche SrcStart: out[i], 440*46c4c49dSIbrahim Kanouche SrcEnd: out[i] + q, 441*46c4c49dSIbrahim Kanouche }) 442*46c4c49dSIbrahim Kanouche } else { 443*46c4c49dSIbrahim Kanouche final[len(final)-1].SrcEnd = out[i] + q 444*46c4c49dSIbrahim Kanouche } 445*46c4c49dSIbrahim Kanouche } 446*46c4c49dSIbrahim Kanouche 447*46c4c49dSIbrahim Kanouche return final 448*46c4c49dSIbrahim Kanouche} 449*46c4c49dSIbrahim Kanouche 450*46c4c49dSIbrahim Kanouchefunc targetMatchedRanges(src, target *searchSet) matchRanges { 451*46c4c49dSIbrahim Kanouche offsetMappings := make(map[int][]*matchRange) 452*46c4c49dSIbrahim Kanouche 453*46c4c49dSIbrahim Kanouche var matched matchRanges 454*46c4c49dSIbrahim Kanouche for _, tgtNode := range target.nodes { 455*46c4c49dSIbrahim Kanouche sr, ok := src.Hashes[tgtNode.checksum] 456*46c4c49dSIbrahim Kanouche if !ok { 457*46c4c49dSIbrahim Kanouche continue 458*46c4c49dSIbrahim Kanouche } 459*46c4c49dSIbrahim Kanouche 460*46c4c49dSIbrahim Kanouche tv := tgtNode.tokens 461*46c4c49dSIbrahim Kanouche for _, sv := range sr { 462*46c4c49dSIbrahim Kanouche offset := tv.Start - sv.Start 463*46c4c49dSIbrahim Kanouche if om, ok := offsetMappings[offset]; ok { 464*46c4c49dSIbrahim Kanouche // See if this extends the most recent existing mapping 465*46c4c49dSIbrahim Kanouche lastIdx := len(om) - 1 466*46c4c49dSIbrahim Kanouche if om[lastIdx].TargetEnd == tv.End-1 { 467*46c4c49dSIbrahim Kanouche // This new value extends. Update the value in place 468*46c4c49dSIbrahim Kanouche om[lastIdx].SrcEnd = sv.End 469*46c4c49dSIbrahim Kanouche om[lastIdx].TargetEnd = tv.End 470*46c4c49dSIbrahim Kanouche continue 471*46c4c49dSIbrahim Kanouche } 472*46c4c49dSIbrahim Kanouche } 473*46c4c49dSIbrahim Kanouche offsetMappings[offset] = append(offsetMappings[offset], &matchRange{ 474*46c4c49dSIbrahim Kanouche SrcStart: sv.Start, 475*46c4c49dSIbrahim Kanouche SrcEnd: sv.End, 476*46c4c49dSIbrahim Kanouche TargetStart: tv.Start, 477*46c4c49dSIbrahim Kanouche TargetEnd: tv.End, 478*46c4c49dSIbrahim Kanouche }) 479*46c4c49dSIbrahim Kanouche } 480*46c4c49dSIbrahim Kanouche } 481*46c4c49dSIbrahim Kanouche 482*46c4c49dSIbrahim Kanouche // Compute the number of tokens claimed in each run and flatten into a single slice. 483*46c4c49dSIbrahim Kanouche for _, mr := range offsetMappings { 484*46c4c49dSIbrahim Kanouche for _, m := range mr { 485*46c4c49dSIbrahim Kanouche m.TokensClaimed = m.TargetEnd - m.TargetStart 486*46c4c49dSIbrahim Kanouche } 487*46c4c49dSIbrahim Kanouche matched = append(matched, mr...) 488*46c4c49dSIbrahim Kanouche } 489*46c4c49dSIbrahim Kanouche sort.Sort(matched) 490*46c4c49dSIbrahim Kanouche return matched 491*46c4c49dSIbrahim Kanouche} 492*46c4c49dSIbrahim Kanouche 493*46c4c49dSIbrahim Kanouchetype hash map[uint32]tokenRanges 494*46c4c49dSIbrahim Kanouche 495*46c4c49dSIbrahim Kanouchefunc (h hash) add(checksum uint32, start, end int) { 496*46c4c49dSIbrahim Kanouche h[checksum] = append(h[checksum], &tokenRange{Start: start, End: end}) 497*46c4c49dSIbrahim Kanouche} 498