1*46c4c49dSIbrahim Kanouche// Copyright 2020 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouchepackage classifier 16*46c4c49dSIbrahim Kanouche 17*46c4c49dSIbrahim Kanoucheimport ( 18*46c4c49dSIbrahim Kanouche "fmt" 19*46c4c49dSIbrahim Kanouche "reflect" 20*46c4c49dSIbrahim Kanouche "strings" 21*46c4c49dSIbrahim Kanouche "testing" 22*46c4c49dSIbrahim Kanouche 23*46c4c49dSIbrahim Kanouche "github.com/davecgh/go-spew/spew" 24*46c4c49dSIbrahim Kanouche "github.com/google/go-cmp/cmp" 25*46c4c49dSIbrahim Kanouche) 26*46c4c49dSIbrahim Kanouche 27*46c4c49dSIbrahim Kanouche// hundredLicenseText is a baseline for debugging precise ordering issues to demonstrate that the token-run assembly process can identify maximally fragmented entries successfully. 28*46c4c49dSIbrahim Kanouchevar hundredLicenseText = ` 29*46c4c49dSIbrahim Kanouche1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100` 30*46c4c49dSIbrahim Kanouche 31*46c4c49dSIbrahim Kanouche// prefixMissingText exercises the error margin detection case at the beginning of a body of text. 32*46c4c49dSIbrahim Kanouchevar prefixMissingText = ` 33*46c4c49dSIbrahim Kanouche21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100` 34*46c4c49dSIbrahim Kanouche 35*46c4c49dSIbrahim Kanouche// suffixMissingText exercises the error margin detection case at the beginning of a body of text. 36*46c4c49dSIbrahim Kanouchevar suffixMissingText = ` 37*46c4c49dSIbrahim Kanouche1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80` 38*46c4c49dSIbrahim Kanouche 39*46c4c49dSIbrahim Kanouche// fragmentedText is worst-case fragmentation that requires maximum reassembly with full error tolerance. 40*46c4c49dSIbrahim Kanouchevar fragmentedText = ` 41*46c4c49dSIbrahim Kanouche1 2 3 4 X 6 7 8 9 X 11 12 13 14 X 16 17 18 19 X 21 22 23 24 X 26 27 28 29 X 31 32 33 34 X 36 37 38 39 X 41 42 43 44 X 46 47 48 49 X 51 52 53 54 X 56 57 58 59 X 61 62 63 64 X 66 67 68 69 X 71 72 73 74 X 76 77 78 79 X 81 82 83 84 X 86 87 88 89 X 91 92 93 94 X 96 97 98 99 X` 42*46c4c49dSIbrahim Kanouche 43*46c4c49dSIbrahim Kanouche// bigChunkText has a gap of maximal length to ensure that reassembly works. 44*46c4c49dSIbrahim Kanouchevar bigChunkText = ` 45*46c4c49dSIbrahim Kanouche1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 X X X X X X X X X X X X X X X X X X X X 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100` 46*46c4c49dSIbrahim Kanouche 47*46c4c49dSIbrahim Kanouche// The 50 license text leverages repeated ordered tokens to exercise reassembly options of repeated text in a worst-case situation. 48*46c4c49dSIbrahim Kanouchevar fiftyLicenseText = ` 49*46c4c49dSIbrahim Kanouche1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50` 50*46c4c49dSIbrahim Kanouche 51*46c4c49dSIbrahim Kanouchevar repeatedWithBreakagesText = ` 52*46c4c49dSIbrahim Kanouche1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 X X X X X X X X X X X X X X X X X 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 X X X` 53*46c4c49dSIbrahim Kanouche 54*46c4c49dSIbrahim Kanouchefunc TestSearchSet_New(t *testing.T) { 55*46c4c49dSIbrahim Kanouche tests := []struct { 56*46c4c49dSIbrahim Kanouche description string 57*46c4c49dSIbrahim Kanouche text string 58*46c4c49dSIbrahim Kanouche q int 59*46c4c49dSIbrahim Kanouche want *searchSet 60*46c4c49dSIbrahim Kanouche }{ 61*46c4c49dSIbrahim Kanouche { 62*46c4c49dSIbrahim Kanouche description: "Empty string", 63*46c4c49dSIbrahim Kanouche text: "", 64*46c4c49dSIbrahim Kanouche q: 4, 65*46c4c49dSIbrahim Kanouche want: &searchSet{ 66*46c4c49dSIbrahim Kanouche Tokens: nil, 67*46c4c49dSIbrahim Kanouche Hashes: make(hash), 68*46c4c49dSIbrahim Kanouche Checksums: nil, 69*46c4c49dSIbrahim Kanouche ChecksumRanges: nil, 70*46c4c49dSIbrahim Kanouche }, 71*46c4c49dSIbrahim Kanouche }, 72*46c4c49dSIbrahim Kanouche { 73*46c4c49dSIbrahim Kanouche description: "Small string", 74*46c4c49dSIbrahim Kanouche text: "Hello world", 75*46c4c49dSIbrahim Kanouche q: 4, 76*46c4c49dSIbrahim Kanouche want: &searchSet{ 77*46c4c49dSIbrahim Kanouche Tokens: []indexedToken{ 78*46c4c49dSIbrahim Kanouche {Line: 1, ID: 1}, 79*46c4c49dSIbrahim Kanouche {Line: 1, ID: 2}, 80*46c4c49dSIbrahim Kanouche }, 81*46c4c49dSIbrahim Kanouche Hashes: hash{1957950203: tokenRanges{&tokenRange{Start: 0, End: 2}}}, 82*46c4c49dSIbrahim Kanouche Checksums: []uint32{1957950203}, 83*46c4c49dSIbrahim Kanouche ChecksumRanges: tokenRanges{&tokenRange{Start: 0, End: 2}}, 84*46c4c49dSIbrahim Kanouche nodes: []*node{{1957950203, &tokenRange{Start: 0, End: 2}}}, 85*46c4c49dSIbrahim Kanouche q: 2, 86*46c4c49dSIbrahim Kanouche }, 87*46c4c49dSIbrahim Kanouche }, 88*46c4c49dSIbrahim Kanouche } 89*46c4c49dSIbrahim Kanouche 90*46c4c49dSIbrahim Kanouche for _, tt := range tests { 91*46c4c49dSIbrahim Kanouche var trace strings.Builder 92*46c4c49dSIbrahim Kanouche c := NewClassifier(.8) // This value doesn't affect the test. 93*46c4c49dSIbrahim Kanouche c.SetTraceConfiguration(&TraceConfiguration{ 94*46c4c49dSIbrahim Kanouche TraceLicenses: "*", 95*46c4c49dSIbrahim Kanouche TracePhases: "*", 96*46c4c49dSIbrahim Kanouche Tracer: func(f string, args ...interface{}) { 97*46c4c49dSIbrahim Kanouche trace.WriteString(fmt.Sprintf(f, args...)) 98*46c4c49dSIbrahim Kanouche }, 99*46c4c49dSIbrahim Kanouche }) 100*46c4c49dSIbrahim Kanouche c.AddContent("", "text", "", []byte(tt.text)) 101*46c4c49dSIbrahim Kanouche if got := newSearchSet(c.getIndexedDocument("", "text", ""), tt.q); !reflect.DeepEqual(got, tt.want) { 102*46c4c49dSIbrahim Kanouche t.Errorf("New(%q) = %+v, want %+v", tt.description, spew.Sdump(got), spew.Sdump(tt.want)) 103*46c4c49dSIbrahim Kanouche t.Errorf("Trace:\n%s", trace.String()) 104*46c4c49dSIbrahim Kanouche } 105*46c4c49dSIbrahim Kanouche } 106*46c4c49dSIbrahim Kanouche} 107*46c4c49dSIbrahim Kanouchefunc TestFindPotentialMatches(t *testing.T) { 108*46c4c49dSIbrahim Kanouche tests := []struct { 109*46c4c49dSIbrahim Kanouche name string 110*46c4c49dSIbrahim Kanouche src string 111*46c4c49dSIbrahim Kanouche target string 112*46c4c49dSIbrahim Kanouche confidence float64 113*46c4c49dSIbrahim Kanouche expectedHits int 114*46c4c49dSIbrahim Kanouche }{ 115*46c4c49dSIbrahim Kanouche { 116*46c4c49dSIbrahim Kanouche name: "maximally fragmented", 117*46c4c49dSIbrahim Kanouche src: hundredLicenseText, 118*46c4c49dSIbrahim Kanouche target: fragmentedText, 119*46c4c49dSIbrahim Kanouche confidence: .8, 120*46c4c49dSIbrahim Kanouche expectedHits: 1, 121*46c4c49dSIbrahim Kanouche }, 122*46c4c49dSIbrahim Kanouche { 123*46c4c49dSIbrahim Kanouche name: "prefix missing", 124*46c4c49dSIbrahim Kanouche src: hundredLicenseText, 125*46c4c49dSIbrahim Kanouche target: prefixMissingText, 126*46c4c49dSIbrahim Kanouche confidence: .8, 127*46c4c49dSIbrahim Kanouche expectedHits: 1, 128*46c4c49dSIbrahim Kanouche }, 129*46c4c49dSIbrahim Kanouche { 130*46c4c49dSIbrahim Kanouche name: "suffix missing", 131*46c4c49dSIbrahim Kanouche src: hundredLicenseText, 132*46c4c49dSIbrahim Kanouche target: suffixMissingText, 133*46c4c49dSIbrahim Kanouche confidence: .8, 134*46c4c49dSIbrahim Kanouche expectedHits: 1, 135*46c4c49dSIbrahim Kanouche }, 136*46c4c49dSIbrahim Kanouche { 137*46c4c49dSIbrahim Kanouche name: "maximum-length error", 138*46c4c49dSIbrahim Kanouche src: hundredLicenseText, 139*46c4c49dSIbrahim Kanouche target: bigChunkText, 140*46c4c49dSIbrahim Kanouche confidence: .8, 141*46c4c49dSIbrahim Kanouche expectedHits: 1, 142*46c4c49dSIbrahim Kanouche }, 143*46c4c49dSIbrahim Kanouche } 144*46c4c49dSIbrahim Kanouche 145*46c4c49dSIbrahim Kanouche for _, test := range tests { 146*46c4c49dSIbrahim Kanouche t.Run(test.name, func(t *testing.T) { 147*46c4c49dSIbrahim Kanouche var trace strings.Builder 148*46c4c49dSIbrahim Kanouche c := NewClassifier(test.confidence) 149*46c4c49dSIbrahim Kanouche c.SetTraceConfiguration(&TraceConfiguration{ 150*46c4c49dSIbrahim Kanouche TraceLicenses: "*", 151*46c4c49dSIbrahim Kanouche TracePhases: "*", 152*46c4c49dSIbrahim Kanouche Tracer: func(f string, args ...interface{}) { 153*46c4c49dSIbrahim Kanouche trace.WriteString(fmt.Sprintf(f, args...)) 154*46c4c49dSIbrahim Kanouche }, 155*46c4c49dSIbrahim Kanouche }) 156*46c4c49dSIbrahim Kanouche c.AddContent("", "source", "", []byte(test.src)) 157*46c4c49dSIbrahim Kanouche 158*46c4c49dSIbrahim Kanouche doc := c.createTargetIndexedDocument([]byte(test.target)) 159*46c4c49dSIbrahim Kanouche doc.generateSearchSet(c.q) 160*46c4c49dSIbrahim Kanouche hits := c.findPotentialMatches(c.getIndexedDocument("", "source", "").s, doc.s, test.confidence) 161*46c4c49dSIbrahim Kanouche if actual := len(hits); actual != test.expectedHits { 162*46c4c49dSIbrahim Kanouche t.Errorf("got %d hits, wanted %d", actual, test.expectedHits) 163*46c4c49dSIbrahim Kanouche t.Errorf("Trace:\n%s", trace.String()) 164*46c4c49dSIbrahim Kanouche } 165*46c4c49dSIbrahim Kanouche }) 166*46c4c49dSIbrahim Kanouche } 167*46c4c49dSIbrahim Kanouche} 168*46c4c49dSIbrahim Kanouche 169*46c4c49dSIbrahim Kanouchefunc TestFuseRanges(t *testing.T) { 170*46c4c49dSIbrahim Kanouche // This test verifies that target ordering doesn't affect how ranges 171*46c4c49dSIbrahim Kanouche // get fused. The data that is input for fuseRanges is not always 172*46c4c49dSIbrahim Kanouche // ordered deterministically since it's the product of a map iteration. 173*46c4c49dSIbrahim Kanouche // This was a bug that occurred during development. 174*46c4c49dSIbrahim Kanouche tests := []struct { 175*46c4c49dSIbrahim Kanouche name string 176*46c4c49dSIbrahim Kanouche in matchRanges 177*46c4c49dSIbrahim Kanouche out matchRanges 178*46c4c49dSIbrahim Kanouche conf float64 179*46c4c49dSIbrahim Kanouche size int 180*46c4c49dSIbrahim Kanouche }{ 181*46c4c49dSIbrahim Kanouche { 182*46c4c49dSIbrahim Kanouche name: "in target order", 183*46c4c49dSIbrahim Kanouche conf: .8, 184*46c4c49dSIbrahim Kanouche size: 100, 185*46c4c49dSIbrahim Kanouche in: matchRanges{ 186*46c4c49dSIbrahim Kanouche &matchRange{ 187*46c4c49dSIbrahim Kanouche SrcStart: 50, 188*46c4c49dSIbrahim Kanouche SrcEnd: 93, 189*46c4c49dSIbrahim Kanouche TargetStart: 0, 190*46c4c49dSIbrahim Kanouche TargetEnd: 43, 191*46c4c49dSIbrahim Kanouche TokensClaimed: 43, 192*46c4c49dSIbrahim Kanouche }, 193*46c4c49dSIbrahim Kanouche &matchRange{ 194*46c4c49dSIbrahim Kanouche SrcStart: 0, 195*46c4c49dSIbrahim Kanouche SrcEnd: 43, 196*46c4c49dSIbrahim Kanouche TargetStart: 0, 197*46c4c49dSIbrahim Kanouche TargetEnd: 43, 198*46c4c49dSIbrahim Kanouche TokensClaimed: 43, 199*46c4c49dSIbrahim Kanouche }, 200*46c4c49dSIbrahim Kanouche &matchRange{ 201*46c4c49dSIbrahim Kanouche SrcStart: 10, 202*46c4c49dSIbrahim Kanouche SrcEnd: 47, 203*46c4c49dSIbrahim Kanouche TargetStart: 60, 204*46c4c49dSIbrahim Kanouche TargetEnd: 97, 205*46c4c49dSIbrahim Kanouche TokensClaimed: 37, 206*46c4c49dSIbrahim Kanouche }, 207*46c4c49dSIbrahim Kanouche &matchRange{ 208*46c4c49dSIbrahim Kanouche SrcStart: 60, 209*46c4c49dSIbrahim Kanouche SrcEnd: 97, 210*46c4c49dSIbrahim Kanouche TargetStart: 60, 211*46c4c49dSIbrahim Kanouche TargetEnd: 97, 212*46c4c49dSIbrahim Kanouche TokensClaimed: 37, 213*46c4c49dSIbrahim Kanouche }, 214*46c4c49dSIbrahim Kanouche }, 215*46c4c49dSIbrahim Kanouche out: matchRanges{ 216*46c4c49dSIbrahim Kanouche &matchRange{ 217*46c4c49dSIbrahim Kanouche SrcStart: 0, 218*46c4c49dSIbrahim Kanouche SrcEnd: 97, 219*46c4c49dSIbrahim Kanouche TargetStart: 0, 220*46c4c49dSIbrahim Kanouche TargetEnd: 97, 221*46c4c49dSIbrahim Kanouche TokensClaimed: 80, 222*46c4c49dSIbrahim Kanouche }, 223*46c4c49dSIbrahim Kanouche }, 224*46c4c49dSIbrahim Kanouche }, 225*46c4c49dSIbrahim Kanouche { 226*46c4c49dSIbrahim Kanouche name: "not in-target order", 227*46c4c49dSIbrahim Kanouche conf: .8, 228*46c4c49dSIbrahim Kanouche size: 100, 229*46c4c49dSIbrahim Kanouche in: matchRanges{ 230*46c4c49dSIbrahim Kanouche &matchRange{ 231*46c4c49dSIbrahim Kanouche SrcStart: 0, 232*46c4c49dSIbrahim Kanouche SrcEnd: 43, 233*46c4c49dSIbrahim Kanouche TargetStart: 0, 234*46c4c49dSIbrahim Kanouche TargetEnd: 43, 235*46c4c49dSIbrahim Kanouche TokensClaimed: 43, 236*46c4c49dSIbrahim Kanouche }, 237*46c4c49dSIbrahim Kanouche &matchRange{ 238*46c4c49dSIbrahim Kanouche SrcStart: 50, 239*46c4c49dSIbrahim Kanouche SrcEnd: 93, 240*46c4c49dSIbrahim Kanouche TargetStart: 0, 241*46c4c49dSIbrahim Kanouche TargetEnd: 43, 242*46c4c49dSIbrahim Kanouche TokensClaimed: 43, 243*46c4c49dSIbrahim Kanouche }, 244*46c4c49dSIbrahim Kanouche &matchRange{ 245*46c4c49dSIbrahim Kanouche SrcStart: 60, 246*46c4c49dSIbrahim Kanouche SrcEnd: 97, 247*46c4c49dSIbrahim Kanouche TargetStart: 60, 248*46c4c49dSIbrahim Kanouche TargetEnd: 97, 249*46c4c49dSIbrahim Kanouche TokensClaimed: 37, 250*46c4c49dSIbrahim Kanouche }, 251*46c4c49dSIbrahim Kanouche &matchRange{ 252*46c4c49dSIbrahim Kanouche SrcStart: 10, 253*46c4c49dSIbrahim Kanouche SrcEnd: 47, 254*46c4c49dSIbrahim Kanouche TargetStart: 60, 255*46c4c49dSIbrahim Kanouche TargetEnd: 97, 256*46c4c49dSIbrahim Kanouche TokensClaimed: 37, 257*46c4c49dSIbrahim Kanouche }, 258*46c4c49dSIbrahim Kanouche }, 259*46c4c49dSIbrahim Kanouche out: matchRanges{ 260*46c4c49dSIbrahim Kanouche &matchRange{ 261*46c4c49dSIbrahim Kanouche SrcStart: 0, 262*46c4c49dSIbrahim Kanouche SrcEnd: 97, 263*46c4c49dSIbrahim Kanouche TargetStart: 0, 264*46c4c49dSIbrahim Kanouche TargetEnd: 97, 265*46c4c49dSIbrahim Kanouche TokensClaimed: 80, 266*46c4c49dSIbrahim Kanouche }, 267*46c4c49dSIbrahim Kanouche }, 268*46c4c49dSIbrahim Kanouche }, 269*46c4c49dSIbrahim Kanouche } 270*46c4c49dSIbrahim Kanouche for _, test := range tests { 271*46c4c49dSIbrahim Kanouche t.Run(test.name, func(t *testing.T) { 272*46c4c49dSIbrahim Kanouche var trace strings.Builder 273*46c4c49dSIbrahim Kanouche c := NewClassifier(.8) 274*46c4c49dSIbrahim Kanouche c.SetTraceConfiguration(&TraceConfiguration{ 275*46c4c49dSIbrahim Kanouche TraceLicenses: "*", 276*46c4c49dSIbrahim Kanouche TracePhases: "*", 277*46c4c49dSIbrahim Kanouche Tracer: func(f string, args ...interface{}) { 278*46c4c49dSIbrahim Kanouche trace.WriteString(fmt.Sprintf(f, args...)) 279*46c4c49dSIbrahim Kanouche }, 280*46c4c49dSIbrahim Kanouche }) 281*46c4c49dSIbrahim Kanouche runs := c.detectRuns(test.name, test.in, 100, 100, test.conf, 4) 282*46c4c49dSIbrahim Kanouche actual := c.fuseRanges(test.name, test.in, test.conf, test.size, runs, 100) 283*46c4c49dSIbrahim Kanouche if !cmp.Equal(actual, test.out) { 284*46c4c49dSIbrahim Kanouche t.Errorf("%v: %v", test.name, cmp.Diff(actual, test.out)) 285*46c4c49dSIbrahim Kanouche t.Errorf("Trace:\n%s", trace.String()) 286*46c4c49dSIbrahim Kanouche } 287*46c4c49dSIbrahim Kanouche }) 288*46c4c49dSIbrahim Kanouche } 289*46c4c49dSIbrahim Kanouche} 290*46c4c49dSIbrahim Kanouche 291*46c4c49dSIbrahim Kanouchefunc TestDetectRuns(t *testing.T) { 292*46c4c49dSIbrahim Kanouche tests := []struct { 293*46c4c49dSIbrahim Kanouche name string 294*46c4c49dSIbrahim Kanouche matched matchRanges 295*46c4c49dSIbrahim Kanouche targetLength, subsetLength, q int 296*46c4c49dSIbrahim Kanouche threshold float64 297*46c4c49dSIbrahim Kanouche expected []matchRange 298*46c4c49dSIbrahim Kanouche }{ 299*46c4c49dSIbrahim Kanouche { 300*46c4c49dSIbrahim Kanouche // For an exact match on 100 accurate tokens, the first q-gram 301*46c4c49dSIbrahim Kanouche // is the only possible location hit we can return. 302*46c4c49dSIbrahim Kanouche name: "precise matching on perfect runs", 303*46c4c49dSIbrahim Kanouche threshold: 1.0, 304*46c4c49dSIbrahim Kanouche targetLength: 100, 305*46c4c49dSIbrahim Kanouche subsetLength: 100, 306*46c4c49dSIbrahim Kanouche q: 4, 307*46c4c49dSIbrahim Kanouche matched: matchRanges{ 308*46c4c49dSIbrahim Kanouche &matchRange{TargetStart: 0, TargetEnd: 100}, 309*46c4c49dSIbrahim Kanouche }, 310*46c4c49dSIbrahim Kanouche expected: []matchRange{ 311*46c4c49dSIbrahim Kanouche {SrcStart: 0, SrcEnd: 4}, 312*46c4c49dSIbrahim Kanouche }, 313*46c4c49dSIbrahim Kanouche }, 314*46c4c49dSIbrahim Kanouche { 315*46c4c49dSIbrahim Kanouche // For an 80% match on 100 accurate tokens, the first 20 token 316*46c4c49dSIbrahim Kanouche // positions represent possible matches. 317*46c4c49dSIbrahim Kanouche name: "approximate matching on perfect runs", 318*46c4c49dSIbrahim Kanouche threshold: 0.8, 319*46c4c49dSIbrahim Kanouche targetLength: 100, 320*46c4c49dSIbrahim Kanouche subsetLength: 100, 321*46c4c49dSIbrahim Kanouche q: 4, 322*46c4c49dSIbrahim Kanouche matched: matchRanges{ 323*46c4c49dSIbrahim Kanouche &matchRange{TargetStart: 0, TargetEnd: 100}, 324*46c4c49dSIbrahim Kanouche }, 325*46c4c49dSIbrahim Kanouche expected: []matchRange{ 326*46c4c49dSIbrahim Kanouche {SrcStart: 0, SrcEnd: 24}, 327*46c4c49dSIbrahim Kanouche }, 328*46c4c49dSIbrahim Kanouche }, 329*46c4c49dSIbrahim Kanouche { 330*46c4c49dSIbrahim Kanouche name: "multiple runs in a single target", 331*46c4c49dSIbrahim Kanouche threshold: 0.8, 332*46c4c49dSIbrahim Kanouche targetLength: 100, 333*46c4c49dSIbrahim Kanouche subsetLength: 10, 334*46c4c49dSIbrahim Kanouche q: 4, 335*46c4c49dSIbrahim Kanouche matched: matchRanges{ 336*46c4c49dSIbrahim Kanouche &matchRange{TargetStart: 0, TargetEnd: 10}, 337*46c4c49dSIbrahim Kanouche &matchRange{TargetStart: 20, TargetEnd: 25}, 338*46c4c49dSIbrahim Kanouche &matchRange{TargetStart: 50, TargetEnd: 60}, 339*46c4c49dSIbrahim Kanouche &matchRange{TargetStart: 70, TargetEnd: 77}, 340*46c4c49dSIbrahim Kanouche }, 341*46c4c49dSIbrahim Kanouche expected: []matchRange{ 342*46c4c49dSIbrahim Kanouche // Runs end on 4-gram boundaries 343*46c4c49dSIbrahim Kanouche {SrcStart: 0, SrcEnd: 6}, 344*46c4c49dSIbrahim Kanouche // The run starts early because of error tolerance 345*46c4c49dSIbrahim Kanouche {SrcStart: 48, SrcEnd: 56}, 346*46c4c49dSIbrahim Kanouche }, 347*46c4c49dSIbrahim Kanouche }, 348*46c4c49dSIbrahim Kanouche { 349*46c4c49dSIbrahim Kanouche name: "bridge broken runs in a single target", 350*46c4c49dSIbrahim Kanouche threshold: 0.8, 351*46c4c49dSIbrahim Kanouche targetLength: 100, 352*46c4c49dSIbrahim Kanouche subsetLength: 10, 353*46c4c49dSIbrahim Kanouche q: 4, 354*46c4c49dSIbrahim Kanouche matched: matchRanges{ 355*46c4c49dSIbrahim Kanouche &matchRange{TargetStart: 20, TargetEnd: 25}, 356*46c4c49dSIbrahim Kanouche &matchRange{TargetStart: 26, TargetEnd: 30}, 357*46c4c49dSIbrahim Kanouche &matchRange{TargetStart: 60, TargetEnd: 67}, 358*46c4c49dSIbrahim Kanouche &matchRange{TargetStart: 68, TargetEnd: 72}, 359*46c4c49dSIbrahim Kanouche }, 360*46c4c49dSIbrahim Kanouche expected: []matchRange{ 361*46c4c49dSIbrahim Kanouche // Runs end on 4-gram boundaries and start early because 362*46c4c49dSIbrahim Kanouche // of error tolerance. 363*46c4c49dSIbrahim Kanouche {SrcStart: 19, SrcEnd: 25}, 364*46c4c49dSIbrahim Kanouche {SrcStart: 59, SrcEnd: 67}, 365*46c4c49dSIbrahim Kanouche }, 366*46c4c49dSIbrahim Kanouche }, 367*46c4c49dSIbrahim Kanouche } 368*46c4c49dSIbrahim Kanouche 369*46c4c49dSIbrahim Kanouche for _, test := range tests { 370*46c4c49dSIbrahim Kanouche t.Run(test.name, func(t *testing.T) { 371*46c4c49dSIbrahim Kanouche var trace strings.Builder 372*46c4c49dSIbrahim Kanouche c := NewClassifier(.8) 373*46c4c49dSIbrahim Kanouche c.SetTraceConfiguration(&TraceConfiguration{ 374*46c4c49dSIbrahim Kanouche TraceLicenses: "*", 375*46c4c49dSIbrahim Kanouche TracePhases: "*", 376*46c4c49dSIbrahim Kanouche Tracer: func(f string, args ...interface{}) { 377*46c4c49dSIbrahim Kanouche trace.WriteString(fmt.Sprintf(f, args...)) 378*46c4c49dSIbrahim Kanouche }, 379*46c4c49dSIbrahim Kanouche }) 380*46c4c49dSIbrahim Kanouche if got := c.detectRuns(test.name, test.matched, test.targetLength, test.subsetLength, test.threshold, test.q); !cmp.Equal(got, test.expected) { 381*46c4c49dSIbrahim Kanouche t.Errorf(cmp.Diff(got, test.expected)) 382*46c4c49dSIbrahim Kanouche t.Errorf("Trace:\n%s", trace.String()) 383*46c4c49dSIbrahim Kanouche } 384*46c4c49dSIbrahim Kanouche 385*46c4c49dSIbrahim Kanouche }) 386*46c4c49dSIbrahim Kanouche } 387*46c4c49dSIbrahim Kanouche} 388