1// Copyright 2020 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package classifier 16 17import ( 18 "testing" 19 20 "github.com/google/go-cmp/cmp" 21 "github.com/sergi/go-diff/diffmatchpatch" 22) 23 24var ( 25 gettysburg = `Four score and seven years ago our fathers brought forth 26on this continent, a new nation, conceived in Liberty, and dedicated to the 27proposition that all men are created equal.` 28 modifiedGettysburg = `Four score and seven years ago our fathers brought forth 29on this continent, a nation that was new and improved, conceived in Liberty, and 30dedicated to the proposition that all men are created equal.` 31 extra = `In the current state of affairs` 32 33 declaration = `When in the Course of human events, it becomes necessary 34for one people to dissolve the political bands which have connected them with 35another, and to assume among the powers of the earth, the separate and equal 36station to which the Laws of Nature and of Nature's God entitle them, a decent 37respect to the opinions of mankind requires that they should declare the causes 38which impel them to the separation.` 39 40 loremipsum = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla 41varius enim mattis, rhoncus lectus id, aliquet sem. Phasellus eget ex in dolor 42feugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim 43vulputate, tempus leo commodo, accumsan nulla.` 44 lessModifiedLorem = `Lorem ipsum dolor sot amet, consectetur adipiscing elit. Nulla 45varius enim mattis, rhoncus lectus id, aliquet. Phasellus eget ex in dolor 46feugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim 47vulputate, tempus leo commodo, accumsan nulla.` 48) 49 50func TestTextLength(t *testing.T) { 51 tests := []struct { 52 name string 53 diffs []diffmatchpatch.Diff 54 expected int 55 }{ 56 { 57 name: "empty diff", 58 diffs: nil, 59 expected: 0, 60 }, 61 { 62 name: "deletion diff", 63 diffs: []diffmatchpatch.Diff{ 64 { 65 Type: diffmatchpatch.DiffDelete, 66 Text: "deleted text", 67 }, 68 }, 69 expected: 2, 70 }, 71 } 72 73 for _, test := range tests { 74 t.Run(test.name, func(t *testing.T) { 75 if got := textLength(test.diffs); got != test.expected { 76 t.Errorf("got %d, want %d", got, test.expected) 77 } 78 }) 79 } 80} 81 82func TestWordLen(t *testing.T) { 83 tests := []struct { 84 in string 85 expected int 86 }{ 87 { 88 in: "short string", 89 expected: 2, 90 }, 91 { 92 in: "", 93 expected: 0, 94 }, 95 { 96 in: "word", 97 expected: 1, 98 }, 99 } 100 101 for _, test := range tests { 102 t.Run(test.in, func(t *testing.T) { 103 if got := wordLen(test.in); got != test.expected { 104 t.Errorf("got %d, want %d", got, test.expected) 105 } 106 }) 107 } 108} 109 110func TestDiffing(t *testing.T) { 111 tests := []struct { 112 name string 113 unknown, known string 114 start, end int 115 diffs []diffmatchpatch.Diff 116 }{ 117 { 118 name: "identical", 119 unknown: declaration, 120 known: declaration, 121 start: 0, 122 end: 1, 123 diffs: []diffmatchpatch.Diff{ 124 { 125 Type: diffmatchpatch.DiffEqual, 126 Text: `when in the course of human events it becomes necessary for one people to dissolve the political bands which have connected them with another and to assume among the powers of the earth the separate and equal station to which the laws of nature and of natures god entitle them a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation`, 127 }, 128 }, 129 }, 130 { 131 name: "lorem", 132 unknown: lessModifiedLorem, 133 known: loremipsum, 134 start: 0, 135 end: 6, 136 diffs: []diffmatchpatch.Diff{ 137 { 138 Type: diffmatchpatch.DiffEqual, 139 Text: "lorem ipsum dolor", 140 }, 141 { 142 Type: diffmatchpatch.DiffDelete, 143 Text: "UNKNOWN", 144 }, 145 { 146 Type: diffmatchpatch.DiffInsert, 147 Text: "sit", 148 }, 149 { 150 Type: diffmatchpatch.DiffEqual, 151 Text: "amet consectetur adipiscing elit nulla varius enim mattis rhoncus lectus id aliquet", 152 }, 153 { 154 Type: diffmatchpatch.DiffInsert, 155 Text: "sem", 156 }, 157 { 158 Type: diffmatchpatch.DiffEqual, 159 Text: "phasellus eget ex in dolor feugiat ultricies etiam interdum sit amet nisl in placerat sed vitae enim vulputate tempus leo commodo accumsan nulla", 160 }, 161 }, 162 }, 163 { 164 name: "whole diff retained", 165 unknown: modifiedGettysburg, 166 known: gettysburg, 167 start: 0, 168 end: 6, 169 diffs: []diffmatchpatch.Diff{ 170 { 171 Type: diffmatchpatch.DiffEqual, 172 Text: "four score and seven years ago our fathers brought forth on this continent a", 173 }, 174 { 175 Type: diffmatchpatch.DiffDelete, 176 Text: "nation that UNKNOWN", 177 }, 178 { 179 Type: diffmatchpatch.DiffEqual, 180 Text: "new", 181 }, 182 { 183 Type: diffmatchpatch.DiffDelete, 184 Text: "and UNKNOWN", 185 }, 186 { 187 Type: diffmatchpatch.DiffInsert, 188 Text: "nation", 189 }, 190 { 191 Type: diffmatchpatch.DiffEqual, 192 Text: "conceived in liberty and dedicated to the proposition that all men are created equal", 193 }, 194 }, 195 }, 196 { 197 name: "extra at beginning", 198 unknown: extra + " " + gettysburg, 199 known: gettysburg, 200 start: 1, 201 end: 2, 202 diffs: []diffmatchpatch.Diff{ 203 { 204 Type: diffmatchpatch.DiffDelete, 205 Text: "in the UNKNOWN UNKNOWN UNKNOWN UNKNOWN", 206 }, 207 { 208 Type: diffmatchpatch.DiffEqual, 209 Text: "four score and seven years ago our fathers brought forth on this continent a new nation conceived in liberty and dedicated to the proposition that all men are created equal", 210 }, 211 }, 212 }, 213 { 214 name: "extra at end", 215 unknown: gettysburg + " " + extra, 216 known: gettysburg, 217 start: 0, 218 end: 1, 219 diffs: []diffmatchpatch.Diff{ 220 { 221 Type: diffmatchpatch.DiffEqual, 222 Text: "four score and seven years ago our fathers brought forth on this continent a new nation conceived in liberty and dedicated to the proposition that all men are created equal", 223 }, 224 { 225 Type: diffmatchpatch.DiffDelete, 226 Text: "in the UNKNOWN UNKNOWN UNKNOWN UNKNOWN", 227 }, 228 }, 229 }, 230 { 231 name: "extra at both ends", 232 unknown: extra + " " + gettysburg + " " + extra, 233 known: gettysburg, 234 start: 1, 235 end: 2, 236 diffs: []diffmatchpatch.Diff{ 237 { 238 Type: diffmatchpatch.DiffDelete, 239 Text: "in the UNKNOWN UNKNOWN UNKNOWN UNKNOWN", 240 }, 241 { 242 Type: diffmatchpatch.DiffEqual, 243 Text: "four score and seven years ago our fathers brought forth on this continent a new nation conceived in liberty and dedicated to the proposition that all men are created equal", 244 }, 245 { 246 Type: diffmatchpatch.DiffDelete, 247 Text: "in the UNKNOWN UNKNOWN UNKNOWN UNKNOWN", 248 }, 249 }, 250 }, 251 { 252 name: "completely different", 253 unknown: "this", 254 known: "that", 255 start: 1, 256 end: 2, 257 diffs: []diffmatchpatch.Diff{ 258 { 259 Type: diffmatchpatch.DiffDelete, 260 Text: "UNKNOWN", 261 }, 262 { 263 Type: diffmatchpatch.DiffInsert, 264 Text: "that", 265 }, 266 }, 267 }, 268 } 269 270 for _, test := range tests { 271 t.Run(test.name, func(t *testing.T) { 272 c := NewClassifier(.8) 273 c.AddContent("", "known", "", []byte(test.known)) 274 kd := c.getIndexedDocument("", "known", "") 275 ud := c.createTargetIndexedDocument([]byte(test.unknown)) 276 diffs := docDiff("known", ud, 0, ud.size(), kd, 0, kd.size()) 277 start, end := diffRange(kd.normalized(), diffs) 278 if start != test.start { 279 t.Errorf("start: got %d want %d", start, test.start) 280 } 281 if end != test.end { 282 t.Errorf("end: got %d want %d", end, test.end) 283 } 284 if !cmp.Equal(diffs, test.diffs) { 285 t.Errorf(cmp.Diff(diffs, test.diffs)) 286 } 287 }) 288 } 289} 290