1// Copyright 2020 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package classifier 16 17import ( 18 "strings" 19 20 "github.com/sergi/go-diff/diffmatchpatch" 21) 22 23// This file contains word-diffing routines that build on top of the go-diff package. 24// The algorithm implemented here is from the suggested word diffing technique in 25// https://github.com/google/diff-match-patch/wiki/Line-or-Word-Diffs 26 27// diffRange returns the indices of the beginning and end locations of the diff 28// that reconstruct (as best possible) the source value. 29func diffRange(known string, diffs []diffmatchpatch.Diff) (start, end int) { 30 var foundStart bool 31 var seen string 32 for end = 0; end < len(diffs); end++ { 33 if len(seen) > 1 && seen[:len(seen)-1] == known { 34 break 35 } 36 switch diffs[end].Type { 37 case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert: 38 if !foundStart { 39 start = end 40 foundStart = true 41 } 42 seen += diffs[end].Text + " " 43 } 44 } 45 return start, end 46} 47 48func docDiff(id string, doc1 *indexedDocument, doc1Start, doc1End int, doc2 *indexedDocument, doc2Start, doc2End int) []diffmatchpatch.Diff { 49 chars1 := doc1.runes[doc1Start:doc1End] 50 chars2 := doc2.runes[doc2Start:doc2End] 51 52 dmp := diffmatchpatch.New() 53 diffs := dmp.DiffMainRunes(chars1, chars2, false) 54 55 // Recover the words from the previous rune encoding and return the textual diffs. 56 diffs = diffRunesToWords(diffs, doc1.dict) 57 return diffs 58} 59 60func diffWordsToRunes(doc *indexedDocument, start, end int) []rune { 61 // Creates a slice of runes using the indexed values as a basis for runes. 62 // The go-diff code basically does exactly this using ephemeral dictionaries 63 // for each input string. We leverage the fact we have a persistent dictionary 64 // to make this operation cheaper. 65 // TODO: perhaps we should cache these in the corpus? 66 runes := make([]rune, 0, end-start) 67 68 for _, t := range doc.Tokens[start:end] { 69 runes = append(runes, rune(t.ID)) 70 } 71 return runes 72} 73 74// diffRunesToWords rehydrates the text in a diff from a string of word hashes to real words of text. 75func diffRunesToWords(diffs []diffmatchpatch.Diff, dict *dictionary) []diffmatchpatch.Diff { 76 hydrated := make([]diffmatchpatch.Diff, 0, len(diffs)) 77 for _, aDiff := range diffs { 78 chars := []rune(aDiff.Text) 79 var sb strings.Builder 80 81 for i, r := range chars { 82 sb.WriteString(dict.getWord(tokenID(r))) 83 if (i + 1) < len(chars) { 84 sb.WriteByte(' ') 85 } 86 } 87 88 aDiff.Text = sb.String() 89 hydrated = append(hydrated, aDiff) 90 } 91 return hydrated 92} 93 94// Returns the number of words in the input string. Used by scoring and distance functions. 95// This function depends on the behavior of the tokenizer such that strings are separated 96// by exactly one space and don't start or end with whitespace. 97func wordLen(text string) int { 98 if text == "" { 99 return 0 100 } 101 return strings.Count(text, " ") + 1 102} 103 104// textLength returns the number of tokens in the diff. This value is used to 105// adjust the offset for detection, since this is the number of tokens 106// discarded while matching a diff. By virtue of how it's called, there won't 107// be "change" diffs (a paired insert/delete) so we can simplify the scan to 108// just count up everything. 109func textLength(diffs []diffmatchpatch.Diff) int { 110 l := 0 111 for _, d := range diffs { 112 l += wordLen(d.Text) 113 } 114 return l 115} 116