xref: /aosp_15_r20/external/licenseclassifier/v2/diff.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2020 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package classifier
16
17import (
18	"strings"
19
20	"github.com/sergi/go-diff/diffmatchpatch"
21)
22
23// This file contains word-diffing routines that build on top of the go-diff package.
24// The algorithm implemented here is from the suggested word diffing technique in
25// https://github.com/google/diff-match-patch/wiki/Line-or-Word-Diffs
26
27// diffRange returns the indices of the beginning and end locations of the diff
28// that reconstruct (as best possible) the source value.
29func diffRange(known string, diffs []diffmatchpatch.Diff) (start, end int) {
30	var foundStart bool
31	var seen string
32	for end = 0; end < len(diffs); end++ {
33		if len(seen) > 1 && seen[:len(seen)-1] == known {
34			break
35		}
36		switch diffs[end].Type {
37		case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert:
38			if !foundStart {
39				start = end
40				foundStart = true
41			}
42			seen += diffs[end].Text + " "
43		}
44	}
45	return start, end
46}
47
48func docDiff(id string, doc1 *indexedDocument, doc1Start, doc1End int, doc2 *indexedDocument, doc2Start, doc2End int) []diffmatchpatch.Diff {
49	chars1 := doc1.runes[doc1Start:doc1End]
50	chars2 := doc2.runes[doc2Start:doc2End]
51
52	dmp := diffmatchpatch.New()
53	diffs := dmp.DiffMainRunes(chars1, chars2, false)
54
55	// Recover the words from the previous rune encoding and return the textual diffs.
56	diffs = diffRunesToWords(diffs, doc1.dict)
57	return diffs
58}
59
60func diffWordsToRunes(doc *indexedDocument, start, end int) []rune {
61	// Creates a slice of runes using the indexed values as a basis for runes.
62	// The go-diff code basically does exactly this using ephemeral dictionaries
63	// for each input string. We leverage the fact we have a persistent dictionary
64	// to make this operation cheaper.
65	// TODO: perhaps we should cache these in the corpus?
66	runes := make([]rune, 0, end-start)
67
68	for _, t := range doc.Tokens[start:end] {
69		runes = append(runes, rune(t.ID))
70	}
71	return runes
72}
73
74// diffRunesToWords rehydrates the text in a diff from a string of word hashes to real words of text.
75func diffRunesToWords(diffs []diffmatchpatch.Diff, dict *dictionary) []diffmatchpatch.Diff {
76	hydrated := make([]diffmatchpatch.Diff, 0, len(diffs))
77	for _, aDiff := range diffs {
78		chars := []rune(aDiff.Text)
79		var sb strings.Builder
80
81		for i, r := range chars {
82			sb.WriteString(dict.getWord(tokenID(r)))
83			if (i + 1) < len(chars) {
84				sb.WriteByte(' ')
85			}
86		}
87
88		aDiff.Text = sb.String()
89		hydrated = append(hydrated, aDiff)
90	}
91	return hydrated
92}
93
94// Returns the number of words in the input string. Used by scoring and distance functions.
95// This function depends on the behavior of the tokenizer such that strings are separated
96// by exactly one space and don't start or end with whitespace.
97func wordLen(text string) int {
98	if text == "" {
99		return 0
100	}
101	return strings.Count(text, " ") + 1
102}
103
104// textLength returns the number of tokens in the diff. This value is used to
105// adjust the offset for detection, since this is the number of tokens
106// discarded while matching a diff.  By virtue of how it's called, there won't
107// be "change" diffs (a paired insert/delete) so we can simplify the scan to
108// just count up everything.
109func textLength(diffs []diffmatchpatch.Diff) int {
110	l := 0
111	for _, d := range diffs {
112		l += wordLen(d.Text)
113	}
114	return l
115}
116