xref: /aosp_15_r20/external/spdx-tools/idsearcher/idsearcher.go (revision ba677afa8f67bb56cbc794f4d0e378e0da058e16)
1// Package idsearcher is used to search for short-form IDs in files
2// within a directory, and to build an SPDX Document containing those
3// license findings.
4// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
5package idsearcher
6
7import (
8	"bufio"
9	"fmt"
10	"github.com/spdx/tools-golang/spdx/v2_3"
11	"os"
12	"path/filepath"
13	"regexp"
14	"sort"
15	"strings"
16
17	"github.com/spdx/tools-golang/builder"
18	"github.com/spdx/tools-golang/spdx/v2_1"
19	"github.com/spdx/tools-golang/spdx/v2_2"
20	"github.com/spdx/tools-golang/utils"
21)
22
23// ===== 2.1 Searcher functions =====
24
25// Config2_1 is a collection of configuration settings for docbuilder
26// (for version 2.1 SPDX Documents). A few mandatory fields are set here
27// so that they can be repeatedly reused in multiple calls to Build2_1.
28type Config2_1 struct {
29	// NamespacePrefix should be a URI representing a prefix for the
30	// namespace with which the SPDX Document will be associated.
31	// It will be used in the DocumentNamespace field in the CreationInfo
32	// section, followed by the per-Document package name and a random UUID.
33	NamespacePrefix string
34
35	// BuilderPathsIgnored lists certain paths to be omitted from the built
36	// document. Each string should be a path, relative to the package's
37	// dirRoot, to a specific file or (for all files in a directory) ending
38	// in a slash. Prefix the string with "**" to omit all instances of that
39	// file / directory, regardless of where it is in the file tree.
40	BuilderPathsIgnored []string
41
42	// SearcherPathsIgnored lists certain paths that should not be searched
43	// by idsearcher, even if those paths have Files present. It uses the
44	// same format as BuilderPathsIgnored.
45	SearcherPathsIgnored []string
46}
47
48// BuildIDsDocument2_1 creates an SPDX Document (version 2.1) and searches for
49// short-form IDs in each file, filling in license fields as appropriate. It
50// returns that document or error if any is encountered. Arguments:
51//   - packageName: name of package / directory
52//   - dirRoot: path to directory to be analyzed
53//   - namespacePrefix: URI representing a prefix for the
54//     namespace with which the SPDX Document will be associated
55func BuildIDsDocument2_1(packageName string, dirRoot string, idconfig *Config2_1) (*v2_1.Document, error) {
56	// first, build the Document using builder
57	bconfig := &builder.Config2_1{
58		NamespacePrefix: idconfig.NamespacePrefix,
59		CreatorType:     "Tool",
60		Creator:         "github.com/spdx/tools-golang/idsearcher",
61		PathsIgnored:    idconfig.BuilderPathsIgnored,
62	}
63	doc, err := builder.Build2_1(packageName, dirRoot, bconfig)
64	if err != nil {
65		return nil, err
66	}
67	if doc == nil {
68		return nil, fmt.Errorf("builder returned nil Document")
69	}
70	if doc.Packages == nil {
71		return nil, fmt.Errorf("builder returned nil Packages map")
72	}
73	if len(doc.Packages) != 1 {
74		return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages))
75	}
76
77	// now, walk through each file and find its licenses (if any)
78	pkg := doc.Packages[0]
79	if pkg == nil {
80		return nil, fmt.Errorf("builder returned nil Package")
81	}
82	if pkg.Files == nil {
83		return nil, fmt.Errorf("builder returned nil Files in Package")
84	}
85	licsForPackage := map[string]int{}
86	for _, f := range pkg.Files {
87		// start by initializing / clearing values
88		f.LicenseInfoInFiles = []string{"NOASSERTION"}
89		f.LicenseConcluded = "NOASSERTION"
90
91		// check whether the searcher should ignore this file
92		if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) {
93			continue
94		}
95
96		fPath := filepath.Join(dirRoot, f.FileName)
97		// FIXME this is not preferable -- ignoring error
98		ids, _ := searchFileIDs(fPath)
99		// FIXME for now, proceed onwards with whatever IDs we obtained.
100		// FIXME instead of ignoring the error, should probably either log it,
101		// FIXME and/or enable the caller to configure what should happen.
102
103		// separate out for this file's licenses
104		licsForFile := map[string]int{}
105		licsParens := []string{}
106		for _, lid := range ids {
107			// get individual elements and add for file and package
108			licElements := getIndividualLicenses(lid)
109			for _, elt := range licElements {
110				licsForFile[elt] = 1
111				licsForPackage[elt] = 1
112			}
113			// parenthesize if needed and add to slice for joining
114			licsParens = append(licsParens, makeElement(lid))
115		}
116
117		// OK -- now we can fill in the file's details, or NOASSERTION if none
118		if len(licsForFile) > 0 {
119			f.LicenseInfoInFiles = []string{}
120			for lic := range licsForFile {
121				f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic)
122			}
123			sort.Strings(f.LicenseInfoInFiles)
124			// avoid adding parens and joining for single-ID items
125			if len(licsParens) == 1 {
126				f.LicenseConcluded = ids[0]
127			} else {
128				f.LicenseConcluded = strings.Join(licsParens, " AND ")
129			}
130		}
131	}
132
133	// and finally, we can fill in the package's details
134	if len(licsForPackage) == 0 {
135		pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"}
136	} else {
137		pkg.PackageLicenseInfoFromFiles = []string{}
138		for lic := range licsForPackage {
139			pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic)
140		}
141		sort.Strings(pkg.PackageLicenseInfoFromFiles)
142	}
143
144	return doc, nil
145}
146
147// ===== 2.2 Searcher functions =====
148
149// Config2_2 is a collection of configuration settings for docbuilder
150// (for version 2.2 SPDX Documents). A few mandatory fields are set here
151// so that they can be repeatedly reused in multiple calls to Build2_2.
152type Config2_2 struct {
153	// NamespacePrefix should be a URI representing a prefix for the
154	// namespace with which the SPDX Document will be associated.
155	// It will be used in the DocumentNamespace field in the CreationInfo
156	// section, followed by the per-Document package name and a random UUID.
157	NamespacePrefix string
158
159	// BuilderPathsIgnored lists certain paths to be omitted from the built
160	// document. Each string should be a path, relative to the package's
161	// dirRoot, to a specific file or (for all files in a directory) ending
162	// in a slash. Prefix the string with "**" to omit all instances of that
163	// file / directory, regardless of where it is in the file tree.
164	BuilderPathsIgnored []string
165
166	// SearcherPathsIgnored lists certain paths that should not be searched
167	// by idsearcher, even if those paths have Files present. It uses the
168	// same format as BuilderPathsIgnored.
169	SearcherPathsIgnored []string
170}
171
172// BuildIDsDocument2_2 creates an SPDX Document (version 2.2) and searches for
173// short-form IDs in each file, filling in license fields as appropriate. It
174// returns that document or error if any is encountered. Arguments:
175//   - packageName: name of package / directory
176//   - dirRoot: path to directory to be analyzed
177//   - namespacePrefix: URI representing a prefix for the
178//     namespace with which the SPDX Document will be associated
179func BuildIDsDocument2_2(packageName string, dirRoot string, idconfig *Config2_2) (*v2_2.Document, error) {
180	// first, build the Document using builder
181	bconfig := &builder.Config2_2{
182		NamespacePrefix: idconfig.NamespacePrefix,
183		CreatorType:     "Tool",
184		Creator:         "github.com/spdx/tools-golang/idsearcher",
185		PathsIgnored:    idconfig.BuilderPathsIgnored,
186	}
187	doc, err := builder.Build2_2(packageName, dirRoot, bconfig)
188	if err != nil {
189		return nil, err
190	}
191	if doc == nil {
192		return nil, fmt.Errorf("builder returned nil Document")
193	}
194	if doc.Packages == nil {
195		return nil, fmt.Errorf("builder returned nil Packages map")
196	}
197	if len(doc.Packages) != 1 {
198		return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages))
199	}
200
201	// now, walk through each file and find its licenses (if any)
202	pkg := doc.Packages[0]
203	if pkg == nil {
204		return nil, fmt.Errorf("builder returned nil Package")
205	}
206	if pkg.Files == nil {
207		return nil, fmt.Errorf("builder returned nil Files in Package")
208	}
209	licsForPackage := map[string]int{}
210	for _, f := range pkg.Files {
211		// start by initializing / clearing values
212		f.LicenseInfoInFiles = []string{"NOASSERTION"}
213		f.LicenseConcluded = "NOASSERTION"
214
215		// check whether the searcher should ignore this file
216		if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) {
217			continue
218		}
219
220		fPath := filepath.Join(dirRoot, f.FileName)
221		// FIXME this is not preferable -- ignoring error
222		ids, _ := searchFileIDs(fPath)
223		// FIXME for now, proceed onwards with whatever IDs we obtained.
224		// FIXME instead of ignoring the error, should probably either log it,
225		// FIXME and/or enable the caller to configure what should happen.
226
227		// separate out for this file's licenses
228		licsForFile := map[string]int{}
229		licsParens := []string{}
230		for _, lid := range ids {
231			// get individual elements and add for file and package
232			licElements := getIndividualLicenses(lid)
233			for _, elt := range licElements {
234				licsForFile[elt] = 1
235				licsForPackage[elt] = 1
236			}
237			// parenthesize if needed and add to slice for joining
238			licsParens = append(licsParens, makeElement(lid))
239		}
240
241		// OK -- now we can fill in the file's details, or NOASSERTION if none
242		if len(licsForFile) > 0 {
243			f.LicenseInfoInFiles = []string{}
244			for lic := range licsForFile {
245				f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic)
246			}
247			sort.Strings(f.LicenseInfoInFiles)
248			// avoid adding parens and joining for single-ID items
249			if len(licsParens) == 1 {
250				f.LicenseConcluded = ids[0]
251			} else {
252				f.LicenseConcluded = strings.Join(licsParens, " AND ")
253			}
254		}
255	}
256
257	// and finally, we can fill in the package's details
258	if len(licsForPackage) == 0 {
259		pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"}
260	} else {
261		pkg.PackageLicenseInfoFromFiles = []string{}
262		for lic := range licsForPackage {
263			pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic)
264		}
265		sort.Strings(pkg.PackageLicenseInfoFromFiles)
266	}
267
268	return doc, nil
269}
270
271// ===== 2.3 Searcher functions =====
272
273// Config2_3 is a collection of configuration settings for docbuilder
274// (for version 2.3 SPDX Documents). A few mandatory fields are set here
275// so that they can be repeatedly reused in multiple calls to Build2_3.
276type Config2_3 struct {
277	// NamespacePrefix should be a URI representing a prefix for the
278	// namespace with which the SPDX Document will be associated.
279	// It will be used in the DocumentNamespace field in the CreationInfo
280	// section, followed by the per-Document package name and a random UUID.
281	NamespacePrefix string
282
283	// BuilderPathsIgnored lists certain paths to be omitted from the built
284	// document. Each string should be a path, relative to the package's
285	// dirRoot, to a specific file or (for all files in a directory) ending
286	// in a slash. Prefix the string with "**" to omit all instances of that
287	// file / directory, regardless of where it is in the file tree.
288	BuilderPathsIgnored []string
289
290	// SearcherPathsIgnored lists certain paths that should not be searched
291	// by idsearcher, even if those paths have Files present. It uses the
292	// same format as BuilderPathsIgnored.
293	SearcherPathsIgnored []string
294}
295
296// BuildIDsDocument2_3 creates an SPDX Document (version 2.3) and searches for
297// short-form IDs in each file, filling in license fields as appropriate. It
298// returns that document or error if any is encountered. Arguments:
299//   - packageName: name of package / directory
300//   - dirRoot: path to directory to be analyzed
301//   - namespacePrefix: URI representing a prefix for the
302//     namespace with which the SPDX Document will be associated
303func BuildIDsDocument2_3(packageName string, dirRoot string, idconfig *Config2_3) (*v2_3.Document, error) {
304	// first, build the Document using builder
305	bconfig := &builder.Config2_3{
306		NamespacePrefix: idconfig.NamespacePrefix,
307		CreatorType:     "Tool",
308		Creator:         "github.com/spdx/tools-golang/idsearcher",
309		PathsIgnored:    idconfig.BuilderPathsIgnored,
310	}
311	doc, err := builder.Build2_3(packageName, dirRoot, bconfig)
312	if err != nil {
313		return nil, err
314	}
315	if doc == nil {
316		return nil, fmt.Errorf("builder returned nil Document")
317	}
318	if doc.Packages == nil {
319		return nil, fmt.Errorf("builder returned nil Packages map")
320	}
321	if len(doc.Packages) != 1 {
322		return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages))
323	}
324
325	// now, walk through each file and find its licenses (if any)
326	pkg := doc.Packages[0]
327	if pkg == nil {
328		return nil, fmt.Errorf("builder returned nil Package")
329	}
330	if pkg.Files == nil {
331		return nil, fmt.Errorf("builder returned nil Files in Package")
332	}
333	licsForPackage := map[string]int{}
334	for _, f := range pkg.Files {
335		// start by initializing / clearing values
336		f.LicenseInfoInFiles = []string{"NOASSERTION"}
337		f.LicenseConcluded = "NOASSERTION"
338
339		// check whether the searcher should ignore this file
340		if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) {
341			continue
342		}
343
344		fPath := filepath.Join(dirRoot, f.FileName)
345		// FIXME this is not preferable -- ignoring error
346		ids, _ := searchFileIDs(fPath)
347		// FIXME for now, proceed onwards with whatever IDs we obtained.
348		// FIXME instead of ignoring the error, should probably either log it,
349		// FIXME and/or enable the caller to configure what should happen.
350
351		// separate out for this file's licenses
352		licsForFile := map[string]int{}
353		licsParens := []string{}
354		for _, lid := range ids {
355			// get individual elements and add for file and package
356			licElements := getIndividualLicenses(lid)
357			for _, elt := range licElements {
358				licsForFile[elt] = 1
359				licsForPackage[elt] = 1
360			}
361			// parenthesize if needed and add to slice for joining
362			licsParens = append(licsParens, makeElement(lid))
363		}
364
365		// OK -- now we can fill in the file's details, or NOASSERTION if none
366		if len(licsForFile) > 0 {
367			f.LicenseInfoInFiles = []string{}
368			for lic := range licsForFile {
369				f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic)
370			}
371			sort.Strings(f.LicenseInfoInFiles)
372			// avoid adding parens and joining for single-ID items
373			if len(licsParens) == 1 {
374				f.LicenseConcluded = ids[0]
375			} else {
376				f.LicenseConcluded = strings.Join(licsParens, " AND ")
377			}
378		}
379	}
380
381	// and finally, we can fill in the package's details
382	if len(licsForPackage) == 0 {
383		pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"}
384	} else {
385		pkg.PackageLicenseInfoFromFiles = []string{}
386		for lic := range licsForPackage {
387			pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic)
388		}
389		sort.Strings(pkg.PackageLicenseInfoFromFiles)
390	}
391
392	return doc, nil
393}
394
395// ===== Utility functions (not version-specific) =====
396func searchFileIDs(filePath string) ([]string, error) {
397	idsMap := map[string]int{}
398	ids := []string{}
399
400	f, err := os.Open(filePath)
401	if err != nil {
402		return nil, err
403	}
404	defer f.Close()
405
406	scanner := bufio.NewScanner(f)
407
408	for scanner.Scan() {
409		if strings.Contains(scanner.Text(), "SPDX-License-Identifier:") {
410			strs := strings.SplitN(scanner.Text(), "SPDX-License-Identifier:", 2)
411
412			// if prefixed by more than n characters, it's probably not a
413			// short-form ID; it's probably code to detect short-form IDs.
414			// Like this function itself, for example  =)
415			prefix := stripTrash(strs[0])
416			if len(prefix) > 5 {
417				continue
418			}
419
420			// stop before trailing */ if it is present
421			lidToExtract := strs[1]
422			lidToExtract = strings.Split(lidToExtract, "*/")[0]
423			lid := strings.TrimSpace(lidToExtract)
424			lid = stripTrash(lid)
425			idsMap[lid] = 1
426		}
427	}
428
429	// FIXME for now, ignore scanner errors because we want to return whatever
430	// FIXME IDs were in fact found. should probably be changed to either
431	// FIXME log the error, and/or be configurable for what should happen.
432	// if err = scanner.Err(); err != nil {
433	// 	return nil, err
434	// }
435
436	// now, convert map to string
437	for lid := range idsMap {
438		ids = append(ids, lid)
439	}
440
441	// and sort it
442	sort.Strings(ids)
443
444	return ids, nil
445}
446
447func stripTrash(lid string) string {
448	re := regexp.MustCompile(`[^\w\s\d.\-\+()]+`)
449	return re.ReplaceAllString(lid, "")
450}
451
452func makeElement(lic string) string {
453	if strings.Contains(lic, " AND ") || strings.Contains(lic, " OR ") {
454		return fmt.Sprintf("(%s)", lic)
455	}
456
457	return lic
458}
459
460func getIndividualLicenses(lic string) []string {
461	// replace parens and '+' with spaces
462	lic = strings.Replace(lic, "(", " ", -1)
463	lic = strings.Replace(lic, ")", " ", -1)
464	lic = strings.Replace(lic, "+", " ", -1)
465
466	// now, split by spaces, trim, and add to slice
467	licElements := strings.Split(lic, " ")
468	lics := []string{}
469	for _, elt := range licElements {
470		elt := strings.TrimSpace(elt)
471		// don't add if empty or if case-insensitive operator
472		if elt == "" || strings.EqualFold(elt, "AND") ||
473			strings.EqualFold(elt, "OR") || strings.EqualFold(elt, "WITH") {
474			continue
475		}
476
477		lics = append(lics, elt)
478	}
479
480	// sort before returning
481	sort.Strings(lics)
482	return lics
483}
484