1// Package idsearcher is used to search for short-form IDs in files 2// within a directory, and to build an SPDX Document containing those 3// license findings. 4// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later 5package idsearcher 6 7import ( 8 "bufio" 9 "fmt" 10 "github.com/spdx/tools-golang/spdx/v2_3" 11 "os" 12 "path/filepath" 13 "regexp" 14 "sort" 15 "strings" 16 17 "github.com/spdx/tools-golang/builder" 18 "github.com/spdx/tools-golang/spdx/v2_1" 19 "github.com/spdx/tools-golang/spdx/v2_2" 20 "github.com/spdx/tools-golang/utils" 21) 22 23// ===== 2.1 Searcher functions ===== 24 25// Config2_1 is a collection of configuration settings for docbuilder 26// (for version 2.1 SPDX Documents). A few mandatory fields are set here 27// so that they can be repeatedly reused in multiple calls to Build2_1. 28type Config2_1 struct { 29 // NamespacePrefix should be a URI representing a prefix for the 30 // namespace with which the SPDX Document will be associated. 31 // It will be used in the DocumentNamespace field in the CreationInfo 32 // section, followed by the per-Document package name and a random UUID. 33 NamespacePrefix string 34 35 // BuilderPathsIgnored lists certain paths to be omitted from the built 36 // document. Each string should be a path, relative to the package's 37 // dirRoot, to a specific file or (for all files in a directory) ending 38 // in a slash. Prefix the string with "**" to omit all instances of that 39 // file / directory, regardless of where it is in the file tree. 40 BuilderPathsIgnored []string 41 42 // SearcherPathsIgnored lists certain paths that should not be searched 43 // by idsearcher, even if those paths have Files present. It uses the 44 // same format as BuilderPathsIgnored. 45 SearcherPathsIgnored []string 46} 47 48// BuildIDsDocument2_1 creates an SPDX Document (version 2.1) and searches for 49// short-form IDs in each file, filling in license fields as appropriate. It 50// returns that document or error if any is encountered. Arguments: 51// - packageName: name of package / directory 52// - dirRoot: path to directory to be analyzed 53// - namespacePrefix: URI representing a prefix for the 54// namespace with which the SPDX Document will be associated 55func BuildIDsDocument2_1(packageName string, dirRoot string, idconfig *Config2_1) (*v2_1.Document, error) { 56 // first, build the Document using builder 57 bconfig := &builder.Config2_1{ 58 NamespacePrefix: idconfig.NamespacePrefix, 59 CreatorType: "Tool", 60 Creator: "github.com/spdx/tools-golang/idsearcher", 61 PathsIgnored: idconfig.BuilderPathsIgnored, 62 } 63 doc, err := builder.Build2_1(packageName, dirRoot, bconfig) 64 if err != nil { 65 return nil, err 66 } 67 if doc == nil { 68 return nil, fmt.Errorf("builder returned nil Document") 69 } 70 if doc.Packages == nil { 71 return nil, fmt.Errorf("builder returned nil Packages map") 72 } 73 if len(doc.Packages) != 1 { 74 return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages)) 75 } 76 77 // now, walk through each file and find its licenses (if any) 78 pkg := doc.Packages[0] 79 if pkg == nil { 80 return nil, fmt.Errorf("builder returned nil Package") 81 } 82 if pkg.Files == nil { 83 return nil, fmt.Errorf("builder returned nil Files in Package") 84 } 85 licsForPackage := map[string]int{} 86 for _, f := range pkg.Files { 87 // start by initializing / clearing values 88 f.LicenseInfoInFiles = []string{"NOASSERTION"} 89 f.LicenseConcluded = "NOASSERTION" 90 91 // check whether the searcher should ignore this file 92 if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) { 93 continue 94 } 95 96 fPath := filepath.Join(dirRoot, f.FileName) 97 // FIXME this is not preferable -- ignoring error 98 ids, _ := searchFileIDs(fPath) 99 // FIXME for now, proceed onwards with whatever IDs we obtained. 100 // FIXME instead of ignoring the error, should probably either log it, 101 // FIXME and/or enable the caller to configure what should happen. 102 103 // separate out for this file's licenses 104 licsForFile := map[string]int{} 105 licsParens := []string{} 106 for _, lid := range ids { 107 // get individual elements and add for file and package 108 licElements := getIndividualLicenses(lid) 109 for _, elt := range licElements { 110 licsForFile[elt] = 1 111 licsForPackage[elt] = 1 112 } 113 // parenthesize if needed and add to slice for joining 114 licsParens = append(licsParens, makeElement(lid)) 115 } 116 117 // OK -- now we can fill in the file's details, or NOASSERTION if none 118 if len(licsForFile) > 0 { 119 f.LicenseInfoInFiles = []string{} 120 for lic := range licsForFile { 121 f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic) 122 } 123 sort.Strings(f.LicenseInfoInFiles) 124 // avoid adding parens and joining for single-ID items 125 if len(licsParens) == 1 { 126 f.LicenseConcluded = ids[0] 127 } else { 128 f.LicenseConcluded = strings.Join(licsParens, " AND ") 129 } 130 } 131 } 132 133 // and finally, we can fill in the package's details 134 if len(licsForPackage) == 0 { 135 pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"} 136 } else { 137 pkg.PackageLicenseInfoFromFiles = []string{} 138 for lic := range licsForPackage { 139 pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic) 140 } 141 sort.Strings(pkg.PackageLicenseInfoFromFiles) 142 } 143 144 return doc, nil 145} 146 147// ===== 2.2 Searcher functions ===== 148 149// Config2_2 is a collection of configuration settings for docbuilder 150// (for version 2.2 SPDX Documents). A few mandatory fields are set here 151// so that they can be repeatedly reused in multiple calls to Build2_2. 152type Config2_2 struct { 153 // NamespacePrefix should be a URI representing a prefix for the 154 // namespace with which the SPDX Document will be associated. 155 // It will be used in the DocumentNamespace field in the CreationInfo 156 // section, followed by the per-Document package name and a random UUID. 157 NamespacePrefix string 158 159 // BuilderPathsIgnored lists certain paths to be omitted from the built 160 // document. Each string should be a path, relative to the package's 161 // dirRoot, to a specific file or (for all files in a directory) ending 162 // in a slash. Prefix the string with "**" to omit all instances of that 163 // file / directory, regardless of where it is in the file tree. 164 BuilderPathsIgnored []string 165 166 // SearcherPathsIgnored lists certain paths that should not be searched 167 // by idsearcher, even if those paths have Files present. It uses the 168 // same format as BuilderPathsIgnored. 169 SearcherPathsIgnored []string 170} 171 172// BuildIDsDocument2_2 creates an SPDX Document (version 2.2) and searches for 173// short-form IDs in each file, filling in license fields as appropriate. It 174// returns that document or error if any is encountered. Arguments: 175// - packageName: name of package / directory 176// - dirRoot: path to directory to be analyzed 177// - namespacePrefix: URI representing a prefix for the 178// namespace with which the SPDX Document will be associated 179func BuildIDsDocument2_2(packageName string, dirRoot string, idconfig *Config2_2) (*v2_2.Document, error) { 180 // first, build the Document using builder 181 bconfig := &builder.Config2_2{ 182 NamespacePrefix: idconfig.NamespacePrefix, 183 CreatorType: "Tool", 184 Creator: "github.com/spdx/tools-golang/idsearcher", 185 PathsIgnored: idconfig.BuilderPathsIgnored, 186 } 187 doc, err := builder.Build2_2(packageName, dirRoot, bconfig) 188 if err != nil { 189 return nil, err 190 } 191 if doc == nil { 192 return nil, fmt.Errorf("builder returned nil Document") 193 } 194 if doc.Packages == nil { 195 return nil, fmt.Errorf("builder returned nil Packages map") 196 } 197 if len(doc.Packages) != 1 { 198 return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages)) 199 } 200 201 // now, walk through each file and find its licenses (if any) 202 pkg := doc.Packages[0] 203 if pkg == nil { 204 return nil, fmt.Errorf("builder returned nil Package") 205 } 206 if pkg.Files == nil { 207 return nil, fmt.Errorf("builder returned nil Files in Package") 208 } 209 licsForPackage := map[string]int{} 210 for _, f := range pkg.Files { 211 // start by initializing / clearing values 212 f.LicenseInfoInFiles = []string{"NOASSERTION"} 213 f.LicenseConcluded = "NOASSERTION" 214 215 // check whether the searcher should ignore this file 216 if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) { 217 continue 218 } 219 220 fPath := filepath.Join(dirRoot, f.FileName) 221 // FIXME this is not preferable -- ignoring error 222 ids, _ := searchFileIDs(fPath) 223 // FIXME for now, proceed onwards with whatever IDs we obtained. 224 // FIXME instead of ignoring the error, should probably either log it, 225 // FIXME and/or enable the caller to configure what should happen. 226 227 // separate out for this file's licenses 228 licsForFile := map[string]int{} 229 licsParens := []string{} 230 for _, lid := range ids { 231 // get individual elements and add for file and package 232 licElements := getIndividualLicenses(lid) 233 for _, elt := range licElements { 234 licsForFile[elt] = 1 235 licsForPackage[elt] = 1 236 } 237 // parenthesize if needed and add to slice for joining 238 licsParens = append(licsParens, makeElement(lid)) 239 } 240 241 // OK -- now we can fill in the file's details, or NOASSERTION if none 242 if len(licsForFile) > 0 { 243 f.LicenseInfoInFiles = []string{} 244 for lic := range licsForFile { 245 f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic) 246 } 247 sort.Strings(f.LicenseInfoInFiles) 248 // avoid adding parens and joining for single-ID items 249 if len(licsParens) == 1 { 250 f.LicenseConcluded = ids[0] 251 } else { 252 f.LicenseConcluded = strings.Join(licsParens, " AND ") 253 } 254 } 255 } 256 257 // and finally, we can fill in the package's details 258 if len(licsForPackage) == 0 { 259 pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"} 260 } else { 261 pkg.PackageLicenseInfoFromFiles = []string{} 262 for lic := range licsForPackage { 263 pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic) 264 } 265 sort.Strings(pkg.PackageLicenseInfoFromFiles) 266 } 267 268 return doc, nil 269} 270 271// ===== 2.3 Searcher functions ===== 272 273// Config2_3 is a collection of configuration settings for docbuilder 274// (for version 2.3 SPDX Documents). A few mandatory fields are set here 275// so that they can be repeatedly reused in multiple calls to Build2_3. 276type Config2_3 struct { 277 // NamespacePrefix should be a URI representing a prefix for the 278 // namespace with which the SPDX Document will be associated. 279 // It will be used in the DocumentNamespace field in the CreationInfo 280 // section, followed by the per-Document package name and a random UUID. 281 NamespacePrefix string 282 283 // BuilderPathsIgnored lists certain paths to be omitted from the built 284 // document. Each string should be a path, relative to the package's 285 // dirRoot, to a specific file or (for all files in a directory) ending 286 // in a slash. Prefix the string with "**" to omit all instances of that 287 // file / directory, regardless of where it is in the file tree. 288 BuilderPathsIgnored []string 289 290 // SearcherPathsIgnored lists certain paths that should not be searched 291 // by idsearcher, even if those paths have Files present. It uses the 292 // same format as BuilderPathsIgnored. 293 SearcherPathsIgnored []string 294} 295 296// BuildIDsDocument2_3 creates an SPDX Document (version 2.3) and searches for 297// short-form IDs in each file, filling in license fields as appropriate. It 298// returns that document or error if any is encountered. Arguments: 299// - packageName: name of package / directory 300// - dirRoot: path to directory to be analyzed 301// - namespacePrefix: URI representing a prefix for the 302// namespace with which the SPDX Document will be associated 303func BuildIDsDocument2_3(packageName string, dirRoot string, idconfig *Config2_3) (*v2_3.Document, error) { 304 // first, build the Document using builder 305 bconfig := &builder.Config2_3{ 306 NamespacePrefix: idconfig.NamespacePrefix, 307 CreatorType: "Tool", 308 Creator: "github.com/spdx/tools-golang/idsearcher", 309 PathsIgnored: idconfig.BuilderPathsIgnored, 310 } 311 doc, err := builder.Build2_3(packageName, dirRoot, bconfig) 312 if err != nil { 313 return nil, err 314 } 315 if doc == nil { 316 return nil, fmt.Errorf("builder returned nil Document") 317 } 318 if doc.Packages == nil { 319 return nil, fmt.Errorf("builder returned nil Packages map") 320 } 321 if len(doc.Packages) != 1 { 322 return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages)) 323 } 324 325 // now, walk through each file and find its licenses (if any) 326 pkg := doc.Packages[0] 327 if pkg == nil { 328 return nil, fmt.Errorf("builder returned nil Package") 329 } 330 if pkg.Files == nil { 331 return nil, fmt.Errorf("builder returned nil Files in Package") 332 } 333 licsForPackage := map[string]int{} 334 for _, f := range pkg.Files { 335 // start by initializing / clearing values 336 f.LicenseInfoInFiles = []string{"NOASSERTION"} 337 f.LicenseConcluded = "NOASSERTION" 338 339 // check whether the searcher should ignore this file 340 if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) { 341 continue 342 } 343 344 fPath := filepath.Join(dirRoot, f.FileName) 345 // FIXME this is not preferable -- ignoring error 346 ids, _ := searchFileIDs(fPath) 347 // FIXME for now, proceed onwards with whatever IDs we obtained. 348 // FIXME instead of ignoring the error, should probably either log it, 349 // FIXME and/or enable the caller to configure what should happen. 350 351 // separate out for this file's licenses 352 licsForFile := map[string]int{} 353 licsParens := []string{} 354 for _, lid := range ids { 355 // get individual elements and add for file and package 356 licElements := getIndividualLicenses(lid) 357 for _, elt := range licElements { 358 licsForFile[elt] = 1 359 licsForPackage[elt] = 1 360 } 361 // parenthesize if needed and add to slice for joining 362 licsParens = append(licsParens, makeElement(lid)) 363 } 364 365 // OK -- now we can fill in the file's details, or NOASSERTION if none 366 if len(licsForFile) > 0 { 367 f.LicenseInfoInFiles = []string{} 368 for lic := range licsForFile { 369 f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic) 370 } 371 sort.Strings(f.LicenseInfoInFiles) 372 // avoid adding parens and joining for single-ID items 373 if len(licsParens) == 1 { 374 f.LicenseConcluded = ids[0] 375 } else { 376 f.LicenseConcluded = strings.Join(licsParens, " AND ") 377 } 378 } 379 } 380 381 // and finally, we can fill in the package's details 382 if len(licsForPackage) == 0 { 383 pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"} 384 } else { 385 pkg.PackageLicenseInfoFromFiles = []string{} 386 for lic := range licsForPackage { 387 pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic) 388 } 389 sort.Strings(pkg.PackageLicenseInfoFromFiles) 390 } 391 392 return doc, nil 393} 394 395// ===== Utility functions (not version-specific) ===== 396func searchFileIDs(filePath string) ([]string, error) { 397 idsMap := map[string]int{} 398 ids := []string{} 399 400 f, err := os.Open(filePath) 401 if err != nil { 402 return nil, err 403 } 404 defer f.Close() 405 406 scanner := bufio.NewScanner(f) 407 408 for scanner.Scan() { 409 if strings.Contains(scanner.Text(), "SPDX-License-Identifier:") { 410 strs := strings.SplitN(scanner.Text(), "SPDX-License-Identifier:", 2) 411 412 // if prefixed by more than n characters, it's probably not a 413 // short-form ID; it's probably code to detect short-form IDs. 414 // Like this function itself, for example =) 415 prefix := stripTrash(strs[0]) 416 if len(prefix) > 5 { 417 continue 418 } 419 420 // stop before trailing */ if it is present 421 lidToExtract := strs[1] 422 lidToExtract = strings.Split(lidToExtract, "*/")[0] 423 lid := strings.TrimSpace(lidToExtract) 424 lid = stripTrash(lid) 425 idsMap[lid] = 1 426 } 427 } 428 429 // FIXME for now, ignore scanner errors because we want to return whatever 430 // FIXME IDs were in fact found. should probably be changed to either 431 // FIXME log the error, and/or be configurable for what should happen. 432 // if err = scanner.Err(); err != nil { 433 // return nil, err 434 // } 435 436 // now, convert map to string 437 for lid := range idsMap { 438 ids = append(ids, lid) 439 } 440 441 // and sort it 442 sort.Strings(ids) 443 444 return ids, nil 445} 446 447func stripTrash(lid string) string { 448 re := regexp.MustCompile(`[^\w\s\d.\-\+()]+`) 449 return re.ReplaceAllString(lid, "") 450} 451 452func makeElement(lic string) string { 453 if strings.Contains(lic, " AND ") || strings.Contains(lic, " OR ") { 454 return fmt.Sprintf("(%s)", lic) 455 } 456 457 return lic 458} 459 460func getIndividualLicenses(lic string) []string { 461 // replace parens and '+' with spaces 462 lic = strings.Replace(lic, "(", " ", -1) 463 lic = strings.Replace(lic, ")", " ", -1) 464 lic = strings.Replace(lic, "+", " ", -1) 465 466 // now, split by spaces, trim, and add to slice 467 licElements := strings.Split(lic, " ") 468 lics := []string{} 469 for _, elt := range licElements { 470 elt := strings.TrimSpace(elt) 471 // don't add if empty or if case-insensitive operator 472 if elt == "" || strings.EqualFold(elt, "AND") || 473 strings.EqualFold(elt, "OR") || strings.EqualFold(elt, "WITH") { 474 continue 475 } 476 477 lics = append(lics, elt) 478 } 479 480 // sort before returning 481 sort.Strings(lics) 482 return lics 483} 484