1 // Copyright (C) 2024 The Android Open Source Project
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 use gestalt_ratio::gestalt_ratio;
16 use itertools::Itertools;
17 use spdx::{LicenseReq, Licensee};
18 use std::sync::LazyLock;
19
strip_punctuation(text: &str) -> String20 fn strip_punctuation(text: &str) -> String {
21 let lowercase = text.to_lowercase();
22 let mut processed = String::with_capacity(lowercase.len());
23 for c in lowercase.chars() {
24 if c.is_alphanumeric() || c == '.' {
25 processed.push(c)
26 } else if !processed.ends_with(' ') {
27 processed.push(' ')
28 }
29 }
30 processed.trim().to_string()
31 }
32
33 // TODO: It's possible for some license files to contain multiple licenses concatenated together
classify_license_file_contents(contents: &str) -> Option<LicenseReq>34 pub(crate) fn classify_license_file_contents(contents: &str) -> Option<LicenseReq> {
35 let contents = strip_punctuation(contents);
36
37 // Exact match
38 for (req, required_text) in LICENSE_CONTENT_CLASSIFICATION.iter() {
39 if contents.contains(required_text) {
40 return Some(req.clone());
41 }
42 }
43
44 // Fuzzy match. This is expensive, so start with licenses that are closest in length to the file.
45 for (req, required_text) in LICENSE_CONTENT_CLASSIFICATION.iter().sorted_by(|a, b| {
46 let mut ra = a.1.len() as f32 / contents.len() as f32;
47 let mut rb = b.1.len() as f32 / contents.len() as f32;
48 if ra > 1.0 {
49 ra = 1.0 / ra;
50 }
51 if rb > 1.0 {
52 rb = 1.0 / rb;
53 }
54 rb.partial_cmp(&ra).unwrap()
55 }) {
56 let similarity = gestalt_ratio(&contents, required_text);
57 if similarity > 0.95 {
58 return Some(req.clone());
59 }
60 }
61
62 None
63 }
64
65 static LICENSE_CONTENT_CLASSIFICATION: LazyLock<Vec<(LicenseReq, String)>> = LazyLock::new(|| {
66 vec![
67 ("MIT", include_str!("licenses/MIT.txt")),
68 ("Apache-2.0", include_str!("licenses/Apache-2.0.txt")),
69 ("ISC", include_str!("licenses/ISC.txt")),
70 ("MPL-2.0", include_str!("licenses/MPL-2.0.txt")),
71 ("BSD-2-Clause", include_str!("licenses/BSD-2-Clause.txt")),
72 ("BSD-3-Clause", include_str!("licenses/BSD-3-Clause.txt")),
73 ("Unlicense", include_str!("licenses/Unlicense.txt")),
74 ("Zlib", include_str!("licenses/Zlib.txt")),
75 ]
76 .into_iter()
77 .map(|(req, tokens)| {
78 let tokens = strip_punctuation(tokens);
79 assert!(!tokens.is_empty());
80 (Licensee::parse(req).unwrap().into_req(), tokens)
81 })
82 .collect()
83 });
84
85 #[cfg(test)]
86 mod tests {
87 use super::*;
88
89 #[test]
test_strip_punctuation()90 fn test_strip_punctuation() {
91 assert_eq!(strip_punctuation("FOO BAR"), "foo bar", "Converted to lowercase");
92 assert_eq!(strip_punctuation("foo, bar"), "foo bar", "Punctuation removed");
93 assert_eq!(strip_punctuation("foo. bar"), "foo. bar", "Periods preserved");
94 assert_eq!(
95 strip_punctuation(" foo bar "),
96 "foo bar",
97 "Leading and trailing whitespace stripped"
98 );
99 assert_eq!(
100 strip_punctuation(" foo\n\n\n\nbar "),
101 "foo bar",
102 "Multiple whitespace replaced with single space"
103 );
104 }
105
106 #[test]
test_classify()107 fn test_classify() {
108 assert!(classify_license_file_contents("foo").is_none());
109 assert_eq!(
110 classify_license_file_contents(include_str!("testdata/BSD-3-Clause-bindgen.txt")),
111 Some(Licensee::parse("BSD-3-Clause").unwrap().into_req())
112 );
113 assert_eq!(
114 classify_license_file_contents(include_str!("testdata/LICENSE-MIT-aarch64-paging.txt")),
115 Some(Licensee::parse("MIT").unwrap().into_req())
116 );
117 }
118 }
119