xref: /aosp_15_r20/development/tools/external_crates/license_checker/src/content_checker.rs (revision 90c8c64db3049935a07c6143d7fd006e26f8ecca)
1 // Copyright (C) 2024 The Android Open Source Project
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 use gestalt_ratio::gestalt_ratio;
16 use itertools::Itertools;
17 use spdx::{LicenseReq, Licensee};
18 use std::sync::LazyLock;
19 
strip_punctuation(text: &str) -> String20 fn strip_punctuation(text: &str) -> String {
21     let lowercase = text.to_lowercase();
22     let mut processed = String::with_capacity(lowercase.len());
23     for c in lowercase.chars() {
24         if c.is_alphanumeric() || c == '.' {
25             processed.push(c)
26         } else if !processed.ends_with(' ') {
27             processed.push(' ')
28         }
29     }
30     processed.trim().to_string()
31 }
32 
33 // TODO: It's possible for some license files to contain multiple licenses concatenated together
classify_license_file_contents(contents: &str) -> Option<LicenseReq>34 pub(crate) fn classify_license_file_contents(contents: &str) -> Option<LicenseReq> {
35     let contents = strip_punctuation(contents);
36 
37     // Exact match
38     for (req, required_text) in LICENSE_CONTENT_CLASSIFICATION.iter() {
39         if contents.contains(required_text) {
40             return Some(req.clone());
41         }
42     }
43 
44     // Fuzzy match. This is expensive, so start with licenses that are closest in length to the file.
45     for (req, required_text) in LICENSE_CONTENT_CLASSIFICATION.iter().sorted_by(|a, b| {
46         let mut ra = a.1.len() as f32 / contents.len() as f32;
47         let mut rb = b.1.len() as f32 / contents.len() as f32;
48         if ra > 1.0 {
49             ra = 1.0 / ra;
50         }
51         if rb > 1.0 {
52             rb = 1.0 / rb;
53         }
54         rb.partial_cmp(&ra).unwrap()
55     }) {
56         let similarity = gestalt_ratio(&contents, required_text);
57         if similarity > 0.95 {
58             return Some(req.clone());
59         }
60     }
61 
62     None
63 }
64 
65 static LICENSE_CONTENT_CLASSIFICATION: LazyLock<Vec<(LicenseReq, String)>> = LazyLock::new(|| {
66     vec![
67         ("MIT", include_str!("licenses/MIT.txt")),
68         ("Apache-2.0", include_str!("licenses/Apache-2.0.txt")),
69         ("ISC", include_str!("licenses/ISC.txt")),
70         ("MPL-2.0", include_str!("licenses/MPL-2.0.txt")),
71         ("BSD-2-Clause", include_str!("licenses/BSD-2-Clause.txt")),
72         ("BSD-3-Clause", include_str!("licenses/BSD-3-Clause.txt")),
73         ("Unlicense", include_str!("licenses/Unlicense.txt")),
74         ("Zlib", include_str!("licenses/Zlib.txt")),
75     ]
76     .into_iter()
77     .map(|(req, tokens)| {
78         let tokens = strip_punctuation(tokens);
79         assert!(!tokens.is_empty());
80         (Licensee::parse(req).unwrap().into_req(), tokens)
81     })
82     .collect()
83 });
84 
85 #[cfg(test)]
86 mod tests {
87     use super::*;
88 
89     #[test]
test_strip_punctuation()90     fn test_strip_punctuation() {
91         assert_eq!(strip_punctuation("FOO BAR"), "foo bar", "Converted to lowercase");
92         assert_eq!(strip_punctuation("foo, bar"), "foo bar", "Punctuation removed");
93         assert_eq!(strip_punctuation("foo. bar"), "foo. bar", "Periods preserved");
94         assert_eq!(
95             strip_punctuation(" foo bar "),
96             "foo bar",
97             "Leading and trailing whitespace stripped"
98         );
99         assert_eq!(
100             strip_punctuation(" foo\n\n\n\nbar "),
101             "foo bar",
102             "Multiple whitespace replaced with single space"
103         );
104     }
105 
106     #[test]
test_classify()107     fn test_classify() {
108         assert!(classify_license_file_contents("foo").is_none());
109         assert_eq!(
110             classify_license_file_contents(include_str!("testdata/BSD-3-Clause-bindgen.txt")),
111             Some(Licensee::parse("BSD-3-Clause").unwrap().into_req())
112         );
113         assert_eq!(
114             classify_license_file_contents(include_str!("testdata/LICENSE-MIT-aarch64-paging.txt")),
115             Some(Licensee::parse("MIT").unwrap().into_req())
116         );
117     }
118 }
119