xref: /aosp_15_r20/external/licenseclassifier/serializer/serializer.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2017 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package serializer normalizes the license text and calculates the hash
16// values for all substrings in the license. It then outputs the normalized
17// text and hashes to disk in a compressed archive.
18package serializer
19
20import (
21	"archive/tar"
22	"bytes"
23	"compress/gzip"
24	"io"
25	"log"
26	"path/filepath"
27	"strings"
28
29	"github.com/google/licenseclassifier"
30	"github.com/google/licenseclassifier/stringclassifier/searchset"
31)
32
33// ArchiveLicenses takes all of the known license texts, normalizes them, then
34// calculates the hash values of all substrings. The resulting normalized text
35// and hashed substring values are then serialized into an archive file.
36func ArchiveLicenses(licenses []string, w io.Writer) error {
37	gw := gzip.NewWriter(w)
38	defer gw.Close()
39
40	tw := tar.NewWriter(gw)
41	for _, license := range licenses {
42		// All license files have a ".txt" extension.
43		ext := filepath.Ext(license)
44		if ext != ".txt" {
45			continue
46		}
47
48		contents, err := licenseclassifier.ReadLicenseFile(license)
49		if err != nil {
50			return err
51		}
52
53		str := licenseclassifier.TrimExtraneousTrailingText(string(contents))
54		for _, n := range licenseclassifier.Normalizers {
55			str = n(str)
56		}
57
58		baseName := strings.TrimSuffix(filepath.Base(license), ext)
59
60		// Serialize the normalized license text.
61		log.Printf("Serializing %q", baseName)
62		hdr := &tar.Header{
63			Name: filepath.Base(license),
64			Mode: 0644,
65			Size: int64(len(str)),
66		}
67
68		if err := tw.WriteHeader(hdr); err != nil {
69			return err
70		}
71		if _, err := tw.Write([]byte(str)); err != nil {
72			return err
73		}
74
75		// Calculate the substrings' checksums
76		set := searchset.New(str, searchset.DefaultGranularity)
77
78		var s bytes.Buffer
79		if err := set.Serialize(&s); err != nil {
80			return err
81		}
82
83		// Serialize the checksums.
84		hdr = &tar.Header{
85			Name: baseName + ".hash",
86			Mode: 0644,
87			Size: int64(s.Len()),
88		}
89
90		if err := tw.WriteHeader(hdr); err != nil {
91			return err
92		}
93		if _, err := tw.Write(s.Bytes()); err != nil {
94			return err
95		}
96	}
97
98	return tw.Close()
99}
100