xref: /aosp_15_r20/external/perfetto/src/trace_processor/util/zip_reader.h (revision 6dbdd20afdafa5e3ca9b8809fa73465d530080dc)
1 /*
2  * Copyright (C) 2022 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef SRC_TRACE_PROCESSOR_UTIL_ZIP_READER_H_
18 #define SRC_TRACE_PROCESSOR_UTIL_ZIP_READER_H_
19 
20 #include <cstddef>
21 #include <cstdint>
22 #include <functional>
23 #include <optional>
24 #include <string>
25 #include <utility>
26 #include <vector>
27 
28 #include "perfetto/base/status.h"
29 #include "perfetto/ext/base/status_or.h"
30 #include "perfetto/ext/base/string_view.h"
31 #include "perfetto/trace_processor/trace_blob_view.h"
32 #include "src/trace_processor/util/gzip_utils.h"
33 #include "src/trace_processor/util/trace_blob_view_reader.h"
34 
35 // ZipReader allows to read Zip files in a streaming fashion.
36 // Key features:
37 // - Read-only access, there is no ZipWriter.
38 // - Files can be processed as they are seen in the zip archive, without needing
39 //   to see the whole .zip file first.
40 // - It does not read the final zip central directory. Only the metadata in the
41 //   inline file headers is exposed.
42 // - Only the compressed payload is kept around in memory.
43 // - Supports line-based streaming for compressed text files (e.g. logs). This
44 //   enables line-based processing of compressed logs without having to
45 //   decompress fully the individual text file in memory.
46 // - Does NOT support zip64, encryption and other advanced zip file features.
47 // - It is not suitable for security-sensitive contexts. E.g. it doesn't deal
48 //   with zip path traversal attacks (the same file showing up twice with two
49 //   different payloads).
50 //
51 // Possible future features:
52 // - The user could setup a filter (a glob, or a callback) to select the
53 //   interesting files (e.g. *.txt) and skip the appending of the other entries.
54 //   This would avoid completely the cost of keeping in memory the compressed
55 //   payload of unwanted files (e.g. dumpstate.bin in BRs).
56 
57 namespace perfetto::trace_processor::util {
58 
59 class ZipReader;
60 
61 constexpr size_t kZipFileHdrSize = 30;
62 
63 // Holds the metadata and compressed payload of a zip file and allows
64 // decompression. The lifecycle of a ZipFile is completely independent of the
65 // ZipReader that created it. ZipFile(s) can be std::move(d) around and even
66 // outlive the ZipReader.
67 class ZipFile {
68  public:
69   // Note: the lifetime of the lines passed in the vector argument is valid only
70   // for the duration of the callback. Don't retain the StringView(s) passed.
71   using LinesCallback =
72       std::function<void(const std::vector<base::StringView>&)>;
73 
74   ZipFile();
75   ~ZipFile();
76   ZipFile(ZipFile&&) noexcept;
77   ZipFile& operator=(ZipFile&&) noexcept;
78   ZipFile(const ZipFile&) = delete;
79   ZipFile& operator=(const ZipFile&) = delete;
80 
81   // Bulk decompression. It keeps around the compressed data internally, so
82   // this can be called several times.
83   base::Status Decompress(std::vector<uint8_t>*) const;
84 
85   // Streaming line-based decompression for text files.
86   // It decompresses the file in chunks and passes batches of lines to the
87   // caller, without decompressing the whole file into memory.
88   // The typical use case is processing large log files from a bugreport.
89   // Like the above, this is idempotent and keeps around the compressed data.
90   base::Status DecompressLines(LinesCallback) const;
91 
92   // File name, including the relative path (e.g., "FS/data/misc/foobar")
name()93   const std::string& name() const { return hdr_.fname; }
94 
95   // Seconds since the Epoch. This is effectively time_t on 64 bit platforms.
96   int64_t GetDatetime() const;
97 
98   // Returns the modified time in the format %Y-%m-%d %H:%M:%S.
99   std::string GetDatetimeStr() const;
100 
uncompressed_size()101   size_t uncompressed_size() const { return hdr_.uncompressed_size; }
compressed_size()102   size_t compressed_size() const { return hdr_.compressed_size; }
103 
104  private:
105   friend class ZipReader;
106 
107   base::Status DoDecompressionChecks() const;
108 
109   // Rationale for having this as a nested sub-struct:
110   // 1. Makes the move operator easier to maintain.
111   // 2. Allows the ZipReader to handle a copy of this struct for the file
112   //    being parsed. ZipReade will move the hdr into a full ZipFile once it
113   //    has established the file is complete and valid.
114   struct Header {
115     uint32_t signature = 0;
116     uint16_t version = 0;
117     uint16_t flags = 0;
118     uint16_t compression = 0;
119     uint32_t checksum = 0;
120     uint16_t mtime = 0;
121     uint16_t mdate = 0;
122     uint32_t compressed_size = 0;
123     uint32_t uncompressed_size = 0;
124     uint16_t fname_len = 0;
125     uint16_t extra_field_len = 0;
126     std::string fname;
127   };
128 
129   Header hdr_{};
130   TraceBlobView compressed_data_;
131   // If adding new fields here, remember to update the move operators.
132 };
133 
134 class ZipReader {
135  public:
136   ZipReader();
137   ~ZipReader();
138 
139   ZipReader(const ZipReader&) = delete;
140   ZipReader& operator=(const ZipReader&) = delete;
141   ZipReader(ZipReader&&) = delete;
142   ZipReader& operator=(ZipReader&&) = delete;
143 
144   // Parses data incrementally from a zip file in chunks. The chunks can be
145   // arbitrarily cut. You can pass the whole file in one go, byte by byte or
146   // anything in between.
147   // files() is updated incrementally as soon as a new whole compressed file
148   // has been processed. You don't need to get to the end of the zip file to
149   // see all files. The final "central directory" at the end of the file is
150   // actually ignored.
151   base::Status Parse(TraceBlobView);
152 
153   // Returns a list of all the files discovered so far.
files()154   const std::vector<ZipFile>& files() const { return files_; }
155 
156   // Moves ownership of the ZipFiles to the caller. The caller can use this
157   // to reduce the memory working set and retain only the files they care about.
TakeFiles()158   std::vector<ZipFile> TakeFiles() { return std::move(files_); }
159 
160   // Find a file by its path inside the zip archive.
161   ZipFile* Find(const std::string& path);
162 
163  private:
164   // Keeps track of the incremental parsing state of the current zip stream.
165   // When a compressed file is completely parsed, a ZipFile instance is
166   // constructed and appended to `files_`.
167   struct FileParseState {
168     enum {
169       kHeader,
170       kFilename,
171       kSkipBytes,
172       kCompressedData,
173     } parse_state = kHeader;
174     size_t ignore_bytes_after_fname = 0;
175     // Used to track the number of bytes fed into the decompressor when we don't
176     // know the compressed size upfront.
177     size_t decompressor_bytes_fed = 0;
178     GzipDecompressor decompressor{GzipDecompressor::InputMode::kRawDeflate};
179     std::optional<TraceBlobView> compressed;
180     ZipFile::Header hdr{};
181   };
182 
183   base::Status TryParseHeader();
184   base::Status TryParseFilename();
185   base::Status TrySkipBytes();
186   base::Status TryParseCompressedData();
187   base::StatusOr<std::optional<TraceBlobView>> TryParseUnsizedCompressedData();
188 
189   FileParseState cur_;
190   std::vector<ZipFile> files_;
191   util::TraceBlobViewReader reader_;
192 };
193 
194 }  // namespace perfetto::trace_processor::util
195 
196 #endif  // SRC_TRACE_PROCESSOR_UTIL_ZIP_READER_H_
197