1 /* 2 * Copyright (C) 2022 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef SRC_TRACE_PROCESSOR_UTIL_ZIP_READER_H_ 18 #define SRC_TRACE_PROCESSOR_UTIL_ZIP_READER_H_ 19 20 #include <cstddef> 21 #include <cstdint> 22 #include <functional> 23 #include <optional> 24 #include <string> 25 #include <utility> 26 #include <vector> 27 28 #include "perfetto/base/status.h" 29 #include "perfetto/ext/base/status_or.h" 30 #include "perfetto/ext/base/string_view.h" 31 #include "perfetto/trace_processor/trace_blob_view.h" 32 #include "src/trace_processor/util/gzip_utils.h" 33 #include "src/trace_processor/util/trace_blob_view_reader.h" 34 35 // ZipReader allows to read Zip files in a streaming fashion. 36 // Key features: 37 // - Read-only access, there is no ZipWriter. 38 // - Files can be processed as they are seen in the zip archive, without needing 39 // to see the whole .zip file first. 40 // - It does not read the final zip central directory. Only the metadata in the 41 // inline file headers is exposed. 42 // - Only the compressed payload is kept around in memory. 43 // - Supports line-based streaming for compressed text files (e.g. logs). This 44 // enables line-based processing of compressed logs without having to 45 // decompress fully the individual text file in memory. 46 // - Does NOT support zip64, encryption and other advanced zip file features. 47 // - It is not suitable for security-sensitive contexts. E.g. it doesn't deal 48 // with zip path traversal attacks (the same file showing up twice with two 49 // different payloads). 50 // 51 // Possible future features: 52 // - The user could setup a filter (a glob, or a callback) to select the 53 // interesting files (e.g. *.txt) and skip the appending of the other entries. 54 // This would avoid completely the cost of keeping in memory the compressed 55 // payload of unwanted files (e.g. dumpstate.bin in BRs). 56 57 namespace perfetto::trace_processor::util { 58 59 class ZipReader; 60 61 constexpr size_t kZipFileHdrSize = 30; 62 63 // Holds the metadata and compressed payload of a zip file and allows 64 // decompression. The lifecycle of a ZipFile is completely independent of the 65 // ZipReader that created it. ZipFile(s) can be std::move(d) around and even 66 // outlive the ZipReader. 67 class ZipFile { 68 public: 69 // Note: the lifetime of the lines passed in the vector argument is valid only 70 // for the duration of the callback. Don't retain the StringView(s) passed. 71 using LinesCallback = 72 std::function<void(const std::vector<base::StringView>&)>; 73 74 ZipFile(); 75 ~ZipFile(); 76 ZipFile(ZipFile&&) noexcept; 77 ZipFile& operator=(ZipFile&&) noexcept; 78 ZipFile(const ZipFile&) = delete; 79 ZipFile& operator=(const ZipFile&) = delete; 80 81 // Bulk decompression. It keeps around the compressed data internally, so 82 // this can be called several times. 83 base::Status Decompress(std::vector<uint8_t>*) const; 84 85 // Streaming line-based decompression for text files. 86 // It decompresses the file in chunks and passes batches of lines to the 87 // caller, without decompressing the whole file into memory. 88 // The typical use case is processing large log files from a bugreport. 89 // Like the above, this is idempotent and keeps around the compressed data. 90 base::Status DecompressLines(LinesCallback) const; 91 92 // File name, including the relative path (e.g., "FS/data/misc/foobar") name()93 const std::string& name() const { return hdr_.fname; } 94 95 // Seconds since the Epoch. This is effectively time_t on 64 bit platforms. 96 int64_t GetDatetime() const; 97 98 // Returns the modified time in the format %Y-%m-%d %H:%M:%S. 99 std::string GetDatetimeStr() const; 100 uncompressed_size()101 size_t uncompressed_size() const { return hdr_.uncompressed_size; } compressed_size()102 size_t compressed_size() const { return hdr_.compressed_size; } 103 104 private: 105 friend class ZipReader; 106 107 base::Status DoDecompressionChecks() const; 108 109 // Rationale for having this as a nested sub-struct: 110 // 1. Makes the move operator easier to maintain. 111 // 2. Allows the ZipReader to handle a copy of this struct for the file 112 // being parsed. ZipReade will move the hdr into a full ZipFile once it 113 // has established the file is complete and valid. 114 struct Header { 115 uint32_t signature = 0; 116 uint16_t version = 0; 117 uint16_t flags = 0; 118 uint16_t compression = 0; 119 uint32_t checksum = 0; 120 uint16_t mtime = 0; 121 uint16_t mdate = 0; 122 uint32_t compressed_size = 0; 123 uint32_t uncompressed_size = 0; 124 uint16_t fname_len = 0; 125 uint16_t extra_field_len = 0; 126 std::string fname; 127 }; 128 129 Header hdr_{}; 130 TraceBlobView compressed_data_; 131 // If adding new fields here, remember to update the move operators. 132 }; 133 134 class ZipReader { 135 public: 136 ZipReader(); 137 ~ZipReader(); 138 139 ZipReader(const ZipReader&) = delete; 140 ZipReader& operator=(const ZipReader&) = delete; 141 ZipReader(ZipReader&&) = delete; 142 ZipReader& operator=(ZipReader&&) = delete; 143 144 // Parses data incrementally from a zip file in chunks. The chunks can be 145 // arbitrarily cut. You can pass the whole file in one go, byte by byte or 146 // anything in between. 147 // files() is updated incrementally as soon as a new whole compressed file 148 // has been processed. You don't need to get to the end of the zip file to 149 // see all files. The final "central directory" at the end of the file is 150 // actually ignored. 151 base::Status Parse(TraceBlobView); 152 153 // Returns a list of all the files discovered so far. files()154 const std::vector<ZipFile>& files() const { return files_; } 155 156 // Moves ownership of the ZipFiles to the caller. The caller can use this 157 // to reduce the memory working set and retain only the files they care about. TakeFiles()158 std::vector<ZipFile> TakeFiles() { return std::move(files_); } 159 160 // Find a file by its path inside the zip archive. 161 ZipFile* Find(const std::string& path); 162 163 private: 164 // Keeps track of the incremental parsing state of the current zip stream. 165 // When a compressed file is completely parsed, a ZipFile instance is 166 // constructed and appended to `files_`. 167 struct FileParseState { 168 enum { 169 kHeader, 170 kFilename, 171 kSkipBytes, 172 kCompressedData, 173 } parse_state = kHeader; 174 size_t ignore_bytes_after_fname = 0; 175 // Used to track the number of bytes fed into the decompressor when we don't 176 // know the compressed size upfront. 177 size_t decompressor_bytes_fed = 0; 178 GzipDecompressor decompressor{GzipDecompressor::InputMode::kRawDeflate}; 179 std::optional<TraceBlobView> compressed; 180 ZipFile::Header hdr{}; 181 }; 182 183 base::Status TryParseHeader(); 184 base::Status TryParseFilename(); 185 base::Status TrySkipBytes(); 186 base::Status TryParseCompressedData(); 187 base::StatusOr<std::optional<TraceBlobView>> TryParseUnsizedCompressedData(); 188 189 FileParseState cur_; 190 std::vector<ZipFile> files_; 191 util::TraceBlobViewReader reader_; 192 }; 193 194 } // namespace perfetto::trace_processor::util 195 196 #endif // SRC_TRACE_PROCESSOR_UTIL_ZIP_READER_H_ 197