1*07fb1d06SElliott Hughes // Copyright 2017 The ChromiumOS Authors
2*07fb1d06SElliott Hughes // Use of this source code is governed by a BSD-style license that can be
3*07fb1d06SElliott Hughes // found in the LICENSE file.
4*07fb1d06SElliott Hughes
5*07fb1d06SElliott Hughes #include "puffin/src/include/puffin/utils.h"
6*07fb1d06SElliott Hughes
7*07fb1d06SElliott Hughes #include <inttypes.h>
8*07fb1d06SElliott Hughes
9*07fb1d06SElliott Hughes #include <algorithm>
10*07fb1d06SElliott Hughes #include <iterator>
11*07fb1d06SElliott Hughes #include <set>
12*07fb1d06SElliott Hughes #include <string>
13*07fb1d06SElliott Hughes #include <vector>
14*07fb1d06SElliott Hughes
15*07fb1d06SElliott Hughes #include "puffin/file_stream.h"
16*07fb1d06SElliott Hughes #include "puffin/memory_stream.h"
17*07fb1d06SElliott Hughes #include "puffin/src/bit_reader.h"
18*07fb1d06SElliott Hughes #include "puffin/src/include/puffin/common.h"
19*07fb1d06SElliott Hughes #include "puffin/src/include/puffin/puffer.h"
20*07fb1d06SElliott Hughes #include "puffin/src/logging.h"
21*07fb1d06SElliott Hughes #include "puffin/src/puff_writer.h"
22*07fb1d06SElliott Hughes
23*07fb1d06SElliott Hughes using std::set;
24*07fb1d06SElliott Hughes using std::string;
25*07fb1d06SElliott Hughes using std::vector;
26*07fb1d06SElliott Hughes
27*07fb1d06SElliott Hughes namespace {
28*07fb1d06SElliott Hughes // Use memcpy to access the unaligned data of type |T|.
29*07fb1d06SElliott Hughes template <typename T>
get_unaligned(const void * address)30*07fb1d06SElliott Hughes inline T get_unaligned(const void* address) {
31*07fb1d06SElliott Hughes T result;
32*07fb1d06SElliott Hughes memcpy(&result, address, sizeof(T));
33*07fb1d06SElliott Hughes return result;
34*07fb1d06SElliott Hughes }
35*07fb1d06SElliott Hughes
36*07fb1d06SElliott Hughes struct ExtentData {
37*07fb1d06SElliott Hughes puffin::BitExtent extent;
38*07fb1d06SElliott Hughes uint64_t byte_offset;
39*07fb1d06SElliott Hughes uint64_t byte_length;
40*07fb1d06SElliott Hughes const puffin::Buffer& data;
41*07fb1d06SElliott Hughes
ExtentData__anonefa9e89a0111::ExtentData42*07fb1d06SElliott Hughes ExtentData(const puffin::BitExtent& in_extent, const puffin::Buffer& in_data)
43*07fb1d06SElliott Hughes : extent(in_extent), data(in_data) {
44*07fb1d06SElliott Hughes // Round start offset up and end offset down to exclude bits not in this
45*07fb1d06SElliott Hughes // extent. We simply ignore the bits at start and end that's not on byte
46*07fb1d06SElliott Hughes // boundary because as long as the majority of the bytes are the same,
47*07fb1d06SElliott Hughes // bsdiff will be able to reference it.
48*07fb1d06SElliott Hughes byte_offset = (extent.offset + 7) / 8;
49*07fb1d06SElliott Hughes uint64_t byte_end_offset = (extent.offset + extent.length) / 8;
50*07fb1d06SElliott Hughes CHECK(byte_end_offset <= data.size());
51*07fb1d06SElliott Hughes if (byte_end_offset > byte_offset) {
52*07fb1d06SElliott Hughes byte_length = byte_end_offset - byte_offset;
53*07fb1d06SElliott Hughes } else {
54*07fb1d06SElliott Hughes byte_length = 0;
55*07fb1d06SElliott Hughes }
56*07fb1d06SElliott Hughes }
57*07fb1d06SElliott Hughes
Compare__anonefa9e89a0111::ExtentData58*07fb1d06SElliott Hughes int Compare(const ExtentData& other) const {
59*07fb1d06SElliott Hughes if (extent.length != other.extent.length) {
60*07fb1d06SElliott Hughes return extent.length < other.extent.length ? -1 : 1;
61*07fb1d06SElliott Hughes }
62*07fb1d06SElliott Hughes return memcmp(data.data() + byte_offset,
63*07fb1d06SElliott Hughes other.data.data() + other.byte_offset,
64*07fb1d06SElliott Hughes std::min(byte_length, other.byte_length));
65*07fb1d06SElliott Hughes }
operator <__anonefa9e89a0111::ExtentData66*07fb1d06SElliott Hughes bool operator<(const ExtentData& other) const { return Compare(other) < 0; }
operator ==__anonefa9e89a0111::ExtentData67*07fb1d06SElliott Hughes bool operator==(const ExtentData& other) const { return Compare(other) == 0; }
68*07fb1d06SElliott Hughes };
69*07fb1d06SElliott Hughes
70*07fb1d06SElliott Hughes } // namespace
71*07fb1d06SElliott Hughes
72*07fb1d06SElliott Hughes namespace puffin {
73*07fb1d06SElliott Hughes
LocateDeflatesInDeflateStream(const uint8_t * data,uint64_t size,uint64_t virtual_offset,vector<BitExtent> * deflates,uint64_t * compressed_size)74*07fb1d06SElliott Hughes bool LocateDeflatesInDeflateStream(const uint8_t* data,
75*07fb1d06SElliott Hughes uint64_t size,
76*07fb1d06SElliott Hughes uint64_t virtual_offset,
77*07fb1d06SElliott Hughes vector<BitExtent>* deflates,
78*07fb1d06SElliott Hughes uint64_t* compressed_size) {
79*07fb1d06SElliott Hughes Puffer puffer;
80*07fb1d06SElliott Hughes BufferBitReader bit_reader(data, size);
81*07fb1d06SElliott Hughes BufferPuffWriter puff_writer(nullptr, 0);
82*07fb1d06SElliott Hughes vector<BitExtent> sub_deflates;
83*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(
84*07fb1d06SElliott Hughes puffer.PuffDeflate(&bit_reader, &puff_writer, &sub_deflates));
85*07fb1d06SElliott Hughes for (const auto& deflate : sub_deflates) {
86*07fb1d06SElliott Hughes deflates->emplace_back(deflate.offset + virtual_offset * 8, deflate.length);
87*07fb1d06SElliott Hughes }
88*07fb1d06SElliott Hughes if (compressed_size) {
89*07fb1d06SElliott Hughes *compressed_size = bit_reader.Offset();
90*07fb1d06SElliott Hughes }
91*07fb1d06SElliott Hughes return true;
92*07fb1d06SElliott Hughes }
93*07fb1d06SElliott Hughes
94*07fb1d06SElliott Hughes // This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the
95*07fb1d06SElliott Hughes // definition of a zlib stream. For finding the deflate blocks, we relying on
96*07fb1d06SElliott Hughes // the proper size of the zlib stream in |data|. Basically the size of the zlib
97*07fb1d06SElliott Hughes // stream should be known before hand. Otherwise we need to parse the stream and
98*07fb1d06SElliott Hughes // find the location of compressed blocks using CalculateSizeOfDeflateBlock().
LocateDeflatesInZlib(const Buffer & data,vector<BitExtent> * deflates)99*07fb1d06SElliott Hughes bool LocateDeflatesInZlib(const Buffer& data, vector<BitExtent>* deflates) {
100*07fb1d06SElliott Hughes // A zlib stream has the following format:
101*07fb1d06SElliott Hughes // 0 1 compression method and flag
102*07fb1d06SElliott Hughes // 1 1 flag
103*07fb1d06SElliott Hughes // 2 4 preset dictionary (optional)
104*07fb1d06SElliott Hughes // 2 or 6 n compressed data
105*07fb1d06SElliott Hughes // n+(2 or 6) 4 Adler-32 checksum
106*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(data.size() >= 6 + 4); // Header + Footer
107*07fb1d06SElliott Hughes uint16_t cmf = data[0];
108*07fb1d06SElliott Hughes auto compression_method = cmf & 0x0F;
109*07fb1d06SElliott Hughes // For deflate compression_method should be 8.
110*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(compression_method == 8);
111*07fb1d06SElliott Hughes
112*07fb1d06SElliott Hughes auto cinfo = (cmf & 0xF0) >> 4;
113*07fb1d06SElliott Hughes // Value greater than 7 is not allowed in deflate.
114*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(cinfo <= 7);
115*07fb1d06SElliott Hughes
116*07fb1d06SElliott Hughes auto flag = data[1];
117*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(((cmf << 8) + flag) % 31 == 0);
118*07fb1d06SElliott Hughes
119*07fb1d06SElliott Hughes uint64_t header_len = 2;
120*07fb1d06SElliott Hughes if (flag & 0x20) {
121*07fb1d06SElliott Hughes header_len += 4; // 4 bytes for the preset dictionary.
122*07fb1d06SElliott Hughes }
123*07fb1d06SElliott Hughes
124*07fb1d06SElliott Hughes // 4 is for ADLER32.
125*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream(
126*07fb1d06SElliott Hughes data.data() + header_len, data.size() - header_len - 4, header_len,
127*07fb1d06SElliott Hughes deflates, nullptr));
128*07fb1d06SElliott Hughes return true;
129*07fb1d06SElliott Hughes }
130*07fb1d06SElliott Hughes
FindDeflateSubBlocks(const UniqueStreamPtr & src,const vector<ByteExtent> & deflates,vector<BitExtent> * subblock_deflates)131*07fb1d06SElliott Hughes bool FindDeflateSubBlocks(const UniqueStreamPtr& src,
132*07fb1d06SElliott Hughes const vector<ByteExtent>& deflates,
133*07fb1d06SElliott Hughes vector<BitExtent>* subblock_deflates) {
134*07fb1d06SElliott Hughes Puffer puffer;
135*07fb1d06SElliott Hughes Buffer deflate_buffer;
136*07fb1d06SElliott Hughes for (const auto& deflate : deflates) {
137*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(src->Seek(deflate.offset));
138*07fb1d06SElliott Hughes // Read from src into deflate_buffer.
139*07fb1d06SElliott Hughes deflate_buffer.resize(deflate.length);
140*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length));
141*07fb1d06SElliott Hughes
142*07fb1d06SElliott Hughes // Find all the subblocks.
143*07fb1d06SElliott Hughes BufferBitReader bit_reader(deflate_buffer.data(), deflate.length);
144*07fb1d06SElliott Hughes // The uncompressed blocks will be ignored since we are passing a null
145*07fb1d06SElliott Hughes // buffered puff writer and a valid deflate locations output array. This
146*07fb1d06SElliott Hughes // should not happen in the puffdiff or anywhere else by default.
147*07fb1d06SElliott Hughes BufferPuffWriter puff_writer(nullptr, 0);
148*07fb1d06SElliott Hughes vector<BitExtent> subblocks;
149*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(
150*07fb1d06SElliott Hughes puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks));
151*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset());
152*07fb1d06SElliott Hughes for (const auto& subblock : subblocks) {
153*07fb1d06SElliott Hughes subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8,
154*07fb1d06SElliott Hughes subblock.length);
155*07fb1d06SElliott Hughes }
156*07fb1d06SElliott Hughes }
157*07fb1d06SElliott Hughes return true;
158*07fb1d06SElliott Hughes }
159*07fb1d06SElliott Hughes
LocateDeflatesInZlibBlocks(const string & file_path,const vector<ByteExtent> & zlibs,vector<BitExtent> * deflates)160*07fb1d06SElliott Hughes bool LocateDeflatesInZlibBlocks(const string& file_path,
161*07fb1d06SElliott Hughes const vector<ByteExtent>& zlibs,
162*07fb1d06SElliott Hughes vector<BitExtent>* deflates) {
163*07fb1d06SElliott Hughes auto src = FileStream::Open(file_path, true, false);
164*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(src);
165*07fb1d06SElliott Hughes
166*07fb1d06SElliott Hughes Buffer buffer;
167*07fb1d06SElliott Hughes for (const auto& zlib : zlibs) {
168*07fb1d06SElliott Hughes buffer.resize(zlib.length);
169*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(src->Seek(zlib.offset));
170*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(src->Read(buffer.data(), buffer.size()));
171*07fb1d06SElliott Hughes vector<BitExtent> tmp_deflates;
172*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(LocateDeflatesInZlib(buffer, &tmp_deflates));
173*07fb1d06SElliott Hughes for (const auto& deflate : tmp_deflates) {
174*07fb1d06SElliott Hughes deflates->emplace_back(deflate.offset + zlib.offset * 8, deflate.length);
175*07fb1d06SElliott Hughes }
176*07fb1d06SElliott Hughes }
177*07fb1d06SElliott Hughes return true;
178*07fb1d06SElliott Hughes }
179*07fb1d06SElliott Hughes
180*07fb1d06SElliott Hughes namespace {
181*07fb1d06SElliott Hughes // For more information about gzip format, refer to RFC 1952 located at:
182*07fb1d06SElliott Hughes // https://www.ietf.org/rfc/rfc1952.txt
IsValidGzipHeader(const uint8_t * header,size_t size)183*07fb1d06SElliott Hughes bool IsValidGzipHeader(const uint8_t* header, size_t size) {
184*07fb1d06SElliott Hughes // Each gzip entry has the following format magic header:
185*07fb1d06SElliott Hughes // 0 1 0x1F
186*07fb1d06SElliott Hughes // 1 1 0x8B
187*07fb1d06SElliott Hughes // 2 1 compression method (8 denotes deflate)
188*07fb1d06SElliott Hughes static constexpr uint8_t magic[] = {0x1F, 0x8B, 8};
189*07fb1d06SElliott Hughes return size >= 10 && std::equal(std::begin(magic), std::end(magic), header);
190*07fb1d06SElliott Hughes }
191*07fb1d06SElliott Hughes } // namespace
192*07fb1d06SElliott Hughes
LocateDeflatesInGzip(const Buffer & data,vector<BitExtent> * deflates)193*07fb1d06SElliott Hughes bool LocateDeflatesInGzip(const Buffer& data, vector<BitExtent>* deflates) {
194*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(IsValidGzipHeader(data.data(), data.size()));
195*07fb1d06SElliott Hughes uint64_t member_start = 0;
196*07fb1d06SElliott Hughes do {
197*07fb1d06SElliott Hughes // After the magic header, the gzip contains:
198*07fb1d06SElliott Hughes // 3 1 set of flags
199*07fb1d06SElliott Hughes // 4 4 modification time
200*07fb1d06SElliott Hughes // 8 1 extra flags
201*07fb1d06SElliott Hughes // 9 1 operating system
202*07fb1d06SElliott Hughes
203*07fb1d06SElliott Hughes uint64_t offset = member_start + 10;
204*07fb1d06SElliott Hughes int flag = data[member_start + 3];
205*07fb1d06SElliott Hughes // Extra field
206*07fb1d06SElliott Hughes if (flag & 4) {
207*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(offset + 2 <= data.size());
208*07fb1d06SElliott Hughes uint16_t extra_length = data[offset++];
209*07fb1d06SElliott Hughes extra_length |= static_cast<uint16_t>(data[offset++]) << 8;
210*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(offset + extra_length <= data.size());
211*07fb1d06SElliott Hughes offset += extra_length;
212*07fb1d06SElliott Hughes }
213*07fb1d06SElliott Hughes // File name field
214*07fb1d06SElliott Hughes if (flag & 8) {
215*07fb1d06SElliott Hughes while (true) {
216*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
217*07fb1d06SElliott Hughes if (data[offset++] == 0) {
218*07fb1d06SElliott Hughes break;
219*07fb1d06SElliott Hughes }
220*07fb1d06SElliott Hughes }
221*07fb1d06SElliott Hughes }
222*07fb1d06SElliott Hughes // File comment field
223*07fb1d06SElliott Hughes if (flag & 16) {
224*07fb1d06SElliott Hughes while (true) {
225*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
226*07fb1d06SElliott Hughes if (data[offset++] == 0) {
227*07fb1d06SElliott Hughes break;
228*07fb1d06SElliott Hughes }
229*07fb1d06SElliott Hughes }
230*07fb1d06SElliott Hughes }
231*07fb1d06SElliott Hughes // CRC16 field
232*07fb1d06SElliott Hughes if (flag & 2) {
233*07fb1d06SElliott Hughes offset += 2;
234*07fb1d06SElliott Hughes }
235*07fb1d06SElliott Hughes
236*07fb1d06SElliott Hughes uint64_t compressed_size = 0;
237*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream(
238*07fb1d06SElliott Hughes data.data() + offset, data.size() - offset, offset, deflates,
239*07fb1d06SElliott Hughes &compressed_size));
240*07fb1d06SElliott Hughes offset += compressed_size;
241*07fb1d06SElliott Hughes
242*07fb1d06SElliott Hughes // Ignore CRC32 and uncompressed size.
243*07fb1d06SElliott Hughes offset += 8;
244*07fb1d06SElliott Hughes member_start = offset;
245*07fb1d06SElliott Hughes } while (member_start < data.size() &&
246*07fb1d06SElliott Hughes IsValidGzipHeader(&data[member_start], data.size() - member_start));
247*07fb1d06SElliott Hughes return true;
248*07fb1d06SElliott Hughes }
249*07fb1d06SElliott Hughes
250*07fb1d06SElliott Hughes // For more information about the zip format, refer to
251*07fb1d06SElliott Hughes // https://support.pkware.com/display/PKZIP/APPNOTE
LocateDeflatesInZipArchive(const Buffer & data,vector<BitExtent> * deflates)252*07fb1d06SElliott Hughes bool LocateDeflatesInZipArchive(const Buffer& data,
253*07fb1d06SElliott Hughes vector<BitExtent>* deflates) {
254*07fb1d06SElliott Hughes uint64_t pos = 0;
255*07fb1d06SElliott Hughes while (pos + 30 <= data.size()) {
256*07fb1d06SElliott Hughes // TODO(xunchang) add support for big endian system when searching for
257*07fb1d06SElliott Hughes // magic numbers.
258*07fb1d06SElliott Hughes if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) {
259*07fb1d06SElliott Hughes pos++;
260*07fb1d06SElliott Hughes continue;
261*07fb1d06SElliott Hughes }
262*07fb1d06SElliott Hughes
263*07fb1d06SElliott Hughes // local file header format
264*07fb1d06SElliott Hughes // 0 4 0x04034b50
265*07fb1d06SElliott Hughes // 4 2 minimum version needed to extract
266*07fb1d06SElliott Hughes // 6 2 general purpose bit flag
267*07fb1d06SElliott Hughes // 8 2 compression method
268*07fb1d06SElliott Hughes // 10 4 file last modification date & time
269*07fb1d06SElliott Hughes // 14 4 CRC-32
270*07fb1d06SElliott Hughes // 18 4 compressed size
271*07fb1d06SElliott Hughes // 22 4 uncompressed size
272*07fb1d06SElliott Hughes // 26 2 file name length
273*07fb1d06SElliott Hughes // 28 2 extra field length
274*07fb1d06SElliott Hughes // 30 n file name
275*07fb1d06SElliott Hughes // 30+n m extra field
276*07fb1d06SElliott Hughes auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8);
277*07fb1d06SElliott Hughes if (compression_method != 8) { // non-deflate type
278*07fb1d06SElliott Hughes pos += 4;
279*07fb1d06SElliott Hughes continue;
280*07fb1d06SElliott Hughes }
281*07fb1d06SElliott Hughes
282*07fb1d06SElliott Hughes auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18);
283*07fb1d06SElliott Hughes auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26);
284*07fb1d06SElliott Hughes auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28);
285*07fb1d06SElliott Hughes uint64_t header_size = 30 + file_name_length + extra_field_length;
286*07fb1d06SElliott Hughes
287*07fb1d06SElliott Hughes // sanity check
288*07fb1d06SElliott Hughes if (static_cast<uint64_t>(header_size) + compressed_size > data.size() ||
289*07fb1d06SElliott Hughes pos > data.size() - header_size - compressed_size) {
290*07fb1d06SElliott Hughes pos += 4;
291*07fb1d06SElliott Hughes continue;
292*07fb1d06SElliott Hughes }
293*07fb1d06SElliott Hughes
294*07fb1d06SElliott Hughes vector<BitExtent> tmp_deflates;
295*07fb1d06SElliott Hughes uint64_t offset = pos + header_size;
296*07fb1d06SElliott Hughes uint64_t calculated_compressed_size = 0;
297*07fb1d06SElliott Hughes if (!LocateDeflatesInDeflateStream(
298*07fb1d06SElliott Hughes data.data() + offset, data.size() - offset, offset, &tmp_deflates,
299*07fb1d06SElliott Hughes &calculated_compressed_size)) {
300*07fb1d06SElliott Hughes LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos
301*07fb1d06SElliott Hughes << ", skip adding deflates for this entry.";
302*07fb1d06SElliott Hughes pos += 4;
303*07fb1d06SElliott Hughes continue;
304*07fb1d06SElliott Hughes }
305*07fb1d06SElliott Hughes
306*07fb1d06SElliott Hughes // Double check the compressed size if it is available in the file header.
307*07fb1d06SElliott Hughes if (compressed_size > 0 && compressed_size != calculated_compressed_size) {
308*07fb1d06SElliott Hughes LOG(WARNING) << "Compressed size in the file header: " << compressed_size
309*07fb1d06SElliott Hughes << " doesn't equal the real size: "
310*07fb1d06SElliott Hughes << calculated_compressed_size;
311*07fb1d06SElliott Hughes }
312*07fb1d06SElliott Hughes
313*07fb1d06SElliott Hughes deflates->insert(deflates->end(), tmp_deflates.begin(), tmp_deflates.end());
314*07fb1d06SElliott Hughes pos += header_size + calculated_compressed_size;
315*07fb1d06SElliott Hughes }
316*07fb1d06SElliott Hughes
317*07fb1d06SElliott Hughes return true;
318*07fb1d06SElliott Hughes }
319*07fb1d06SElliott Hughes
FindPuffLocations(const UniqueStreamPtr & src,const vector<BitExtent> & deflates,vector<ByteExtent> * puffs,uint64_t * out_puff_size)320*07fb1d06SElliott Hughes bool FindPuffLocations(const UniqueStreamPtr& src,
321*07fb1d06SElliott Hughes const vector<BitExtent>& deflates,
322*07fb1d06SElliott Hughes vector<ByteExtent>* puffs,
323*07fb1d06SElliott Hughes uint64_t* out_puff_size) {
324*07fb1d06SElliott Hughes Puffer puffer;
325*07fb1d06SElliott Hughes Buffer deflate_buffer;
326*07fb1d06SElliott Hughes
327*07fb1d06SElliott Hughes // Here accumulate the size difference between each corresponding deflate and
328*07fb1d06SElliott Hughes // puff. At the end we add this cummulative size difference to the size of the
329*07fb1d06SElliott Hughes // deflate stream to get the size of the puff stream. We use signed size
330*07fb1d06SElliott Hughes // because puff size could be smaller than deflate size.
331*07fb1d06SElliott Hughes int64_t total_size_difference = 0;
332*07fb1d06SElliott Hughes for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) {
333*07fb1d06SElliott Hughes // Read from src into deflate_buffer.
334*07fb1d06SElliott Hughes auto start_byte = deflate->offset / 8;
335*07fb1d06SElliott Hughes auto end_byte = (deflate->offset + deflate->length + 7) / 8;
336*07fb1d06SElliott Hughes deflate_buffer.resize(end_byte - start_byte);
337*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(src->Seek(start_byte));
338*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(
339*07fb1d06SElliott Hughes src->Read(deflate_buffer.data(), deflate_buffer.size()));
340*07fb1d06SElliott Hughes // Find the size of the puff.
341*07fb1d06SElliott Hughes BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size());
342*07fb1d06SElliott Hughes uint64_t bits_to_skip = deflate->offset % 8;
343*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip));
344*07fb1d06SElliott Hughes bit_reader.DropBits(bits_to_skip);
345*07fb1d06SElliott Hughes
346*07fb1d06SElliott Hughes BufferPuffWriter puff_writer(nullptr, 0);
347*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(
348*07fb1d06SElliott Hughes puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr));
349*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset());
350*07fb1d06SElliott Hughes
351*07fb1d06SElliott Hughes // 1 if a deflate ends at the same byte that the next deflate starts and
352*07fb1d06SElliott Hughes // there is a few bits gap between them. In practice this may never happen,
353*07fb1d06SElliott Hughes // but it is a good idea to support it anyways. If there is a gap, the value
354*07fb1d06SElliott Hughes // of the gap will be saved as an integer byte to the puff stream. The parts
355*07fb1d06SElliott Hughes // of the byte that belogs to the deflates are shifted out.
356*07fb1d06SElliott Hughes int gap = 0;
357*07fb1d06SElliott Hughes if (deflate != deflates.begin()) {
358*07fb1d06SElliott Hughes auto prev_deflate = std::prev(deflate);
359*07fb1d06SElliott Hughes if ((prev_deflate->offset + prev_deflate->length == deflate->offset)
360*07fb1d06SElliott Hughes // If deflates are on byte boundary the gap will not be counted later,
361*07fb1d06SElliott Hughes // so we won't worry about it.
362*07fb1d06SElliott Hughes && (deflate->offset % 8 != 0)) {
363*07fb1d06SElliott Hughes gap = 1;
364*07fb1d06SElliott Hughes }
365*07fb1d06SElliott Hughes }
366*07fb1d06SElliott Hughes
367*07fb1d06SElliott Hughes start_byte = ((deflate->offset + 7) / 8);
368*07fb1d06SElliott Hughes end_byte = (deflate->offset + deflate->length) / 8;
369*07fb1d06SElliott Hughes int64_t deflate_length_in_bytes = end_byte - start_byte;
370*07fb1d06SElliott Hughes
371*07fb1d06SElliott Hughes // If there was no gap bits between the current and previous deflates, there
372*07fb1d06SElliott Hughes // will be no extra gap byte, so the offset will be shifted one byte back.
373*07fb1d06SElliott Hughes auto puff_offset = start_byte - gap + total_size_difference;
374*07fb1d06SElliott Hughes auto puff_size = puff_writer.Size();
375*07fb1d06SElliott Hughes // Add the location into puff.
376*07fb1d06SElliott Hughes puffs->emplace_back(puff_offset, puff_size);
377*07fb1d06SElliott Hughes total_size_difference +=
378*07fb1d06SElliott Hughes static_cast<int64_t>(puff_size) - deflate_length_in_bytes - gap;
379*07fb1d06SElliott Hughes }
380*07fb1d06SElliott Hughes
381*07fb1d06SElliott Hughes uint64_t src_size;
382*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(src->GetSize(&src_size));
383*07fb1d06SElliott Hughes auto final_size = static_cast<int64_t>(src_size) + total_size_difference;
384*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(final_size >= 0);
385*07fb1d06SElliott Hughes *out_puff_size = final_size;
386*07fb1d06SElliott Hughes return true;
387*07fb1d06SElliott Hughes }
388*07fb1d06SElliott Hughes
RemoveEqualBitExtents(const Buffer & data1,const Buffer & data2,vector<BitExtent> * extents1,vector<BitExtent> * extents2)389*07fb1d06SElliott Hughes void RemoveEqualBitExtents(const Buffer& data1,
390*07fb1d06SElliott Hughes const Buffer& data2,
391*07fb1d06SElliott Hughes vector<BitExtent>* extents1,
392*07fb1d06SElliott Hughes vector<BitExtent>* extents2) {
393*07fb1d06SElliott Hughes set<ExtentData> extent1_set, equal_extents;
394*07fb1d06SElliott Hughes for (const BitExtent& ext : *extents1) {
395*07fb1d06SElliott Hughes extent1_set.emplace(ext, data1);
396*07fb1d06SElliott Hughes }
397*07fb1d06SElliott Hughes
398*07fb1d06SElliott Hughes auto new_extents2_end = extents2->begin();
399*07fb1d06SElliott Hughes for (const BitExtent& ext : *extents2) {
400*07fb1d06SElliott Hughes ExtentData extent_data(ext, data2);
401*07fb1d06SElliott Hughes if (extent1_set.find(extent_data) != extent1_set.end()) {
402*07fb1d06SElliott Hughes equal_extents.insert(extent_data);
403*07fb1d06SElliott Hughes } else {
404*07fb1d06SElliott Hughes *new_extents2_end++ = ext;
405*07fb1d06SElliott Hughes }
406*07fb1d06SElliott Hughes }
407*07fb1d06SElliott Hughes extents2->erase(new_extents2_end, extents2->end());
408*07fb1d06SElliott Hughes extents1->erase(
409*07fb1d06SElliott Hughes std::remove_if(extents1->begin(), extents1->end(),
410*07fb1d06SElliott Hughes [&equal_extents, &data1](const BitExtent& ext) {
411*07fb1d06SElliott Hughes return equal_extents.find(ExtentData(ext, data1)) !=
412*07fb1d06SElliott Hughes equal_extents.end();
413*07fb1d06SElliott Hughes }),
414*07fb1d06SElliott Hughes extents1->end());
415*07fb1d06SElliott Hughes }
416*07fb1d06SElliott Hughes
RemoveDeflatesWithBadDistanceCaches(const Buffer & data,vector<BitExtent> * deflates)417*07fb1d06SElliott Hughes bool RemoveDeflatesWithBadDistanceCaches(const Buffer& data,
418*07fb1d06SElliott Hughes vector<BitExtent>* deflates) {
419*07fb1d06SElliott Hughes Puffer puffer(true /* exclude_bad_distance_caches */);
420*07fb1d06SElliott Hughes for (auto def = deflates->begin(); def != deflates->end();) {
421*07fb1d06SElliott Hughes uint64_t offset = def->offset / 8;
422*07fb1d06SElliott Hughes uint64_t length = (def->offset + def->length + 7) / 8 - offset;
423*07fb1d06SElliott Hughes BufferBitReader br(&data[offset], length);
424*07fb1d06SElliott Hughes BufferPuffWriter pw(nullptr, 0);
425*07fb1d06SElliott Hughes
426*07fb1d06SElliott Hughes // Drop the first few bits in the buffer so we start exactly where the
427*07fb1d06SElliott Hughes // deflate starts.
428*07fb1d06SElliott Hughes uint64_t bits_to_drop = def->offset % 8;
429*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(br.CacheBits(bits_to_drop));
430*07fb1d06SElliott Hughes br.DropBits(bits_to_drop);
431*07fb1d06SElliott Hughes
432*07fb1d06SElliott Hughes vector<BitExtent> defs_out;
433*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(puffer.PuffDeflate(&br, &pw, &defs_out));
434*07fb1d06SElliott Hughes
435*07fb1d06SElliott Hughes TEST_AND_RETURN_FALSE(defs_out.size() <= 1);
436*07fb1d06SElliott Hughes if (defs_out.size() == 0) {
437*07fb1d06SElliott Hughes // This is a deflate we were looking for, remove it.
438*07fb1d06SElliott Hughes def = deflates->erase(def);
439*07fb1d06SElliott Hughes } else {
440*07fb1d06SElliott Hughes ++def;
441*07fb1d06SElliott Hughes }
442*07fb1d06SElliott Hughes }
443*07fb1d06SElliott Hughes return true;
444*07fb1d06SElliott Hughes }
445*07fb1d06SElliott Hughes
446*07fb1d06SElliott Hughes } // namespace puffin
447