xref: /aosp_15_r20/external/zstd/contrib/match_finders/zstd_edist.h (revision 01826a4963a0d8a59bc3812d29bdf0fb76416722)
1*01826a49SYabin Cui /*
2*01826a49SYabin Cui  * Copyright (c) Meta Platforms, Inc. and affiliates.
3*01826a49SYabin Cui  * All rights reserved.
4*01826a49SYabin Cui  *
5*01826a49SYabin Cui  * This source code is licensed under both the BSD-style license (found in the
6*01826a49SYabin Cui  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*01826a49SYabin Cui  * in the COPYING file in the root directory of this source tree).
8*01826a49SYabin Cui  * You may select, at your option, one of the above-listed licenses.
9*01826a49SYabin Cui  */
10*01826a49SYabin Cui 
11*01826a49SYabin Cui /* This match finder leverages techniques used in file comparison algorithms
12*01826a49SYabin Cui  * to find matches between a dictionary and a source file.
13*01826a49SYabin Cui  *
14*01826a49SYabin Cui  * The original motivation for studying this approach was to try and optimize
15*01826a49SYabin Cui  * Zstandard for the use case of patching: the most common scenario being
16*01826a49SYabin Cui  * updating an existing software package with the next version. When patching,
17*01826a49SYabin Cui  * the difference between the old version of the package and the new version
18*01826a49SYabin Cui  * is generally tiny (most of the new file will be identical to
19*01826a49SYabin Cui  * the old one). In more technical terms, the edit distance (the minimal number
20*01826a49SYabin Cui  * of changes required to take one sequence of bytes to another) between the
21*01826a49SYabin Cui  * files would be small relative to the size of the file.
22*01826a49SYabin Cui  *
23*01826a49SYabin Cui  * Various 'diffing' algorithms utilize this notion of edit distance and
24*01826a49SYabin Cui  * the corresponding concept of a minimal edit script between two
25*01826a49SYabin Cui  * sequences to identify the regions within two files where they differ.
26*01826a49SYabin Cui  * The core algorithm used in this match finder is described in:
27*01826a49SYabin Cui  *
28*01826a49SYabin Cui  * "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
29*01826a49SYabin Cui  *    Algorithmica Vol. 1, 1986, pp. 251-266,
30*01826a49SYabin Cui  *    <https://doi.org/10.1007/BF01840446>.
31*01826a49SYabin Cui  *
32*01826a49SYabin Cui  * Additional algorithmic heuristics for speed improvement have also been included.
33*01826a49SYabin Cui  * These we inspired from implementations of various regular and binary diffing
34*01826a49SYabin Cui  * algorithms such as GNU diff, bsdiff, and Xdelta.
35*01826a49SYabin Cui  *
36*01826a49SYabin Cui  * Note: after some experimentation, this approach proved to not provide enough
37*01826a49SYabin Cui  * utility to justify the additional CPU used in finding matches. The one area
38*01826a49SYabin Cui  * where this approach consistently outperforms Zstandard even on level 19 is
39*01826a49SYabin Cui  * when compressing small files (<10 KB) using an equally small dictionary that
40*01826a49SYabin Cui  * is very similar to the source file. For the use case that this was intended,
41*01826a49SYabin Cui  * (large similar files) this approach by itself took 5-10X longer than zstd-19 and
42*01826a49SYabin Cui  * generally resulted in 2-3X larger files. The core advantage that zstd-19 has
43*01826a49SYabin Cui  * over this approach for match finding is the overlapping matches. This approach
44*01826a49SYabin Cui  * cannot find any.
45*01826a49SYabin Cui  *
46*01826a49SYabin Cui  * I'm leaving this in the contrib section in case this ever becomes interesting
47*01826a49SYabin Cui  * to explore again.
48*01826a49SYabin Cui  * */
49*01826a49SYabin Cui 
50*01826a49SYabin Cui #ifndef ZSTD_EDIST_H
51*01826a49SYabin Cui #define ZSTD_EDIST_H
52*01826a49SYabin Cui 
53*01826a49SYabin Cui /*-*************************************
54*01826a49SYabin Cui *  Dependencies
55*01826a49SYabin Cui ***************************************/
56*01826a49SYabin Cui 
57*01826a49SYabin Cui #include <stddef.h>
58*01826a49SYabin Cui #include "zstd_internal.h" /* ZSTD_Sequence */
59*01826a49SYabin Cui 
60*01826a49SYabin Cui /*! ZSTD_eDist_genSequences() :
61*01826a49SYabin Cui  * Will populate the provided ZSTD_Sequence buffer with sequences
62*01826a49SYabin Cui  * based on the optimal or near-optimal (depending on 'useHeuristics')
63*01826a49SYabin Cui  * edit script between 'dict' and 'src.'
64*01826a49SYabin Cui  * @return : the number of sequences found */
65*01826a49SYabin Cui size_t ZSTD_eDist_genSequences(ZSTD_Sequence* sequences,
66*01826a49SYabin Cui                         const void* dict, size_t dictSize,
67*01826a49SYabin Cui                         const void* src, size_t srcSize,
68*01826a49SYabin Cui                         int useHeuristics);
69*01826a49SYabin Cui 
70*01826a49SYabin Cui #endif
71