xref: /aosp_15_r20/external/zucchini/heuristic_ensemble_matcher.cc (revision a03ca8b91e029cd15055c20c78c2e087c84792e4)
1*a03ca8b9SKrzysztof Kosiński // Copyright 2017 The Chromium Authors. All rights reserved.
2*a03ca8b9SKrzysztof Kosiński // Use of this source code is governed by a BSD-style license that can be
3*a03ca8b9SKrzysztof Kosiński // found in the LICENSE file.
4*a03ca8b9SKrzysztof Kosiński 
5*a03ca8b9SKrzysztof Kosiński #include "components/zucchini/heuristic_ensemble_matcher.h"
6*a03ca8b9SKrzysztof Kosiński 
7*a03ca8b9SKrzysztof Kosiński #include <algorithm>
8*a03ca8b9SKrzysztof Kosiński #include <memory>
9*a03ca8b9SKrzysztof Kosiński #include <string>
10*a03ca8b9SKrzysztof Kosiński #include <utility>
11*a03ca8b9SKrzysztof Kosiński #include <vector>
12*a03ca8b9SKrzysztof Kosiński 
13*a03ca8b9SKrzysztof Kosiński #include "base/bind.h"
14*a03ca8b9SKrzysztof Kosiński #include "base/logging.h"
15*a03ca8b9SKrzysztof Kosiński #include "base/numerics/safe_conversions.h"
16*a03ca8b9SKrzysztof Kosiński #include "base/strings/stringprintf.h"
17*a03ca8b9SKrzysztof Kosiński #include "components/zucchini/binary_data_histogram.h"
18*a03ca8b9SKrzysztof Kosiński #include "components/zucchini/element_detection.h"
19*a03ca8b9SKrzysztof Kosiński #include "components/zucchini/image_utils.h"
20*a03ca8b9SKrzysztof Kosiński #include "components/zucchini/io_utils.h"
21*a03ca8b9SKrzysztof Kosiński 
22*a03ca8b9SKrzysztof Kosiński namespace zucchini {
23*a03ca8b9SKrzysztof Kosiński 
24*a03ca8b9SKrzysztof Kosiński namespace {
25*a03ca8b9SKrzysztof Kosiński 
26*a03ca8b9SKrzysztof Kosiński /******** Helper Functions ********/
27*a03ca8b9SKrzysztof Kosiński 
28*a03ca8b9SKrzysztof Kosiński // Uses |detector| to find embedded executables inside |image|, and returns the
29*a03ca8b9SKrzysztof Kosiński // result on success, or std::nullopt on failure,  which occurs if too many (>
30*a03ca8b9SKrzysztof Kosiński // |kElementLimit|) elements are found.
FindEmbeddedElements(ConstBufferView image,const std::string & name,ElementDetector && detector)31*a03ca8b9SKrzysztof Kosiński std::optional<std::vector<Element>> FindEmbeddedElements(
32*a03ca8b9SKrzysztof Kosiński     ConstBufferView image,
33*a03ca8b9SKrzysztof Kosiński     const std::string& name,
34*a03ca8b9SKrzysztof Kosiński     ElementDetector&& detector) {
35*a03ca8b9SKrzysztof Kosiński   // Maximum number of Elements in a file. This is enforced because our matching
36*a03ca8b9SKrzysztof Kosiński   // algorithm is O(n^2), which suffices for regular archive files that should
37*a03ca8b9SKrzysztof Kosiński   // have up to 10's of executable files. An archive containing 100's of
38*a03ca8b9SKrzysztof Kosiński   // executables is likely pathological, and is rejected to prevent exploits.
39*a03ca8b9SKrzysztof Kosiński   static constexpr size_t kElementLimit = 256;
40*a03ca8b9SKrzysztof Kosiński   std::vector<Element> elements;
41*a03ca8b9SKrzysztof Kosiński   ElementFinder element_finder(image, std::move(detector));
42*a03ca8b9SKrzysztof Kosiński   for (auto element = element_finder.GetNext();
43*a03ca8b9SKrzysztof Kosiński        element.has_value() && elements.size() <= kElementLimit;
44*a03ca8b9SKrzysztof Kosiński        element = element_finder.GetNext()) {
45*a03ca8b9SKrzysztof Kosiński     elements.push_back(*element);
46*a03ca8b9SKrzysztof Kosiński   }
47*a03ca8b9SKrzysztof Kosiński   if (elements.size() >= kElementLimit) {
48*a03ca8b9SKrzysztof Kosiński     LOG(WARNING) << name << ": Found too many elements.";
49*a03ca8b9SKrzysztof Kosiński     return std::nullopt;
50*a03ca8b9SKrzysztof Kosiński   }
51*a03ca8b9SKrzysztof Kosiński   LOG(INFO) << name << ": Found " << elements.size() << " elements.";
52*a03ca8b9SKrzysztof Kosiński   return elements;
53*a03ca8b9SKrzysztof Kosiński }
54*a03ca8b9SKrzysztof Kosiński 
55*a03ca8b9SKrzysztof Kosiński // Determines whether a proposed comparison between Elements should be rejected
56*a03ca8b9SKrzysztof Kosiński // early, to decrease the likelihood of creating false-positive matches, which
57*a03ca8b9SKrzysztof Kosiński // may be costly for patching. Our heuristic simply prohibits big difference in
58*a03ca8b9SKrzysztof Kosiński // size (relative and absolute) between matched elements.
UnsafeDifference(const Element & old_element,const Element & new_element)59*a03ca8b9SKrzysztof Kosiński bool UnsafeDifference(const Element& old_element, const Element& new_element) {
60*a03ca8b9SKrzysztof Kosiński   static constexpr double kMaxBloat = 2.0;
61*a03ca8b9SKrzysztof Kosiński   static constexpr size_t kMinWorrysomeDifference = 2 << 20;  // 2MB
62*a03ca8b9SKrzysztof Kosiński   size_t lo_size = std::min(old_element.size, new_element.size);
63*a03ca8b9SKrzysztof Kosiński   size_t hi_size = std::max(old_element.size, new_element.size);
64*a03ca8b9SKrzysztof Kosiński   if (hi_size - lo_size < kMinWorrysomeDifference)
65*a03ca8b9SKrzysztof Kosiński     return false;
66*a03ca8b9SKrzysztof Kosiński   if (hi_size < lo_size * kMaxBloat)
67*a03ca8b9SKrzysztof Kosiński     return false;
68*a03ca8b9SKrzysztof Kosiński   return true;
69*a03ca8b9SKrzysztof Kosiński }
70*a03ca8b9SKrzysztof Kosiński 
operator <<(std::ostream & stream,const Element & elt)71*a03ca8b9SKrzysztof Kosiński std::ostream& operator<<(std::ostream& stream, const Element& elt) {
72*a03ca8b9SKrzysztof Kosiński   stream << "(" << CastExecutableTypeToString(elt.exe_type) << ", "
73*a03ca8b9SKrzysztof Kosiński          << AsHex<8, size_t>(elt.offset) << " +" << AsHex<8, size_t>(elt.size)
74*a03ca8b9SKrzysztof Kosiński          << ")";
75*a03ca8b9SKrzysztof Kosiński   return stream;
76*a03ca8b9SKrzysztof Kosiński }
77*a03ca8b9SKrzysztof Kosiński 
78*a03ca8b9SKrzysztof Kosiński /******** MatchingInfoOut ********/
79*a03ca8b9SKrzysztof Kosiński 
80*a03ca8b9SKrzysztof Kosiński // A class to output detailed information during ensemble matching. Extracting
81*a03ca8b9SKrzysztof Kosiński // the functionality to a separate class decouples formatting and printing logic
82*a03ca8b9SKrzysztof Kosiński // from matching logic. The base class consists of stubs.
83*a03ca8b9SKrzysztof Kosiński class MatchingInfoOut {
84*a03ca8b9SKrzysztof Kosiński  protected:
85*a03ca8b9SKrzysztof Kosiński   MatchingInfoOut() = default;
86*a03ca8b9SKrzysztof Kosiński   MatchingInfoOut(const MatchingInfoOut&) = delete;
87*a03ca8b9SKrzysztof Kosiński   const MatchingInfoOut& operator=(const MatchingInfoOut&) = delete;
88*a03ca8b9SKrzysztof Kosiński 
89*a03ca8b9SKrzysztof Kosiński  public:
90*a03ca8b9SKrzysztof Kosiński   virtual ~MatchingInfoOut() = default;
InitSizes(size_t old_size,size_t new_size)91*a03ca8b9SKrzysztof Kosiński   virtual void InitSizes(size_t old_size, size_t new_size) {}
DeclareTypeMismatch(int iold,int inew)92*a03ca8b9SKrzysztof Kosiński   virtual void DeclareTypeMismatch(int iold, int inew) {}
DeclareUnsafeDistance(int iold,int inew)93*a03ca8b9SKrzysztof Kosiński   virtual void DeclareUnsafeDistance(int iold, int inew) {}
DeclareCandidate(int iold,int inew)94*a03ca8b9SKrzysztof Kosiński   virtual void DeclareCandidate(int iold, int inew) {}
DeclareMatch(int iold,int inew,double dist,bool is_identical)95*a03ca8b9SKrzysztof Kosiński   virtual void DeclareMatch(int iold,
96*a03ca8b9SKrzysztof Kosiński                             int inew,
97*a03ca8b9SKrzysztof Kosiński                             double dist,
98*a03ca8b9SKrzysztof Kosiński                             bool is_identical) {}
DeclareOutlier(int iold,int inew)99*a03ca8b9SKrzysztof Kosiński   virtual void DeclareOutlier(int iold, int inew) {}
100*a03ca8b9SKrzysztof Kosiński 
OutputCompare(const Element & old_element,const Element & new_element,double dist)101*a03ca8b9SKrzysztof Kosiński   virtual void OutputCompare(const Element& old_element,
102*a03ca8b9SKrzysztof Kosiński                              const Element& new_element,
103*a03ca8b9SKrzysztof Kosiński                              double dist) {}
104*a03ca8b9SKrzysztof Kosiński 
OutputMatch(const Element & best_old_element,const Element & new_element,bool is_identical,double best_dist)105*a03ca8b9SKrzysztof Kosiński   virtual void OutputMatch(const Element& best_old_element,
106*a03ca8b9SKrzysztof Kosiński                            const Element& new_element,
107*a03ca8b9SKrzysztof Kosiński                            bool is_identical,
108*a03ca8b9SKrzysztof Kosiński                            double best_dist) {}
109*a03ca8b9SKrzysztof Kosiński 
OutputScores(const std::string & stats)110*a03ca8b9SKrzysztof Kosiński   virtual void OutputScores(const std::string& stats) {}
111*a03ca8b9SKrzysztof Kosiński 
OutputTextGrid()112*a03ca8b9SKrzysztof Kosiński   virtual void OutputTextGrid() {}
113*a03ca8b9SKrzysztof Kosiński };
114*a03ca8b9SKrzysztof Kosiński 
115*a03ca8b9SKrzysztof Kosiński /******** MatchingInfoTerse ********/
116*a03ca8b9SKrzysztof Kosiński 
117*a03ca8b9SKrzysztof Kosiński // A terse MatchingInfoOut that prints only basic information, using LOG().
118*a03ca8b9SKrzysztof Kosiński class MatchingInfoOutTerse : public MatchingInfoOut {
119*a03ca8b9SKrzysztof Kosiński  public:
120*a03ca8b9SKrzysztof Kosiński   MatchingInfoOutTerse() = default;
121*a03ca8b9SKrzysztof Kosiński   MatchingInfoOutTerse(const MatchingInfoOutTerse&) = delete;
122*a03ca8b9SKrzysztof Kosiński   const MatchingInfoOutTerse& operator=(const MatchingInfoOutTerse&) = delete;
123*a03ca8b9SKrzysztof Kosiński   ~MatchingInfoOutTerse() override = default;
124*a03ca8b9SKrzysztof Kosiński 
OutputScores(const std::string & stats)125*a03ca8b9SKrzysztof Kosiński   void OutputScores(const std::string& stats) override {
126*a03ca8b9SKrzysztof Kosiński     LOG(INFO) << "Best dists: " << stats;
127*a03ca8b9SKrzysztof Kosiński   }
128*a03ca8b9SKrzysztof Kosiński };
129*a03ca8b9SKrzysztof Kosiński 
130*a03ca8b9SKrzysztof Kosiński /******** MatchingInfoOutVerbose ********/
131*a03ca8b9SKrzysztof Kosiński 
132*a03ca8b9SKrzysztof Kosiński // A verbose MatchingInfoOut that prints detailed information using |out_|,
133*a03ca8b9SKrzysztof Kosiński // including comparison pairs, scores, and a text grid representation of
134*a03ca8b9SKrzysztof Kosiński // pairwise matching results.
135*a03ca8b9SKrzysztof Kosiński class MatchingInfoOutVerbose : public MatchingInfoOut {
136*a03ca8b9SKrzysztof Kosiński  public:
MatchingInfoOutVerbose(std::ostream & out)137*a03ca8b9SKrzysztof Kosiński   explicit MatchingInfoOutVerbose(std::ostream& out) : out_(out) {}
138*a03ca8b9SKrzysztof Kosiński   MatchingInfoOutVerbose(const MatchingInfoOutVerbose&) = delete;
139*a03ca8b9SKrzysztof Kosiński   const MatchingInfoOutVerbose& operator=(const MatchingInfoOutVerbose&) =
140*a03ca8b9SKrzysztof Kosiński       delete;
141*a03ca8b9SKrzysztof Kosiński   ~MatchingInfoOutVerbose() override = default;
142*a03ca8b9SKrzysztof Kosiński 
143*a03ca8b9SKrzysztof Kosiński   // Outputs sizes and initializes |text_grid_|.
InitSizes(size_t old_size,size_t new_size)144*a03ca8b9SKrzysztof Kosiński   void InitSizes(size_t old_size, size_t new_size) override {
145*a03ca8b9SKrzysztof Kosiński     out_ << "Comparing old (" << old_size << " elements) and new (" << new_size
146*a03ca8b9SKrzysztof Kosiński          << " elements)" << std::endl;
147*a03ca8b9SKrzysztof Kosiński     text_grid_.assign(new_size, std::string(old_size, '-'));
148*a03ca8b9SKrzysztof Kosiński     best_dist_.assign(new_size, -1.0);
149*a03ca8b9SKrzysztof Kosiński   }
150*a03ca8b9SKrzysztof Kosiński 
151*a03ca8b9SKrzysztof Kosiński   // Functions to update match status in text grid representation.
152*a03ca8b9SKrzysztof Kosiński 
DeclareTypeMismatch(int iold,int inew)153*a03ca8b9SKrzysztof Kosiński   void DeclareTypeMismatch(int iold, int inew) override {
154*a03ca8b9SKrzysztof Kosiński     text_grid_[inew][iold] = 'T';
155*a03ca8b9SKrzysztof Kosiński   }
DeclareUnsafeDistance(int iold,int inew)156*a03ca8b9SKrzysztof Kosiński   void DeclareUnsafeDistance(int iold, int inew) override {
157*a03ca8b9SKrzysztof Kosiński     text_grid_[inew][iold] = 'U';
158*a03ca8b9SKrzysztof Kosiński   }
DeclareCandidate(int iold,int inew)159*a03ca8b9SKrzysztof Kosiński   void DeclareCandidate(int iold, int inew) override {
160*a03ca8b9SKrzysztof Kosiński     text_grid_[inew][iold] = 'C';  // Provisional.
161*a03ca8b9SKrzysztof Kosiński   }
DeclareMatch(int iold,int inew,double dist,bool is_identical)162*a03ca8b9SKrzysztof Kosiński   void DeclareMatch(int iold,
163*a03ca8b9SKrzysztof Kosiński                     int inew,
164*a03ca8b9SKrzysztof Kosiński                     double dist,
165*a03ca8b9SKrzysztof Kosiński                     bool is_identical) override {
166*a03ca8b9SKrzysztof Kosiński     text_grid_[inew][iold] = is_identical ? 'I' : 'M';
167*a03ca8b9SKrzysztof Kosiński     best_dist_[inew] = dist;
168*a03ca8b9SKrzysztof Kosiński   }
DeclareOutlier(int iold,int inew)169*a03ca8b9SKrzysztof Kosiński   void DeclareOutlier(int iold, int inew) override {
170*a03ca8b9SKrzysztof Kosiński     text_grid_[inew][iold] = 'O';
171*a03ca8b9SKrzysztof Kosiński   }
172*a03ca8b9SKrzysztof Kosiński 
173*a03ca8b9SKrzysztof Kosiński   // Functions to print detailed information.
174*a03ca8b9SKrzysztof Kosiński 
OutputCompare(const Element & old_element,const Element & new_element,double dist)175*a03ca8b9SKrzysztof Kosiński   void OutputCompare(const Element& old_element,
176*a03ca8b9SKrzysztof Kosiński                      const Element& new_element,
177*a03ca8b9SKrzysztof Kosiński                      double dist) override {
178*a03ca8b9SKrzysztof Kosiński     out_ << "Compare old" << old_element << " to new" << new_element << " --> "
179*a03ca8b9SKrzysztof Kosiński          << base::StringPrintf("%.5f", dist) << std::endl;
180*a03ca8b9SKrzysztof Kosiński   }
181*a03ca8b9SKrzysztof Kosiński 
OutputMatch(const Element & best_old_element,const Element & new_element,bool is_identical,double best_dist)182*a03ca8b9SKrzysztof Kosiński   void OutputMatch(const Element& best_old_element,
183*a03ca8b9SKrzysztof Kosiński                    const Element& new_element,
184*a03ca8b9SKrzysztof Kosiński                    bool is_identical,
185*a03ca8b9SKrzysztof Kosiński                    double best_dist) override {
186*a03ca8b9SKrzysztof Kosiński     if (is_identical) {
187*a03ca8b9SKrzysztof Kosiński       out_ << "Skipped old" << best_old_element << " - identical to new"
188*a03ca8b9SKrzysztof Kosiński            << new_element;
189*a03ca8b9SKrzysztof Kosiński     } else {
190*a03ca8b9SKrzysztof Kosiński       out_ << "Matched old" << best_old_element << " to new" << new_element
191*a03ca8b9SKrzysztof Kosiński            << " --> " << base::StringPrintf("%.5f", best_dist);
192*a03ca8b9SKrzysztof Kosiński     }
193*a03ca8b9SKrzysztof Kosiński     out_ << std::endl;
194*a03ca8b9SKrzysztof Kosiński   }
195*a03ca8b9SKrzysztof Kosiński 
OutputScores(const std::string & stats)196*a03ca8b9SKrzysztof Kosiński   void OutputScores(const std::string& stats) override {
197*a03ca8b9SKrzysztof Kosiński     out_ << "Best dists: " << stats << std::endl;
198*a03ca8b9SKrzysztof Kosiński   }
199*a03ca8b9SKrzysztof Kosiński 
OutputTextGrid()200*a03ca8b9SKrzysztof Kosiński   void OutputTextGrid() override {
201*a03ca8b9SKrzysztof Kosiński     int new_size = static_cast<int>(text_grid_.size());
202*a03ca8b9SKrzysztof Kosiński     for (int inew = 0; inew < new_size; ++inew) {
203*a03ca8b9SKrzysztof Kosiński       const std::string& line = text_grid_[inew];
204*a03ca8b9SKrzysztof Kosiński       out_ << "  ";
205*a03ca8b9SKrzysztof Kosiński       for (char ch : line) {
206*a03ca8b9SKrzysztof Kosiński         char prefix = (ch == 'I' || ch == 'M') ? '(' : ' ';
207*a03ca8b9SKrzysztof Kosiński         char suffix = (ch == 'I' || ch == 'M') ? ')' : ' ';
208*a03ca8b9SKrzysztof Kosiński         out_ << prefix << ch << suffix;
209*a03ca8b9SKrzysztof Kosiński       }
210*a03ca8b9SKrzysztof Kosiński       if (best_dist_[inew] >= 0)
211*a03ca8b9SKrzysztof Kosiński         out_ << "   " << base::StringPrintf("%.5f", best_dist_[inew]);
212*a03ca8b9SKrzysztof Kosiński       out_ << std::endl;
213*a03ca8b9SKrzysztof Kosiński     }
214*a03ca8b9SKrzysztof Kosiński     if (!text_grid_.empty()) {
215*a03ca8b9SKrzysztof Kosiński       out_ << "  Legend: I = identical, M = matched, T = type mismatch, "
216*a03ca8b9SKrzysztof Kosiński               "U = unsafe distance, C = candidate, O = outlier, - = skipped."
217*a03ca8b9SKrzysztof Kosiński            << std::endl;
218*a03ca8b9SKrzysztof Kosiński     }
219*a03ca8b9SKrzysztof Kosiński   }
220*a03ca8b9SKrzysztof Kosiński 
221*a03ca8b9SKrzysztof Kosiński  private:
222*a03ca8b9SKrzysztof Kosiński   std::ostream& out_;
223*a03ca8b9SKrzysztof Kosiński 
224*a03ca8b9SKrzysztof Kosiński   // Text grid representation of matches. Rows correspond to "old" and columns
225*a03ca8b9SKrzysztof Kosiński   // correspond to "new".
226*a03ca8b9SKrzysztof Kosiński   std::vector<std::string> text_grid_;
227*a03ca8b9SKrzysztof Kosiński 
228*a03ca8b9SKrzysztof Kosiński   // For each "new" element, distance of best match. -1 denotes no match.
229*a03ca8b9SKrzysztof Kosiński   std::vector<double> best_dist_;
230*a03ca8b9SKrzysztof Kosiński };
231*a03ca8b9SKrzysztof Kosiński 
232*a03ca8b9SKrzysztof Kosiński }  // namespace
233*a03ca8b9SKrzysztof Kosiński 
234*a03ca8b9SKrzysztof Kosiński /******** HeuristicEnsembleMatcher ********/
235*a03ca8b9SKrzysztof Kosiński 
HeuristicEnsembleMatcher(std::ostream * out)236*a03ca8b9SKrzysztof Kosiński HeuristicEnsembleMatcher::HeuristicEnsembleMatcher(std::ostream* out)
237*a03ca8b9SKrzysztof Kosiński     : out_(out) {}
238*a03ca8b9SKrzysztof Kosiński 
239*a03ca8b9SKrzysztof Kosiński HeuristicEnsembleMatcher::~HeuristicEnsembleMatcher() = default;
240*a03ca8b9SKrzysztof Kosiński 
RunMatch(ConstBufferView old_image,ConstBufferView new_image)241*a03ca8b9SKrzysztof Kosiński bool HeuristicEnsembleMatcher::RunMatch(ConstBufferView old_image,
242*a03ca8b9SKrzysztof Kosiński                                         ConstBufferView new_image) {
243*a03ca8b9SKrzysztof Kosiński   DCHECK(matches_.empty());
244*a03ca8b9SKrzysztof Kosiński   LOG(INFO) << "Start matching.";
245*a03ca8b9SKrzysztof Kosiński 
246*a03ca8b9SKrzysztof Kosiński   // Find all elements in "old" and "new".
247*a03ca8b9SKrzysztof Kosiński   std::optional<std::vector<Element>> old_elements =
248*a03ca8b9SKrzysztof Kosiński       FindEmbeddedElements(old_image, "Old file",
249*a03ca8b9SKrzysztof Kosiński                            base::BindRepeating(DetectElementFromDisassembler));
250*a03ca8b9SKrzysztof Kosiński   if (!old_elements.has_value())
251*a03ca8b9SKrzysztof Kosiński     return false;
252*a03ca8b9SKrzysztof Kosiński   std::optional<std::vector<Element>> new_elements =
253*a03ca8b9SKrzysztof Kosiński       FindEmbeddedElements(new_image, "New file",
254*a03ca8b9SKrzysztof Kosiński                            base::BindRepeating(DetectElementFromDisassembler));
255*a03ca8b9SKrzysztof Kosiński   if (!new_elements.has_value())
256*a03ca8b9SKrzysztof Kosiński     return false;
257*a03ca8b9SKrzysztof Kosiński 
258*a03ca8b9SKrzysztof Kosiński   std::unique_ptr<MatchingInfoOut> info_out;
259*a03ca8b9SKrzysztof Kosiński   if (out_)
260*a03ca8b9SKrzysztof Kosiński     info_out = std::make_unique<MatchingInfoOutVerbose>(*out_);
261*a03ca8b9SKrzysztof Kosiński   else
262*a03ca8b9SKrzysztof Kosiński     info_out = std::make_unique<MatchingInfoOutTerse>();
263*a03ca8b9SKrzysztof Kosiński 
264*a03ca8b9SKrzysztof Kosiński   const int num_new_elements = base::checked_cast<int>(new_elements->size());
265*a03ca8b9SKrzysztof Kosiński   const int num_old_elements = base::checked_cast<int>(old_elements->size());
266*a03ca8b9SKrzysztof Kosiński   info_out->InitSizes(num_old_elements, num_new_elements);
267*a03ca8b9SKrzysztof Kosiński 
268*a03ca8b9SKrzysztof Kosiński   // For each "new" element, match it with the "old" element that's nearest to
269*a03ca8b9SKrzysztof Kosiński   // it, with distance determined by BinaryDataHistogram. The resulting
270*a03ca8b9SKrzysztof Kosiński   // "old"-"new" pairs are stored into |results|. Possibilities:
271*a03ca8b9SKrzysztof Kosiński   // - Type mismatch: No match.
272*a03ca8b9SKrzysztof Kosiński   // - UnsafeDifference() heuristics fail: No match.
273*a03ca8b9SKrzysztof Kosiński   // - Identical match: Skip "new" since this is a trivial case.
274*a03ca8b9SKrzysztof Kosiński   // - Non-identical match: Match "new" with "old" with min distance.
275*a03ca8b9SKrzysztof Kosiński   // - No match: Skip "new".
276*a03ca8b9SKrzysztof Kosiński   struct Results {
277*a03ca8b9SKrzysztof Kosiński     int iold;
278*a03ca8b9SKrzysztof Kosiński     int inew;
279*a03ca8b9SKrzysztof Kosiński     double dist;
280*a03ca8b9SKrzysztof Kosiński   };
281*a03ca8b9SKrzysztof Kosiński   std::vector<Results> results;
282*a03ca8b9SKrzysztof Kosiński 
283*a03ca8b9SKrzysztof Kosiński   // Precompute histograms for "old" since they get reused.
284*a03ca8b9SKrzysztof Kosiński   std::vector<BinaryDataHistogram> old_his(num_old_elements);
285*a03ca8b9SKrzysztof Kosiński   for (int iold = 0; iold < num_old_elements; ++iold) {
286*a03ca8b9SKrzysztof Kosiński     ConstBufferView sub_image(old_image[(*old_elements)[iold]]);
287*a03ca8b9SKrzysztof Kosiński     old_his[iold].Compute(sub_image);
288*a03ca8b9SKrzysztof Kosiński     // ProgramDetector should have imposed minimal size limit to |sub_image|.
289*a03ca8b9SKrzysztof Kosiński     // Therefore resulting histogram are expected to be valid.
290*a03ca8b9SKrzysztof Kosiński     CHECK(old_his[iold].IsValid());
291*a03ca8b9SKrzysztof Kosiński   }
292*a03ca8b9SKrzysztof Kosiński 
293*a03ca8b9SKrzysztof Kosiński   const int kUninitIold = num_old_elements;
294*a03ca8b9SKrzysztof Kosiński   for (int inew = 0; inew < num_new_elements; ++inew) {
295*a03ca8b9SKrzysztof Kosiński     const Element& cur_new_element = (*new_elements)[inew];
296*a03ca8b9SKrzysztof Kosiński     ConstBufferView cur_new_sub_image(new_image[cur_new_element.region()]);
297*a03ca8b9SKrzysztof Kosiński     BinaryDataHistogram new_his;
298*a03ca8b9SKrzysztof Kosiński     new_his.Compute(cur_new_sub_image);
299*a03ca8b9SKrzysztof Kosiński     CHECK(new_his.IsValid());
300*a03ca8b9SKrzysztof Kosiński 
301*a03ca8b9SKrzysztof Kosiński     double best_dist = HUGE_VAL;
302*a03ca8b9SKrzysztof Kosiński     int best_iold = kUninitIold;
303*a03ca8b9SKrzysztof Kosiński     bool is_identical = false;
304*a03ca8b9SKrzysztof Kosiński 
305*a03ca8b9SKrzysztof Kosiński     for (int iold = 0; iold < num_old_elements; ++iold) {
306*a03ca8b9SKrzysztof Kosiński       const Element& cur_old_element = (*old_elements)[iold];
307*a03ca8b9SKrzysztof Kosiński       if (cur_old_element.exe_type != cur_new_element.exe_type) {
308*a03ca8b9SKrzysztof Kosiński         info_out->DeclareTypeMismatch(iold, inew);
309*a03ca8b9SKrzysztof Kosiński         continue;
310*a03ca8b9SKrzysztof Kosiński       }
311*a03ca8b9SKrzysztof Kosiński       if (UnsafeDifference(cur_old_element, cur_new_element)) {
312*a03ca8b9SKrzysztof Kosiński         info_out->DeclareUnsafeDistance(iold, inew);
313*a03ca8b9SKrzysztof Kosiński         continue;
314*a03ca8b9SKrzysztof Kosiński       }
315*a03ca8b9SKrzysztof Kosiński       double dist = old_his[iold].Distance(new_his);
316*a03ca8b9SKrzysztof Kosiński       info_out->DeclareCandidate(iold, inew);
317*a03ca8b9SKrzysztof Kosiński       info_out->OutputCompare(cur_old_element, cur_new_element, dist);
318*a03ca8b9SKrzysztof Kosiński       if (best_dist > dist) {  // Tie resolution: First-one, first-serve.
319*a03ca8b9SKrzysztof Kosiński         best_iold = iold;
320*a03ca8b9SKrzysztof Kosiński         best_dist = dist;
321*a03ca8b9SKrzysztof Kosiński         if (best_dist == 0) {
322*a03ca8b9SKrzysztof Kosiński           ConstBufferView sub_image(old_image[cur_old_element.region()]);
323*a03ca8b9SKrzysztof Kosiński           if (sub_image.equals(cur_new_sub_image)) {
324*a03ca8b9SKrzysztof Kosiński             is_identical = true;
325*a03ca8b9SKrzysztof Kosiński             break;
326*a03ca8b9SKrzysztof Kosiński           }
327*a03ca8b9SKrzysztof Kosiński         }
328*a03ca8b9SKrzysztof Kosiński       }
329*a03ca8b9SKrzysztof Kosiński     }
330*a03ca8b9SKrzysztof Kosiński 
331*a03ca8b9SKrzysztof Kosiński     if (best_iold != kUninitIold) {
332*a03ca8b9SKrzysztof Kosiński       const Element& best_old_element = (*old_elements)[best_iold];
333*a03ca8b9SKrzysztof Kosiński       info_out->DeclareMatch(best_iold, inew, best_dist, is_identical);
334*a03ca8b9SKrzysztof Kosiński       if (is_identical)  // Skip "new" if identical match is found.
335*a03ca8b9SKrzysztof Kosiński         ++num_identical_;
336*a03ca8b9SKrzysztof Kosiński       else
337*a03ca8b9SKrzysztof Kosiński         results.push_back({best_iold, inew, best_dist});
338*a03ca8b9SKrzysztof Kosiński       info_out->OutputMatch(best_old_element, cur_new_element, is_identical,
339*a03ca8b9SKrzysztof Kosiński                             best_dist);
340*a03ca8b9SKrzysztof Kosiński     }
341*a03ca8b9SKrzysztof Kosiński   }
342*a03ca8b9SKrzysztof Kosiński 
343*a03ca8b9SKrzysztof Kosiński   // Populate |matches_| from |result|. To reduce that chance of false-positive
344*a03ca8b9SKrzysztof Kosiński   // matches, statistics on dists are computed. If a match's |dist| is an
345*a03ca8b9SKrzysztof Kosiński   // outlier then it is rejected.
346*a03ca8b9SKrzysztof Kosiński   if (results.size() > 0) {
347*a03ca8b9SKrzysztof Kosiński     OutlierDetector detector;
348*a03ca8b9SKrzysztof Kosiński     for (const auto& result : results) {
349*a03ca8b9SKrzysztof Kosiński       if (result.dist > 0)
350*a03ca8b9SKrzysztof Kosiński         detector.Add(result.dist);
351*a03ca8b9SKrzysztof Kosiński     }
352*a03ca8b9SKrzysztof Kosiński     detector.Prepare();
353*a03ca8b9SKrzysztof Kosiński     info_out->OutputScores(detector.RenderStats());
354*a03ca8b9SKrzysztof Kosiński     for (const Results& result : results) {
355*a03ca8b9SKrzysztof Kosiński       if (detector.DecideOutlier(result.dist) > 0) {
356*a03ca8b9SKrzysztof Kosiński         info_out->DeclareOutlier(result.iold, result.inew);
357*a03ca8b9SKrzysztof Kosiński       } else {
358*a03ca8b9SKrzysztof Kosiński         matches_.push_back(
359*a03ca8b9SKrzysztof Kosiński             {(*old_elements)[result.iold], (*new_elements)[result.inew]});
360*a03ca8b9SKrzysztof Kosiński       }
361*a03ca8b9SKrzysztof Kosiński     }
362*a03ca8b9SKrzysztof Kosiński     info_out->OutputTextGrid();
363*a03ca8b9SKrzysztof Kosiński   }
364*a03ca8b9SKrzysztof Kosiński 
365*a03ca8b9SKrzysztof Kosiński   Trim();
366*a03ca8b9SKrzysztof Kosiński   return true;
367*a03ca8b9SKrzysztof Kosiński }
368*a03ca8b9SKrzysztof Kosiński 
369*a03ca8b9SKrzysztof Kosiński }  // namespace zucchini
370