1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_
6 #define COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_
7 
8 #include <stddef.h>
9 #include <stdint.h>
10 
11 #include <memory>
12 #include <string>
13 
14 #include "components/zucchini/buffer_view.h"
15 
16 namespace zucchini {
17 
18 // A class to detect outliers in a list of doubles using Chauvenet's criterion:
19 // Compute mean and standard deviation of observations, then determine whether
20 // a query value lies beyond a fixed number of standard deviations (sigmas) from
21 // the mean. The purpose of this test is to reduce the chance of false-positive
22 // ensemble matches.
23 class OutlierDetector {
24  public:
25   OutlierDetector();
26   OutlierDetector(const OutlierDetector&) = delete;
27   const OutlierDetector& operator=(const OutlierDetector&) = delete;
28   ~OutlierDetector();
29 
30   // Incorporates |sample| into mean and standard deviation.
31   void Add(double sample);
32 
33   // Prepares basic statistics for DecideOutlier() calls. Should be called after
34   // all samples have been added.
35   void Prepare();
36 
37   // Renders current statistics as strings for logging.
38   std::string RenderStats();
39 
40   // Heuristically decides whether |sample| is an outlier. Returns 1 if |sample|
41   // is "too high", 0 if |sample| is "normal", and -1 if |sample| is "too low".
42   // Must be called after Prepare().
43   int DecideOutlier(double sample);
44 
45  private:
46   size_t n_ = 0;
47   double sum_ = 0;
48   double sum_of_squares_ = 0;
49   double mean_ = 0;
50   double standard_deviation_ = 0;
51 };
52 
53 // A class to compute similarity score between binary data. The heuristic here
54 // preprocesses input data to a size-65536 histogram, counting the frequency of
55 // consecutive 2-byte sequences. Therefore data with lengths < 2 are considered
56 // invalid -- but this is okay for Zucchini's use case.
57 class BinaryDataHistogram {
58  public:
59   BinaryDataHistogram();
60   BinaryDataHistogram(const BinaryDataHistogram&) = delete;
61   const BinaryDataHistogram& operator=(const BinaryDataHistogram&) = delete;
62   ~BinaryDataHistogram();
63 
64   // Attempts to compute the histogram, returns true iff successful.
65   bool Compute(ConstBufferView region);
66 
IsValid()67   bool IsValid() const { return static_cast<bool>(histogram_); }
68 
69   // Returns distance to another histogram (heuristics). If two binaries are
70   // identical then their histogram distance is 0. However, the converse is not
71   // true in general. For example, "aba" and "bab" are different, but their
72   // histogram distance is 0 (both histograms are {"ab": 1, "ba": 1}).
73   double Distance(const BinaryDataHistogram& other) const;
74 
75  private:
76   enum { kNumBins = 1 << (sizeof(uint16_t) * 8) };
77   static_assert(kNumBins == 65536, "Incorrect constant computation.");
78 
79   // Size, in bytes, of the data over which the histogram was computed.
80   size_t size_ = 0;
81 
82   // 2^16 buckets holding counts of all 2-byte sequences in the data. The counts
83   // are stored as signed values to simplify computing the distance between two
84   // histograms.
85   std::unique_ptr<int32_t[]> histogram_;
86 };
87 
88 }  // namespace zucchini
89 
90 #endif  // COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_
91