1 // Copyright 2017 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_ 6 #define COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_ 7 8 #include <stddef.h> 9 #include <stdint.h> 10 11 #include <memory> 12 #include <string> 13 14 #include "components/zucchini/buffer_view.h" 15 16 namespace zucchini { 17 18 // A class to detect outliers in a list of doubles using Chauvenet's criterion: 19 // Compute mean and standard deviation of observations, then determine whether 20 // a query value lies beyond a fixed number of standard deviations (sigmas) from 21 // the mean. The purpose of this test is to reduce the chance of false-positive 22 // ensemble matches. 23 class OutlierDetector { 24 public: 25 OutlierDetector(); 26 OutlierDetector(const OutlierDetector&) = delete; 27 const OutlierDetector& operator=(const OutlierDetector&) = delete; 28 ~OutlierDetector(); 29 30 // Incorporates |sample| into mean and standard deviation. 31 void Add(double sample); 32 33 // Prepares basic statistics for DecideOutlier() calls. Should be called after 34 // all samples have been added. 35 void Prepare(); 36 37 // Renders current statistics as strings for logging. 38 std::string RenderStats(); 39 40 // Heuristically decides whether |sample| is an outlier. Returns 1 if |sample| 41 // is "too high", 0 if |sample| is "normal", and -1 if |sample| is "too low". 42 // Must be called after Prepare(). 43 int DecideOutlier(double sample); 44 45 private: 46 size_t n_ = 0; 47 double sum_ = 0; 48 double sum_of_squares_ = 0; 49 double mean_ = 0; 50 double standard_deviation_ = 0; 51 }; 52 53 // A class to compute similarity score between binary data. The heuristic here 54 // preprocesses input data to a size-65536 histogram, counting the frequency of 55 // consecutive 2-byte sequences. Therefore data with lengths < 2 are considered 56 // invalid -- but this is okay for Zucchini's use case. 57 class BinaryDataHistogram { 58 public: 59 BinaryDataHistogram(); 60 BinaryDataHistogram(const BinaryDataHistogram&) = delete; 61 const BinaryDataHistogram& operator=(const BinaryDataHistogram&) = delete; 62 ~BinaryDataHistogram(); 63 64 // Attempts to compute the histogram, returns true iff successful. 65 bool Compute(ConstBufferView region); 66 IsValid()67 bool IsValid() const { return static_cast<bool>(histogram_); } 68 69 // Returns distance to another histogram (heuristics). If two binaries are 70 // identical then their histogram distance is 0. However, the converse is not 71 // true in general. For example, "aba" and "bab" are different, but their 72 // histogram distance is 0 (both histograms are {"ab": 1, "ba": 1}). 73 double Distance(const BinaryDataHistogram& other) const; 74 75 private: 76 enum { kNumBins = 1 << (sizeof(uint16_t) * 8) }; 77 static_assert(kNumBins == 65536, "Incorrect constant computation."); 78 79 // Size, in bytes, of the data over which the histogram was computed. 80 size_t size_ = 0; 81 82 // 2^16 buckets holding counts of all 2-byte sequences in the data. The counts 83 // are stored as signed values to simplify computing the distance between two 84 // histograms. 85 std::unique_ptr<int32_t[]> histogram_; 86 }; 87 88 } // namespace zucchini 89 90 #endif // COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_ 91