xref: /aosp_15_r20/external/zucchini/binary_data_histogram.cc (revision a03ca8b91e029cd15055c20c78c2e087c84792e4)
1*a03ca8b9SKrzysztof Kosiński // Copyright 2017 The Chromium Authors. All rights reserved.
2*a03ca8b9SKrzysztof Kosiński // Use of this source code is governed by a BSD-style license that can be
3*a03ca8b9SKrzysztof Kosiński // found in the LICENSE file.
4*a03ca8b9SKrzysztof Kosiński 
5*a03ca8b9SKrzysztof Kosiński #include "components/zucchini/binary_data_histogram.h"
6*a03ca8b9SKrzysztof Kosiński 
7*a03ca8b9SKrzysztof Kosiński #include <algorithm>
8*a03ca8b9SKrzysztof Kosiński #include <cmath>
9*a03ca8b9SKrzysztof Kosiński #include <limits>
10*a03ca8b9SKrzysztof Kosiński 
11*a03ca8b9SKrzysztof Kosiński #include "base/check_op.h"
12*a03ca8b9SKrzysztof Kosiński #include "base/format_macros.h"
13*a03ca8b9SKrzysztof Kosiński #include "base/strings/stringprintf.h"
14*a03ca8b9SKrzysztof Kosiński 
15*a03ca8b9SKrzysztof Kosiński namespace zucchini {
16*a03ca8b9SKrzysztof Kosiński 
17*a03ca8b9SKrzysztof Kosiński /******** OutlierDetector ********/
18*a03ca8b9SKrzysztof Kosiński 
19*a03ca8b9SKrzysztof Kosiński OutlierDetector::OutlierDetector() = default;
20*a03ca8b9SKrzysztof Kosiński 
21*a03ca8b9SKrzysztof Kosiński OutlierDetector::~OutlierDetector() = default;
22*a03ca8b9SKrzysztof Kosiński 
23*a03ca8b9SKrzysztof Kosiński // For BinaryDataHistogram, |sample| is typically in interval [0, 1].
Add(double sample)24*a03ca8b9SKrzysztof Kosiński void OutlierDetector::Add(double sample) {
25*a03ca8b9SKrzysztof Kosiński   ++n_;
26*a03ca8b9SKrzysztof Kosiński   sum_ += sample;
27*a03ca8b9SKrzysztof Kosiński   sum_of_squares_ += sample * sample;
28*a03ca8b9SKrzysztof Kosiński }
29*a03ca8b9SKrzysztof Kosiński 
Prepare()30*a03ca8b9SKrzysztof Kosiński void OutlierDetector::Prepare() {
31*a03ca8b9SKrzysztof Kosiński   if (n_ > 0) {
32*a03ca8b9SKrzysztof Kosiński     mean_ = sum_ / n_;
33*a03ca8b9SKrzysztof Kosiński     standard_deviation_ = ::sqrt((sum_of_squares_ - sum_ * mean_) /
34*a03ca8b9SKrzysztof Kosiński                                  std::max(static_cast<size_t>(1), n_ - 1));
35*a03ca8b9SKrzysztof Kosiński   }
36*a03ca8b9SKrzysztof Kosiński }
37*a03ca8b9SKrzysztof Kosiński 
RenderStats()38*a03ca8b9SKrzysztof Kosiński std::string OutlierDetector::RenderStats() {
39*a03ca8b9SKrzysztof Kosiński   return base::StringPrintf("Mean = %.5f, StdDev = %.5f over %" PRIuS
40*a03ca8b9SKrzysztof Kosiński                             " samples",
41*a03ca8b9SKrzysztof Kosiński                             mean_, standard_deviation_, n_);
42*a03ca8b9SKrzysztof Kosiński }
43*a03ca8b9SKrzysztof Kosiński 
44*a03ca8b9SKrzysztof Kosiński // Constants are chosen for BinaryDataHistogram, where |sample| is typically in
45*a03ca8b9SKrzysztof Kosiński // [0, 1].
DecideOutlier(double sample)46*a03ca8b9SKrzysztof Kosiński int OutlierDetector::DecideOutlier(double sample) {
47*a03ca8b9SKrzysztof Kosiński   // Lower bound to avoid divide-by-zero and penalizing tight clusters.
48*a03ca8b9SKrzysztof Kosiński   constexpr double kMinTolerance = 0.1;
49*a03ca8b9SKrzysztof Kosiński   // Number of standard deviations away from mean for value to become outlier.
50*a03ca8b9SKrzysztof Kosiński   constexpr double kSigmaBound = 1.9;
51*a03ca8b9SKrzysztof Kosiński   if (n_ <= 1)
52*a03ca8b9SKrzysztof Kosiński     return 0;
53*a03ca8b9SKrzysztof Kosiński   double tolerance = std::max(kMinTolerance, standard_deviation_);
54*a03ca8b9SKrzysztof Kosiński   double num_sigma = (sample - mean_) / tolerance;
55*a03ca8b9SKrzysztof Kosiński   return num_sigma > kSigmaBound ? 1 : num_sigma < -kSigmaBound ? -1 : 0;
56*a03ca8b9SKrzysztof Kosiński }
57*a03ca8b9SKrzysztof Kosiński 
58*a03ca8b9SKrzysztof Kosiński /******** BinaryDataHistogram ********/
59*a03ca8b9SKrzysztof Kosiński 
60*a03ca8b9SKrzysztof Kosiński BinaryDataHistogram::BinaryDataHistogram() = default;
61*a03ca8b9SKrzysztof Kosiński 
62*a03ca8b9SKrzysztof Kosiński BinaryDataHistogram::~BinaryDataHistogram() = default;
63*a03ca8b9SKrzysztof Kosiński 
Compute(ConstBufferView region)64*a03ca8b9SKrzysztof Kosiński bool BinaryDataHistogram::Compute(ConstBufferView region) {
65*a03ca8b9SKrzysztof Kosiński   DCHECK(!histogram_);
66*a03ca8b9SKrzysztof Kosiński   // Binary data with size < 2 are invalid.
67*a03ca8b9SKrzysztof Kosiński   if (region.size() < sizeof(uint16_t))
68*a03ca8b9SKrzysztof Kosiński     return false;
69*a03ca8b9SKrzysztof Kosiński   DCHECK_LE(region.size(),
70*a03ca8b9SKrzysztof Kosiński             static_cast<size_t>(std::numeric_limits<int32_t>::max()));
71*a03ca8b9SKrzysztof Kosiński 
72*a03ca8b9SKrzysztof Kosiński   histogram_ = std::make_unique<int32_t[]>(kNumBins);
73*a03ca8b9SKrzysztof Kosiński   size_ = region.size();
74*a03ca8b9SKrzysztof Kosiński   // Number of 2-byte intervals fully contained in |region|.
75*a03ca8b9SKrzysztof Kosiński   size_t bound = size_ - sizeof(uint16_t) + 1;
76*a03ca8b9SKrzysztof Kosiński   for (size_t i = 0; i < bound; ++i)
77*a03ca8b9SKrzysztof Kosiński     ++histogram_[region.read<uint16_t>(i)];
78*a03ca8b9SKrzysztof Kosiński   return true;
79*a03ca8b9SKrzysztof Kosiński }
80*a03ca8b9SKrzysztof Kosiński 
Distance(const BinaryDataHistogram & other) const81*a03ca8b9SKrzysztof Kosiński double BinaryDataHistogram::Distance(const BinaryDataHistogram& other) const {
82*a03ca8b9SKrzysztof Kosiński   DCHECK(IsValid() && other.IsValid());
83*a03ca8b9SKrzysztof Kosiński   // Compute Manhattan (L1) distance between respective histograms.
84*a03ca8b9SKrzysztof Kosiński   double total_diff = 0;
85*a03ca8b9SKrzysztof Kosiński   for (int i = 0; i < kNumBins; ++i)
86*a03ca8b9SKrzysztof Kosiński     total_diff += std::abs(histogram_[i] - other.histogram_[i]);
87*a03ca8b9SKrzysztof Kosiński   // Normalize by total size, so result lies in [0, 1].
88*a03ca8b9SKrzysztof Kosiński   return total_diff / (size_ + other.size_);
89*a03ca8b9SKrzysztof Kosiński }
90*a03ca8b9SKrzysztof Kosiński 
91*a03ca8b9SKrzysztof Kosiński }  // namespace zucchini
92