xref: /aosp_15_r20/external/XNNPACK/bench/qu8-requantization.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <random>
14 #include <vector>
15 
16 #include <cpuinfo.h>
17 
18 #include <benchmark/benchmark.h>
19 #include "bench/utils.h"
20 
21 #include <xnnpack/aligned-allocator.h>
22 #include <xnnpack/common.h>
23 #include <xnnpack/requantization-stubs.h>
24 
25 
26 class Requantization : public benchmark::Fixture {
27  public:
Requantization()28   inline Requantization()
29   {
30     cpuinfo_initialize();
31     const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size;
32     const size_t l1d_reserve = 1024;
33     n_ = (l1d_size - l1d_reserve) / (sizeof(int32_t) + sizeof(uint8_t));
34     n_ = n_ / 16 * 16;
35   }
36 
SetUp(benchmark::State & state)37   virtual void SetUp(benchmark::State& state) override
38   {
39     std::random_device random_device;
40     auto rng = std::mt19937(random_device());
41     auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(), std::ref(rng));
42 
43     input_.resize(n());
44     std::generate(input_.begin(), input_.end(), std::ref(i32rng));
45     output_.resize(n());
46     std::fill(output_.begin(), output_.end(), 0xA5);
47 
48     const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
49     if (cpu_frequency != 0) {
50       state.counters["cpufreq"] = cpu_frequency;
51     }
52   }
53 
TearDown(benchmark::State & state)54   virtual void TearDown(benchmark::State& state) override
55   {
56     state.SetItemsProcessed(uint64_t(state.iterations()) * n());
57     state.SetBytesProcessed(uint64_t(state.iterations()) * n() * (sizeof(int32_t) + sizeof(uint8_t)));
58     input_.clear();
59     output_.clear();
60   }
61 
input() const62   inline const int32_t* input() const
63   {
64     return input_.data();
65   }
66 
output()67   inline uint8_t* output()
68   {
69     return output_.data();
70   }
71 
n() const72   inline size_t n() const
73   {
74     return n_;
75   }
76 
77  protected:
78   std::vector<int32_t, AlignedAllocator<int32_t, 64>> input_;
79   std::vector<uint8_t> output_;
80   size_t n_;
81 };
82 
83 
84 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_F(Requantization,fp32__neon)85   BENCHMARK_F(Requantization, fp32__neon)(benchmark::State& state) {
86     for (auto _ : state) {
87       xnn_qu8_requantize_fp32__neon(
88           n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
89     }
90   }
91 
BENCHMARK_F(Requantization,gemmlowp__neon)92   BENCHMARK_F(Requantization, gemmlowp__neon)(benchmark::State& state) {
93     for (auto _ : state) {
94       xnn_qu8_requantize_gemmlowp__neon(
95           n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
96     }
97   }
98 
BENCHMARK_F(Requantization,rndna__neon)99   BENCHMARK_F(Requantization, rndna__neon)(benchmark::State& state) {
100     for (auto _ : state) {
101       xnn_qu8_requantize_rndna__neon(
102           n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
103     }
104   }
105 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
106 
107 
108 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_F(Requantization,fp32__sse2)109   BENCHMARK_F(Requantization, fp32__sse2)(benchmark::State& state) {
110     for (auto _ : state) {
111       xnn_qu8_requantize_fp32__sse2(
112           n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
113     }
114   }
115 
BENCHMARK_F(Requantization,gemmlowp__sse2)116   BENCHMARK_F(Requantization, gemmlowp__sse2)(benchmark::State& state) {
117     for (auto _ : state) {
118       xnn_qu8_requantize_gemmlowp__sse2(
119           n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
120     }
121   }
122 
BENCHMARK_F(Requantization,gemmlowp__ssse3)123   BENCHMARK_F(Requantization, gemmlowp__ssse3)(benchmark::State& state) {
124     for (auto _ : state) {
125       xnn_qu8_requantize_gemmlowp__ssse3(
126           n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
127     }
128   }
129 
BENCHMARK_F(Requantization,gemmlowp__sse4)130   BENCHMARK_F(Requantization, gemmlowp__sse4)(benchmark::State& state) {
131     for (auto _ : state) {
132       xnn_qu8_requantize_gemmlowp__sse4(
133           n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
134     }
135   }
136 
BENCHMARK_F(Requantization,rndna__sse2)137   BENCHMARK_F(Requantization, rndna__sse2)(benchmark::State& state) {
138     for (auto _ : state) {
139       xnn_qu8_requantize_rndna__sse2(
140           n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
141     }
142   }
143 
BENCHMARK_F(Requantization,rndna__ssse3)144   BENCHMARK_F(Requantization, rndna__ssse3)(benchmark::State& state) {
145     for (auto _ : state) {
146       xnn_qu8_requantize_rndna__ssse3(
147           n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
148     }
149   }
150 
BENCHMARK_F(Requantization,rndna__sse4)151   BENCHMARK_F(Requantization, rndna__sse4)(benchmark::State& state) {
152     for (auto _ : state) {
153       xnn_qu8_requantize_rndna__sse4(
154           n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
155     }
156   }
157 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
158 
159 
160 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_F(Requantization,fp32__wasmsimd)161   BENCHMARK_F(Requantization, fp32__wasmsimd)(benchmark::State& state) {
162     for (auto _ : state) {
163       xnn_qu8_requantize_fp32__wasmsimd(
164           n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
165     }
166   }
167 
BENCHMARK_F(Requantization,gemmlowp__wasmsimd)168   BENCHMARK_F(Requantization, gemmlowp__wasmsimd)(benchmark::State& state) {
169     for (auto _ : state) {
170       xnn_qu8_requantize_gemmlowp__wasmsimd(
171           n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
172     }
173   }
174 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
175 
176 
BENCHMARK_F(Requantization,fp32__scalar_lrintf)177 BENCHMARK_F(Requantization, fp32__scalar_lrintf)(benchmark::State& state) {
178   for (auto _ : state) {
179     xnn_qu8_requantize_fp32__scalar_lrintf(
180         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
181   }
182 }
183 
BENCHMARK_F(Requantization,fp32__scalar_fmagic)184 BENCHMARK_F(Requantization, fp32__scalar_fmagic)(benchmark::State& state) {
185   for (auto _ : state) {
186     xnn_qu8_requantize_fp32__scalar_fmagic(
187         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
188   }
189 }
190 
BENCHMARK_F(Requantization,gemmlowp__scalar)191 BENCHMARK_F(Requantization, gemmlowp__scalar)(benchmark::State& state) {
192   for (auto _ : state) {
193     xnn_qu8_requantize_gemmlowp__scalar(
194         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
195   }
196 }
197 
BENCHMARK_F(Requantization,rndna__scalar_signed64)198 BENCHMARK_F(Requantization, rndna__scalar_signed64)(benchmark::State& state) {
199   for (auto _ : state) {
200     xnn_qu8_requantize_rndna__scalar_signed64(
201         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
202   }
203 }
204 
BENCHMARK_F(Requantization,rndna__scalar_unsigned32)205 BENCHMARK_F(Requantization, rndna__scalar_unsigned32)(benchmark::State& state) {
206   for (auto _ : state) {
207     xnn_qu8_requantize_rndna__scalar_unsigned32(
208         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
209   }
210 }
211 
BENCHMARK_F(Requantization,rndna__scalar_unsigned64)212 BENCHMARK_F(Requantization, rndna__scalar_unsigned64)(benchmark::State& state) {
213   for (auto _ : state) {
214     xnn_qu8_requantize_rndna__scalar_unsigned64(
215         n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
216   }
217 }
218 
219 
220 #ifndef XNNPACK_BENCHMARK_NO_MAIN
221 BENCHMARK_MAIN();
222 #endif
223