xref: /aosp_15_r20/external/XNNPACK/bench/qs8-requantization.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2020 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <random>
14 #include <vector>
15 
16 #include <cpuinfo.h>
17 
18 #include <benchmark/benchmark.h>
19 #include "bench/utils.h"
20 
21 #include <xnnpack/aligned-allocator.h>
22 #include <xnnpack/common.h>
23 #include <xnnpack/requantization-stubs.h>
24 
25 
26 class Requantization : public benchmark::Fixture {
27  public:
Requantization()28   inline Requantization()
29   {
30     cpuinfo_initialize();
31     const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size;
32     const size_t l1d_reserve = 1024;
33     n_ = (l1d_size - l1d_reserve) / (sizeof(int32_t) + sizeof(int8_t));
34     n_ = n_ / 16 * 16;
35   }
36 
SetUp(benchmark::State & state)37   virtual void SetUp(benchmark::State& state) override
38   {
39     std::random_device random_device;
40     auto rng = std::mt19937(random_device());
41     auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(), std::ref(rng));
42 
43     input_.resize(n());
44     std::generate(input_.begin(), input_.end(), std::ref(i32rng));
45     output_.resize(n());
46     std::fill(output_.begin(), output_.end(), 0xA5);
47 
48     const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
49     if (cpu_frequency != 0) {
50       state.counters["cpufreq"] = cpu_frequency;
51     }
52   }
TearDown(benchmark::State & state)53   virtual void TearDown(benchmark::State& state) override
54   {
55     state.SetItemsProcessed(uint64_t(state.iterations()) * n());
56     state.SetBytesProcessed(uint64_t(state.iterations()) * n() * (sizeof(int32_t) + sizeof(int8_t)));
57     input_.clear();
58     output_.clear();
59   }
60 
input() const61   inline const int32_t* input() const
62   {
63     return input_.data();
64   }
65 
output()66   inline int8_t* output()
67   {
68     return output_.data();
69   }
70 
n() const71   inline size_t n() const
72   {
73     return n_;
74   }
75 
76  protected:
77   std::vector<int32_t, AlignedAllocator<int32_t, 64>> input_;
78   std::vector<int8_t> output_;
79   size_t n_;
80 };
81 
82 
83 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_F(Requantization,fp32__neon)84   BENCHMARK_F(Requantization, fp32__neon)(benchmark::State& state) {
85     for (auto _ : state) {
86       xnn_qs8_requantize_fp32__neon(
87           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
88     }
89   }
90 
BENCHMARK_F(Requantization,gemmlowp__neon)91   BENCHMARK_F(Requantization, gemmlowp__neon)(benchmark::State& state) {
92     for (auto _ : state) {
93       xnn_qs8_requantize_gemmlowp__neon(
94           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
95     }
96   }
97 
BENCHMARK_F(Requantization,rndna__neon)98   BENCHMARK_F(Requantization, rndna__neon)(benchmark::State& state) {
99     for (auto _ : state) {
100       xnn_qs8_requantize_rndna__neon(
101           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
102     }
103   }
104 
BENCHMARK_F(Requantization,rndnu__neon_mull)105   BENCHMARK_F(Requantization, rndnu__neon_mull)(benchmark::State& state) {
106     for (auto _ : state) {
107       xnn_qs8_requantize_rndnu__neon_mull(
108           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
109     }
110   }
111 
BENCHMARK_F(Requantization,rndnu__neon_qdmulh)112   BENCHMARK_F(Requantization, rndnu__neon_qdmulh)(benchmark::State& state) {
113     for (auto _ : state) {
114       xnn_qs8_requantize_rndnu__neon_qdmulh(
115           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
116     }
117   }
118 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
119 
120 
121 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_F(Requantization,fp32__sse2)122   BENCHMARK_F(Requantization, fp32__sse2)(benchmark::State& state) {
123     for (auto _ : state) {
124       xnn_qs8_requantize_fp32__sse2(
125           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
126     }
127   }
128 
BENCHMARK_F(Requantization,fp32__sse4)129   BENCHMARK_F(Requantization, fp32__sse4)(benchmark::State& state) {
130     for (auto _ : state) {
131       xnn_qs8_requantize_fp32__sse4(
132           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
133     }
134   }
135 
BENCHMARK_F(Requantization,gemmlowp__sse2)136   BENCHMARK_F(Requantization, gemmlowp__sse2)(benchmark::State& state) {
137     for (auto _ : state) {
138       xnn_qs8_requantize_gemmlowp__sse2(
139           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
140     }
141   }
142 
BENCHMARK_F(Requantization,gemmlowp__ssse3)143   BENCHMARK_F(Requantization, gemmlowp__ssse3)(benchmark::State& state) {
144     for (auto _ : state) {
145       xnn_qs8_requantize_gemmlowp__ssse3(
146           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
147     }
148   }
149 
BENCHMARK_F(Requantization,gemmlowp__sse4)150   BENCHMARK_F(Requantization, gemmlowp__sse4)(benchmark::State& state) {
151     for (auto _ : state) {
152       xnn_qs8_requantize_gemmlowp__sse4(
153           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
154     }
155   }
156 
BENCHMARK_F(Requantization,rndna__sse2)157   BENCHMARK_F(Requantization, rndna__sse2)(benchmark::State& state) {
158     for (auto _ : state) {
159       xnn_qs8_requantize_rndna__sse2(
160           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
161     }
162   }
163 
BENCHMARK_F(Requantization,rndna__ssse3)164   BENCHMARK_F(Requantization, rndna__ssse3)(benchmark::State& state) {
165     for (auto _ : state) {
166       xnn_qs8_requantize_rndna__ssse3(
167           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
168     }
169   }
170 
BENCHMARK_F(Requantization,rndna__sse4)171   BENCHMARK_F(Requantization, rndna__sse4)(benchmark::State& state) {
172     for (auto _ : state) {
173       xnn_qs8_requantize_rndna__sse4(
174           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
175     }
176   }
177 
BENCHMARK_F(Requantization,rndnu__sse4_sra)178   BENCHMARK_F(Requantization, rndnu__sse4_sra)(benchmark::State& state) {
179     for (auto _ : state) {
180       xnn_qs8_requantize_rndnu__sse4_sra(
181           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
182     }
183   }
184 
BENCHMARK_F(Requantization,rndnu__sse4_srl)185   BENCHMARK_F(Requantization, rndnu__sse4_srl)(benchmark::State& state) {
186     for (auto _ : state) {
187       xnn_qs8_requantize_rndnu__sse4_srl(
188           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
189     }
190   }
191 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
192 
193 
194 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_F(Requantization,fp32__wasmsimd)195   BENCHMARK_F(Requantization, fp32__wasmsimd)(benchmark::State& state) {
196     for (auto _ : state) {
197       xnn_qs8_requantize_fp32__wasmsimd(
198           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
199     }
200   }
201 
BENCHMARK_F(Requantization,gemmlowp__wasmsimd)202   BENCHMARK_F(Requantization, gemmlowp__wasmsimd)(benchmark::State& state) {
203     for (auto _ : state) {
204       xnn_qs8_requantize_gemmlowp__wasmsimd(
205           n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
206     }
207   }
208 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
209 
210 
BENCHMARK_F(Requantization,fp32__scalar_lrintf)211 BENCHMARK_F(Requantization, fp32__scalar_lrintf)(benchmark::State& state) {
212   for (auto _ : state) {
213     xnn_qs8_requantize_fp32__scalar_lrintf(
214         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
215   }
216 }
217 
BENCHMARK_F(Requantization,fp32__scalar_fmagic)218 BENCHMARK_F(Requantization, fp32__scalar_fmagic)(benchmark::State& state) {
219   for (auto _ : state) {
220     xnn_qs8_requantize_fp32__scalar_fmagic(
221         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
222   }
223 }
224 
BENCHMARK_F(Requantization,gemmlowp__scalar)225 BENCHMARK_F(Requantization, gemmlowp__scalar)(benchmark::State& state) {
226   for (auto _ : state) {
227     xnn_qs8_requantize_gemmlowp__scalar(
228         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
229   }
230 }
231 
BENCHMARK_F(Requantization,rndna__scalar_signed64)232 BENCHMARK_F(Requantization, rndna__scalar_signed64)(benchmark::State& state) {
233   for (auto _ : state) {
234     xnn_qs8_requantize_rndna__scalar_signed64(
235         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
236   }
237 }
238 
BENCHMARK_F(Requantization,rndna__scalar_unsigned32)239 BENCHMARK_F(Requantization, rndna__scalar_unsigned32)(benchmark::State& state) {
240   for (auto _ : state) {
241     xnn_qs8_requantize_rndna__scalar_unsigned32(
242         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
243   }
244 }
245 
BENCHMARK_F(Requantization,rndna__scalar_unsigned64)246 BENCHMARK_F(Requantization, rndna__scalar_unsigned64)(benchmark::State& state) {
247   for (auto _ : state) {
248     xnn_qs8_requantize_rndna__scalar_unsigned64(
249         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
250   }
251 }
252 
BENCHMARK_F(Requantization,rndnu__scalar)253 BENCHMARK_F(Requantization, rndnu__scalar)(benchmark::State& state) {
254   for (auto _ : state) {
255     xnn_qs8_requantize_rndnu__scalar(
256         n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
257   }
258 }
259 
260 
261 #ifndef XNNPACK_BENCHMARK_NO_MAIN
262 BENCHMARK_MAIN();
263 #endif
264