1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2020 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <random>
14 #include <vector>
15
16 #include <cpuinfo.h>
17
18 #include <benchmark/benchmark.h>
19 #include "bench/utils.h"
20
21 #include <xnnpack/aligned-allocator.h>
22 #include <xnnpack/common.h>
23 #include <xnnpack/requantization-stubs.h>
24
25
26 class Requantization : public benchmark::Fixture {
27 public:
Requantization()28 inline Requantization()
29 {
30 cpuinfo_initialize();
31 const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size;
32 const size_t l1d_reserve = 1024;
33 n_ = (l1d_size - l1d_reserve) / (sizeof(int32_t) + sizeof(int8_t));
34 n_ = n_ / 16 * 16;
35 }
36
SetUp(benchmark::State & state)37 virtual void SetUp(benchmark::State& state) override
38 {
39 std::random_device random_device;
40 auto rng = std::mt19937(random_device());
41 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(), std::ref(rng));
42
43 input_.resize(n());
44 std::generate(input_.begin(), input_.end(), std::ref(i32rng));
45 output_.resize(n());
46 std::fill(output_.begin(), output_.end(), 0xA5);
47
48 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
49 if (cpu_frequency != 0) {
50 state.counters["cpufreq"] = cpu_frequency;
51 }
52 }
TearDown(benchmark::State & state)53 virtual void TearDown(benchmark::State& state) override
54 {
55 state.SetItemsProcessed(uint64_t(state.iterations()) * n());
56 state.SetBytesProcessed(uint64_t(state.iterations()) * n() * (sizeof(int32_t) + sizeof(int8_t)));
57 input_.clear();
58 output_.clear();
59 }
60
input() const61 inline const int32_t* input() const
62 {
63 return input_.data();
64 }
65
output()66 inline int8_t* output()
67 {
68 return output_.data();
69 }
70
n() const71 inline size_t n() const
72 {
73 return n_;
74 }
75
76 protected:
77 std::vector<int32_t, AlignedAllocator<int32_t, 64>> input_;
78 std::vector<int8_t> output_;
79 size_t n_;
80 };
81
82
83 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_F(Requantization,fp32__neon)84 BENCHMARK_F(Requantization, fp32__neon)(benchmark::State& state) {
85 for (auto _ : state) {
86 xnn_qs8_requantize_fp32__neon(
87 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
88 }
89 }
90
BENCHMARK_F(Requantization,gemmlowp__neon)91 BENCHMARK_F(Requantization, gemmlowp__neon)(benchmark::State& state) {
92 for (auto _ : state) {
93 xnn_qs8_requantize_gemmlowp__neon(
94 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
95 }
96 }
97
BENCHMARK_F(Requantization,rndna__neon)98 BENCHMARK_F(Requantization, rndna__neon)(benchmark::State& state) {
99 for (auto _ : state) {
100 xnn_qs8_requantize_rndna__neon(
101 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
102 }
103 }
104
BENCHMARK_F(Requantization,rndnu__neon_mull)105 BENCHMARK_F(Requantization, rndnu__neon_mull)(benchmark::State& state) {
106 for (auto _ : state) {
107 xnn_qs8_requantize_rndnu__neon_mull(
108 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
109 }
110 }
111
BENCHMARK_F(Requantization,rndnu__neon_qdmulh)112 BENCHMARK_F(Requantization, rndnu__neon_qdmulh)(benchmark::State& state) {
113 for (auto _ : state) {
114 xnn_qs8_requantize_rndnu__neon_qdmulh(
115 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
116 }
117 }
118 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
119
120
121 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_F(Requantization,fp32__sse2)122 BENCHMARK_F(Requantization, fp32__sse2)(benchmark::State& state) {
123 for (auto _ : state) {
124 xnn_qs8_requantize_fp32__sse2(
125 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
126 }
127 }
128
BENCHMARK_F(Requantization,fp32__sse4)129 BENCHMARK_F(Requantization, fp32__sse4)(benchmark::State& state) {
130 for (auto _ : state) {
131 xnn_qs8_requantize_fp32__sse4(
132 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
133 }
134 }
135
BENCHMARK_F(Requantization,gemmlowp__sse2)136 BENCHMARK_F(Requantization, gemmlowp__sse2)(benchmark::State& state) {
137 for (auto _ : state) {
138 xnn_qs8_requantize_gemmlowp__sse2(
139 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
140 }
141 }
142
BENCHMARK_F(Requantization,gemmlowp__ssse3)143 BENCHMARK_F(Requantization, gemmlowp__ssse3)(benchmark::State& state) {
144 for (auto _ : state) {
145 xnn_qs8_requantize_gemmlowp__ssse3(
146 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
147 }
148 }
149
BENCHMARK_F(Requantization,gemmlowp__sse4)150 BENCHMARK_F(Requantization, gemmlowp__sse4)(benchmark::State& state) {
151 for (auto _ : state) {
152 xnn_qs8_requantize_gemmlowp__sse4(
153 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
154 }
155 }
156
BENCHMARK_F(Requantization,rndna__sse2)157 BENCHMARK_F(Requantization, rndna__sse2)(benchmark::State& state) {
158 for (auto _ : state) {
159 xnn_qs8_requantize_rndna__sse2(
160 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
161 }
162 }
163
BENCHMARK_F(Requantization,rndna__ssse3)164 BENCHMARK_F(Requantization, rndna__ssse3)(benchmark::State& state) {
165 for (auto _ : state) {
166 xnn_qs8_requantize_rndna__ssse3(
167 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
168 }
169 }
170
BENCHMARK_F(Requantization,rndna__sse4)171 BENCHMARK_F(Requantization, rndna__sse4)(benchmark::State& state) {
172 for (auto _ : state) {
173 xnn_qs8_requantize_rndna__sse4(
174 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
175 }
176 }
177
BENCHMARK_F(Requantization,rndnu__sse4_sra)178 BENCHMARK_F(Requantization, rndnu__sse4_sra)(benchmark::State& state) {
179 for (auto _ : state) {
180 xnn_qs8_requantize_rndnu__sse4_sra(
181 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
182 }
183 }
184
BENCHMARK_F(Requantization,rndnu__sse4_srl)185 BENCHMARK_F(Requantization, rndnu__sse4_srl)(benchmark::State& state) {
186 for (auto _ : state) {
187 xnn_qs8_requantize_rndnu__sse4_srl(
188 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
189 }
190 }
191 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
192
193
194 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_F(Requantization,fp32__wasmsimd)195 BENCHMARK_F(Requantization, fp32__wasmsimd)(benchmark::State& state) {
196 for (auto _ : state) {
197 xnn_qs8_requantize_fp32__wasmsimd(
198 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
199 }
200 }
201
BENCHMARK_F(Requantization,gemmlowp__wasmsimd)202 BENCHMARK_F(Requantization, gemmlowp__wasmsimd)(benchmark::State& state) {
203 for (auto _ : state) {
204 xnn_qs8_requantize_gemmlowp__wasmsimd(
205 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
206 }
207 }
208 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
209
210
BENCHMARK_F(Requantization,fp32__scalar_lrintf)211 BENCHMARK_F(Requantization, fp32__scalar_lrintf)(benchmark::State& state) {
212 for (auto _ : state) {
213 xnn_qs8_requantize_fp32__scalar_lrintf(
214 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
215 }
216 }
217
BENCHMARK_F(Requantization,fp32__scalar_fmagic)218 BENCHMARK_F(Requantization, fp32__scalar_fmagic)(benchmark::State& state) {
219 for (auto _ : state) {
220 xnn_qs8_requantize_fp32__scalar_fmagic(
221 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
222 }
223 }
224
BENCHMARK_F(Requantization,gemmlowp__scalar)225 BENCHMARK_F(Requantization, gemmlowp__scalar)(benchmark::State& state) {
226 for (auto _ : state) {
227 xnn_qs8_requantize_gemmlowp__scalar(
228 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
229 }
230 }
231
BENCHMARK_F(Requantization,rndna__scalar_signed64)232 BENCHMARK_F(Requantization, rndna__scalar_signed64)(benchmark::State& state) {
233 for (auto _ : state) {
234 xnn_qs8_requantize_rndna__scalar_signed64(
235 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
236 }
237 }
238
BENCHMARK_F(Requantization,rndna__scalar_unsigned32)239 BENCHMARK_F(Requantization, rndna__scalar_unsigned32)(benchmark::State& state) {
240 for (auto _ : state) {
241 xnn_qs8_requantize_rndna__scalar_unsigned32(
242 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
243 }
244 }
245
BENCHMARK_F(Requantization,rndna__scalar_unsigned64)246 BENCHMARK_F(Requantization, rndna__scalar_unsigned64)(benchmark::State& state) {
247 for (auto _ : state) {
248 xnn_qs8_requantize_rndna__scalar_unsigned64(
249 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
250 }
251 }
252
BENCHMARK_F(Requantization,rndnu__scalar)253 BENCHMARK_F(Requantization, rndnu__scalar)(benchmark::State& state) {
254 for (auto _ : state) {
255 xnn_qs8_requantize_rndnu__scalar(
256 n(), input(), 0x1.0p-12f /* scale */, -1 /* zero point */, -127 /* qmin */, 126 /* qmax */, output());
257 }
258 }
259
260
261 #ifndef XNNPACK_BENCHMARK_NO_MAIN
262 BENCHMARK_MAIN();
263 #endif
264