xref: /aosp_15_r20/external/XNNPACK/bench/qs8-vaddc.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14 
15 #include <xnnpack.h>
16 #include <xnnpack/aligned-allocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/microfnptr.h>
19 #include <xnnpack/microparams-init.h>
20 #include <xnnpack/vadd.h>
21 
22 
qs8_vaddc(benchmark::State & state,xnn_qs8_vadd_minmax_ukernel_function vaddc,xnn_init_qs8_add_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)23 static void qs8_vaddc(
24   benchmark::State& state,
25   xnn_qs8_vadd_minmax_ukernel_function vaddc,
26   xnn_init_qs8_add_minmax_params_fn init_params,
27   benchmark::utils::IsaCheckFunction isa_check = nullptr)
28 {
29   if (isa_check && !isa_check(state)) {
30     return;
31   }
32 
33   const size_t num_elements = state.range(0);
34 
35   std::random_device random_device;
36   auto rng = std::mt19937(random_device());
37   auto i8rng = std::bind(
38     std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
39     std::ref(rng));
40 
41   std::vector<int8_t, AlignedAllocator<int8_t, 64>> a(num_elements);
42   std::vector<int8_t, AlignedAllocator<int8_t, 64>> sum(num_elements);
43   std::generate(a.begin(), a.end(), std::ref(i8rng));
44   const int8_t b = i8rng();
45 
46   union xnn_qs8_add_minmax_params params;
47   init_params(&params,
48     1 /* a zero point */, 1 /* b zero point */, 1 /* output zero point */,
49     0.5f /* a-output scale */, 0.75f /* b-output scale */,
50     std::numeric_limits<int8_t>::min() + 1, std::numeric_limits<int8_t>::max() - 1);
51   for (auto _ : state) {
52     vaddc(num_elements * sizeof(int8_t), a.data(), &b, sum.data(), &params);
53   }
54 
55   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
56   if (cpu_frequency != 0) {
57     state.counters["cpufreq"] = cpu_frequency;
58   }
59 
60   const size_t num_elements_per_iteration = num_elements;
61   state.counters["num_elements"] =
62     benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate);
63 
64   const size_t bytes_per_iteration = 2 * num_elements * sizeof(int8_t);
65   state.counters["bytes"] =
66     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
67 }
68 
69 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
70   BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x8,
71                     xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8,
72                     xnn_init_qs8_add_minmax_neon_params,
73                     benchmark::utils::CheckNEON)
74     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
75     ->UseRealTime();
76   BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x16,
77                     xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
78                     xnn_init_qs8_add_minmax_neon_params,
79                     benchmark::utils::CheckNEON)
80     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
81     ->UseRealTime();
82   BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x24,
83                     xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x24,
84                     xnn_init_qs8_add_minmax_neon_params,
85                     benchmark::utils::CheckNEON)
86     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
87     ->UseRealTime();
88   BENCHMARK_CAPTURE(qs8_vaddc, neon_ld64_x32,
89                     xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
90                     xnn_init_qs8_add_minmax_neon_params,
91                     benchmark::utils::CheckNEON)
92     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
93     ->UseRealTime();
94 
95   BENCHMARK_CAPTURE(qs8_vaddc, neon_ld128_x16,
96                     xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16,
97                     xnn_init_qs8_add_minmax_neon_params,
98                     benchmark::utils::CheckNEON)
99     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
100     ->UseRealTime();
101   BENCHMARK_CAPTURE(qs8_vaddc, neon_ld128_x32,
102                     xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32,
103                     xnn_init_qs8_add_minmax_neon_params,
104                     benchmark::utils::CheckNEON)
105     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
106     ->UseRealTime();
107 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
108 
109 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
110   BENCHMARK_CAPTURE(qs8_vaddc, avx512skx_mul32_ld128_x16,
111                     xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
112                     xnn_init_qs8_add_minmax_avx512_params,
113                     benchmark::utils::CheckAVX512SKX)
114     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
115     ->UseRealTime();
116   BENCHMARK_CAPTURE(qs8_vaddc, avx512skx_mul32_ld128_x32,
117                     xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x32,
118                     xnn_init_qs8_add_minmax_avx512_params,
119                     benchmark::utils::CheckAVX512SKX)
120     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
121     ->UseRealTime();
122 
123   BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x8,
124                     xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x8,
125                     xnn_init_qs8_add_minmax_avx2_params,
126                     benchmark::utils::CheckAVX2)
127     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
128     ->UseRealTime();
129   BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x16,
130                     xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
131                     xnn_init_qs8_add_minmax_avx2_params,
132                     benchmark::utils::CheckAVX2)
133     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
134     ->UseRealTime();
135   BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x24,
136                     xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x24,
137                     xnn_init_qs8_add_minmax_avx2_params,
138                     benchmark::utils::CheckAVX2)
139     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
140     ->UseRealTime();
141   BENCHMARK_CAPTURE(qs8_vaddc, avx2_mul32_ld64_x32,
142                     xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x32,
143                     xnn_init_qs8_add_minmax_avx2_params,
144                     benchmark::utils::CheckAVX2)
145     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
146     ->UseRealTime();
147 
148   BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x8,
149                     xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
150                     xnn_init_qs8_add_minmax_sse4_mul32_params,
151                     benchmark::utils::CheckXOP)
152     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
153     ->UseRealTime();
154   BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x16,
155                     xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x16,
156                     xnn_init_qs8_add_minmax_sse4_mul32_params,
157                     benchmark::utils::CheckXOP)
158     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
159     ->UseRealTime();
160   BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x24,
161                     xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x24,
162                     xnn_init_qs8_add_minmax_sse4_mul32_params,
163                     benchmark::utils::CheckXOP)
164     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
165     ->UseRealTime();
166   BENCHMARK_CAPTURE(qs8_vaddc, xop_mul32_ld32_x32,
167                     xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x32,
168                     xnn_init_qs8_add_minmax_sse4_mul32_params,
169                     benchmark::utils::CheckXOP)
170     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
171     ->UseRealTime();
172 
173   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x8,
174                     xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x8,
175                     xnn_init_qs8_add_minmax_sse4_mul16_params,
176                     benchmark::utils::CheckAVX)
177     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
178     ->UseRealTime();
179   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x16,
180                     xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x16,
181                     xnn_init_qs8_add_minmax_sse4_mul16_params,
182                     benchmark::utils::CheckAVX)
183     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
184     ->UseRealTime();
185   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x24,
186                     xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x24,
187                     xnn_init_qs8_add_minmax_sse4_mul16_params,
188                     benchmark::utils::CheckAVX)
189     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
190     ->UseRealTime();
191   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul16_ld64_x32,
192                     xnn_qs8_vaddc_minmax_ukernel__avx_mul16_ld64_x32,
193                     xnn_init_qs8_add_minmax_sse4_mul16_params,
194                     benchmark::utils::CheckAVX)
195     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
196     ->UseRealTime();
197 
198   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x8,
199                     xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
200                     xnn_init_qs8_add_minmax_sse4_mul32_params,
201                     benchmark::utils::CheckAVX)
202     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
203     ->UseRealTime();
204   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x16,
205                     xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x16,
206                     xnn_init_qs8_add_minmax_sse4_mul32_params,
207                     benchmark::utils::CheckAVX)
208     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
209     ->UseRealTime();
210   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x24,
211                     xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x24,
212                     xnn_init_qs8_add_minmax_sse4_mul32_params,
213                     benchmark::utils::CheckAVX)
214     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
215     ->UseRealTime();
216   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x32,
217                     xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x32,
218                     xnn_init_qs8_add_minmax_sse4_mul32_params,
219                     benchmark::utils::CheckAVX)
220     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
221     ->UseRealTime();
222 
223   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x8,
224                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
225                     xnn_init_qs8_add_minmax_sse4_mul16_params,
226                     benchmark::utils::CheckSSE41)
227     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
228     ->UseRealTime();
229   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x16,
230                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16,
231                     xnn_init_qs8_add_minmax_sse4_mul16_params,
232                     benchmark::utils::CheckSSE41)
233     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
234     ->UseRealTime();
235   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x24,
236                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24,
237                     xnn_init_qs8_add_minmax_sse4_mul16_params,
238                     benchmark::utils::CheckSSE41)
239     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
240     ->UseRealTime();
241   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul16_ld64_x32,
242                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32,
243                     xnn_init_qs8_add_minmax_sse4_mul16_params,
244                     benchmark::utils::CheckSSE41)
245     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
246     ->UseRealTime();
247 
248   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x8,
249                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x8,
250                     xnn_init_qs8_add_minmax_sse4_mul32_params,
251                     benchmark::utils::CheckSSE41)
252     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
253     ->UseRealTime();
254   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x16,
255                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x16,
256                     xnn_init_qs8_add_minmax_sse4_mul32_params,
257                     benchmark::utils::CheckSSE41)
258     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
259     ->UseRealTime();
260   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x24,
261                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x24,
262                     xnn_init_qs8_add_minmax_sse4_mul32_params,
263                     benchmark::utils::CheckSSE41)
264     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
265     ->UseRealTime();
266   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x32,
267                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x32,
268                     xnn_init_qs8_add_minmax_sse4_mul32_params,
269                     benchmark::utils::CheckSSE41)
270     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
271     ->UseRealTime();
272 
273   BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x8,
274                     xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
275                     xnn_init_qs8_add_minmax_sse2_params)
276     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
277     ->UseRealTime();
278   BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x16,
279                     xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16,
280                     xnn_init_qs8_add_minmax_sse2_params)
281     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
282     ->UseRealTime();
283   BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x24,
284                     xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24,
285                     xnn_init_qs8_add_minmax_sse2_params)
286     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
287     ->UseRealTime();
288   BENCHMARK_CAPTURE(qs8_vaddc, sse2_mul16_ld64_x32,
289                     xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32,
290                     xnn_init_qs8_add_minmax_sse2_params)
291     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
292     ->UseRealTime();
293 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
294 
295 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
296   BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x8,
297                     xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8,
298                     xnn_init_qs8_add_minmax_wasmsimd_params)
299     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
300     ->UseRealTime();
301   BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x16,
302                     xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x16,
303                     xnn_init_qs8_add_minmax_wasmsimd_params)
304     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
305     ->UseRealTime();
306   BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x24,
307                     xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x24,
308                     xnn_init_qs8_add_minmax_wasmsimd_params)
309     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
310     ->UseRealTime();
311   BENCHMARK_CAPTURE(qs8_vaddc, wasmsimd_x32,
312                     xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
313                     xnn_init_qs8_add_minmax_wasmsimd_params)
314     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
315     ->UseRealTime();
316 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
317 
318 BENCHMARK_CAPTURE(qs8_vaddc, scalar_x1,
319                   xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
320                   xnn_init_qs8_add_minmax_scalar_params)
321   ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
322   ->UseRealTime();
323 BENCHMARK_CAPTURE(qs8_vaddc, scalar_x2,
324                   xnn_qs8_vaddc_minmax_ukernel__scalar_x2,
325                   xnn_init_qs8_add_minmax_scalar_params)
326   ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
327   ->UseRealTime();
328 BENCHMARK_CAPTURE(qs8_vaddc, scalar_x4,
329                   xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
330                   xnn_init_qs8_add_minmax_scalar_params)
331   ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
332   ->UseRealTime();
333 
334 #ifndef XNNPACK_BENCHMARK_NO_MAIN
335 BENCHMARK_MAIN();
336 #endif
337