xref: /aosp_15_r20/external/XNNPACK/bench/f32-vsqrt.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14 
15 #include <xnnpack.h>
16 #include <xnnpack/aligned-allocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/microfnptr.h>
19 #include <xnnpack/microparams-init.h>
20 #include <xnnpack/vunary.h>
21 
22 
f32_vsqrt(benchmark::State & state,xnn_f32_vsqrt_ukernel_function vsqrt,xnn_init_f32_sqrt_params_fn init_params=nullptr,benchmark::utils::IsaCheckFunction isa_check=nullptr)23 static void f32_vsqrt(
24   benchmark::State& state,
25   xnn_f32_vsqrt_ukernel_function vsqrt,
26   xnn_init_f32_sqrt_params_fn init_params = nullptr,
27   benchmark::utils::IsaCheckFunction isa_check = nullptr)
28 {
29   if (isa_check && !isa_check(state)) {
30     return;
31   }
32 
33   const size_t num_elements = state.range(0);
34   std::vector<float, AlignedAllocator<float, 64>> input(num_elements);
35   std::vector<float, AlignedAllocator<float, 64>> output(num_elements);
36 
37   std::random_device random_device;
38   auto rng = std::mt19937(random_device());
39   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 10.0f), std::ref(rng));
40   std::generate(input.begin(), input.end(), std::ref(f32rng));
41   std::fill(output.begin(), output.end(), std::nanf(""));
42 
43   union xnn_f32_sqrt_params params;
44   if (init_params != nullptr) {
45     init_params(&params);
46   }
47   for (auto _ : state) {
48     vsqrt(num_elements * sizeof(float), input.data(), output.data(), &params);
49   }
50 
51   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
52   if (cpu_frequency != 0) {
53     state.counters["cpufreq"] = cpu_frequency;
54   }
55 
56   const size_t elements_per_iteration = num_elements;
57   state.counters["elements"] =
58     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
59 
60   const size_t bytes_per_iteration = 2 * num_elements * sizeof(float);
61   state.counters["bytes"] =
62     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
63 }
64 
65 #if XNN_ARCH_ARM64
66   BENCHMARK_CAPTURE(f32_vsqrt, neon_sqrt_x4,
67                     xnn_f32_vsqrt_ukernel__neon_sqrt_x4)
68     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
69     ->UseRealTime();
70   BENCHMARK_CAPTURE(f32_vsqrt, neon_sqrt_x8,
71                     xnn_f32_vsqrt_ukernel__neon_sqrt_x8)
72     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
73     ->UseRealTime();
74 #endif  // XNN_ARCH_ARM64
75 
76 #if XNN_ARCH_ARM64 || XNN_ARCH_ARM64
77   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x4,
78                     xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4,
79                     nullptr /* init params */,
80                     benchmark::utils::CheckNEONFMA)
81     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
82     ->UseRealTime();
83   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x8,
84                     xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8,
85                     nullptr /* init params */,
86                     benchmark::utils::CheckNEONFMA)
87     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
88     ->UseRealTime();
89   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x12,
90                     xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x12,
91                     nullptr /* init params */,
92                     benchmark::utils::CheckNEONFMA)
93     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
94     ->UseRealTime();
95   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x16,
96                     xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x16,
97                     nullptr /* init params */,
98                     benchmark::utils::CheckNEONFMA)
99     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
100     ->UseRealTime();
101   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x20,
102                     xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x20,
103                     nullptr /* init params */,
104                     benchmark::utils::CheckNEONFMA)
105     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
106     ->UseRealTime();
107   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x24,
108                     xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24,
109                     nullptr /* init params */,
110                     benchmark::utils::CheckNEONFMA)
111     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
112     ->UseRealTime();
113   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x28,
114                     xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28,
115                     nullptr /* init params */,
116                     benchmark::utils::CheckNEONFMA)
117     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
118     ->UseRealTime();
119   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x32,
120                     xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32,
121                     nullptr /* init params */,
122                     benchmark::utils::CheckNEONFMA)
123     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
124     ->UseRealTime();
125   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x36,
126                     xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36,
127                     nullptr /* init params */,
128                     benchmark::utils::CheckNEONFMA)
129     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
130     ->UseRealTime();
131   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x40,
132                     xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40,
133                     nullptr /* init params */,
134                     benchmark::utils::CheckNEONFMA)
135     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
136     ->UseRealTime();
137 
138   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x4,
139                     xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4,
140                     nullptr /* init params */,
141                     benchmark::utils::CheckNEONFMA)
142     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
143     ->UseRealTime();
144   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x8,
145                     xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8,
146                     nullptr /* init params */,
147                     benchmark::utils::CheckNEONFMA)
148     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
149     ->UseRealTime();
150   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x12,
151                     xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12,
152                     nullptr /* init params */,
153                     benchmark::utils::CheckNEONFMA)
154     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
155     ->UseRealTime();
156   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x16,
157                     xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16,
158                     nullptr /* init params */,
159                     benchmark::utils::CheckNEONFMA)
160     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
161     ->UseRealTime();
162   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x20,
163                     xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20,
164                     nullptr /* init params */,
165                     benchmark::utils::CheckNEONFMA)
166     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
167     ->UseRealTime();
168   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x24,
169                     xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24,
170                     nullptr /* init params */,
171                     benchmark::utils::CheckNEONFMA)
172     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
173     ->UseRealTime();
174   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x28,
175                     xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28,
176                     nullptr /* init params */,
177                     benchmark::utils::CheckNEONFMA)
178     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
179     ->UseRealTime();
180   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x32,
181                     xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32,
182                     nullptr /* init params */,
183                     benchmark::utils::CheckNEONFMA)
184     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
185     ->UseRealTime();
186   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x36,
187                     xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36,
188                     nullptr /* init params */,
189                     benchmark::utils::CheckNEONFMA)
190     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
191     ->UseRealTime();
192   BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x40,
193                     xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40,
194                     nullptr /* init params */,
195                     benchmark::utils::CheckNEONFMA)
196     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
197     ->UseRealTime();
198 #endif  // XNN_ARCH_ARM64 || XNN_ARCH_ARM64
199 
200 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
201   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x16,
202                     xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16,
203                     xnn_init_f32_sqrt_avx512_params,
204                     benchmark::utils::CheckAVX512F)
205     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
206     ->UseRealTime();
207   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x32,
208                     xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32,
209                     xnn_init_f32_sqrt_avx512_params,
210                     benchmark::utils::CheckAVX512F)
211     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
212     ->UseRealTime();
213   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x48,
214                     xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48,
215                     xnn_init_f32_sqrt_avx512_params,
216                     benchmark::utils::CheckAVX512F)
217     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
218     ->UseRealTime();
219   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x64,
220                     xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64,
221                     xnn_init_f32_sqrt_avx512_params,
222                     benchmark::utils::CheckAVX512F)
223     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
224     ->UseRealTime();
225   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x80,
226                     xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80,
227                     xnn_init_f32_sqrt_avx512_params,
228                     benchmark::utils::CheckAVX512F)
229     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
230     ->UseRealTime();
231   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x96,
232                     xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96,
233                     xnn_init_f32_sqrt_avx512_params,
234                     benchmark::utils::CheckAVX512F)
235     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
236     ->UseRealTime();
237   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x112,
238                     xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112,
239                     xnn_init_f32_sqrt_avx512_params,
240                     benchmark::utils::CheckAVX512F)
241     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
242     ->UseRealTime();
243   BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x128,
244                     xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128,
245                     xnn_init_f32_sqrt_avx512_params,
246                     benchmark::utils::CheckAVX512F)
247     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
248     ->UseRealTime();
249 
250   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x8,
251                     xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8,
252                     xnn_init_f32_sqrt_fma_params,
253                     benchmark::utils::CheckFMA3)
254     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
255     ->UseRealTime();
256   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x16,
257                     xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16,
258                     xnn_init_f32_sqrt_fma_params,
259                     benchmark::utils::CheckFMA3)
260     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
261     ->UseRealTime();
262   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x24,
263                     xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24,
264                     xnn_init_f32_sqrt_fma_params,
265                     benchmark::utils::CheckFMA3)
266     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
267     ->UseRealTime();
268   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x32,
269                     xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32,
270                     xnn_init_f32_sqrt_fma_params,
271                     benchmark::utils::CheckFMA3)
272     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
273     ->UseRealTime();
274   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x40,
275                     xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40,
276                     xnn_init_f32_sqrt_fma_params,
277                     benchmark::utils::CheckFMA3)
278     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
279     ->UseRealTime();
280   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x48,
281                     xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48,
282                     xnn_init_f32_sqrt_fma_params,
283                     benchmark::utils::CheckFMA3)
284     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
285     ->UseRealTime();
286   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x56,
287                     xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56,
288                     xnn_init_f32_sqrt_fma_params,
289                     benchmark::utils::CheckFMA3)
290     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
291     ->UseRealTime();
292   BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x64,
293                     xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64,
294                     xnn_init_f32_sqrt_fma_params,
295                     benchmark::utils::CheckFMA3)
296     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
297     ->UseRealTime();
298 
299   BENCHMARK_CAPTURE(f32_vsqrt, avx_sqrt_x8,
300                     xnn_f32_vsqrt_ukernel__avx_sqrt_x8,
301                     xnn_init_f32_sqrt_avx_params,
302                     benchmark::utils::CheckAVX)
303     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
304     ->UseRealTime();
305   BENCHMARK_CAPTURE(f32_vsqrt, avx_sqrt_x16,
306                     xnn_f32_vsqrt_ukernel__avx_sqrt_x16,
307                     xnn_init_f32_sqrt_avx_params,
308                     benchmark::utils::CheckAVX)
309     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
310     ->UseRealTime();
311 
312   BENCHMARK_CAPTURE(f32_vsqrt, sse_sqrt_x4,
313                     xnn_f32_vsqrt_ukernel__sse_sqrt_x4)
314     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
315     ->UseRealTime();
316   BENCHMARK_CAPTURE(f32_vsqrt, sse_sqrt_x8,
317                     xnn_f32_vsqrt_ukernel__sse_sqrt_x8)
318     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
319     ->UseRealTime();
320 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
321 
322 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
323   BENCHMARK_CAPTURE(f32_vsqrt, wasmsimd_sqrt_x4,
324                     xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x4)
325     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
326     ->UseRealTime();
327   BENCHMARK_CAPTURE(f32_vsqrt, wasmsimd_sqrt_x8,
328                     xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8)
329     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
330     ->UseRealTime();
331 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
332 
333 BENCHMARK_CAPTURE(f32_vsqrt, scalar_sqrt_x1,
334                   xnn_f32_vsqrt_ukernel__scalar_sqrt_x1)
335   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
336   ->UseRealTime();
337 BENCHMARK_CAPTURE(f32_vsqrt, scalar_sqrt_x2,
338                   xnn_f32_vsqrt_ukernel__scalar_sqrt_x2)
339   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
340   ->UseRealTime();
341 BENCHMARK_CAPTURE(f32_vsqrt, scalar_sqrt_x4,
342                   xnn_f32_vsqrt_ukernel__scalar_sqrt_x4)
343   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
344   ->UseRealTime();
345 
346 #ifndef XNNPACK_BENCHMARK_NO_MAIN
347 BENCHMARK_MAIN();
348 #endif
349