1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14
15 #include <xnnpack.h>
16 #include <xnnpack/aligned-allocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/microfnptr.h>
19 #include <xnnpack/microparams-init.h>
20 #include <xnnpack/vunary.h>
21
22
f32_vsqrt(benchmark::State & state,xnn_f32_vsqrt_ukernel_function vsqrt,xnn_init_f32_sqrt_params_fn init_params=nullptr,benchmark::utils::IsaCheckFunction isa_check=nullptr)23 static void f32_vsqrt(
24 benchmark::State& state,
25 xnn_f32_vsqrt_ukernel_function vsqrt,
26 xnn_init_f32_sqrt_params_fn init_params = nullptr,
27 benchmark::utils::IsaCheckFunction isa_check = nullptr)
28 {
29 if (isa_check && !isa_check(state)) {
30 return;
31 }
32
33 const size_t num_elements = state.range(0);
34 std::vector<float, AlignedAllocator<float, 64>> input(num_elements);
35 std::vector<float, AlignedAllocator<float, 64>> output(num_elements);
36
37 std::random_device random_device;
38 auto rng = std::mt19937(random_device());
39 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 10.0f), std::ref(rng));
40 std::generate(input.begin(), input.end(), std::ref(f32rng));
41 std::fill(output.begin(), output.end(), std::nanf(""));
42
43 union xnn_f32_sqrt_params params;
44 if (init_params != nullptr) {
45 init_params(¶ms);
46 }
47 for (auto _ : state) {
48 vsqrt(num_elements * sizeof(float), input.data(), output.data(), ¶ms);
49 }
50
51 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
52 if (cpu_frequency != 0) {
53 state.counters["cpufreq"] = cpu_frequency;
54 }
55
56 const size_t elements_per_iteration = num_elements;
57 state.counters["elements"] =
58 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
59
60 const size_t bytes_per_iteration = 2 * num_elements * sizeof(float);
61 state.counters["bytes"] =
62 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
63 }
64
65 #if XNN_ARCH_ARM64
66 BENCHMARK_CAPTURE(f32_vsqrt, neon_sqrt_x4,
67 xnn_f32_vsqrt_ukernel__neon_sqrt_x4)
68 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
69 ->UseRealTime();
70 BENCHMARK_CAPTURE(f32_vsqrt, neon_sqrt_x8,
71 xnn_f32_vsqrt_ukernel__neon_sqrt_x8)
72 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
73 ->UseRealTime();
74 #endif // XNN_ARCH_ARM64
75
76 #if XNN_ARCH_ARM64 || XNN_ARCH_ARM64
77 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x4,
78 xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x4,
79 nullptr /* init params */,
80 benchmark::utils::CheckNEONFMA)
81 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
82 ->UseRealTime();
83 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x8,
84 xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x8,
85 nullptr /* init params */,
86 benchmark::utils::CheckNEONFMA)
87 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
88 ->UseRealTime();
89 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x12,
90 xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x12,
91 nullptr /* init params */,
92 benchmark::utils::CheckNEONFMA)
93 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
94 ->UseRealTime();
95 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x16,
96 xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x16,
97 nullptr /* init params */,
98 benchmark::utils::CheckNEONFMA)
99 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
100 ->UseRealTime();
101 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x20,
102 xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x20,
103 nullptr /* init params */,
104 benchmark::utils::CheckNEONFMA)
105 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
106 ->UseRealTime();
107 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x24,
108 xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x24,
109 nullptr /* init params */,
110 benchmark::utils::CheckNEONFMA)
111 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
112 ->UseRealTime();
113 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x28,
114 xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x28,
115 nullptr /* init params */,
116 benchmark::utils::CheckNEONFMA)
117 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
118 ->UseRealTime();
119 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x32,
120 xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x32,
121 nullptr /* init params */,
122 benchmark::utils::CheckNEONFMA)
123 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
124 ->UseRealTime();
125 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x36,
126 xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x36,
127 nullptr /* init params */,
128 benchmark::utils::CheckNEONFMA)
129 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
130 ->UseRealTime();
131 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr1rsqrts1fma1adj_x40,
132 xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x40,
133 nullptr /* init params */,
134 benchmark::utils::CheckNEONFMA)
135 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
136 ->UseRealTime();
137
138 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x4,
139 xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x4,
140 nullptr /* init params */,
141 benchmark::utils::CheckNEONFMA)
142 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
143 ->UseRealTime();
144 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x8,
145 xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x8,
146 nullptr /* init params */,
147 benchmark::utils::CheckNEONFMA)
148 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
149 ->UseRealTime();
150 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x12,
151 xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x12,
152 nullptr /* init params */,
153 benchmark::utils::CheckNEONFMA)
154 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
155 ->UseRealTime();
156 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x16,
157 xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x16,
158 nullptr /* init params */,
159 benchmark::utils::CheckNEONFMA)
160 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
161 ->UseRealTime();
162 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x20,
163 xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x20,
164 nullptr /* init params */,
165 benchmark::utils::CheckNEONFMA)
166 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
167 ->UseRealTime();
168 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x24,
169 xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x24,
170 nullptr /* init params */,
171 benchmark::utils::CheckNEONFMA)
172 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
173 ->UseRealTime();
174 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x28,
175 xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x28,
176 nullptr /* init params */,
177 benchmark::utils::CheckNEONFMA)
178 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
179 ->UseRealTime();
180 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x32,
181 xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x32,
182 nullptr /* init params */,
183 benchmark::utils::CheckNEONFMA)
184 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
185 ->UseRealTime();
186 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x36,
187 xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x36,
188 nullptr /* init params */,
189 benchmark::utils::CheckNEONFMA)
190 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
191 ->UseRealTime();
192 BENCHMARK_CAPTURE(f32_vsqrt, neonfma_nr2fma1adj_x40,
193 xnn_f32_vsqrt_ukernel__neonfma_nr2fma1adj_x40,
194 nullptr /* init params */,
195 benchmark::utils::CheckNEONFMA)
196 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
197 ->UseRealTime();
198 #endif // XNN_ARCH_ARM64 || XNN_ARCH_ARM64
199
200 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
201 BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x16,
202 xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x16,
203 xnn_init_f32_sqrt_avx512_params,
204 benchmark::utils::CheckAVX512F)
205 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
206 ->UseRealTime();
207 BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x32,
208 xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x32,
209 xnn_init_f32_sqrt_avx512_params,
210 benchmark::utils::CheckAVX512F)
211 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
212 ->UseRealTime();
213 BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x48,
214 xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x48,
215 xnn_init_f32_sqrt_avx512_params,
216 benchmark::utils::CheckAVX512F)
217 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
218 ->UseRealTime();
219 BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x64,
220 xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x64,
221 xnn_init_f32_sqrt_avx512_params,
222 benchmark::utils::CheckAVX512F)
223 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
224 ->UseRealTime();
225 BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x80,
226 xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x80,
227 xnn_init_f32_sqrt_avx512_params,
228 benchmark::utils::CheckAVX512F)
229 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
230 ->UseRealTime();
231 BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x96,
232 xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x96,
233 xnn_init_f32_sqrt_avx512_params,
234 benchmark::utils::CheckAVX512F)
235 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
236 ->UseRealTime();
237 BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x112,
238 xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x112,
239 xnn_init_f32_sqrt_avx512_params,
240 benchmark::utils::CheckAVX512F)
241 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
242 ->UseRealTime();
243 BENCHMARK_CAPTURE(f32_vsqrt, avx512f_nr1fma1adj_x128,
244 xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_x128,
245 xnn_init_f32_sqrt_avx512_params,
246 benchmark::utils::CheckAVX512F)
247 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
248 ->UseRealTime();
249
250 BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x8,
251 xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x8,
252 xnn_init_f32_sqrt_fma_params,
253 benchmark::utils::CheckFMA3)
254 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
255 ->UseRealTime();
256 BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x16,
257 xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x16,
258 xnn_init_f32_sqrt_fma_params,
259 benchmark::utils::CheckFMA3)
260 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
261 ->UseRealTime();
262 BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x24,
263 xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x24,
264 xnn_init_f32_sqrt_fma_params,
265 benchmark::utils::CheckFMA3)
266 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
267 ->UseRealTime();
268 BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x32,
269 xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x32,
270 xnn_init_f32_sqrt_fma_params,
271 benchmark::utils::CheckFMA3)
272 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
273 ->UseRealTime();
274 BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x40,
275 xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x40,
276 xnn_init_f32_sqrt_fma_params,
277 benchmark::utils::CheckFMA3)
278 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
279 ->UseRealTime();
280 BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x48,
281 xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x48,
282 xnn_init_f32_sqrt_fma_params,
283 benchmark::utils::CheckFMA3)
284 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
285 ->UseRealTime();
286 BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x56,
287 xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x56,
288 xnn_init_f32_sqrt_fma_params,
289 benchmark::utils::CheckFMA3)
290 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
291 ->UseRealTime();
292 BENCHMARK_CAPTURE(f32_vsqrt, fma3_nr1fma1adj_x64,
293 xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_x64,
294 xnn_init_f32_sqrt_fma_params,
295 benchmark::utils::CheckFMA3)
296 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
297 ->UseRealTime();
298
299 BENCHMARK_CAPTURE(f32_vsqrt, avx_sqrt_x8,
300 xnn_f32_vsqrt_ukernel__avx_sqrt_x8,
301 xnn_init_f32_sqrt_avx_params,
302 benchmark::utils::CheckAVX)
303 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
304 ->UseRealTime();
305 BENCHMARK_CAPTURE(f32_vsqrt, avx_sqrt_x16,
306 xnn_f32_vsqrt_ukernel__avx_sqrt_x16,
307 xnn_init_f32_sqrt_avx_params,
308 benchmark::utils::CheckAVX)
309 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
310 ->UseRealTime();
311
312 BENCHMARK_CAPTURE(f32_vsqrt, sse_sqrt_x4,
313 xnn_f32_vsqrt_ukernel__sse_sqrt_x4)
314 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
315 ->UseRealTime();
316 BENCHMARK_CAPTURE(f32_vsqrt, sse_sqrt_x8,
317 xnn_f32_vsqrt_ukernel__sse_sqrt_x8)
318 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
319 ->UseRealTime();
320 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
321
322 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
323 BENCHMARK_CAPTURE(f32_vsqrt, wasmsimd_sqrt_x4,
324 xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x4)
325 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
326 ->UseRealTime();
327 BENCHMARK_CAPTURE(f32_vsqrt, wasmsimd_sqrt_x8,
328 xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8)
329 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
330 ->UseRealTime();
331 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
332
333 BENCHMARK_CAPTURE(f32_vsqrt, scalar_sqrt_x1,
334 xnn_f32_vsqrt_ukernel__scalar_sqrt_x1)
335 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
336 ->UseRealTime();
337 BENCHMARK_CAPTURE(f32_vsqrt, scalar_sqrt_x2,
338 xnn_f32_vsqrt_ukernel__scalar_sqrt_x2)
339 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
340 ->UseRealTime();
341 BENCHMARK_CAPTURE(f32_vsqrt, scalar_sqrt_x4,
342 xnn_f32_vsqrt_ukernel__scalar_sqrt_x4)
343 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
344 ->UseRealTime();
345
346 #ifndef XNNPACK_BENCHMARK_NO_MAIN
347 BENCHMARK_MAIN();
348 #endif
349