1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <benchmark/benchmark.h>
13 #include <fp16/fp16.h>
14 #include "bench/utils.h"
15
16 #include <xnnpack.h>
17 #include <xnnpack/aligned-allocator.h>
18 #include <xnnpack/common.h>
19 #include <xnnpack/microfnptr.h>
20 #include <xnnpack/microparams-init.h>
21 #include <xnnpack/vcvt.h>
22
23
f32_qs8_vcvt(benchmark::State & state,xnn_f32_qs8_vcvt_ukernel_function cvt,xnn_init_f32_qs8_cvt_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)24 static void f32_qs8_vcvt(
25 benchmark::State& state,
26 xnn_f32_qs8_vcvt_ukernel_function cvt,
27 xnn_init_f32_qs8_cvt_params_fn init_params,
28 benchmark::utils::IsaCheckFunction isa_check = nullptr)
29 {
30 if (isa_check && !isa_check(state)) {
31 return;
32 }
33
34 const size_t num_elements = state.range(0);
35
36 std::random_device random_device;
37 auto rng = std::mt19937(random_device());
38 auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
39
40 std::vector<float, AlignedAllocator<float, 64>> x(num_elements + XNN_EXTRA_BYTES / sizeof(float));
41 std::vector<int8_t, AlignedAllocator<int8_t, 64>> y(num_elements);
42 std::generate(x.begin(), x.end(), std::ref(f32rng));
43 std::fill(y.begin(), y.end(), INT8_C(0xA5));
44
45 xnn_f32_qs8_cvt_params params;
46 init_params(¶ms,
47 25.0f /* scale */,
48 1 /* output zero point */,
49 std::numeric_limits<int8_t>::min() + 1 /* output min */,
50 std::numeric_limits<int8_t>::max() - 1 /* output max */);
51 for (auto _ : state) {
52 cvt(num_elements * sizeof(int8_t), x.data(), y.data(), ¶ms);
53 }
54
55 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
56 if (cpu_frequency != 0) {
57 state.counters["cpufreq"] = cpu_frequency;
58 }
59
60 const size_t elements_per_iteration = num_elements;
61 state.counters["elements"] =
62 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
63
64 const size_t bytes_per_iteration = num_elements * (sizeof(int8_t) + sizeof(float));
65 state.counters["bytes"] =
66 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
67 }
68
69 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
70 BENCHMARK_CAPTURE(f32_qs8_vcvt, neonv8_x8,
71 xnn_f32_qs8_vcvt_ukernel__neonv8_x8,
72 xnn_init_f32_qs8_cvt_neonv8_params,
73 benchmark::utils::CheckNEONV8)
74 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
75 ->UseRealTime();
76 BENCHMARK_CAPTURE(f32_qs8_vcvt, neonv8_x16,
77 xnn_f32_qs8_vcvt_ukernel__neonv8_x16,
78 xnn_init_f32_qs8_cvt_neonv8_params,
79 benchmark::utils::CheckNEONV8)
80 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
81 ->UseRealTime();
82 BENCHMARK_CAPTURE(f32_qs8_vcvt, neonv8_x24,
83 xnn_f32_qs8_vcvt_ukernel__neonv8_x24,
84 xnn_init_f32_qs8_cvt_neonv8_params,
85 benchmark::utils::CheckNEONV8)
86 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
87 ->UseRealTime();
88 BENCHMARK_CAPTURE(f32_qs8_vcvt, neonv8_x32,
89 xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
90 xnn_init_f32_qs8_cvt_neonv8_params,
91 benchmark::utils::CheckNEONV8)
92 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
93 ->UseRealTime();
94
95 BENCHMARK_CAPTURE(f32_qs8_vcvt, neon_x8,
96 xnn_f32_qs8_vcvt_ukernel__neon_x8,
97 xnn_init_f32_qs8_cvt_neon_params,
98 benchmark::utils::CheckNEON)
99 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
100 ->UseRealTime();
101 BENCHMARK_CAPTURE(f32_qs8_vcvt, neon_x16,
102 xnn_f32_qs8_vcvt_ukernel__neon_x16,
103 xnn_init_f32_qs8_cvt_neon_params,
104 benchmark::utils::CheckNEON)
105 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
106 ->UseRealTime();
107 BENCHMARK_CAPTURE(f32_qs8_vcvt, neon_x24,
108 xnn_f32_qs8_vcvt_ukernel__neon_x24,
109 xnn_init_f32_qs8_cvt_neon_params,
110 benchmark::utils::CheckNEON)
111 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
112 ->UseRealTime();
113 BENCHMARK_CAPTURE(f32_qs8_vcvt, neon_x32,
114 xnn_f32_qs8_vcvt_ukernel__neon_x32,
115 xnn_init_f32_qs8_cvt_neon_params,
116 benchmark::utils::CheckNEON)
117 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
118 ->UseRealTime();
119 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
120
121 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
122 BENCHMARK_CAPTURE(f32_qs8_vcvt, avx512skx_x32,
123 xnn_f32_qs8_vcvt_ukernel__avx512skx_x32,
124 xnn_init_f32_qs8_cvt_avx512_params,
125 benchmark::utils::CheckAVX512SKX)
126 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
127 ->UseRealTime();
128 BENCHMARK_CAPTURE(f32_qs8_vcvt, avx512skx_x64,
129 xnn_f32_qs8_vcvt_ukernel__avx512skx_x64,
130 xnn_init_f32_qs8_cvt_avx512_params,
131 benchmark::utils::CheckAVX512SKX)
132 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
133 ->UseRealTime();
134 BENCHMARK_CAPTURE(f32_qs8_vcvt, avx512skx_x96,
135 xnn_f32_qs8_vcvt_ukernel__avx512skx_x96,
136 xnn_init_f32_qs8_cvt_avx512_params,
137 benchmark::utils::CheckAVX512SKX)
138 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
139 ->UseRealTime();
140 BENCHMARK_CAPTURE(f32_qs8_vcvt, avx512skx_x128,
141 xnn_f32_qs8_vcvt_ukernel__avx512skx_x128,
142 xnn_init_f32_qs8_cvt_avx512_params,
143 benchmark::utils::CheckAVX512SKX)
144 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
145 ->UseRealTime();
146
147 BENCHMARK_CAPTURE(f32_qs8_vcvt, avx2_x16,
148 xnn_f32_qs8_vcvt_ukernel__avx2_x16,
149 xnn_init_f32_qs8_cvt_avx2_params,
150 benchmark::utils::CheckAVX2)
151 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
152 ->UseRealTime();
153 BENCHMARK_CAPTURE(f32_qs8_vcvt, avx2_x32,
154 xnn_f32_qs8_vcvt_ukernel__avx2_x32,
155 xnn_init_f32_qs8_cvt_avx2_params,
156 benchmark::utils::CheckAVX2)
157 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
158 ->UseRealTime();
159 BENCHMARK_CAPTURE(f32_qs8_vcvt, avx2_x48,
160 xnn_f32_qs8_vcvt_ukernel__avx2_x48,
161 xnn_init_f32_qs8_cvt_avx2_params,
162 benchmark::utils::CheckAVX2)
163 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
164 ->UseRealTime();
165 BENCHMARK_CAPTURE(f32_qs8_vcvt, avx2_x64,
166 xnn_f32_qs8_vcvt_ukernel__avx2_x64,
167 xnn_init_f32_qs8_cvt_avx2_params,
168 benchmark::utils::CheckAVX2)
169 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
170 ->UseRealTime();
171
172 BENCHMARK_CAPTURE(f32_qs8_vcvt, avx_x8,
173 xnn_f32_qs8_vcvt_ukernel__avx_x8,
174 xnn_init_f32_qs8_cvt_avx_params,
175 benchmark::utils::CheckAVX)
176 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
177 ->UseRealTime();
178 BENCHMARK_CAPTURE(f32_qs8_vcvt, avx_x16,
179 xnn_f32_qs8_vcvt_ukernel__avx_x16,
180 xnn_init_f32_qs8_cvt_avx_params,
181 benchmark::utils::CheckAVX)
182 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
183 ->UseRealTime();
184 BENCHMARK_CAPTURE(f32_qs8_vcvt, avx_x24,
185 xnn_f32_qs8_vcvt_ukernel__avx_x24,
186 xnn_init_f32_qs8_cvt_avx_params,
187 benchmark::utils::CheckAVX)
188 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
189 ->UseRealTime();
190 BENCHMARK_CAPTURE(f32_qs8_vcvt, avx_x32,
191 xnn_f32_qs8_vcvt_ukernel__avx_x32,
192 xnn_init_f32_qs8_cvt_avx_params,
193 benchmark::utils::CheckAVX)
194 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
195 ->UseRealTime();
196
197 BENCHMARK_CAPTURE(f32_qs8_vcvt, sse41_x8,
198 xnn_f32_qs8_vcvt_ukernel__sse41_x8,
199 xnn_init_f32_qs8_cvt_sse4_params,
200 benchmark::utils::CheckSSE41)
201 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
202 ->UseRealTime();
203 BENCHMARK_CAPTURE(f32_qs8_vcvt, sse41_x16,
204 xnn_f32_qs8_vcvt_ukernel__sse41_x16,
205 xnn_init_f32_qs8_cvt_sse4_params,
206 benchmark::utils::CheckSSE41)
207 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
208 ->UseRealTime();
209 BENCHMARK_CAPTURE(f32_qs8_vcvt, sse41_x24,
210 xnn_f32_qs8_vcvt_ukernel__sse41_x24,
211 xnn_init_f32_qs8_cvt_sse4_params,
212 benchmark::utils::CheckSSE41)
213 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
214 ->UseRealTime();
215 BENCHMARK_CAPTURE(f32_qs8_vcvt, sse41_x32,
216 xnn_f32_qs8_vcvt_ukernel__sse41_x32,
217 xnn_init_f32_qs8_cvt_sse4_params,
218 benchmark::utils::CheckSSE41)
219 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
220 ->UseRealTime();
221
222 BENCHMARK_CAPTURE(f32_qs8_vcvt, sse2_x8,
223 xnn_f32_qs8_vcvt_ukernel__sse2_x8,
224 xnn_init_f32_qs8_cvt_sse2_params)
225 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
226 ->UseRealTime();
227 BENCHMARK_CAPTURE(f32_qs8_vcvt, sse2_x16,
228 xnn_f32_qs8_vcvt_ukernel__sse2_x16,
229 xnn_init_f32_qs8_cvt_sse2_params)
230 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
231 ->UseRealTime();
232 BENCHMARK_CAPTURE(f32_qs8_vcvt, sse2_x24,
233 xnn_f32_qs8_vcvt_ukernel__sse2_x24,
234 xnn_init_f32_qs8_cvt_sse2_params)
235 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
236 ->UseRealTime();
237 BENCHMARK_CAPTURE(f32_qs8_vcvt, sse2_x32,
238 xnn_f32_qs8_vcvt_ukernel__sse2_x32,
239 xnn_init_f32_qs8_cvt_sse2_params)
240 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
241 ->UseRealTime();
242 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
243
244 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
245 BENCHMARK_CAPTURE(f32_qs8_vcvt, wasmsimd_cvt_x8,
246 xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_x8,
247 xnn_init_f32_qs8_cvt_wasmsimd_cvt_params)
248 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
249 ->UseRealTime();
250 BENCHMARK_CAPTURE(f32_qs8_vcvt, wasmsimd_cvt_x16,
251 xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_x16,
252 xnn_init_f32_qs8_cvt_wasmsimd_cvt_params)
253 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
254 ->UseRealTime();
255 BENCHMARK_CAPTURE(f32_qs8_vcvt, wasmsimd_cvt_x24,
256 xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_x24,
257 xnn_init_f32_qs8_cvt_wasmsimd_cvt_params)
258 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
259 ->UseRealTime();
260 BENCHMARK_CAPTURE(f32_qs8_vcvt, wasmsimd_cvt_x32,
261 xnn_f32_qs8_vcvt_ukernel__wasmsimd_cvt_x32,
262 xnn_init_f32_qs8_cvt_wasmsimd_cvt_params)
263 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
264 ->UseRealTime();
265
266 BENCHMARK_CAPTURE(f32_qs8_vcvt, wasmsimd_magic_x8,
267 xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x8,
268 xnn_init_f32_qs8_cvt_wasmsimd_magic_params)
269 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
270 ->UseRealTime();
271 BENCHMARK_CAPTURE(f32_qs8_vcvt, wasmsimd_magic_x16,
272 xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x16,
273 xnn_init_f32_qs8_cvt_wasmsimd_magic_params)
274 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
275 ->UseRealTime();
276 BENCHMARK_CAPTURE(f32_qs8_vcvt, wasmsimd_magic_x24,
277 xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x24,
278 xnn_init_f32_qs8_cvt_wasmsimd_magic_params)
279 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
280 ->UseRealTime();
281 BENCHMARK_CAPTURE(f32_qs8_vcvt, wasmsimd_magic_x32,
282 xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x32,
283 xnn_init_f32_qs8_cvt_wasmsimd_magic_params)
284 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
285 ->UseRealTime();
286 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
287
288 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
289 BENCHMARK_CAPTURE(f32_qs8_vcvt, wasm_fmagic_x1,
290 xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_x1,
291 xnn_init_f32_qs8_cvt_scalar_fmagic_params)
292 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
293 ->UseRealTime();
294 BENCHMARK_CAPTURE(f32_qs8_vcvt, wasm_fmagic_x2,
295 xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_x2,
296 xnn_init_f32_qs8_cvt_scalar_fmagic_params)
297 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
298 ->UseRealTime();
299 BENCHMARK_CAPTURE(f32_qs8_vcvt, wasm_fmagic_x3,
300 xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_x3,
301 xnn_init_f32_qs8_cvt_scalar_fmagic_params)
302 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
303 ->UseRealTime();
304 BENCHMARK_CAPTURE(f32_qs8_vcvt, wasm_fmagic_x4,
305 xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_x4,
306 xnn_init_f32_qs8_cvt_scalar_fmagic_params)
307 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
308 ->UseRealTime();
309 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
310
311 BENCHMARK_CAPTURE(f32_qs8_vcvt, scalar_fmagic_x1,
312 xnn_f32_qs8_vcvt_ukernel__scalar_fmagic_x1,
313 xnn_init_f32_qs8_cvt_scalar_fmagic_params)
314 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
315 ->UseRealTime();
316 BENCHMARK_CAPTURE(f32_qs8_vcvt, scalar_fmagic_x2,
317 xnn_f32_qs8_vcvt_ukernel__scalar_fmagic_x2,
318 xnn_init_f32_qs8_cvt_scalar_fmagic_params)
319 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
320 ->UseRealTime();
321 BENCHMARK_CAPTURE(f32_qs8_vcvt, scalar_fmagic_x3,
322 xnn_f32_qs8_vcvt_ukernel__scalar_fmagic_x3,
323 xnn_init_f32_qs8_cvt_scalar_fmagic_params)
324 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
325 ->UseRealTime();
326 BENCHMARK_CAPTURE(f32_qs8_vcvt, scalar_fmagic_x4,
327 xnn_f32_qs8_vcvt_ukernel__scalar_fmagic_x4,
328 xnn_init_f32_qs8_cvt_scalar_fmagic_params)
329 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
330 ->UseRealTime();
331
332 BENCHMARK_CAPTURE(f32_qs8_vcvt, scalar_imagic_x1,
333 xnn_f32_qs8_vcvt_ukernel__scalar_imagic_x1,
334 xnn_init_f32_qs8_cvt_scalar_imagic_params)
335 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
336 ->UseRealTime();
337 BENCHMARK_CAPTURE(f32_qs8_vcvt, scalar_imagic_x2,
338 xnn_f32_qs8_vcvt_ukernel__scalar_imagic_x2,
339 xnn_init_f32_qs8_cvt_scalar_imagic_params)
340 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
341 ->UseRealTime();
342 BENCHMARK_CAPTURE(f32_qs8_vcvt, scalar_imagic_x3,
343 xnn_f32_qs8_vcvt_ukernel__scalar_imagic_x3,
344 xnn_init_f32_qs8_cvt_scalar_imagic_params)
345 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
346 ->UseRealTime();
347 BENCHMARK_CAPTURE(f32_qs8_vcvt, scalar_imagic_x4,
348 xnn_f32_qs8_vcvt_ukernel__scalar_imagic_x4,
349 xnn_init_f32_qs8_cvt_scalar_imagic_params)
350 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
351 ->UseRealTime();
352
353 BENCHMARK_CAPTURE(f32_qs8_vcvt, scalar_lrintf_x1,
354 xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_x1,
355 xnn_init_f32_qs8_cvt_scalar_lrintf_params)
356 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
357 ->UseRealTime();
358 BENCHMARK_CAPTURE(f32_qs8_vcvt, scalar_lrintf_x2,
359 xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_x2,
360 xnn_init_f32_qs8_cvt_scalar_lrintf_params)
361 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
362 ->UseRealTime();
363 BENCHMARK_CAPTURE(f32_qs8_vcvt, scalar_lrintf_x3,
364 xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_x3,
365 xnn_init_f32_qs8_cvt_scalar_lrintf_params)
366 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
367 ->UseRealTime();
368 BENCHMARK_CAPTURE(f32_qs8_vcvt, scalar_lrintf_x4,
369 xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_x4,
370 xnn_init_f32_qs8_cvt_scalar_lrintf_params)
371 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
372 ->UseRealTime();
373
374 #ifndef XNNPACK_BENCHMARK_NO_MAIN
375 BENCHMARK_MAIN();
376 #endif
377