1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <benchmark/benchmark.h>
13 #include <fp16/fp16.h>
14 #include "bench/utils.h"
15
16 #include <xnnpack.h>
17 #include <xnnpack/aligned-allocator.h>
18 #include <xnnpack/common.h>
19 #include <xnnpack/microfnptr.h>
20 #include <xnnpack/microparams-init.h>
21 #include <xnnpack/vcvt.h>
22
23
f32_qu8_vcvt(benchmark::State & state,xnn_f32_qu8_vcvt_ukernel_function cvt,xnn_init_f32_qu8_cvt_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)24 static void f32_qu8_vcvt(
25 benchmark::State& state,
26 xnn_f32_qu8_vcvt_ukernel_function cvt,
27 xnn_init_f32_qu8_cvt_params_fn init_params,
28 benchmark::utils::IsaCheckFunction isa_check = nullptr)
29 {
30 if (isa_check && !isa_check(state)) {
31 return;
32 }
33
34 const size_t num_elements = state.range(0);
35
36 std::random_device random_device;
37 auto rng = std::mt19937(random_device());
38 auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
39
40 std::vector<float, AlignedAllocator<float, 64>> x(num_elements + XNN_EXTRA_BYTES / sizeof(float));
41 std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> y(num_elements);
42 std::generate(x.begin(), x.end(), std::ref(f32rng));
43 std::fill(y.begin(), y.end(), UINT8_C(0xA5));
44
45 xnn_f32_qu8_cvt_params params;
46 init_params(¶ms,
47 25.0f /* scale */,
48 127 /* output zero point */,
49 std::numeric_limits<uint8_t>::min() + 1 /* output min */,
50 std::numeric_limits<uint8_t>::max() - 1 /* output max */);
51 for (auto _ : state) {
52 cvt(num_elements * sizeof(uint8_t), x.data(), y.data(), ¶ms);
53 }
54
55 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
56 if (cpu_frequency != 0) {
57 state.counters["cpufreq"] = cpu_frequency;
58 }
59
60 const size_t elements_per_iteration = num_elements;
61 state.counters["elements"] =
62 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
63
64 const size_t bytes_per_iteration = num_elements * (sizeof(uint8_t) + sizeof(float));
65 state.counters["bytes"] =
66 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
67 }
68
69 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
70 BENCHMARK_CAPTURE(f32_qu8_vcvt, neonv8_x8,
71 xnn_f32_qu8_vcvt_ukernel__neonv8_x8,
72 xnn_init_f32_qu8_cvt_neonv8_params,
73 benchmark::utils::CheckNEONV8)
74 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
75 ->UseRealTime();
76 BENCHMARK_CAPTURE(f32_qu8_vcvt, neonv8_x16,
77 xnn_f32_qu8_vcvt_ukernel__neonv8_x16,
78 xnn_init_f32_qu8_cvt_neonv8_params,
79 benchmark::utils::CheckNEONV8)
80 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
81 ->UseRealTime();
82 BENCHMARK_CAPTURE(f32_qu8_vcvt, neonv8_x24,
83 xnn_f32_qu8_vcvt_ukernel__neonv8_x24,
84 xnn_init_f32_qu8_cvt_neonv8_params,
85 benchmark::utils::CheckNEONV8)
86 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
87 ->UseRealTime();
88 BENCHMARK_CAPTURE(f32_qu8_vcvt, neonv8_x32,
89 xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
90 xnn_init_f32_qu8_cvt_neonv8_params,
91 benchmark::utils::CheckNEONV8)
92 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
93 ->UseRealTime();
94
95 BENCHMARK_CAPTURE(f32_qu8_vcvt, neon_x8,
96 xnn_f32_qu8_vcvt_ukernel__neon_x8,
97 xnn_init_f32_qu8_cvt_neon_params,
98 benchmark::utils::CheckNEON)
99 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
100 ->UseRealTime();
101 BENCHMARK_CAPTURE(f32_qu8_vcvt, neon_x16,
102 xnn_f32_qu8_vcvt_ukernel__neon_x16,
103 xnn_init_f32_qu8_cvt_neon_params,
104 benchmark::utils::CheckNEON)
105 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
106 ->UseRealTime();
107 BENCHMARK_CAPTURE(f32_qu8_vcvt, neon_x24,
108 xnn_f32_qu8_vcvt_ukernel__neon_x24,
109 xnn_init_f32_qu8_cvt_neon_params,
110 benchmark::utils::CheckNEON)
111 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
112 ->UseRealTime();
113 BENCHMARK_CAPTURE(f32_qu8_vcvt, neon_x32,
114 xnn_f32_qu8_vcvt_ukernel__neon_x32,
115 xnn_init_f32_qu8_cvt_neon_params,
116 benchmark::utils::CheckNEON)
117 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
118 ->UseRealTime();
119 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
120
121 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
122 BENCHMARK_CAPTURE(f32_qu8_vcvt, avx512skx_x32,
123 xnn_f32_qu8_vcvt_ukernel__avx512skx_x32,
124 xnn_init_f32_qu8_cvt_avx512_params,
125 benchmark::utils::CheckAVX512SKX)
126 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
127 ->UseRealTime();
128 BENCHMARK_CAPTURE(f32_qu8_vcvt, avx512skx_x64,
129 xnn_f32_qu8_vcvt_ukernel__avx512skx_x64,
130 xnn_init_f32_qu8_cvt_avx512_params,
131 benchmark::utils::CheckAVX512SKX)
132 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
133 ->UseRealTime();
134 BENCHMARK_CAPTURE(f32_qu8_vcvt, avx512skx_x96,
135 xnn_f32_qu8_vcvt_ukernel__avx512skx_x96,
136 xnn_init_f32_qu8_cvt_avx512_params,
137 benchmark::utils::CheckAVX512SKX)
138 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
139 ->UseRealTime();
140 BENCHMARK_CAPTURE(f32_qu8_vcvt, avx512skx_x128,
141 xnn_f32_qu8_vcvt_ukernel__avx512skx_x128,
142 xnn_init_f32_qu8_cvt_avx512_params,
143 benchmark::utils::CheckAVX512SKX)
144 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
145 ->UseRealTime();
146
147 BENCHMARK_CAPTURE(f32_qu8_vcvt, avx2_x16,
148 xnn_f32_qu8_vcvt_ukernel__avx2_x16,
149 xnn_init_f32_qu8_cvt_avx2_params,
150 benchmark::utils::CheckAVX2)
151 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
152 ->UseRealTime();
153 BENCHMARK_CAPTURE(f32_qu8_vcvt, avx2_x32,
154 xnn_f32_qu8_vcvt_ukernel__avx2_x32,
155 xnn_init_f32_qu8_cvt_avx2_params,
156 benchmark::utils::CheckAVX2)
157 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
158 ->UseRealTime();
159 BENCHMARK_CAPTURE(f32_qu8_vcvt, avx2_x48,
160 xnn_f32_qu8_vcvt_ukernel__avx2_x48,
161 xnn_init_f32_qu8_cvt_avx2_params,
162 benchmark::utils::CheckAVX2)
163 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
164 ->UseRealTime();
165 BENCHMARK_CAPTURE(f32_qu8_vcvt, avx2_x64,
166 xnn_f32_qu8_vcvt_ukernel__avx2_x64,
167 xnn_init_f32_qu8_cvt_avx2_params,
168 benchmark::utils::CheckAVX2)
169 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
170 ->UseRealTime();
171
172 BENCHMARK_CAPTURE(f32_qu8_vcvt, avx_x8,
173 xnn_f32_qu8_vcvt_ukernel__avx_x8,
174 xnn_init_f32_qu8_cvt_avx_params,
175 benchmark::utils::CheckAVX)
176 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
177 ->UseRealTime();
178 BENCHMARK_CAPTURE(f32_qu8_vcvt, avx_x16,
179 xnn_f32_qu8_vcvt_ukernel__avx_x16,
180 xnn_init_f32_qu8_cvt_avx_params,
181 benchmark::utils::CheckAVX)
182 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
183 ->UseRealTime();
184 BENCHMARK_CAPTURE(f32_qu8_vcvt, avx_x24,
185 xnn_f32_qu8_vcvt_ukernel__avx_x24,
186 xnn_init_f32_qu8_cvt_avx_params,
187 benchmark::utils::CheckAVX)
188 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
189 ->UseRealTime();
190 BENCHMARK_CAPTURE(f32_qu8_vcvt, avx_x32,
191 xnn_f32_qu8_vcvt_ukernel__avx_x32,
192 xnn_init_f32_qu8_cvt_avx_params,
193 benchmark::utils::CheckAVX)
194 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
195 ->UseRealTime();
196
197 BENCHMARK_CAPTURE(f32_qu8_vcvt, sse2_x8,
198 xnn_f32_qu8_vcvt_ukernel__sse2_x8,
199 xnn_init_f32_qu8_cvt_sse2_params)
200 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
201 ->UseRealTime();
202 BENCHMARK_CAPTURE(f32_qu8_vcvt, sse2_x16,
203 xnn_f32_qu8_vcvt_ukernel__sse2_x16,
204 xnn_init_f32_qu8_cvt_sse2_params)
205 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
206 ->UseRealTime();
207 BENCHMARK_CAPTURE(f32_qu8_vcvt, sse2_x24,
208 xnn_f32_qu8_vcvt_ukernel__sse2_x24,
209 xnn_init_f32_qu8_cvt_sse2_params)
210 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
211 ->UseRealTime();
212 BENCHMARK_CAPTURE(f32_qu8_vcvt, sse2_x32,
213 xnn_f32_qu8_vcvt_ukernel__sse2_x32,
214 xnn_init_f32_qu8_cvt_sse2_params)
215 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
216 ->UseRealTime();
217 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
218
219 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
220 BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_cvt_x8,
221 xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_x8,
222 xnn_init_f32_qu8_cvt_wasmsimd_cvt_params)
223 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
224 ->UseRealTime();
225 BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_cvt_x16,
226 xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_x16,
227 xnn_init_f32_qu8_cvt_wasmsimd_cvt_params)
228 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
229 ->UseRealTime();
230 BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_cvt_x24,
231 xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_x24,
232 xnn_init_f32_qu8_cvt_wasmsimd_cvt_params)
233 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
234 ->UseRealTime();
235 BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_cvt_x32,
236 xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_x32,
237 xnn_init_f32_qu8_cvt_wasmsimd_cvt_params)
238 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
239 ->UseRealTime();
240
241 BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_magic_x8,
242 xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x8,
243 xnn_init_f32_qu8_cvt_wasmsimd_magic_params)
244 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
245 ->UseRealTime();
246 BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_magic_x16,
247 xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x16,
248 xnn_init_f32_qu8_cvt_wasmsimd_magic_params)
249 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
250 ->UseRealTime();
251 BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_magic_x24,
252 xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x24,
253 xnn_init_f32_qu8_cvt_wasmsimd_magic_params)
254 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
255 ->UseRealTime();
256 BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_magic_x32,
257 xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x32,
258 xnn_init_f32_qu8_cvt_wasmsimd_magic_params)
259 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
260 ->UseRealTime();
261 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
262
263 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
264 BENCHMARK_CAPTURE(f32_qu8_vcvt, wasm_fmagic_x1,
265 xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x1,
266 xnn_init_f32_qu8_cvt_scalar_fmagic_params)
267 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
268 ->UseRealTime();
269 BENCHMARK_CAPTURE(f32_qu8_vcvt, wasm_fmagic_x2,
270 xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x2,
271 xnn_init_f32_qu8_cvt_scalar_fmagic_params)
272 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
273 ->UseRealTime();
274 BENCHMARK_CAPTURE(f32_qu8_vcvt, wasm_fmagic_x3,
275 xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x3,
276 xnn_init_f32_qu8_cvt_scalar_fmagic_params)
277 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
278 ->UseRealTime();
279 BENCHMARK_CAPTURE(f32_qu8_vcvt, wasm_fmagic_x4,
280 xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
281 xnn_init_f32_qu8_cvt_scalar_fmagic_params)
282 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
283 ->UseRealTime();
284 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
285
286 BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_fmagic_x1,
287 xnn_f32_qu8_vcvt_ukernel__scalar_fmagic_x1,
288 xnn_init_f32_qu8_cvt_scalar_fmagic_params)
289 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
290 ->UseRealTime();
291 BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_fmagic_x2,
292 xnn_f32_qu8_vcvt_ukernel__scalar_fmagic_x2,
293 xnn_init_f32_qu8_cvt_scalar_fmagic_params)
294 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
295 ->UseRealTime();
296 BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_fmagic_x3,
297 xnn_f32_qu8_vcvt_ukernel__scalar_fmagic_x3,
298 xnn_init_f32_qu8_cvt_scalar_fmagic_params)
299 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
300 ->UseRealTime();
301 BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_fmagic_x4,
302 xnn_f32_qu8_vcvt_ukernel__scalar_fmagic_x4,
303 xnn_init_f32_qu8_cvt_scalar_fmagic_params)
304 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
305 ->UseRealTime();
306
307 BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_imagic_x1,
308 xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
309 xnn_init_f32_qu8_cvt_scalar_imagic_params)
310 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
311 ->UseRealTime();
312 BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_imagic_x2,
313 xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x2,
314 xnn_init_f32_qu8_cvt_scalar_imagic_params)
315 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
316 ->UseRealTime();
317 BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_imagic_x3,
318 xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x3,
319 xnn_init_f32_qu8_cvt_scalar_imagic_params)
320 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
321 ->UseRealTime();
322 BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_imagic_x4,
323 xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x4,
324 xnn_init_f32_qu8_cvt_scalar_imagic_params)
325 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
326 ->UseRealTime();
327
328 BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_lrintf_x1,
329 xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_x1,
330 xnn_init_f32_qu8_cvt_scalar_lrintf_params)
331 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
332 ->UseRealTime();
333 BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_lrintf_x2,
334 xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_x2,
335 xnn_init_f32_qu8_cvt_scalar_lrintf_params)
336 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
337 ->UseRealTime();
338 BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_lrintf_x3,
339 xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_x3,
340 xnn_init_f32_qu8_cvt_scalar_lrintf_params)
341 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
342 ->UseRealTime();
343 BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_lrintf_x4,
344 xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_x4,
345 xnn_init_f32_qu8_cvt_scalar_lrintf_params)
346 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
347 ->UseRealTime();
348
349 #ifndef XNNPACK_BENCHMARK_NO_MAIN
350 BENCHMARK_MAIN();
351 #endif
352