1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <benchmark/benchmark.h>
13 #include <fp16/fp16.h>
14 #include "bench/utils.h"
15
16 #include <xnnpack.h>
17 #include <xnnpack/aligned-allocator.h>
18 #include <xnnpack/common.h>
19 #include <xnnpack/microfnptr.h>
20 #include <xnnpack/microparams-init.h>
21 #include <xnnpack/vcvt.h>
22
23
qu8_f32_vcvt(benchmark::State & state,xnn_qu8_f32_vcvt_ukernel_function cvt,xnn_init_qu8_f32_cvt_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)24 static void qu8_f32_vcvt(
25 benchmark::State& state,
26 xnn_qu8_f32_vcvt_ukernel_function cvt,
27 xnn_init_qu8_f32_cvt_params_fn init_params,
28 benchmark::utils::IsaCheckFunction isa_check = nullptr)
29 {
30 if (isa_check && !isa_check(state)) {
31 return;
32 }
33
34 const size_t num_elements = state.range(0);
35
36 std::random_device random_device;
37 auto rng = std::mt19937(random_device());
38 auto u8rng = std::bind(
39 std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
40 std::ref(rng));
41
42 std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> x(num_elements + XNN_EXTRA_BYTES / sizeof(uint8_t));
43 std::vector<float, AlignedAllocator<float, 64>> y(num_elements);
44 std::generate(x.begin(), x.end(), std::ref(u8rng));
45 std::fill(y.begin(), y.end(), std::nanf(""));
46
47 xnn_qu8_f32_cvt_params params;
48 init_params(¶ms,
49 0.25f /* scale */,
50 127 /* output zero point */);
51 for (auto _ : state) {
52 cvt(num_elements * sizeof(uint8_t), x.data(), y.data(), ¶ms);
53 }
54
55 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
56 if (cpu_frequency != 0) {
57 state.counters["cpufreq"] = cpu_frequency;
58 }
59
60 const size_t elements_per_iteration = num_elements;
61 state.counters["elements"] =
62 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
63
64 const size_t bytes_per_iteration = num_elements * (sizeof(uint8_t) + sizeof(float));
65 state.counters["bytes"] =
66 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
67 }
68
69 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
70 BENCHMARK_CAPTURE(qu8_f32_vcvt, neon_x8,
71 xnn_qu8_f32_vcvt_ukernel__neon_x8,
72 xnn_init_qu8_f32_cvt_neon_params,
73 benchmark::utils::CheckNEON)
74 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
75 ->UseRealTime();
76 BENCHMARK_CAPTURE(qu8_f32_vcvt, neon_x16,
77 xnn_qu8_f32_vcvt_ukernel__neon_x16,
78 xnn_init_qu8_f32_cvt_neon_params,
79 benchmark::utils::CheckNEON)
80 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
81 ->UseRealTime();
82 BENCHMARK_CAPTURE(qu8_f32_vcvt, neon_x24,
83 xnn_qu8_f32_vcvt_ukernel__neon_x24,
84 xnn_init_qu8_f32_cvt_neon_params,
85 benchmark::utils::CheckNEON)
86 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
87 ->UseRealTime();
88 BENCHMARK_CAPTURE(qu8_f32_vcvt, neon_x32,
89 xnn_qu8_f32_vcvt_ukernel__neon_x32,
90 xnn_init_qu8_f32_cvt_neon_params,
91 benchmark::utils::CheckNEON)
92 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
93 ->UseRealTime();
94 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
95
96 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
97 BENCHMARK_CAPTURE(qu8_f32_vcvt, avx512skx_x16,
98 xnn_qu8_f32_vcvt_ukernel__avx512skx_x16,
99 xnn_init_qu8_f32_cvt_avx512_params,
100 benchmark::utils::CheckAVX512SKX)
101 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
102 ->UseRealTime();
103 BENCHMARK_CAPTURE(qu8_f32_vcvt, avx512skx_x32,
104 xnn_qu8_f32_vcvt_ukernel__avx512skx_x32,
105 xnn_init_qu8_f32_cvt_avx512_params,
106 benchmark::utils::CheckAVX512SKX)
107 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
108 ->UseRealTime();
109 BENCHMARK_CAPTURE(qu8_f32_vcvt, avx512skx_x48,
110 xnn_qu8_f32_vcvt_ukernel__avx512skx_x48,
111 xnn_init_qu8_f32_cvt_avx512_params,
112 benchmark::utils::CheckAVX512SKX)
113 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
114 ->UseRealTime();
115 BENCHMARK_CAPTURE(qu8_f32_vcvt, avx512skx_x64,
116 xnn_qu8_f32_vcvt_ukernel__avx512skx_x64,
117 xnn_init_qu8_f32_cvt_avx512_params,
118 benchmark::utils::CheckAVX512SKX)
119 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
120 ->UseRealTime();
121
122 BENCHMARK_CAPTURE(qu8_f32_vcvt, avx2_x8,
123 xnn_qu8_f32_vcvt_ukernel__avx2_x8,
124 xnn_init_qu8_f32_cvt_avx_params,
125 benchmark::utils::CheckAVX2)
126 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
127 ->UseRealTime();
128 BENCHMARK_CAPTURE(qu8_f32_vcvt, avx2_x16,
129 xnn_qu8_f32_vcvt_ukernel__avx2_x16,
130 xnn_init_qu8_f32_cvt_avx_params,
131 benchmark::utils::CheckAVX2)
132 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
133 ->UseRealTime();
134 BENCHMARK_CAPTURE(qu8_f32_vcvt, avx2_x24,
135 xnn_qu8_f32_vcvt_ukernel__avx2_x24,
136 xnn_init_qu8_f32_cvt_avx_params,
137 benchmark::utils::CheckAVX2)
138 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
139 ->UseRealTime();
140 BENCHMARK_CAPTURE(qu8_f32_vcvt, avx2_x32,
141 xnn_qu8_f32_vcvt_ukernel__avx2_x32,
142 xnn_init_qu8_f32_cvt_avx_params,
143 benchmark::utils::CheckAVX2)
144 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
145 ->UseRealTime();
146
147 BENCHMARK_CAPTURE(qu8_f32_vcvt, avx_x8,
148 xnn_qu8_f32_vcvt_ukernel__avx_x8,
149 xnn_init_qu8_f32_cvt_avx_params,
150 benchmark::utils::CheckAVX)
151 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
152 ->UseRealTime();
153 BENCHMARK_CAPTURE(qu8_f32_vcvt, avx_x16,
154 xnn_qu8_f32_vcvt_ukernel__avx_x16,
155 xnn_init_qu8_f32_cvt_avx_params,
156 benchmark::utils::CheckAVX)
157 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
158 ->UseRealTime();
159 BENCHMARK_CAPTURE(qu8_f32_vcvt, avx_x24,
160 xnn_qu8_f32_vcvt_ukernel__avx_x24,
161 xnn_init_qu8_f32_cvt_avx_params,
162 benchmark::utils::CheckAVX)
163 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
164 ->UseRealTime();
165 BENCHMARK_CAPTURE(qu8_f32_vcvt, avx_x32,
166 xnn_qu8_f32_vcvt_ukernel__avx_x32,
167 xnn_init_qu8_f32_cvt_avx_params,
168 benchmark::utils::CheckAVX)
169 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
170 ->UseRealTime();
171
172 BENCHMARK_CAPTURE(qu8_f32_vcvt, sse41_x8,
173 xnn_qu8_f32_vcvt_ukernel__sse41_x8,
174 xnn_init_qu8_f32_cvt_sse4_params,
175 benchmark::utils::CheckSSE41)
176 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
177 ->UseRealTime();
178 BENCHMARK_CAPTURE(qu8_f32_vcvt, sse41_x16,
179 xnn_qu8_f32_vcvt_ukernel__sse41_x16,
180 xnn_init_qu8_f32_cvt_sse4_params,
181 benchmark::utils::CheckSSE41)
182 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
183 ->UseRealTime();
184 BENCHMARK_CAPTURE(qu8_f32_vcvt, sse41_x24,
185 xnn_qu8_f32_vcvt_ukernel__sse41_x24,
186 xnn_init_qu8_f32_cvt_sse4_params,
187 benchmark::utils::CheckSSE41)
188 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
189 ->UseRealTime();
190 BENCHMARK_CAPTURE(qu8_f32_vcvt, sse41_x32,
191 xnn_qu8_f32_vcvt_ukernel__sse41_x32,
192 xnn_init_qu8_f32_cvt_sse4_params,
193 benchmark::utils::CheckSSE41)
194 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
195 ->UseRealTime();
196
197 BENCHMARK_CAPTURE(qu8_f32_vcvt, sse2_x8,
198 xnn_qu8_f32_vcvt_ukernel__sse2_x8,
199 xnn_init_qu8_f32_cvt_sse2_params)
200 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
201 ->UseRealTime();
202 BENCHMARK_CAPTURE(qu8_f32_vcvt, sse2_x16,
203 xnn_qu8_f32_vcvt_ukernel__sse2_x16,
204 xnn_init_qu8_f32_cvt_sse2_params)
205 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
206 ->UseRealTime();
207 BENCHMARK_CAPTURE(qu8_f32_vcvt, sse2_x24,
208 xnn_qu8_f32_vcvt_ukernel__sse2_x24,
209 xnn_init_qu8_f32_cvt_sse2_params)
210 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
211 ->UseRealTime();
212 BENCHMARK_CAPTURE(qu8_f32_vcvt, sse2_x32,
213 xnn_qu8_f32_vcvt_ukernel__sse2_x32,
214 xnn_init_qu8_f32_cvt_sse2_params)
215 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
216 ->UseRealTime();
217 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
218
219 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
220 BENCHMARK_CAPTURE(qu8_f32_vcvt, wasmsimd_x8,
221 xnn_qu8_f32_vcvt_ukernel__wasmsimd_x8,
222 xnn_init_qu8_f32_cvt_wasmsimd_params)
223 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
224 ->UseRealTime();
225 BENCHMARK_CAPTURE(qu8_f32_vcvt, wasmsimd_x16,
226 xnn_qu8_f32_vcvt_ukernel__wasmsimd_x16,
227 xnn_init_qu8_f32_cvt_wasmsimd_params)
228 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
229 ->UseRealTime();
230 BENCHMARK_CAPTURE(qu8_f32_vcvt, wasmsimd_x24,
231 xnn_qu8_f32_vcvt_ukernel__wasmsimd_x24,
232 xnn_init_qu8_f32_cvt_wasmsimd_params)
233 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
234 ->UseRealTime();
235 BENCHMARK_CAPTURE(qu8_f32_vcvt, wasmsimd_x32,
236 xnn_qu8_f32_vcvt_ukernel__wasmsimd_x32,
237 xnn_init_qu8_f32_cvt_wasmsimd_params)
238 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
239 ->UseRealTime();
240 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
241
242 BENCHMARK_CAPTURE(qu8_f32_vcvt, scalar_x1,
243 xnn_qu8_f32_vcvt_ukernel__scalar_x1,
244 xnn_init_qu8_f32_cvt_scalar_params)
245 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
246 ->UseRealTime();
247 BENCHMARK_CAPTURE(qu8_f32_vcvt, scalar_x2,
248 xnn_qu8_f32_vcvt_ukernel__scalar_x2,
249 xnn_init_qu8_f32_cvt_scalar_params)
250 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
251 ->UseRealTime();
252 BENCHMARK_CAPTURE(qu8_f32_vcvt, scalar_x3,
253 xnn_qu8_f32_vcvt_ukernel__scalar_x3,
254 xnn_init_qu8_f32_cvt_scalar_params)
255 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
256 ->UseRealTime();
257 BENCHMARK_CAPTURE(qu8_f32_vcvt, scalar_x4,
258 xnn_qu8_f32_vcvt_ukernel__scalar_x4,
259 xnn_init_qu8_f32_cvt_scalar_params)
260 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
261 ->UseRealTime();
262
263 #ifndef XNNPACK_BENCHMARK_NO_MAIN
264 BENCHMARK_MAIN();
265 #endif
266