1 // Copyright 2022 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11
12 #include <benchmark/benchmark.h>
13 #include <fp16/fp16.h>
14 #include "bench/utils.h"
15
16 #include <xnnpack.h>
17 #include <xnnpack/aligned-allocator.h>
18 #include <xnnpack/common.h>
19 #include <xnnpack/microfnptr.h>
20 #include <xnnpack/microparams-init.h>
21 #include <xnnpack/vunary.h>
22
23
f16_vsigmoid(benchmark::State & state,xnn_f16_vsigmoid_ukernel_function sigmoid,xnn_init_f16_sigmoid_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)24 static void f16_vsigmoid(
25 benchmark::State& state,
26 xnn_f16_vsigmoid_ukernel_function sigmoid,
27 xnn_init_f16_sigmoid_params_fn init_params,
28 benchmark::utils::IsaCheckFunction isa_check = nullptr)
29 {
30 if (isa_check && !isa_check(state)) {
31 return;
32 }
33
34 const size_t num_elements = state.range(0);
35
36 std::random_device random_device;
37 auto rng = std::mt19937(random_device());
38 auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
39 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
40
41 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
42 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
43 std::generate(x.begin(), x.end(), std::ref(f16rng));
44 std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
45
46 xnn_f16_sigmoid_params params;
47 init_params(¶ms);
48 for (auto _ : state) {
49 sigmoid(num_elements * sizeof(uint16_t), x.data(), y.data(), ¶ms);
50 }
51
52 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
53 if (cpu_frequency != 0) {
54 state.counters["cpufreq"] = cpu_frequency;
55 }
56
57 const size_t elements_per_iteration = num_elements;
58 state.counters["elements"] =
59 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
60
61 const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
62 state.counters["bytes"] =
63 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
64 }
65
66 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64
67 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x8,
68 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x8,
69 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
70 benchmark::utils::CheckNEONFP16ARITH)
71 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
72 ->UseRealTime();
73 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x16,
74 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x16,
75 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
76 benchmark::utils::CheckNEONFP16ARITH)
77 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
78 ->UseRealTime();
79 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x24,
80 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x24,
81 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
82 benchmark::utils::CheckNEONFP16ARITH)
83 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
84 ->UseRealTime();
85 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x32,
86 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x32,
87 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
88 benchmark::utils::CheckNEONFP16ARITH)
89 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
90 ->UseRealTime();
91 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x40,
92 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x40,
93 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
94 benchmark::utils::CheckNEONFP16ARITH)
95 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
96 ->UseRealTime();
97 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x48,
98 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x48,
99 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
100 benchmark::utils::CheckNEONFP16ARITH)
101 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
102 ->UseRealTime();
103 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x56,
104 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x56,
105 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
106 benchmark::utils::CheckNEONFP16ARITH)
107 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
108 ->UseRealTime();
109 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x64,
110 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x64,
111 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
112 benchmark::utils::CheckNEONFP16ARITH)
113 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
114 ->UseRealTime();
115 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64
116
117 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
118 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x8,
119 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x8,
120 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
121 benchmark::utils::CheckNEONFP16ARITH)
122 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
123 ->UseRealTime();
124 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x16,
125 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x16,
126 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
127 benchmark::utils::CheckNEONFP16ARITH)
128 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
129 ->UseRealTime();
130 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x24,
131 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x24,
132 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
133 benchmark::utils::CheckNEONFP16ARITH)
134 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
135 ->UseRealTime();
136 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x32,
137 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x32,
138 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
139 benchmark::utils::CheckNEONFP16ARITH)
140 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
141 ->UseRealTime();
142 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x40,
143 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x40,
144 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
145 benchmark::utils::CheckNEONFP16ARITH)
146 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
147 ->UseRealTime();
148 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x48,
149 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x48,
150 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
151 benchmark::utils::CheckNEONFP16ARITH)
152 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
153 ->UseRealTime();
154 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x56,
155 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x56,
156 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
157 benchmark::utils::CheckNEONFP16ARITH)
158 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
159 ->UseRealTime();
160 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x64,
161 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x64,
162 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
163 benchmark::utils::CheckNEONFP16ARITH)
164 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
165 ->UseRealTime();
166
167 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x8,
168 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x8,
169 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
170 benchmark::utils::CheckNEONFP16ARITH)
171 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
172 ->UseRealTime();
173 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x16,
174 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x16,
175 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
176 benchmark::utils::CheckNEONFP16ARITH)
177 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
178 ->UseRealTime();
179 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x24,
180 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x24,
181 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
182 benchmark::utils::CheckNEONFP16ARITH)
183 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
184 ->UseRealTime();
185 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x32,
186 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x32,
187 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
188 benchmark::utils::CheckNEONFP16ARITH)
189 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
190 ->UseRealTime();
191 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x40,
192 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x40,
193 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
194 benchmark::utils::CheckNEONFP16ARITH)
195 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
196 ->UseRealTime();
197 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x48,
198 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x48,
199 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
200 benchmark::utils::CheckNEONFP16ARITH)
201 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
202 ->UseRealTime();
203 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x56,
204 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x56,
205 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
206 benchmark::utils::CheckNEONFP16ARITH)
207 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
208 ->UseRealTime();
209 BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x64,
210 xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x64,
211 xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
212 benchmark::utils::CheckNEONFP16ARITH)
213 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
214 ->UseRealTime();
215 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
216
217 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
218 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x8,
219 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x8,
220 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
221 benchmark::utils::CheckAVX2)
222 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
223 ->UseRealTime();
224 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x16,
225 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x16,
226 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
227 benchmark::utils::CheckAVX2)
228 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
229 ->UseRealTime();
230 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x24,
231 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x24,
232 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
233 benchmark::utils::CheckAVX2)
234 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
235 ->UseRealTime();
236 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x32,
237 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x32,
238 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
239 benchmark::utils::CheckAVX2)
240 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
241 ->UseRealTime();
242 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x40,
243 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x40,
244 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
245 benchmark::utils::CheckAVX2)
246 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
247 ->UseRealTime();
248 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x48,
249 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x48,
250 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
251 benchmark::utils::CheckAVX2)
252 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
253 ->UseRealTime();
254 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x56,
255 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x56,
256 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
257 benchmark::utils::CheckAVX2)
258 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
259 ->UseRealTime();
260 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x64,
261 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x64,
262 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
263 benchmark::utils::CheckAVX2)
264 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
265 ->UseRealTime();
266
267 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x8,
268 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x8,
269 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
270 benchmark::utils::CheckAVX2)
271 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
272 ->UseRealTime();
273 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x16,
274 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x16,
275 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
276 benchmark::utils::CheckAVX2)
277 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
278 ->UseRealTime();
279 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x24,
280 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x24,
281 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
282 benchmark::utils::CheckAVX2)
283 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
284 ->UseRealTime();
285 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x32,
286 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x32,
287 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
288 benchmark::utils::CheckAVX2)
289 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
290 ->UseRealTime();
291 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x40,
292 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x40,
293 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
294 benchmark::utils::CheckAVX2)
295 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
296 ->UseRealTime();
297 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x48,
298 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x48,
299 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
300 benchmark::utils::CheckAVX2)
301 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
302 ->UseRealTime();
303 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x56,
304 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x56,
305 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
306 benchmark::utils::CheckAVX2)
307 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
308 ->UseRealTime();
309 BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x64,
310 xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x64,
311 xnn_init_f16_sigmoid_avx2_rr1_p2_params,
312 benchmark::utils::CheckAVX2)
313 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
314 ->UseRealTime();
315 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
316
317 #ifndef XNNPACK_BENCHMARK_NO_MAIN
318 BENCHMARK_MAIN();
319 #endif
320