xref: /aosp_15_r20/external/XNNPACK/bench/f16-vsigmoid.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2022 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include <fp16/fp16.h>
14 #include "bench/utils.h"
15 
16 #include <xnnpack.h>
17 #include <xnnpack/aligned-allocator.h>
18 #include <xnnpack/common.h>
19 #include <xnnpack/microfnptr.h>
20 #include <xnnpack/microparams-init.h>
21 #include <xnnpack/vunary.h>
22 
23 
f16_vsigmoid(benchmark::State & state,xnn_f16_vsigmoid_ukernel_function sigmoid,xnn_init_f16_sigmoid_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)24 static void f16_vsigmoid(
25   benchmark::State& state,
26   xnn_f16_vsigmoid_ukernel_function sigmoid,
27   xnn_init_f16_sigmoid_params_fn init_params,
28   benchmark::utils::IsaCheckFunction isa_check = nullptr)
29 {
30   if (isa_check && !isa_check(state)) {
31     return;
32   }
33 
34   const size_t num_elements = state.range(0);
35 
36   std::random_device random_device;
37   auto rng = std::mt19937(random_device());
38   auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
39   auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
40 
41   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> x(num_elements);
42   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> y(num_elements);
43   std::generate(x.begin(), x.end(), std::ref(f16rng));
44   std::fill(y.begin(), y.end(), UINT16_C(0x7E00) /* NaN */);
45 
46   xnn_f16_sigmoid_params params;
47   init_params(&params);
48   for (auto _ : state) {
49     sigmoid(num_elements * sizeof(uint16_t), x.data(), y.data(), &params);
50   }
51 
52   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
53   if (cpu_frequency != 0) {
54     state.counters["cpufreq"] = cpu_frequency;
55   }
56 
57   const size_t elements_per_iteration = num_elements;
58   state.counters["elements"] =
59     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
60 
61   const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint16_t);
62   state.counters["bytes"] =
63     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
64 }
65 
66 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64
67   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x8,
68                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x8,
69                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
70                     benchmark::utils::CheckNEONFP16ARITH)
71     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
72     ->UseRealTime();
73   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x16,
74                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x16,
75                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
76                     benchmark::utils::CheckNEONFP16ARITH)
77     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
78     ->UseRealTime();
79   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x24,
80                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x24,
81                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
82                     benchmark::utils::CheckNEONFP16ARITH)
83     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
84     ->UseRealTime();
85   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x32,
86                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x32,
87                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
88                     benchmark::utils::CheckNEONFP16ARITH)
89     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
90     ->UseRealTime();
91   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x40,
92                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x40,
93                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
94                     benchmark::utils::CheckNEONFP16ARITH)
95     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
96     ->UseRealTime();
97   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x48,
98                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x48,
99                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
100                     benchmark::utils::CheckNEONFP16ARITH)
101     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
102     ->UseRealTime();
103   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x56,
104                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x56,
105                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
106                     benchmark::utils::CheckNEONFP16ARITH)
107     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
108     ->UseRealTime();
109   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_div_x64,
110                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_div_x64,
111                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
112                     benchmark::utils::CheckNEONFP16ARITH)
113     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
114     ->UseRealTime();
115 #endif  // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64
116 
117 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
118   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x8,
119                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x8,
120                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
121                     benchmark::utils::CheckNEONFP16ARITH)
122     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
123     ->UseRealTime();
124   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x16,
125                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x16,
126                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
127                     benchmark::utils::CheckNEONFP16ARITH)
128     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
129     ->UseRealTime();
130   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x24,
131                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x24,
132                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
133                     benchmark::utils::CheckNEONFP16ARITH)
134     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
135     ->UseRealTime();
136   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x32,
137                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x32,
138                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
139                     benchmark::utils::CheckNEONFP16ARITH)
140     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
141     ->UseRealTime();
142   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x40,
143                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x40,
144                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
145                     benchmark::utils::CheckNEONFP16ARITH)
146     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
147     ->UseRealTime();
148   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x48,
149                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x48,
150                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
151                     benchmark::utils::CheckNEONFP16ARITH)
152     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
153     ->UseRealTime();
154   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x56,
155                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x56,
156                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
157                     benchmark::utils::CheckNEONFP16ARITH)
158     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
159     ->UseRealTime();
160   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1fma_x64,
161                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x64,
162                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
163                     benchmark::utils::CheckNEONFP16ARITH)
164     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
165     ->UseRealTime();
166 
167   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x8,
168                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x8,
169                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
170                     benchmark::utils::CheckNEONFP16ARITH)
171     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
172     ->UseRealTime();
173   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x16,
174                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x16,
175                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
176                     benchmark::utils::CheckNEONFP16ARITH)
177     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
178     ->UseRealTime();
179   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x24,
180                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x24,
181                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
182                     benchmark::utils::CheckNEONFP16ARITH)
183     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
184     ->UseRealTime();
185   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x32,
186                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x32,
187                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
188                     benchmark::utils::CheckNEONFP16ARITH)
189     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
190     ->UseRealTime();
191   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x40,
192                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x40,
193                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
194                     benchmark::utils::CheckNEONFP16ARITH)
195     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
196     ->UseRealTime();
197   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x48,
198                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x48,
199                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
200                     benchmark::utils::CheckNEONFP16ARITH)
201     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
202     ->UseRealTime();
203   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x56,
204                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x56,
205                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
206                     benchmark::utils::CheckNEONFP16ARITH)
207     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
208     ->UseRealTime();
209   BENCHMARK_CAPTURE(f16_vsigmoid, neonfp16arith_rr2_p2_nr1recps_x64,
210                     xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_x64,
211                     xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
212                     benchmark::utils::CheckNEONFP16ARITH)
213     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
214     ->UseRealTime();
215 #endif  // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
216 
217 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
218   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x8,
219                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x8,
220                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
221                     benchmark::utils::CheckAVX2)
222     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
223     ->UseRealTime();
224   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x16,
225                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x16,
226                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
227                     benchmark::utils::CheckAVX2)
228     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
229     ->UseRealTime();
230   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x24,
231                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x24,
232                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
233                     benchmark::utils::CheckAVX2)
234     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
235     ->UseRealTime();
236   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x32,
237                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x32,
238                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
239                     benchmark::utils::CheckAVX2)
240     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
241     ->UseRealTime();
242   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x40,
243                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x40,
244                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
245                     benchmark::utils::CheckAVX2)
246     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
247     ->UseRealTime();
248   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x48,
249                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x48,
250                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
251                     benchmark::utils::CheckAVX2)
252     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
253     ->UseRealTime();
254   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x56,
255                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x56,
256                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
257                     benchmark::utils::CheckAVX2)
258     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
259     ->UseRealTime();
260   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_div_x64,
261                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_x64,
262                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
263                     benchmark::utils::CheckAVX2)
264     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
265     ->UseRealTime();
266 
267   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x8,
268                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x8,
269                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
270                     benchmark::utils::CheckAVX2)
271     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
272     ->UseRealTime();
273   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x16,
274                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x16,
275                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
276                     benchmark::utils::CheckAVX2)
277     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
278     ->UseRealTime();
279   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x24,
280                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x24,
281                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
282                     benchmark::utils::CheckAVX2)
283     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
284     ->UseRealTime();
285   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x32,
286                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x32,
287                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
288                     benchmark::utils::CheckAVX2)
289     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
290     ->UseRealTime();
291   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x40,
292                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x40,
293                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
294                     benchmark::utils::CheckAVX2)
295     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
296     ->UseRealTime();
297   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x48,
298                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x48,
299                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
300                     benchmark::utils::CheckAVX2)
301     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
302     ->UseRealTime();
303   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x56,
304                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x56,
305                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
306                     benchmark::utils::CheckAVX2)
307     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
308     ->UseRealTime();
309   BENCHMARK_CAPTURE(f16_vsigmoid, avx2_rr1_p2_rcp_x64,
310                     xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x64,
311                     xnn_init_f16_sigmoid_avx2_rr1_p2_params,
312                     benchmark::utils::CheckAVX2)
313     ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
314     ->UseRealTime();
315 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
316 
317 #ifndef XNNPACK_BENCHMARK_NO_MAIN
318 BENCHMARK_MAIN();
319 #endif
320