xref: /aosp_15_r20/external/XNNPACK/bench/f32-velu.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14 
15 #include <xnnpack.h>
16 #include <xnnpack/aligned-allocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/microfnptr.h>
19 #include <xnnpack/microparams-init.h>
20 #include <xnnpack/vunary.h>
21 
22 
f32_velu(benchmark::State & state,xnn_f32_velu_ukernel_function elu,xnn_init_f32_elu_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)23 static void f32_velu(
24   benchmark::State& state,
25   xnn_f32_velu_ukernel_function elu,
26   xnn_init_f32_elu_params_fn init_params,
27   benchmark::utils::IsaCheckFunction isa_check = nullptr)
28 {
29   if (isa_check && !isa_check(state)) {
30     return;
31   }
32 
33   const size_t num_elements = state.range(0);
34 
35   std::random_device random_device;
36   auto rng = std::mt19937(random_device());
37   auto f32rng = std::bind(std::uniform_real_distribution<float>(-20.0f, 10.0f), std::ref(rng));
38 
39   std::vector<float, AlignedAllocator<float, 64>> x(num_elements);
40   std::vector<float, AlignedAllocator<float, 64>> y(num_elements);
41   std::generate(x.begin(), x.end(), std::ref(f32rng));
42   std::fill(y.begin(), y.end(), std::nanf(""));
43 
44   union xnn_f32_elu_params params;
45   init_params(&params, 1.0f /* prescale */, 1.0f /* alpha */, 1.0f /* beta */);
46   for (auto _ : state) {
47     elu(num_elements * sizeof(float), x.data(), y.data(), &params);
48   }
49 
50   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
51   if (cpu_frequency != 0) {
52     state.counters["cpufreq"] = cpu_frequency;
53   }
54 
55   const size_t elements_per_iteration = num_elements;
56   state.counters["elements"] =
57     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
58 
59   const size_t bytes_per_iteration = 2 * num_elements * sizeof(float);
60   state.counters["bytes"] =
61     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
62 }
63 
64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
65   BENCHMARK_CAPTURE(f32_velu, neonfma_lut16_p3_x4,
66                     xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x4,
67                     xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
68                     benchmark::utils::CheckNEONFMA)
69     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
70     ->UseRealTime();
71   BENCHMARK_CAPTURE(f32_velu, neonfma_lut16_p3_x8,
72                     xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x8,
73                     xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
74                     benchmark::utils::CheckNEONFMA)
75     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
76     ->UseRealTime();
77   BENCHMARK_CAPTURE(f32_velu, neonfma_lut16_p3_x12,
78                     xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x12,
79                     xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
80                     benchmark::utils::CheckNEONFMA)
81     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
82     ->UseRealTime();
83   BENCHMARK_CAPTURE(f32_velu, neonfma_lut16_p3_x16,
84                     xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16,
85                     xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
86                     benchmark::utils::CheckNEONFMA)
87     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
88     ->UseRealTime();
89   BENCHMARK_CAPTURE(f32_velu, neonfma_lut16_p3_x20,
90                     xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x20,
91                     xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
92                     benchmark::utils::CheckNEONFMA)
93     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
94     ->UseRealTime();
95   BENCHMARK_CAPTURE(f32_velu, neonfma_lut16_p3_x24,
96                     xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x24,
97                     xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
98                     benchmark::utils::CheckNEONFMA)
99     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
100     ->UseRealTime();
101 
102   BENCHMARK_CAPTURE(f32_velu, neonfma_p6_x4,
103                     xnn_f32_velu_ukernel__neonfma_rr1_p6_x4,
104                     xnn_init_f32_elu_neonfma_rr1_p6_params,
105                     benchmark::utils::CheckNEONFMA)
106     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
107     ->UseRealTime();
108   BENCHMARK_CAPTURE(f32_velu, neonfma_p6_x8,
109                     xnn_f32_velu_ukernel__neonfma_rr1_p6_x8,
110                     xnn_init_f32_elu_neonfma_rr1_p6_params,
111                     benchmark::utils::CheckNEONFMA)
112     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
113     ->UseRealTime();
114   BENCHMARK_CAPTURE(f32_velu, neonfma_p6_x12,
115                     xnn_f32_velu_ukernel__neonfma_rr1_p6_x12,
116                     xnn_init_f32_elu_neonfma_rr1_p6_params,
117                     benchmark::utils::CheckNEONFMA)
118     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
119     ->UseRealTime();
120   BENCHMARK_CAPTURE(f32_velu, neonfma_p6_x16,
121                     xnn_f32_velu_ukernel__neonfma_rr1_p6_x16,
122                     xnn_init_f32_elu_neonfma_rr1_p6_params,
123                     benchmark::utils::CheckNEONFMA)
124     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
125     ->UseRealTime();
126   BENCHMARK_CAPTURE(f32_velu, neonfma_p6_x20,
127                     xnn_f32_velu_ukernel__neonfma_rr1_p6_x20,
128                     xnn_init_f32_elu_neonfma_rr1_p6_params,
129                     benchmark::utils::CheckNEONFMA)
130     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
131     ->UseRealTime();
132   BENCHMARK_CAPTURE(f32_velu, neonfma_p6_x24,
133                     xnn_f32_velu_ukernel__neonfma_rr1_p6_x24,
134                     xnn_init_f32_elu_neonfma_rr1_p6_params,
135                     benchmark::utils::CheckNEONFMA)
136     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
137     ->UseRealTime();
138 
139   BENCHMARK_CAPTURE(f32_velu, neon_lut16_p3_x4,
140                     xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x4,
141                     xnn_init_f32_elu_neon_rr2_lut16_p3_params,
142                     benchmark::utils::CheckNEON)
143     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
144     ->UseRealTime();
145   BENCHMARK_CAPTURE(f32_velu, neon_lut16_p3_x8,
146                     xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8,
147                     xnn_init_f32_elu_neon_rr2_lut16_p3_params,
148                     benchmark::utils::CheckNEON)
149     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
150     ->UseRealTime();
151   BENCHMARK_CAPTURE(f32_velu, neon_lut16_p3_x12,
152                     xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x12,
153                     xnn_init_f32_elu_neon_rr2_lut16_p3_params,
154                     benchmark::utils::CheckNEON)
155     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
156     ->UseRealTime();
157   BENCHMARK_CAPTURE(f32_velu, neon_lut16_p3_x16,
158                     xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x16,
159                     xnn_init_f32_elu_neon_rr2_lut16_p3_params,
160                     benchmark::utils::CheckNEON)
161     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
162     ->UseRealTime();
163   BENCHMARK_CAPTURE(f32_velu, neon_lut16_p3_x20,
164                     xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x20,
165                     xnn_init_f32_elu_neon_rr2_lut16_p3_params,
166                     benchmark::utils::CheckNEON)
167     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
168     ->UseRealTime();
169   BENCHMARK_CAPTURE(f32_velu, neon_lut16_p3_x24,
170                     xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x24,
171                     xnn_init_f32_elu_neon_rr2_lut16_p3_params,
172                     benchmark::utils::CheckNEON)
173     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
174     ->UseRealTime();
175 
176   BENCHMARK_CAPTURE(f32_velu, neon_p6_x4,
177                     xnn_f32_velu_ukernel__neon_rr2_p6_x4,
178                     xnn_init_f32_elu_neon_rr2_p6_params,
179                     benchmark::utils::CheckNEON)
180     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
181     ->UseRealTime();
182   BENCHMARK_CAPTURE(f32_velu, neon_p6_x8,
183                     xnn_f32_velu_ukernel__neon_rr2_p6_x8,
184                     xnn_init_f32_elu_neon_rr2_p6_params,
185                     benchmark::utils::CheckNEON)
186     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
187     ->UseRealTime();
188   BENCHMARK_CAPTURE(f32_velu, neon_p6_x12,
189                     xnn_f32_velu_ukernel__neon_rr2_p6_x12,
190                     xnn_init_f32_elu_neon_rr2_p6_params,
191                     benchmark::utils::CheckNEON)
192     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
193     ->UseRealTime();
194   BENCHMARK_CAPTURE(f32_velu, neon_p6_x16,
195                     xnn_f32_velu_ukernel__neon_rr2_p6_x16,
196                     xnn_init_f32_elu_neon_rr2_p6_params,
197                     benchmark::utils::CheckNEON)
198     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
199     ->UseRealTime();
200   BENCHMARK_CAPTURE(f32_velu, neon_p6_x20,
201                     xnn_f32_velu_ukernel__neon_rr2_p6_x20,
202                     xnn_init_f32_elu_neon_rr2_p6_params,
203                     benchmark::utils::CheckNEON)
204     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
205     ->UseRealTime();
206   BENCHMARK_CAPTURE(f32_velu, neon_p6_x24,
207                     xnn_f32_velu_ukernel__neon_rr2_p6_x24,
208                     xnn_init_f32_elu_neon_rr2_p6_params,
209                     benchmark::utils::CheckNEON)
210     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
211     ->UseRealTime();
212 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
213 
214 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
215   BENCHMARK_CAPTURE(f32_velu, avx512f_lut16_p3_x16,
216                     xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x16,
217                     xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
218                     benchmark::utils::CheckAVX512F)
219     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
220     ->UseRealTime();
221   BENCHMARK_CAPTURE(f32_velu, avx512f_lut16_p3_x32,
222                     xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x32,
223                     xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
224                     benchmark::utils::CheckAVX512F)
225     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
226     ->UseRealTime();
227   BENCHMARK_CAPTURE(f32_velu, avx512f_lut16_p3_x48,
228                     xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x48,
229                     xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
230                     benchmark::utils::CheckAVX512F)
231     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
232     ->UseRealTime();
233   BENCHMARK_CAPTURE(f32_velu, avx512f_lut16_p3_x64,
234                     xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x64,
235                     xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
236                     benchmark::utils::CheckAVX512F)
237     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
238     ->UseRealTime();
239   BENCHMARK_CAPTURE(f32_velu, avx512f_lut16_p3_x80,
240                     xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x80,
241                     xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
242                     benchmark::utils::CheckAVX512F)
243     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
244     ->UseRealTime();
245   BENCHMARK_CAPTURE(f32_velu, avx512f_lut16_p3_x96,
246                     xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x96,
247                     xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
248                     benchmark::utils::CheckAVX512F)
249     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
250     ->UseRealTime();
251   BENCHMARK_CAPTURE(f32_velu, avx512f_lut16_p3_x112,
252                     xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x112,
253                     xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
254                     benchmark::utils::CheckAVX512F)
255     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
256     ->UseRealTime();
257   BENCHMARK_CAPTURE(f32_velu, avx512f_lut16_p3_x128,
258                     xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x128,
259                     xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
260                     benchmark::utils::CheckAVX512F)
261     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
262     ->UseRealTime();
263 
264   BENCHMARK_CAPTURE(f32_velu, avx512f_p6_x16,
265                     xnn_f32_velu_ukernel__avx512f_rr1_p6_x16,
266                     xnn_init_f32_elu_avx512_rr1_p6_params,
267                     benchmark::utils::CheckAVX512F)
268     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
269     ->UseRealTime();
270   BENCHMARK_CAPTURE(f32_velu, avx512f_p6_x32,
271                     xnn_f32_velu_ukernel__avx512f_rr1_p6_x32,
272                     xnn_init_f32_elu_avx512_rr1_p6_params,
273                     benchmark::utils::CheckAVX512F)
274     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
275     ->UseRealTime();
276   BENCHMARK_CAPTURE(f32_velu, avx512f_p6_x48,
277                     xnn_f32_velu_ukernel__avx512f_rr1_p6_x48,
278                     xnn_init_f32_elu_avx512_rr1_p6_params,
279                     benchmark::utils::CheckAVX512F)
280     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
281     ->UseRealTime();
282   BENCHMARK_CAPTURE(f32_velu, avx512f_p6_x64,
283                     xnn_f32_velu_ukernel__avx512f_rr1_p6_x64,
284                     xnn_init_f32_elu_avx512_rr1_p6_params,
285                     benchmark::utils::CheckAVX512F)
286     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
287     ->UseRealTime();
288   BENCHMARK_CAPTURE(f32_velu, avx512f_p6_x80,
289                     xnn_f32_velu_ukernel__avx512f_rr1_p6_x80,
290                     xnn_init_f32_elu_avx512_rr1_p6_params,
291                     benchmark::utils::CheckAVX512F)
292     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
293     ->UseRealTime();
294   BENCHMARK_CAPTURE(f32_velu, avx512f_p6_x96,
295                     xnn_f32_velu_ukernel__avx512f_rr1_p6_x96,
296                     xnn_init_f32_elu_avx512_rr1_p6_params,
297                     benchmark::utils::CheckAVX512F)
298     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
299     ->UseRealTime();
300   BENCHMARK_CAPTURE(f32_velu, avx512f_p6_x112,
301                     xnn_f32_velu_ukernel__avx512f_rr1_p6_x112,
302                     xnn_init_f32_elu_avx512_rr1_p6_params,
303                     benchmark::utils::CheckAVX512F)
304     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
305     ->UseRealTime();
306   BENCHMARK_CAPTURE(f32_velu, avx512f_p6_x128,
307                     xnn_f32_velu_ukernel__avx512f_rr1_p6_x128,
308                     xnn_init_f32_elu_avx512_rr1_p6_params,
309                     benchmark::utils::CheckAVX512F)
310     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
311     ->UseRealTime();
312 
313   BENCHMARK_CAPTURE(f32_velu, avx2_lut4_p4_x8,
314                     xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x8,
315                     xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
316                     benchmark::utils::CheckAVX2)
317     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
318     ->UseRealTime();
319   BENCHMARK_CAPTURE(f32_velu, avx2_lut4_p4_x16,
320                     xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x16,
321                     xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
322                     benchmark::utils::CheckAVX2)
323     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
324     ->UseRealTime();
325   BENCHMARK_CAPTURE(f32_velu, avx2_lut4_p4_x24,
326                     xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x24,
327                     xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
328                     benchmark::utils::CheckAVX2)
329     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
330     ->UseRealTime();
331   BENCHMARK_CAPTURE(f32_velu, avx2_lut4_p4_x32,
332                     xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x32,
333                     xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
334                     benchmark::utils::CheckAVX2)
335     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
336     ->UseRealTime();
337   BENCHMARK_CAPTURE(f32_velu, avx2_lut4_p4_x40,
338                     xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x40,
339                     xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
340                     benchmark::utils::CheckAVX2)
341     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
342     ->UseRealTime();
343   BENCHMARK_CAPTURE(f32_velu, avx2_lut4_p4_x48,
344                     xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x48,
345                     xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
346                     benchmark::utils::CheckAVX2)
347     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
348     ->UseRealTime();
349   BENCHMARK_CAPTURE(f32_velu, avx2_lut4_p4_x56,
350                     xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56,
351                     xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
352                     benchmark::utils::CheckAVX2)
353     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
354     ->UseRealTime();
355   BENCHMARK_CAPTURE(f32_velu, avx2_lut4_p4_x64,
356                     xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x64,
357                     xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
358                     benchmark::utils::CheckAVX2)
359     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
360     ->UseRealTime();
361   BENCHMARK_CAPTURE(f32_velu, avx2_lut4_p4_x72,
362                     xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x72,
363                     xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
364                     benchmark::utils::CheckAVX2)
365     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
366     ->UseRealTime();
367   BENCHMARK_CAPTURE(f32_velu, avx2_lut4_p4_x80,
368                     xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x80,
369                     xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
370                     benchmark::utils::CheckAVX2)
371     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
372     ->UseRealTime();
373 
374   BENCHMARK_CAPTURE(f32_velu, avx2_lut8_p4_x8,
375                     xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x8,
376                     xnn_init_f32_elu_avx2_rr1_lut8_p4_params,
377                     benchmark::utils::CheckAVX2)
378     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
379     ->UseRealTime();
380   BENCHMARK_CAPTURE(f32_velu, avx2_lut8_p4_x16,
381                     xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x16,
382                     xnn_init_f32_elu_avx2_rr1_lut8_p4_params,
383                     benchmark::utils::CheckAVX2)
384     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
385     ->UseRealTime();
386   BENCHMARK_CAPTURE(f32_velu, avx2_lut8_p4_x24,
387                     xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x24,
388                     xnn_init_f32_elu_avx2_rr1_lut8_p4_params,
389                     benchmark::utils::CheckAVX2)
390     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
391     ->UseRealTime();
392   BENCHMARK_CAPTURE(f32_velu, avx2_lut8_p4_x32,
393                     xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x32,
394                     xnn_init_f32_elu_avx2_rr1_lut8_p4_params,
395                     benchmark::utils::CheckAVX2)
396     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
397     ->UseRealTime();
398   BENCHMARK_CAPTURE(f32_velu, avx2_lut8_p4_x40,
399                     xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x40,
400                     xnn_init_f32_elu_avx2_rr1_lut8_p4_params,
401                     benchmark::utils::CheckAVX2)
402     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
403     ->UseRealTime();
404   BENCHMARK_CAPTURE(f32_velu, avx2_lut8_p4_x48,
405                     xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x48,
406                     xnn_init_f32_elu_avx2_rr1_lut8_p4_params,
407                     benchmark::utils::CheckAVX2)
408     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
409     ->UseRealTime();
410   BENCHMARK_CAPTURE(f32_velu, avx2_lut8_p4_x56,
411                     xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x56,
412                     xnn_init_f32_elu_avx2_rr1_lut8_p4_params,
413                     benchmark::utils::CheckAVX2)
414     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
415     ->UseRealTime();
416   BENCHMARK_CAPTURE(f32_velu, avx2_lut8_p4_x64,
417                     xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x64,
418                     xnn_init_f32_elu_avx2_rr1_lut8_p4_params,
419                     benchmark::utils::CheckAVX2)
420     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
421     ->UseRealTime();
422   BENCHMARK_CAPTURE(f32_velu, avx2_lut8_p4_x72,
423                     xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x72,
424                     xnn_init_f32_elu_avx2_rr1_lut8_p4_params,
425                     benchmark::utils::CheckAVX2)
426     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
427     ->UseRealTime();
428   BENCHMARK_CAPTURE(f32_velu, avx2_lut8_p4_x80,
429                     xnn_f32_velu_ukernel__avx2_rr1_lut8_p4_perm_x80,
430                     xnn_init_f32_elu_avx2_rr1_lut8_p4_params,
431                     benchmark::utils::CheckAVX2)
432     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
433     ->UseRealTime();
434 
435   BENCHMARK_CAPTURE(f32_velu, avx2_lut16_p3_x8,
436                     xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x8,
437                     xnn_init_f32_elu_avx2_rr1_lut16_p3_params,
438                     benchmark::utils::CheckAVX2)
439     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
440     ->UseRealTime();
441   BENCHMARK_CAPTURE(f32_velu, avx2_lut16_p3_x16,
442                     xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x16,
443                     xnn_init_f32_elu_avx2_rr1_lut16_p3_params,
444                     benchmark::utils::CheckAVX2)
445     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
446     ->UseRealTime();
447   BENCHMARK_CAPTURE(f32_velu, avx2_lut16_p3_x24,
448                     xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x24,
449                     xnn_init_f32_elu_avx2_rr1_lut16_p3_params,
450                     benchmark::utils::CheckAVX2)
451     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
452     ->UseRealTime();
453   BENCHMARK_CAPTURE(f32_velu, avx2_lut16_p3_x32,
454                     xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x32,
455                     xnn_init_f32_elu_avx2_rr1_lut16_p3_params,
456                     benchmark::utils::CheckAVX2)
457     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
458     ->UseRealTime();
459   BENCHMARK_CAPTURE(f32_velu, avx2_lut16_p3_x40,
460                     xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x40,
461                     xnn_init_f32_elu_avx2_rr1_lut16_p3_params,
462                     benchmark::utils::CheckAVX2)
463     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
464     ->UseRealTime();
465   BENCHMARK_CAPTURE(f32_velu, avx2_lut16_p3_x48,
466                     xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x48,
467                     xnn_init_f32_elu_avx2_rr1_lut16_p3_params,
468                     benchmark::utils::CheckAVX2)
469     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
470     ->UseRealTime();
471   BENCHMARK_CAPTURE(f32_velu, avx2_lut16_p3_x56,
472                     xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x56,
473                     xnn_init_f32_elu_avx2_rr1_lut16_p3_params,
474                     benchmark::utils::CheckAVX2)
475     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
476     ->UseRealTime();
477   BENCHMARK_CAPTURE(f32_velu, avx2_lut16_p3_x64,
478                     xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x64,
479                     xnn_init_f32_elu_avx2_rr1_lut16_p3_params,
480                     benchmark::utils::CheckAVX2)
481     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
482     ->UseRealTime();
483   BENCHMARK_CAPTURE(f32_velu, avx2_lut16_p3_x72,
484                     xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x72,
485                     xnn_init_f32_elu_avx2_rr1_lut16_p3_params,
486                     benchmark::utils::CheckAVX2)
487     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
488     ->UseRealTime();
489   BENCHMARK_CAPTURE(f32_velu, avx2_lut16_p3_x80,
490                     xnn_f32_velu_ukernel__avx2_rr1_lut16_p3_gather_x80,
491                     xnn_init_f32_elu_avx2_rr1_lut16_p3_params,
492                     benchmark::utils::CheckAVX2)
493     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
494     ->UseRealTime();
495 
496   BENCHMARK_CAPTURE(f32_velu, avx2_p6_x8,
497                     xnn_f32_velu_ukernel__avx2_rr1_p6_x8,
498                     xnn_init_f32_elu_avx2_rr1_p6_params,
499                     benchmark::utils::CheckAVX2)
500     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
501     ->UseRealTime();
502   BENCHMARK_CAPTURE(f32_velu, avx2_p6_x16,
503                     xnn_f32_velu_ukernel__avx2_rr1_p6_x16,
504                     xnn_init_f32_elu_avx2_rr1_p6_params,
505                     benchmark::utils::CheckAVX2)
506     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
507     ->UseRealTime();
508   BENCHMARK_CAPTURE(f32_velu, avx2_p6_x24,
509                     xnn_f32_velu_ukernel__avx2_rr1_p6_x24,
510                     xnn_init_f32_elu_avx2_rr1_p6_params,
511                     benchmark::utils::CheckAVX2)
512     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
513     ->UseRealTime();
514   BENCHMARK_CAPTURE(f32_velu, avx2_p6_x32,
515                     xnn_f32_velu_ukernel__avx2_rr1_p6_x32,
516                     xnn_init_f32_elu_avx2_rr1_p6_params,
517                     benchmark::utils::CheckAVX2)
518     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
519     ->UseRealTime();
520   BENCHMARK_CAPTURE(f32_velu, avx2_p6_x40,
521                     xnn_f32_velu_ukernel__avx2_rr1_p6_x40,
522                     xnn_init_f32_elu_avx2_rr1_p6_params,
523                     benchmark::utils::CheckAVX2)
524     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
525     ->UseRealTime();
526   BENCHMARK_CAPTURE(f32_velu, avx2_p6_x48,
527                     xnn_f32_velu_ukernel__avx2_rr1_p6_x48,
528                     xnn_init_f32_elu_avx2_rr1_p6_params,
529                     benchmark::utils::CheckAVX2)
530     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
531     ->UseRealTime();
532   BENCHMARK_CAPTURE(f32_velu, avx2_p6_x56,
533                     xnn_f32_velu_ukernel__avx2_rr1_p6_x56,
534                     xnn_init_f32_elu_avx2_rr1_p6_params,
535                     benchmark::utils::CheckAVX2)
536     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
537     ->UseRealTime();
538   BENCHMARK_CAPTURE(f32_velu, avx2_p6_x64,
539                     xnn_f32_velu_ukernel__avx2_rr1_p6_x64,
540                     xnn_init_f32_elu_avx2_rr1_p6_params,
541                     benchmark::utils::CheckAVX2)
542     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
543     ->UseRealTime();
544   BENCHMARK_CAPTURE(f32_velu, avx2_p6_x72,
545                     xnn_f32_velu_ukernel__avx2_rr1_p6_x72,
546                     xnn_init_f32_elu_avx2_rr1_p6_params,
547                     benchmark::utils::CheckAVX2)
548     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
549     ->UseRealTime();
550   BENCHMARK_CAPTURE(f32_velu, avx2_p6_x80,
551                     xnn_f32_velu_ukernel__avx2_rr1_p6_x80,
552                     xnn_init_f32_elu_avx2_rr1_p6_params,
553                     benchmark::utils::CheckAVX2)
554     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
555     ->UseRealTime();
556 
557   BENCHMARK_CAPTURE(f32_velu, avx_lut4_p4_x8,
558                     xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x8,
559                     xnn_init_f32_elu_avx_rr2_lut4_p4_params,
560                     benchmark::utils::CheckAVX)
561     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
562     ->UseRealTime();
563   BENCHMARK_CAPTURE(f32_velu, avx_lut4_p4_x16,
564                     xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x16,
565                     xnn_init_f32_elu_avx_rr2_lut4_p4_params,
566                     benchmark::utils::CheckAVX)
567     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
568     ->UseRealTime();
569   BENCHMARK_CAPTURE(f32_velu, avx_lut4_p4_x24,
570                     xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x24,
571                     xnn_init_f32_elu_avx_rr2_lut4_p4_params,
572                     benchmark::utils::CheckAVX)
573     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
574     ->UseRealTime();
575   BENCHMARK_CAPTURE(f32_velu, avx_lut4_p4_x32,
576                     xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32,
577                     xnn_init_f32_elu_avx_rr2_lut4_p4_params,
578                     benchmark::utils::CheckAVX)
579     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
580     ->UseRealTime();
581   BENCHMARK_CAPTURE(f32_velu, avx_lut4_p4_x40,
582                     xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x40,
583                     xnn_init_f32_elu_avx_rr2_lut4_p4_params,
584                     benchmark::utils::CheckAVX)
585     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
586     ->UseRealTime();
587   BENCHMARK_CAPTURE(f32_velu, avx_lut4_p4_x48,
588                     xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x48,
589                     xnn_init_f32_elu_avx_rr2_lut4_p4_params,
590                     benchmark::utils::CheckAVX)
591     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
592     ->UseRealTime();
593 
594   BENCHMARK_CAPTURE(f32_velu, avx_lut16_p3_x8,
595                     xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x8,
596                     xnn_init_f32_elu_avx_rr2_lut16_p3_params,
597                     benchmark::utils::CheckAVX)
598     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
599     ->UseRealTime();
600   BENCHMARK_CAPTURE(f32_velu, avx_lut16_p3_x16,
601                     xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x16,
602                     xnn_init_f32_elu_avx_rr2_lut16_p3_params,
603                     benchmark::utils::CheckAVX)
604     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
605     ->UseRealTime();
606   BENCHMARK_CAPTURE(f32_velu, avx_lut16_p3_x24,
607                     xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x24,
608                     xnn_init_f32_elu_avx_rr2_lut16_p3_params,
609                     benchmark::utils::CheckAVX)
610     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
611     ->UseRealTime();
612   BENCHMARK_CAPTURE(f32_velu, avx_lut16_p3_x32,
613                     xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x32,
614                     xnn_init_f32_elu_avx_rr2_lut16_p3_params,
615                     benchmark::utils::CheckAVX)
616     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
617     ->UseRealTime();
618   BENCHMARK_CAPTURE(f32_velu, avx_lut16_p3_x40,
619                     xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x40,
620                     xnn_init_f32_elu_avx_rr2_lut16_p3_params,
621                     benchmark::utils::CheckAVX)
622     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
623     ->UseRealTime();
624   BENCHMARK_CAPTURE(f32_velu, avx_lut16_p3_x48,
625                     xnn_f32_velu_ukernel__avx_rr2_lut16_p3_x48,
626                     xnn_init_f32_elu_avx_rr2_lut16_p3_params,
627                     benchmark::utils::CheckAVX)
628     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
629     ->UseRealTime();
630 
631   BENCHMARK_CAPTURE(f32_velu, avx_p6_x8,
632                     xnn_f32_velu_ukernel__avx_rr2_p6_x8,
633                     xnn_init_f32_elu_avx_rr2_p6_params,
634                     benchmark::utils::CheckAVX)
635     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
636     ->UseRealTime();
637   BENCHMARK_CAPTURE(f32_velu, avx_p6_x16,
638                     xnn_f32_velu_ukernel__avx_rr2_p6_x16,
639                     xnn_init_f32_elu_avx_rr2_p6_params,
640                     benchmark::utils::CheckAVX)
641     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
642     ->UseRealTime();
643   BENCHMARK_CAPTURE(f32_velu, avx_p6_x24,
644                     xnn_f32_velu_ukernel__avx_rr2_p6_x24,
645                     xnn_init_f32_elu_avx_rr2_p6_params,
646                     benchmark::utils::CheckAVX)
647     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
648     ->UseRealTime();
649   BENCHMARK_CAPTURE(f32_velu, avx_p6_x32,
650                     xnn_f32_velu_ukernel__avx_rr2_p6_x32,
651                     xnn_init_f32_elu_avx_rr2_p6_params,
652                     benchmark::utils::CheckAVX)
653     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
654     ->UseRealTime();
655   BENCHMARK_CAPTURE(f32_velu, avx_p6_x40,
656                     xnn_f32_velu_ukernel__avx_rr2_p6_x40,
657                     xnn_init_f32_elu_avx_rr2_p6_params,
658                     benchmark::utils::CheckAVX)
659     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
660     ->UseRealTime();
661   BENCHMARK_CAPTURE(f32_velu, avx_p6_x48,
662                     xnn_f32_velu_ukernel__avx_rr2_p6_x48,
663                     xnn_init_f32_elu_avx_rr2_p6_params,
664                     benchmark::utils::CheckAVX)
665     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
666     ->UseRealTime();
667 
668   BENCHMARK_CAPTURE(f32_velu, sse41_lut16_p3_x4,
669                     xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x4,
670                     xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
671                     benchmark::utils::CheckSSE41)
672     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
673     ->UseRealTime();
674   BENCHMARK_CAPTURE(f32_velu, sse41_lut16_p3_x8,
675                     xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x8,
676                     xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
677                     benchmark::utils::CheckSSE41)
678     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
679     ->UseRealTime();
680   BENCHMARK_CAPTURE(f32_velu, sse41_lut16_p3_x12,
681                     xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x12,
682                     xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
683                     benchmark::utils::CheckSSE41)
684     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
685     ->UseRealTime();
686   BENCHMARK_CAPTURE(f32_velu, sse41_lut16_p3_x16,
687                     xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x16,
688                     xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
689                     benchmark::utils::CheckSSE41)
690     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
691     ->UseRealTime();
692   BENCHMARK_CAPTURE(f32_velu, sse41_lut16_p3_x20,
693                     xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x20,
694                     xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
695                     benchmark::utils::CheckSSE41)
696     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
697     ->UseRealTime();
698   BENCHMARK_CAPTURE(f32_velu, sse41_lut16_p3_x24,
699                     xnn_f32_velu_ukernel__sse41_rr2_lut16_p3_x24,
700                     xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
701                     benchmark::utils::CheckSSE41)
702     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
703     ->UseRealTime();
704 
705   BENCHMARK_CAPTURE(f32_velu, sse41_p6_x4,
706                     xnn_f32_velu_ukernel__sse41_rr2_p6_x4,
707                     xnn_init_f32_elu_sse2_rr2_p6_params,
708                     benchmark::utils::CheckSSE41)
709     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
710     ->UseRealTime();
711   BENCHMARK_CAPTURE(f32_velu, sse41_p6_x8,
712                     xnn_f32_velu_ukernel__sse41_rr2_p6_x8,
713                     xnn_init_f32_elu_sse2_rr2_p6_params,
714                     benchmark::utils::CheckSSE41)
715     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
716     ->UseRealTime();
717   BENCHMARK_CAPTURE(f32_velu, sse41_p6_x12,
718                     xnn_f32_velu_ukernel__sse41_rr2_p6_x12,
719                     xnn_init_f32_elu_sse2_rr2_p6_params,
720                     benchmark::utils::CheckSSE41)
721     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
722     ->UseRealTime();
723   BENCHMARK_CAPTURE(f32_velu, sse41_p6_x16,
724                     xnn_f32_velu_ukernel__sse41_rr2_p6_x16,
725                     xnn_init_f32_elu_sse2_rr2_p6_params,
726                     benchmark::utils::CheckSSE41)
727     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
728     ->UseRealTime();
729   BENCHMARK_CAPTURE(f32_velu, sse41_p6_x20,
730                     xnn_f32_velu_ukernel__sse41_rr2_p6_x20,
731                     xnn_init_f32_elu_sse2_rr2_p6_params,
732                     benchmark::utils::CheckSSE41)
733     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
734     ->UseRealTime();
735   BENCHMARK_CAPTURE(f32_velu, sse41_p6_x24,
736                     xnn_f32_velu_ukernel__sse41_rr2_p6_x24,
737                     xnn_init_f32_elu_sse2_rr2_p6_params,
738                     benchmark::utils::CheckSSE41)
739     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
740     ->UseRealTime();
741 
742   BENCHMARK_CAPTURE(f32_velu, sse2_lut16_p3_x4,
743                     xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x4,
744                     xnn_init_f32_elu_sse2_rr2_lut16_p3_params)
745     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
746     ->UseRealTime();
747   BENCHMARK_CAPTURE(f32_velu, sse2_lut16_p3_x8,
748                     xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x8,
749                     xnn_init_f32_elu_sse2_rr2_lut16_p3_params)
750     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
751     ->UseRealTime();
752   BENCHMARK_CAPTURE(f32_velu, sse2_lut16_p3_x12,
753                     xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12,
754                     xnn_init_f32_elu_sse2_rr2_lut16_p3_params)
755     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
756     ->UseRealTime();
757   BENCHMARK_CAPTURE(f32_velu, sse2_lut16_p3_x16,
758                     xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x16,
759                     xnn_init_f32_elu_sse2_rr2_lut16_p3_params)
760     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
761     ->UseRealTime();
762   BENCHMARK_CAPTURE(f32_velu, sse2_lut16_p3_x20,
763                     xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x20,
764                     xnn_init_f32_elu_sse2_rr2_lut16_p3_params)
765     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
766     ->UseRealTime();
767   BENCHMARK_CAPTURE(f32_velu, sse2_lut16_p3_x24,
768                     xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x24,
769                     xnn_init_f32_elu_sse2_rr2_lut16_p3_params)
770     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
771     ->UseRealTime();
772 
773   BENCHMARK_CAPTURE(f32_velu, sse2_p6_x4,
774                     xnn_f32_velu_ukernel__sse2_rr2_p6_x4,
775                     xnn_init_f32_elu_sse2_rr2_p6_params)
776     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
777     ->UseRealTime();
778   BENCHMARK_CAPTURE(f32_velu, sse2_p6_x8,
779                     xnn_f32_velu_ukernel__sse2_rr2_p6_x8,
780                     xnn_init_f32_elu_sse2_rr2_p6_params)
781     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
782     ->UseRealTime();
783   BENCHMARK_CAPTURE(f32_velu, sse2_p6_x12,
784                     xnn_f32_velu_ukernel__sse2_rr2_p6_x12,
785                     xnn_init_f32_elu_sse2_rr2_p6_params)
786     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
787     ->UseRealTime();
788   BENCHMARK_CAPTURE(f32_velu, sse2_p6_x16,
789                     xnn_f32_velu_ukernel__sse2_rr2_p6_x16,
790                     xnn_init_f32_elu_sse2_rr2_p6_params)
791     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
792     ->UseRealTime();
793   BENCHMARK_CAPTURE(f32_velu, sse2_p6_x20,
794                     xnn_f32_velu_ukernel__sse2_rr2_p6_x20,
795                     xnn_init_f32_elu_sse2_rr2_p6_params)
796     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
797     ->UseRealTime();
798   BENCHMARK_CAPTURE(f32_velu, sse2_p6_x24,
799                     xnn_f32_velu_ukernel__sse2_rr2_p6_x24,
800                     xnn_init_f32_elu_sse2_rr2_p6_params)
801     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
802     ->UseRealTime();
803 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
804 
805 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
806   BENCHMARK_CAPTURE(f32_velu, wasmsimd_arm_lut16_p3_x4,
807                     xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x4,
808                     xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params)
809     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
810     ->UseRealTime();
811   BENCHMARK_CAPTURE(f32_velu, wasmsimd_arm_lut16_p3_x8,
812                     xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x8,
813                     xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params)
814     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
815     ->UseRealTime();
816   BENCHMARK_CAPTURE(f32_velu, wasmsimd_arm_lut16_p3_x12,
817                     xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x12,
818                     xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params)
819     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
820     ->UseRealTime();
821   BENCHMARK_CAPTURE(f32_velu, wasmsimd_arm_lut16_p3_x16,
822                     xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x16,
823                     xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params)
824     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
825     ->UseRealTime();
826   BENCHMARK_CAPTURE(f32_velu, wasmsimd_arm_lut16_p3_x20,
827                     xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x20,
828                     xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params)
829     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
830     ->UseRealTime();
831   BENCHMARK_CAPTURE(f32_velu, wasmsimd_arm_lut16_p3_x24,
832                     xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x24,
833                     xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params)
834     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
835     ->UseRealTime();
836 
837   BENCHMARK_CAPTURE(f32_velu, wasmsimd_x86_lut16_p3_x4,
838                     xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x4,
839                     xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params)
840     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
841     ->UseRealTime();
842   BENCHMARK_CAPTURE(f32_velu, wasmsimd_x86_lut16_p3_x8,
843                     xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x8,
844                     xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params)
845     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
846     ->UseRealTime();
847   BENCHMARK_CAPTURE(f32_velu, wasmsimd_x86_lut16_p3_x12,
848                     xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x12,
849                     xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params)
850     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
851     ->UseRealTime();
852   BENCHMARK_CAPTURE(f32_velu, wasmsimd_x86_lut16_p3_x16,
853                     xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x16,
854                     xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params)
855     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
856     ->UseRealTime();
857   BENCHMARK_CAPTURE(f32_velu, wasmsimd_x86_lut16_p3_x20,
858                     xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x20,
859                     xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params)
860     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
861     ->UseRealTime();
862   BENCHMARK_CAPTURE(f32_velu, wasmsimd_x86_lut16_p3_x24,
863                     xnn_f32_velu_ukernel__wasmsimd_x86_rr2_lut16_p3_x24,
864                     xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params)
865     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
866     ->UseRealTime();
867 
868   BENCHMARK_CAPTURE(f32_velu, wasmsimd_arm_p6_x4,
869                     xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x4,
870                     xnn_init_f32_elu_wasmsimd_rr2_p6_params)
871     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
872     ->UseRealTime();
873   BENCHMARK_CAPTURE(f32_velu, wasmsimd_arm_p6_x8,
874                     xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x8,
875                     xnn_init_f32_elu_wasmsimd_rr2_p6_params)
876     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
877     ->UseRealTime();
878   BENCHMARK_CAPTURE(f32_velu, wasmsimd_arm_p6_x12,
879                     xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x12,
880                     xnn_init_f32_elu_wasmsimd_rr2_p6_params)
881     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
882     ->UseRealTime();
883   BENCHMARK_CAPTURE(f32_velu, wasmsimd_arm_p6_x16,
884                     xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x16,
885                     xnn_init_f32_elu_wasmsimd_rr2_p6_params)
886     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
887     ->UseRealTime();
888   BENCHMARK_CAPTURE(f32_velu, wasmsimd_arm_p6_x20,
889                     xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20,
890                     xnn_init_f32_elu_wasmsimd_rr2_p6_params)
891     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
892     ->UseRealTime();
893   BENCHMARK_CAPTURE(f32_velu, wasmsimd_arm_p6_x24,
894                     xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x24,
895                     xnn_init_f32_elu_wasmsimd_rr2_p6_params)
896     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
897     ->UseRealTime();
898 
899   BENCHMARK_CAPTURE(f32_velu, wasmsimd_x86_p6_x4,
900                     xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x4,
901                     xnn_init_f32_elu_wasmsimd_rr2_p6_params)
902     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
903     ->UseRealTime();
904   BENCHMARK_CAPTURE(f32_velu, wasmsimd_x86_p6_x8,
905                     xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x8,
906                     xnn_init_f32_elu_wasmsimd_rr2_p6_params)
907     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
908     ->UseRealTime();
909   BENCHMARK_CAPTURE(f32_velu, wasmsimd_x86_p6_x12,
910                     xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x12,
911                     xnn_init_f32_elu_wasmsimd_rr2_p6_params)
912     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
913     ->UseRealTime();
914   BENCHMARK_CAPTURE(f32_velu, wasmsimd_x86_p6_x16,
915                     xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x16,
916                     xnn_init_f32_elu_wasmsimd_rr2_p6_params)
917     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
918     ->UseRealTime();
919   BENCHMARK_CAPTURE(f32_velu, wasmsimd_x86_p6_x20,
920                     xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20,
921                     xnn_init_f32_elu_wasmsimd_rr2_p6_params)
922     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
923     ->UseRealTime();
924   BENCHMARK_CAPTURE(f32_velu, wasmsimd_x86_p6_x24,
925                     xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x24,
926                     xnn_init_f32_elu_wasmsimd_rr2_p6_params)
927     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
928     ->UseRealTime();
929 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
930 
931 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
932   BENCHMARK_CAPTURE(f32_velu, wasm_lut16_p3_x1,
933                     xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x1,
934                     xnn_init_f32_elu_scalar_rr2_lut16_p3_params)
935     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
936     ->UseRealTime();
937   BENCHMARK_CAPTURE(f32_velu, wasm_lut16_p3_x2,
938                     xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x2,
939                     xnn_init_f32_elu_scalar_rr2_lut16_p3_params)
940     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
941     ->UseRealTime();
942   BENCHMARK_CAPTURE(f32_velu, wasm_lut16_p3_x3,
943                     xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x3,
944                     xnn_init_f32_elu_scalar_rr2_lut16_p3_params)
945     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
946     ->UseRealTime();
947   BENCHMARK_CAPTURE(f32_velu, wasm_lut16_p3_x4,
948                     xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x4,
949                     xnn_init_f32_elu_scalar_rr2_lut16_p3_params)
950     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
951     ->UseRealTime();
952   BENCHMARK_CAPTURE(f32_velu, wasm_lut16_p3_x5,
953                     xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x5,
954                     xnn_init_f32_elu_scalar_rr2_lut16_p3_params)
955     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
956     ->UseRealTime();
957   BENCHMARK_CAPTURE(f32_velu, wasm_lut16_p3_x6,
958                     xnn_f32_velu_ukernel__wasm_rr2_lut16_p3_x6,
959                     xnn_init_f32_elu_scalar_rr2_lut16_p3_params)
960     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
961     ->UseRealTime();
962 
963   BENCHMARK_CAPTURE(f32_velu, wasm_p6_x1,
964                     xnn_f32_velu_ukernel__wasm_rr2_p6_x1,
965                     xnn_init_f32_elu_scalar_rr2_p6_params)
966     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
967     ->UseRealTime();
968   BENCHMARK_CAPTURE(f32_velu, wasm_p6_x2,
969                     xnn_f32_velu_ukernel__wasm_rr2_p6_x2,
970                     xnn_init_f32_elu_scalar_rr2_p6_params)
971     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
972     ->UseRealTime();
973   BENCHMARK_CAPTURE(f32_velu, wasm_p6_x3,
974                     xnn_f32_velu_ukernel__wasm_rr2_p6_x3,
975                     xnn_init_f32_elu_scalar_rr2_p6_params)
976     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
977     ->UseRealTime();
978   BENCHMARK_CAPTURE(f32_velu, wasm_p6_x4,
979                     xnn_f32_velu_ukernel__wasm_rr2_p6_x4,
980                     xnn_init_f32_elu_scalar_rr2_p6_params)
981     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
982     ->UseRealTime();
983   BENCHMARK_CAPTURE(f32_velu, wasm_p6_x5,
984                     xnn_f32_velu_ukernel__wasm_rr2_p6_x5,
985                     xnn_init_f32_elu_scalar_rr2_p6_params)
986     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
987     ->UseRealTime();
988   BENCHMARK_CAPTURE(f32_velu, wasm_p6_x6,
989                     xnn_f32_velu_ukernel__wasm_rr2_p6_x6,
990                     xnn_init_f32_elu_scalar_rr2_p6_params)
991     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
992     ->UseRealTime();
993 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
994 
995 BENCHMARK_CAPTURE(f32_velu, scalar_lut16_p3_x1,
996                   xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x1,
997                   xnn_init_f32_elu_scalar_rr2_lut16_p3_params)
998   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
999   ->UseRealTime();
1000 BENCHMARK_CAPTURE(f32_velu, scalar_lut16_p3_x2,
1001                   xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2,
1002                   xnn_init_f32_elu_scalar_rr2_lut16_p3_params)
1003   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
1004   ->UseRealTime();
1005 BENCHMARK_CAPTURE(f32_velu, scalar_lut16_p3_x3,
1006                   xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x3,
1007                   xnn_init_f32_elu_scalar_rr2_lut16_p3_params)
1008   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
1009   ->UseRealTime();
1010 BENCHMARK_CAPTURE(f32_velu, scalar_lut16_p3_x4,
1011                   xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
1012                   xnn_init_f32_elu_scalar_rr2_lut16_p3_params)
1013   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
1014   ->UseRealTime();
1015 BENCHMARK_CAPTURE(f32_velu, scalar_lut16_p3_x5,
1016                   xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x5,
1017                   xnn_init_f32_elu_scalar_rr2_lut16_p3_params)
1018   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
1019   ->UseRealTime();
1020 BENCHMARK_CAPTURE(f32_velu, scalar_lut16_p3_x6,
1021                   xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x6,
1022                   xnn_init_f32_elu_scalar_rr2_lut16_p3_params)
1023   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
1024   ->UseRealTime();
1025 
1026 BENCHMARK_CAPTURE(f32_velu, scalar_p6_x1,
1027                   xnn_f32_velu_ukernel__scalar_rr2_p6_x1,
1028                   xnn_init_f32_elu_scalar_rr2_p6_params)
1029   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
1030   ->UseRealTime();
1031 BENCHMARK_CAPTURE(f32_velu, scalar_p6_x2,
1032                   xnn_f32_velu_ukernel__scalar_rr2_p6_x2,
1033                   xnn_init_f32_elu_scalar_rr2_p6_params)
1034   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
1035   ->UseRealTime();
1036 BENCHMARK_CAPTURE(f32_velu, scalar_p6_x3,
1037                   xnn_f32_velu_ukernel__scalar_rr2_p6_x3,
1038                   xnn_init_f32_elu_scalar_rr2_p6_params)
1039   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
1040   ->UseRealTime();
1041 BENCHMARK_CAPTURE(f32_velu, scalar_p6_x4,
1042                   xnn_f32_velu_ukernel__scalar_rr2_p6_x4,
1043                   xnn_init_f32_elu_scalar_rr2_p6_params)
1044   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
1045   ->UseRealTime();
1046 BENCHMARK_CAPTURE(f32_velu, scalar_p6_x5,
1047                   xnn_f32_velu_ukernel__scalar_rr2_p6_x5,
1048                   xnn_init_f32_elu_scalar_rr2_p6_params)
1049   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
1050   ->UseRealTime();
1051 BENCHMARK_CAPTURE(f32_velu, scalar_p6_x6,
1052                   xnn_f32_velu_ukernel__scalar_rr2_p6_x6,
1053                   xnn_init_f32_elu_scalar_rr2_p6_params)
1054   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
1055   ->UseRealTime();
1056 
1057 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1058 BENCHMARK_MAIN();
1059 #endif
1060