xref: /aosp_15_r20/external/XNNPACK/bench/f32-raddstoreexpminusmax.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <benchmark/benchmark.h>
13 #include "bench/utils.h"
14 
15 #include <xnnpack.h>
16 #include <xnnpack/aligned-allocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/microfnptr.h>
19 #include <xnnpack/microparams-init.h>
20 #include <xnnpack/raddstoreexpminusmax.h>
21 #include <xnnpack/rmax.h>
22 
23 
f32_raddstoreexpminusmax(benchmark::State & state,xnn_f32_rmax_ukernel_function rmax,xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,xnn_init_f32_expminus_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)24 static void f32_raddstoreexpminusmax(
25   benchmark::State& state,
26   xnn_f32_rmax_ukernel_function rmax,
27   xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax,
28   xnn_init_f32_expminus_params_fn init_params,
29   benchmark::utils::IsaCheckFunction isa_check = nullptr)
30 {
31   if (isa_check && !isa_check(state)) {
32     return;
33   }
34 
35   const size_t elements = state.range(0);
36   const size_t cache_line_size_max = 128;
37   const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
38 
39   std::random_device random_device;
40   auto rng = std::mt19937(random_device());
41   auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
42 
43   const size_t num_buffers = 1 +
44     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
45   std::vector<float, AlignedAllocator<float, 64>> x(elements);
46   std::vector<float, AlignedAllocator<float, 64>> y(packed_elements * num_buffers);
47 
48   std::generate(x.begin(), x.end(), std::ref(f32rng));
49 
50   benchmark::utils::DisableDenormals();
51 
52   xnn_f32_expminus_params params;
53   init_params(&params);
54 
55   size_t buffer_index = 0;
56   for (auto _ : state) {
57     state.PauseTiming();
58     float x_max = nanf("");
59     rmax(elements * sizeof(float), x.data(), &x_max);
60     if (++buffer_index == num_buffers) {
61       buffer_index = 0;
62     }
63     state.ResumeTiming();
64 
65     float y_sum = nanf("");
66     raddstoreexpminusmax(elements * sizeof(float), x.data(), &x_max, y.data() + buffer_index * packed_elements, &y_sum, &params);
67   }
68 
69   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
70   if (cpu_frequency != 0) {
71     state.counters["cpufreq"] = cpu_frequency;
72   }
73 
74   const size_t elements_per_iteration = elements;
75   state.counters["elements"] =
76     benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
77 
78   const size_t bytes_per_iteration = 2 * elements * sizeof(float);
79   state.counters["bytes"] =
80     benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
81 }
82 
83 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
84   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_x4,
85                     xnn_f32_rmax_ukernel__neon,
86                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_x4,
87                     xnn_init_f32_expminus_neon_rr2_p5_params,
88                     benchmark::utils::CheckNEON)
89     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
90     ->UseRealTime();
91   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_x8,
92                     xnn_f32_rmax_ukernel__neon,
93                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_x8,
94                     xnn_init_f32_expminus_neon_rr2_p5_params,
95                     benchmark::utils::CheckNEON)
96     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
97     ->UseRealTime();
98   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_x8_acc2,
99                     xnn_f32_rmax_ukernel__neon,
100                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_x8_acc2,
101                     xnn_init_f32_expminus_neon_rr2_p5_params,
102                     benchmark::utils::CheckNEON)
103     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
104     ->UseRealTime();
105   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_x12,
106                     xnn_f32_rmax_ukernel__neon,
107                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_x12,
108                     xnn_init_f32_expminus_neon_rr2_p5_params,
109                     benchmark::utils::CheckNEON)
110     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
111     ->UseRealTime();
112   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_x12_acc2,
113                     xnn_f32_rmax_ukernel__neon,
114                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_x12_acc2,
115                     xnn_init_f32_expminus_neon_rr2_p5_params,
116                     benchmark::utils::CheckNEON)
117     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
118     ->UseRealTime();
119   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_x12_acc3,
120                     xnn_f32_rmax_ukernel__neon,
121                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_x12_acc3,
122                     xnn_init_f32_expminus_neon_rr2_p5_params,
123                     benchmark::utils::CheckNEON)
124     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
125     ->UseRealTime();
126   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_x16,
127                     xnn_f32_rmax_ukernel__neon,
128                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_x16,
129                     xnn_init_f32_expminus_neon_rr2_p5_params,
130                     benchmark::utils::CheckNEON)
131     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
132     ->UseRealTime();
133   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_x16_acc2,
134                     xnn_f32_rmax_ukernel__neon,
135                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_x16_acc2,
136                     xnn_init_f32_expminus_neon_rr2_p5_params,
137                     benchmark::utils::CheckNEON)
138     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
139     ->UseRealTime();
140   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_x16_acc4,
141                     xnn_f32_rmax_ukernel__neon,
142                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_x16_acc4,
143                     xnn_init_f32_expminus_neon_rr2_p5_params,
144                     benchmark::utils::CheckNEON)
145     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
146     ->UseRealTime();
147   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_x20,
148                     xnn_f32_rmax_ukernel__neon,
149                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_x20,
150                     xnn_init_f32_expminus_neon_rr2_p5_params,
151                     benchmark::utils::CheckNEON)
152     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
153     ->UseRealTime();
154   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_x20_acc2,
155                     xnn_f32_rmax_ukernel__neon,
156                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_x20_acc2,
157                     xnn_init_f32_expminus_neon_rr2_p5_params,
158                     benchmark::utils::CheckNEON)
159     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
160     ->UseRealTime();
161   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_p5_x20_acc5,
162                     xnn_f32_rmax_ukernel__neon,
163                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_p5_x20_acc5,
164                     xnn_init_f32_expminus_neon_rr2_p5_params,
165                     benchmark::utils::CheckNEON)
166     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
167     ->UseRealTime();
168 
169   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_x4,
170                     xnn_f32_rmax_ukernel__neon,
171                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x4,
172                     xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
173                     benchmark::utils::CheckNEON)
174     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
175     ->UseRealTime();
176   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_x8,
177                     xnn_f32_rmax_ukernel__neon,
178                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x8,
179                     xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
180                     benchmark::utils::CheckNEON)
181     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
182     ->UseRealTime();
183   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_x8_acc2,
184                     xnn_f32_rmax_ukernel__neon,
185                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x8_acc2,
186                     xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
187                     benchmark::utils::CheckNEON)
188     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
189     ->UseRealTime();
190   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_x12,
191                     xnn_f32_rmax_ukernel__neon,
192                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x12,
193                     xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
194                     benchmark::utils::CheckNEON)
195     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
196     ->UseRealTime();
197   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_x12_acc2,
198                     xnn_f32_rmax_ukernel__neon,
199                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x12_acc2,
200                     xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
201                     benchmark::utils::CheckNEON)
202     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
203     ->UseRealTime();
204   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_x12_acc3,
205                     xnn_f32_rmax_ukernel__neon,
206                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x12_acc3,
207                     xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
208                     benchmark::utils::CheckNEON)
209     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
210     ->UseRealTime();
211   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_x16,
212                     xnn_f32_rmax_ukernel__neon,
213                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x16,
214                     xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
215                     benchmark::utils::CheckNEON)
216     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
217     ->UseRealTime();
218   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_x16_acc2,
219                     xnn_f32_rmax_ukernel__neon,
220                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x16_acc2,
221                     xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
222                     benchmark::utils::CheckNEON)
223     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
224     ->UseRealTime();
225   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_x16_acc4,
226                     xnn_f32_rmax_ukernel__neon,
227                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x16_acc4,
228                     xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
229                     benchmark::utils::CheckNEON)
230     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
231     ->UseRealTime();
232   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_x20,
233                     xnn_f32_rmax_ukernel__neon,
234                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x20,
235                     xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
236                     benchmark::utils::CheckNEON)
237     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
238     ->UseRealTime();
239   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_x20_acc2,
240                     xnn_f32_rmax_ukernel__neon,
241                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x20_acc2,
242                     xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
243                     benchmark::utils::CheckNEON)
244     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
245     ->UseRealTime();
246   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neon_rr2_lut64_p2_x20_acc5,
247                     xnn_f32_rmax_ukernel__neon,
248                     xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x20_acc5,
249                     xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
250                     benchmark::utils::CheckNEON)
251     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
252     ->UseRealTime();
253 
254   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_x4,
255                     xnn_f32_rmax_ukernel__neon,
256                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_x4,
257                     xnn_init_f32_expminus_neonfma_rr1_p5_params,
258                     benchmark::utils::CheckNEONFMA)
259     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
260     ->UseRealTime();
261   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_x8,
262                     xnn_f32_rmax_ukernel__neon,
263                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_x8,
264                     xnn_init_f32_expminus_neonfma_rr1_p5_params,
265                     benchmark::utils::CheckNEONFMA)
266     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
267     ->UseRealTime();
268   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_x8_acc2,
269                     xnn_f32_rmax_ukernel__neon,
270                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_x8_acc2,
271                     xnn_init_f32_expminus_neonfma_rr1_p5_params,
272                     benchmark::utils::CheckNEONFMA)
273     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
274     ->UseRealTime();
275   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_x12,
276                     xnn_f32_rmax_ukernel__neon,
277                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_x12,
278                     xnn_init_f32_expminus_neonfma_rr1_p5_params,
279                     benchmark::utils::CheckNEONFMA)
280     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
281     ->UseRealTime();
282   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_x12_acc2,
283                     xnn_f32_rmax_ukernel__neon,
284                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_x12_acc2,
285                     xnn_init_f32_expminus_neonfma_rr1_p5_params,
286                     benchmark::utils::CheckNEONFMA)
287     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
288     ->UseRealTime();
289   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_x12_acc3,
290                     xnn_f32_rmax_ukernel__neon,
291                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_x12_acc3,
292                     xnn_init_f32_expminus_neonfma_rr1_p5_params,
293                     benchmark::utils::CheckNEONFMA)
294     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
295     ->UseRealTime();
296   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_x16,
297                     xnn_f32_rmax_ukernel__neon,
298                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_x16,
299                     xnn_init_f32_expminus_neonfma_rr1_p5_params,
300                     benchmark::utils::CheckNEONFMA)
301     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
302     ->UseRealTime();
303   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_x16_acc2,
304                     xnn_f32_rmax_ukernel__neon,
305                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_x16_acc2,
306                     xnn_init_f32_expminus_neonfma_rr1_p5_params,
307                     benchmark::utils::CheckNEONFMA)
308     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
309     ->UseRealTime();
310   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_x16_acc4,
311                     xnn_f32_rmax_ukernel__neon,
312                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_x16_acc4,
313                     xnn_init_f32_expminus_neonfma_rr1_p5_params,
314                     benchmark::utils::CheckNEONFMA)
315     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
316     ->UseRealTime();
317   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_x20,
318                     xnn_f32_rmax_ukernel__neon,
319                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_x20,
320                     xnn_init_f32_expminus_neonfma_rr1_p5_params,
321                     benchmark::utils::CheckNEONFMA)
322     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
323     ->UseRealTime();
324   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_x20_acc2,
325                     xnn_f32_rmax_ukernel__neon,
326                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_x20_acc2,
327                     xnn_init_f32_expminus_neonfma_rr1_p5_params,
328                     benchmark::utils::CheckNEONFMA)
329     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
330     ->UseRealTime();
331   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_p5_x20_acc5,
332                     xnn_f32_rmax_ukernel__neon,
333                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_p5_x20_acc5,
334                     xnn_init_f32_expminus_neonfma_rr1_p5_params,
335                     benchmark::utils::CheckNEONFMA)
336     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
337     ->UseRealTime();
338 
339   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_x4,
340                     xnn_f32_rmax_ukernel__neon,
341                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x4,
342                     xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
343                     benchmark::utils::CheckNEONFMA)
344     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
345     ->UseRealTime();
346   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_x8,
347                     xnn_f32_rmax_ukernel__neon,
348                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x8,
349                     xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
350                     benchmark::utils::CheckNEONFMA)
351     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
352     ->UseRealTime();
353   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_x8_acc2,
354                     xnn_f32_rmax_ukernel__neon,
355                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x8_acc2,
356                     xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
357                     benchmark::utils::CheckNEONFMA)
358     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
359     ->UseRealTime();
360   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_x12,
361                     xnn_f32_rmax_ukernel__neon,
362                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x12,
363                     xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
364                     benchmark::utils::CheckNEONFMA)
365     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
366     ->UseRealTime();
367   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_x12_acc2,
368                     xnn_f32_rmax_ukernel__neon,
369                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x12_acc2,
370                     xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
371                     benchmark::utils::CheckNEONFMA)
372     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
373     ->UseRealTime();
374   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_x12_acc3,
375                     xnn_f32_rmax_ukernel__neon,
376                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x12_acc3,
377                     xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
378                     benchmark::utils::CheckNEONFMA)
379     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
380     ->UseRealTime();
381   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_x16,
382                     xnn_f32_rmax_ukernel__neon,
383                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16,
384                     xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
385                     benchmark::utils::CheckNEONFMA)
386     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
387     ->UseRealTime();
388   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_x16_acc2,
389                     xnn_f32_rmax_ukernel__neon,
390                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16_acc2,
391                     xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
392                     benchmark::utils::CheckNEONFMA)
393     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
394     ->UseRealTime();
395   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_x16_acc4,
396                     xnn_f32_rmax_ukernel__neon,
397                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16_acc4,
398                     xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
399                     benchmark::utils::CheckNEONFMA)
400     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
401     ->UseRealTime();
402   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_x20,
403                     xnn_f32_rmax_ukernel__neon,
404                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x20,
405                     xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
406                     benchmark::utils::CheckNEONFMA)
407     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
408     ->UseRealTime();
409   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_x20_acc2,
410                     xnn_f32_rmax_ukernel__neon,
411                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x20_acc2,
412                     xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
413                     benchmark::utils::CheckNEONFMA)
414     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
415     ->UseRealTime();
416   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, neonfma_rr1_lut64_p2_x20_acc5,
417                     xnn_f32_rmax_ukernel__neon,
418                     xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x20_acc5,
419                     xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
420                     benchmark::utils::CheckNEONFMA)
421     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
422     ->UseRealTime();
423 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
424 
425 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
426   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_x128,
427                     xnn_f32_rmax_ukernel__avx,
428                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128,
429                     xnn_init_f32_expminus_avx512_rr1_p5_params,
430                     benchmark::utils::CheckAVX512F)
431     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
432     ->UseRealTime();
433   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_x128_acc2,
434                     xnn_f32_rmax_ukernel__avx,
435                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2,
436                     xnn_init_f32_expminus_avx512_rr1_p5_params,
437                     benchmark::utils::CheckAVX512F)
438     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
439     ->UseRealTime();
440   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_x128_acc4,
441                     xnn_f32_rmax_ukernel__avx,
442                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc4,
443                     xnn_init_f32_expminus_avx512_rr1_p5_params,
444                     benchmark::utils::CheckAVX512F)
445     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
446     ->UseRealTime();
447 
448   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_x144,
449                     xnn_f32_rmax_ukernel__avx,
450                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144,
451                     xnn_init_f32_expminus_avx512_rr1_p5_params,
452                     benchmark::utils::CheckAVX512F)
453     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
454     ->UseRealTime();
455   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_x144_acc3,
456                     xnn_f32_rmax_ukernel__avx,
457                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144_acc3,
458                     xnn_init_f32_expminus_avx512_rr1_p5_params,
459                     benchmark::utils::CheckAVX512F)
460     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
461     ->UseRealTime();
462 
463   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_x160,
464                     xnn_f32_rmax_ukernel__avx,
465                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160,
466                     xnn_init_f32_expminus_avx512_rr1_p5_params,
467                     benchmark::utils::CheckAVX512F)
468     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
469     ->UseRealTime();
470   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_x160_acc2,
471                     xnn_f32_rmax_ukernel__avx,
472                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc2,
473                     xnn_init_f32_expminus_avx512_rr1_p5_params,
474                     benchmark::utils::CheckAVX512F)
475     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
476     ->UseRealTime();
477   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_x160_acc5,
478                     xnn_f32_rmax_ukernel__avx,
479                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc5,
480                     xnn_init_f32_expminus_avx512_rr1_p5_params,
481                     benchmark::utils::CheckAVX512F)
482     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
483     ->UseRealTime();
484 
485   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_x192,
486                     xnn_f32_rmax_ukernel__avx,
487                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192,
488                     xnn_init_f32_expminus_avx512_rr1_p5_params,
489                     benchmark::utils::CheckAVX512F)
490     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
491     ->UseRealTime();
492   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_x192_acc2,
493                     xnn_f32_rmax_ukernel__avx,
494                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc2,
495                     xnn_init_f32_expminus_avx512_rr1_p5_params,
496                     benchmark::utils::CheckAVX512F)
497     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
498     ->UseRealTime();
499   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_x192_acc3,
500                     xnn_f32_rmax_ukernel__avx,
501                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc3,
502                     xnn_init_f32_expminus_avx512_rr1_p5_params,
503                     benchmark::utils::CheckAVX512F)
504     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
505     ->UseRealTime();
506   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx512f_rr1_p5_scalef_x192_acc6,
507                     xnn_f32_rmax_ukernel__avx,
508                     xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc6,
509                     xnn_init_f32_expminus_avx512_rr1_p5_params,
510                     benchmark::utils::CheckAVX512F)
511     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
512     ->UseRealTime();
513 
514   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_x64,
515                     xnn_f32_rmax_ukernel__avx,
516                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x64,
517                     xnn_init_f32_expminus_avx2_rr1_p5_params,
518                     benchmark::utils::CheckAVX2)
519     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
520     ->UseRealTime();
521   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_x64_acc2,
522                     xnn_f32_rmax_ukernel__avx,
523                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x64_acc2,
524                     xnn_init_f32_expminus_avx2_rr1_p5_params,
525                     benchmark::utils::CheckAVX2)
526     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
527     ->UseRealTime();
528   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_x64_acc4,
529                     xnn_f32_rmax_ukernel__avx,
530                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x64_acc4,
531                     xnn_init_f32_expminus_avx2_rr1_p5_params,
532                     benchmark::utils::CheckAVX2)
533     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
534     ->UseRealTime();
535 
536   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_x72,
537                     xnn_f32_rmax_ukernel__avx,
538                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x72,
539                     xnn_init_f32_expminus_avx2_rr1_p5_params,
540                     benchmark::utils::CheckAVX2)
541     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
542     ->UseRealTime();
543   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_x72_acc3,
544                     xnn_f32_rmax_ukernel__avx,
545                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x72_acc3,
546                     xnn_init_f32_expminus_avx2_rr1_p5_params,
547                     benchmark::utils::CheckAVX2)
548     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
549     ->UseRealTime();
550 
551   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_x80,
552                     xnn_f32_rmax_ukernel__avx,
553                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x80,
554                     xnn_init_f32_expminus_avx2_rr1_p5_params,
555                     benchmark::utils::CheckAVX2)
556     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
557     ->UseRealTime();
558   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_x80_acc2,
559                     xnn_f32_rmax_ukernel__avx,
560                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x80_acc2,
561                     xnn_init_f32_expminus_avx2_rr1_p5_params,
562                     benchmark::utils::CheckAVX2)
563     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
564     ->UseRealTime();
565   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_x80_acc5,
566                     xnn_f32_rmax_ukernel__avx,
567                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x80_acc5,
568                     xnn_init_f32_expminus_avx2_rr1_p5_params,
569                     benchmark::utils::CheckAVX2)
570     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
571     ->UseRealTime();
572 
573   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_x96,
574                     xnn_f32_rmax_ukernel__avx,
575                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96,
576                     xnn_init_f32_expminus_avx2_rr1_p5_params,
577                     benchmark::utils::CheckAVX2)
578     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
579     ->UseRealTime();
580   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_x96_acc2,
581                     xnn_f32_rmax_ukernel__avx,
582                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc2,
583                     xnn_init_f32_expminus_avx2_rr1_p5_params,
584                     benchmark::utils::CheckAVX2)
585     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
586     ->UseRealTime();
587   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_x96_acc3,
588                     xnn_f32_rmax_ukernel__avx,
589                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc3,
590                     xnn_init_f32_expminus_avx2_rr1_p5_params,
591                     benchmark::utils::CheckAVX2)
592     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
593     ->UseRealTime();
594   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, avx2_rr1_p5_x96_acc6,
595                     xnn_f32_rmax_ukernel__avx,
596                     xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc6,
597                     xnn_init_f32_expminus_avx2_rr1_p5_params,
598                     benchmark::utils::CheckAVX2)
599     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
600     ->UseRealTime();
601 
602   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_x4,
603                     xnn_f32_rmax_ukernel__sse,
604                     xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x4,
605                     xnn_init_f32_expminus_sse2_rr2_p5_params)
606     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
607     ->UseRealTime();
608   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_x8,
609                     xnn_f32_rmax_ukernel__sse,
610                     xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x8,
611                     xnn_init_f32_expminus_sse2_rr2_p5_params)
612     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
613     ->UseRealTime();
614   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_x8_acc2,
615                     xnn_f32_rmax_ukernel__sse,
616                     xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x8_acc2,
617                     xnn_init_f32_expminus_sse2_rr2_p5_params)
618     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
619     ->UseRealTime();
620   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_x12,
621                     xnn_f32_rmax_ukernel__sse,
622                     xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x12,
623                     xnn_init_f32_expminus_sse2_rr2_p5_params)
624     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
625     ->UseRealTime();
626   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_x12_acc2,
627                     xnn_f32_rmax_ukernel__sse,
628                     xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x12_acc2,
629                     xnn_init_f32_expminus_sse2_rr2_p5_params)
630     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
631     ->UseRealTime();
632   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_x12_acc3,
633                     xnn_f32_rmax_ukernel__sse,
634                     xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x12_acc3,
635                     xnn_init_f32_expminus_sse2_rr2_p5_params)
636     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
637     ->UseRealTime();
638   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_x16,
639                     xnn_f32_rmax_ukernel__sse,
640                     xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x16,
641                     xnn_init_f32_expminus_sse2_rr2_p5_params)
642     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
643     ->UseRealTime();
644   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_x16_acc2,
645                     xnn_f32_rmax_ukernel__sse,
646                     xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x16_acc2,
647                     xnn_init_f32_expminus_sse2_rr2_p5_params)
648     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
649     ->UseRealTime();
650   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_x16_acc4,
651                     xnn_f32_rmax_ukernel__sse,
652                     xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x16_acc4,
653                     xnn_init_f32_expminus_sse2_rr2_p5_params)
654     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
655     ->UseRealTime();
656   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_x20,
657                     xnn_f32_rmax_ukernel__sse,
658                     xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20,
659                     xnn_init_f32_expminus_sse2_rr2_p5_params)
660     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
661     ->UseRealTime();
662   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_x20_acc2,
663                     xnn_f32_rmax_ukernel__sse,
664                     xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20_acc2,
665                     xnn_init_f32_expminus_sse2_rr2_p5_params)
666     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
667     ->UseRealTime();
668   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, sse2_rr2_p5_x20_acc5,
669                     xnn_f32_rmax_ukernel__sse,
670                     xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20_acc5,
671                     xnn_init_f32_expminus_sse2_rr2_p5_params)
672     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
673     ->UseRealTime();
674 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
675 
676 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
677   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_x4,
678                     xnn_f32_rmax_ukernel__wasmsimd_arm,
679                     xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x4,
680                     xnn_init_f32_expminus_wasmsimd_rr2_p5_params)
681     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
682     ->UseRealTime();
683   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_x8,
684                     xnn_f32_rmax_ukernel__wasmsimd_arm,
685                     xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x8,
686                     xnn_init_f32_expminus_wasmsimd_rr2_p5_params)
687     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
688     ->UseRealTime();
689   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_x8_acc2,
690                     xnn_f32_rmax_ukernel__wasmsimd_arm,
691                     xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x8_acc2,
692                     xnn_init_f32_expminus_wasmsimd_rr2_p5_params)
693     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
694     ->UseRealTime();
695   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_x12,
696                     xnn_f32_rmax_ukernel__wasmsimd_arm,
697                     xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x12,
698                     xnn_init_f32_expminus_wasmsimd_rr2_p5_params)
699     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
700     ->UseRealTime();
701   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_x12_acc2,
702                     xnn_f32_rmax_ukernel__wasmsimd_arm,
703                     xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x12_acc2,
704                     xnn_init_f32_expminus_wasmsimd_rr2_p5_params)
705     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
706     ->UseRealTime();
707   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_x12_acc3,
708                     xnn_f32_rmax_ukernel__wasmsimd_arm,
709                     xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x12_acc3,
710                     xnn_init_f32_expminus_wasmsimd_rr2_p5_params)
711     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
712     ->UseRealTime();
713   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_x16,
714                     xnn_f32_rmax_ukernel__wasmsimd_arm,
715                     xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x16,
716                     xnn_init_f32_expminus_wasmsimd_rr2_p5_params)
717     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
718     ->UseRealTime();
719   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_x16_acc2,
720                     xnn_f32_rmax_ukernel__wasmsimd_arm,
721                     xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x16_acc2,
722                     xnn_init_f32_expminus_wasmsimd_rr2_p5_params)
723     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
724     ->UseRealTime();
725   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_x16_acc4,
726                     xnn_f32_rmax_ukernel__wasmsimd_arm,
727                     xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x16_acc4,
728                     xnn_init_f32_expminus_wasmsimd_rr2_p5_params)
729     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
730     ->UseRealTime();
731   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_x20,
732                     xnn_f32_rmax_ukernel__wasmsimd_arm,
733                     xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x20,
734                     xnn_init_f32_expminus_wasmsimd_rr2_p5_params)
735     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
736     ->UseRealTime();
737   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_x20_acc2,
738                     xnn_f32_rmax_ukernel__wasmsimd_arm,
739                     xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x20_acc2,
740                     xnn_init_f32_expminus_wasmsimd_rr2_p5_params)
741     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
742     ->UseRealTime();
743   BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, wasmsimd_rr2_p5_x20_acc5,
744                     xnn_f32_rmax_ukernel__wasmsimd_arm,
745                     xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x20_acc5,
746                     xnn_init_f32_expminus_wasmsimd_rr2_p5_params)
747     ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
748     ->UseRealTime();
749 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
750 
751 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_lut64_p2_x1,
752                   xnn_f32_rmax_ukernel__scalar,
753                   xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_x1,
754                   xnn_init_f32_expminus_scalar_rr2_lut64_p2_params)
755   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
756   ->UseRealTime();
757 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_lut64_p2_x2,
758                   xnn_f32_rmax_ukernel__scalar,
759                   xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_x2,
760                   xnn_init_f32_expminus_scalar_rr2_lut64_p2_params)
761   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
762   ->UseRealTime();
763 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_lut64_p2_x2_acc2,
764                   xnn_f32_rmax_ukernel__scalar,
765                   xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_x2_acc2,
766                   xnn_init_f32_expminus_scalar_rr2_lut64_p2_params)
767   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
768   ->UseRealTime();
769 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_lut64_p2_x4,
770                   xnn_f32_rmax_ukernel__scalar,
771                   xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_x4,
772                   xnn_init_f32_expminus_scalar_rr2_lut64_p2_params)
773   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
774   ->UseRealTime();
775 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_lut64_p2_x4_acc2,
776                   xnn_f32_rmax_ukernel__scalar,
777                   xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_x4_acc2,
778                   xnn_init_f32_expminus_scalar_rr2_lut64_p2_params)
779   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
780   ->UseRealTime();
781 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_lut64_p2_x4_acc4,
782                   xnn_f32_rmax_ukernel__scalar,
783                   xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_lut64_p2_x4_acc4,
784                   xnn_init_f32_expminus_scalar_rr2_lut64_p2_params)
785   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
786   ->UseRealTime();
787 
788 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_p5_x1,
789                   xnn_f32_rmax_ukernel__scalar,
790                   xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x1,
791                   xnn_init_f32_expminus_scalar_rr2_p5_params)
792   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
793   ->UseRealTime();
794 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_p5_x2,
795                   xnn_f32_rmax_ukernel__scalar,
796                   xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x2,
797                   xnn_init_f32_expminus_scalar_rr2_p5_params)
798   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
799   ->UseRealTime();
800 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_p5_x2_acc2,
801                   xnn_f32_rmax_ukernel__scalar,
802                   xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x2_acc2,
803                   xnn_init_f32_expminus_scalar_rr2_p5_params)
804   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
805   ->UseRealTime();
806 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_p5_x4,
807                   xnn_f32_rmax_ukernel__scalar,
808                   xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4,
809                   xnn_init_f32_expminus_scalar_rr2_p5_params)
810   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
811   ->UseRealTime();
812 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_p5_x4_acc2,
813                   xnn_f32_rmax_ukernel__scalar,
814                   xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
815                   xnn_init_f32_expminus_scalar_rr2_p5_params)
816   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
817   ->UseRealTime();
818 BENCHMARK_CAPTURE(f32_raddstoreexpminusmax, scalar_rr2_p5_x4_acc4,
819                   xnn_f32_rmax_ukernel__scalar,
820                   xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc4,
821                   xnn_init_f32_expminus_scalar_rr2_p5_params)
822   ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
823   ->UseRealTime();
824 
825 #ifndef XNNPACK_BENCHMARK_NO_MAIN
826 BENCHMARK_MAIN();
827 #endif
828