xref: /aosp_15_r20/external/XNNPACK/bench/f32-igemm.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <limits>
11 #include <random>
12 #include <vector>
13 
14 #include <benchmark/benchmark.h>
15 #include "bench/conv.h"
16 #include "bench/utils.h"
17 
18 #include <xnnpack.h>
19 #include <xnnpack/aligned-allocator.h>
20 #include <xnnpack/common.h>
21 #include <xnnpack/igemm.h>
22 #include <xnnpack/indirection.h>
23 #include <xnnpack/operator.h>
24 #include <xnnpack/microfnptr.h>
25 #include <xnnpack/microparams-init.h>
26 #include <xnnpack/pack.h>
27 
28 
f32_igemm(benchmark::State & state,xnn_f32_igemm_minmax_ukernel_function igemm,uint32_t mr,uint32_t nr,uint32_t kr,uint32_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)29 static void f32_igemm(benchmark::State& state,
30   xnn_f32_igemm_minmax_ukernel_function igemm,
31   uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
32   xnn_init_f32_minmax_params_fn init_params,
33   benchmark::utils::IsaCheckFunction isa_check = nullptr)
34 {
35   if (isa_check && !isa_check(state)) {
36     return;
37   }
38 
39   const size_t input_height = state.range(0);
40   const size_t input_width = state.range(1);
41   const size_t kernel_height = state.range(2);
42   const size_t kernel_width = state.range(3);
43   const size_t kernel_size = kernel_height * kernel_width;
44   const size_t padding_height = state.range(4);
45   const size_t padding_width = state.range(5);
46   const size_t subsampling = state.range(6);
47   const size_t dilation = state.range(7);
48   const size_t group_input_channels = state.range(8);
49   const size_t group_output_channels = state.range(9);
50 
51   std::random_device random_device;
52   auto rng = std::mt19937(random_device());
53   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
54 
55   const size_t output_pixel_stride = group_output_channels;
56   const size_t input_pixel_stride = group_input_channels;
57   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
58   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
59   const size_t padding_left = padding_width / 2;
60   const size_t padding_top = padding_height / 2;
61   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
62   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
63   const size_t output_size = output_height * output_width;
64 
65   const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
66   const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
67   const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
68 
69   std::vector<float> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
70   std::generate(a.begin(), a.end(), std::ref(f32rng));
71   std::vector<float> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
72   std::generate(k.begin(), k.end(), std::ref(f32rng));
73   std::vector<float> b(group_output_channels);
74   std::generate(b.begin(), b.end(), std::ref(f32rng));
75 
76   std::vector<float> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(float));
77 
78   const size_t w_elements = kernel_size * kc_stride * nc_stride + nc_stride;
79   const size_t i_elements = mc_stride * kernel_size;
80   const size_t c_elements = output_height * output_width * output_pixel_stride;
81   const size_t num_buffers = 1 +
82     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
83       sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
84 
85   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
86   std::fill(w.begin(), w.end(), 0.0f);
87   xnn_pack_f32_conv_goki_w(
88     1 /* groups */, group_output_channels, kernel_size, group_input_channels,
89     nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
90   for (size_t n = 1; n < num_buffers; n++) {
91     std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
92   }
93 
94   std::vector<const float*> i(i_elements * num_buffers);
95   xnn_operator convolution_op = { };
96   convolution_op.indirection_buffer   = reinterpret_cast<const void**>(i.data());
97   convolution_op.input                = a.data();
98   convolution_op.input_pixel_stride   = input_pixel_stride;
99   convolution_op.zero_buffer          = z.data();
100   convolution_op.groups               = 1;
101   convolution_op.group_input_channels = group_input_channels;
102   convolution_op.batch_size           = 1;
103   convolution_op.input_height         = input_height;
104   convolution_op.input_width          = input_width;
105   convolution_op.output_height        = output_height;
106   convolution_op.output_width         = output_width;
107   convolution_op.kernel_height        = kernel_height;
108   convolution_op.kernel_width         = kernel_width;
109   convolution_op.stride_height        = subsampling;
110   convolution_op.stride_width         = subsampling;
111   convolution_op.dilation_height      = dilation;
112   convolution_op.dilation_width       = dilation;
113   convolution_op.padding_top          = padding_top;
114   convolution_op.padding_left         = padding_left;
115   xnn_indirection_init_conv2d(&convolution_op, mr, 2 /* log2(sizeof(float)) */);
116   for (size_t n = 1; n < num_buffers; n++) {
117     std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
118   }
119 
120   std::vector<float> c(c_elements * num_buffers);
121   std::fill(c.begin(), c.end(), std::nanf(""));
122 
123   xnn_f32_minmax_params params;
124   init_params(&params,
125     -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
126 
127   size_t buffer_index = 0;
128   for (auto _ : state) {
129     state.PauseTiming();
130     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
131     buffer_index = (buffer_index + 1) % num_buffers;
132     state.ResumeTiming();
133 
134     for (uint32_t m = 0; m < output_size; m += mr) {
135       const uint32_t mb = min(output_size - m, mr);
136       igemm(
137         mb, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void*),
138         i.data() + buffer_index * i_elements + m,
139         w.data() + buffer_index * w_elements,
140         c.data() + buffer_index * c_elements + m * group_output_channels, group_output_channels * sizeof(float), nr * sizeof(float),
141         0, z.data(), &params);
142     }
143   }
144 
145   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
146   if (cpu_frequency != 0) {
147     state.counters["cpufreq"] = cpu_frequency;
148   }
149 
150   state.counters["FLOPS"] = benchmark::Counter(
151     uint64_t(state.iterations()) * 2 *
152       output_height * output_width *
153       group_input_channels * group_output_channels *
154       kernel_height * kernel_width,
155     benchmark::Counter::kIsRate);
156 }
157 
158 #if XNN_PLATFORM_JIT
f32_igemm(benchmark::State & state,xnn_jit_igemm_code_generator_function generator,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)159   static void f32_igemm(benchmark::State& state,
160     xnn_jit_igemm_code_generator_function generator,
161     size_t mr, size_t nr, size_t kr, size_t sr,
162     xnn_init_f32_minmax_params_fn init_params,
163     benchmark::utils::IsaCheckFunction isa_check = nullptr)
164 {
165   if (isa_check && !isa_check(state)) {
166     return;
167   }
168 
169   const size_t input_height = state.range(0);
170   const size_t input_width = state.range(1);
171   const size_t kernel_height = state.range(2);
172   const size_t kernel_width = state.range(3);
173   const size_t kernel_size = kernel_height * kernel_width;
174   const size_t padding_height = state.range(4);
175   const size_t padding_width = state.range(5);
176   const size_t subsampling = state.range(6);
177   const size_t dilation = state.range(7);
178   const size_t group_input_channels = state.range(8);
179   const size_t group_output_channels = state.range(9);
180 
181   std::random_device random_device;
182   auto rng = std::mt19937(random_device());
183   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
184 
185   const size_t output_pixel_stride = group_output_channels;
186   const size_t input_pixel_stride = group_input_channels;
187   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
188   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
189   const size_t padding_left = padding_width / 2;
190   const size_t padding_top = padding_height / 2;
191   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
192   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
193   const size_t output_size = output_height * output_width;
194 
195   const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
196   const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
197   const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
198 
199   std::vector<float> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
200   std::generate(a.begin(), a.end(), std::ref(f32rng));
201   std::vector<float> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
202   std::generate(k.begin(), k.end(), std::ref(f32rng));
203   std::vector<float> b(group_output_channels);
204   std::generate(b.begin(), b.end(), std::ref(f32rng));
205 
206   std::vector<float> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(float));
207 
208   const size_t w_elements = kernel_size * kc_stride * nc_stride + nc_stride;
209   const size_t i_elements = mc_stride * kernel_size;
210   const size_t c_elements = output_height * output_width * output_pixel_stride;
211   const size_t num_buffers = 1 +
212     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
213       sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
214 
215   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
216   std::fill(w.begin(), w.end(), 0.0f);
217   xnn_pack_f32_conv_goki_w(
218     1 /* groups */, group_output_channels, kernel_size, group_input_channels,
219     nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
220   for (size_t n = 1; n < num_buffers; n++) {
221     std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
222   }
223 
224   std::vector<const float*> i(i_elements * num_buffers);
225   xnn_operator convolution_op = { };
226   convolution_op.indirection_buffer   = reinterpret_cast<const void**>(i.data());
227   convolution_op.input                = a.data();
228   convolution_op.input_pixel_stride   = input_pixel_stride;
229   convolution_op.zero_buffer          = z.data();
230   convolution_op.groups               = 1;
231   convolution_op.group_input_channels = group_input_channels;
232   convolution_op.batch_size           = 1;
233   convolution_op.input_height         = input_height;
234   convolution_op.input_width          = input_width;
235   convolution_op.output_height        = output_height;
236   convolution_op.output_width         = output_width;
237   convolution_op.kernel_height        = kernel_height;
238   convolution_op.kernel_width         = kernel_width;
239   convolution_op.stride_height        = subsampling;
240   convolution_op.stride_width         = subsampling;
241   convolution_op.dilation_height      = dilation;
242   convolution_op.dilation_width       = dilation;
243   convolution_op.padding_top          = padding_top;
244   convolution_op.padding_left         = padding_left;
245   xnn_indirection_init_conv2d(&convolution_op, mr, 2 /* log2(sizeof(float)) */);
246   for (size_t n = 1; n < num_buffers; n++) {
247     std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
248   }
249 
250   std::vector<float> c(c_elements * num_buffers);
251   std::fill(c.begin(), c.end(), std::nanf(""));
252 
253   xnn_f32_minmax_params params;
254   init_params(&params,
255     -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
256 
257   jit_gemm_params jit_params = {
258     .f32_minmax = {
259       .min = -std::numeric_limits<float>::infinity(),
260       .max = +std::numeric_limits<float>::infinity()
261     }
262   };
263 
264   xnn_initialize(/*allocator=*/nullptr);
265   xnn_code_buffer code_buffer;
266   xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
267   generator(&code_buffer,
268             mr,
269             group_output_channels % nr,
270             group_input_channels * sizeof(float),
271             kernel_size * mr * sizeof(void *),
272             &jit_params);
273   xnn_finalize_code_memory(&code_buffer);
274   auto igemm = reinterpret_cast<xnn_f32_igemm_minmax_ukernel_function>(code_buffer.start);
275 
276   size_t buffer_index = 0;
277   for (auto _ : state) {
278     state.PauseTiming();
279     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
280     buffer_index = (buffer_index + 1) % num_buffers;
281     state.ResumeTiming();
282 
283     for (uint32_t m = 0; m < output_size; m += mr) {
284       const uint32_t mb = min(output_size - m, mr);
285       igemm(
286         mb, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void*),
287         i.data() + buffer_index * i_elements + m,
288         w.data() + buffer_index * w_elements,
289         c.data() + buffer_index * c_elements + m * group_output_channels, group_output_channels * sizeof(float), nr * sizeof(float),
290         0, z.data(), &params);
291     }
292   }
293   xnn_release_code_memory(&code_buffer);
294 
295   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
296   if (cpu_frequency != 0) {
297     state.counters["cpufreq"] = cpu_frequency;
298   }
299 
300   state.counters["FLOPS"] = benchmark::Counter(
301     uint64_t(state.iterations()) * 2 *
302       output_height * output_width *
303       group_input_channels * group_output_channels *
304       kernel_height * kernel_width,
305     benchmark::Counter::kIsRate);
306 
307 }
308 #endif  // XNN_PLATFORM_JIT
309 
310 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)311   static void jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
312     f32_igemm(state, xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
313       xnn_init_f32_minmax_scalar_params);
314   }
jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)315   static void jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
316     f32_igemm(state, xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
317       xnn_init_f32_minmax_scalar_params);
318   }
jit_f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)319   static void jit_f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
320     f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
321       xnn_init_f32_minmax_scalar_params);
322   }
jit_f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)323   static void jit_f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
324     f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
325       xnn_init_f32_minmax_scalar_params);
326   }
jit_f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)327   static void jit_f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
328     f32_igemm(state, xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
329       xnn_init_f32_minmax_scalar_params);
330   }
331 
332   BENCHMARK_CONV(jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75)
333   BENCHMARK_CONV(jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75)
334   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch64_neonfma_cortex_a75)
335   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75)
336   BENCHMARK_CONV(jit_f32_igemm_6x8__aarch64_neonfma_ld128)
337 
338 #define BENCHMARK_UPTO_MR_IGEMM(name, max_mr, nr)                                \
339   static void name(benchmark::State &state, const char *net) {                   \
340     f32_igemm(                                                              \
341         state,                                                                   \
342         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, \
343         max_mr, nr, 1, 1, xnn_init_f32_minmax_scalar_params);                    \
344   }                                                                              \
345   BENCHMARK_CONV(name)
346   BENCHMARK_UPTO_MR_IGEMM(jit_f32_igemm_upto6x8_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8);
347   BENCHMARK_UPTO_MR_IGEMM(jit_f32_igemm_upto6x8_2x8__aarch64_neonfma_prfm_cortex_a75, 2, 8);
348   BENCHMARK_UPTO_MR_IGEMM(jit_f32_igemm_upto6x8_3x8__aarch64_neonfma_prfm_cortex_a75, 3, 8);
349   BENCHMARK_UPTO_MR_IGEMM(jit_f32_igemm_upto6x8_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8);
350   BENCHMARK_UPTO_MR_IGEMM(jit_f32_igemm_upto6x8_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8);
351   BENCHMARK_UPTO_MR_IGEMM(jit_f32_igemm_upto6x8_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8);
352 #undef BENCHMARK_UPTO_MR_IGEMM
353 
354 #endif  // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
355 
356 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT
jit_f32_igemm_4x8__aarch32_neon_ld64(benchmark::State & state,const char * net)357   static void jit_f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
358     f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
359       xnn_init_f32_minmax_scalar_params);
360   }
jit_f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)361   static void jit_f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
362     f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
363       xnn_init_f32_minmax_scalar_params);
364   }
jit_f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)365   static void jit_f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
366     f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
367       xnn_init_f32_minmax_scalar_params);
368   }
jit_f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)369   static void jit_f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
370     f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
371       xnn_init_f32_minmax_scalar_params);
372   }
jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)373   static void jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
374     f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
375       xnn_init_f32_minmax_scalar_params);
376   }
jit_f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)377   static void jit_f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
378     f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
379       xnn_init_f32_minmax_scalar_params);
380   }
381 
382   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_ld64)
BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a7)383   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a7)
384   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a53)
385   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a55)
386   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75)
387   BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a75)
388 #endif  // XNN_ARCH_ARM && XNN_PLATFORM_JIT
389 
390 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
391   static void f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
392     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
393       xnn_init_f32_minmax_scalar_params);
394   }
f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)395   static void f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
396     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
397       xnn_init_f32_minmax_scalar_params);
398   }
f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)399   static void f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
400     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
401       xnn_init_f32_minmax_scalar_params);
402   }
f32_igemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State & state,const char * net)403   static void f32_igemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State& state, const char* net) {
404     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, 4, 8, 1, 1,
405       xnn_init_f32_minmax_scalar_params);
406   }
f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)407   static void f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
408     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
409       xnn_init_f32_minmax_scalar_params);
410   }
f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)411   static void f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
412     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
413       xnn_init_f32_minmax_scalar_params);
414   }
f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)415   static void f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
416     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
417       xnn_init_f32_minmax_scalar_params);
418   }
419 
420   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_ld64)
BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a7)421   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a7)
422   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a53)
423   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_prfm_cortex_a53)
424   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a55)
425   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_prfm_cortex_a75)
426   BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a75)
427 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
428 
429 
430 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
431   static void f32_igemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
432     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1,
433       xnn_init_f32_minmax_scalar_params);
434   }
f32_igemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)435   static void f32_igemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
436     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1,
437       xnn_init_f32_minmax_scalar_params);
438   }
f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,const char * net)439   static void f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net) {
440     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, 1, 8, 1, 1,
441       xnn_init_f32_minmax_scalar_params);
442   }
f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)443   static void f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
444     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
445       xnn_init_f32_minmax_scalar_params);
446   }
f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)447   static void f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
448     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
449       xnn_init_f32_minmax_scalar_params);
450   }
f32_igemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)451   static void f32_igemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
452     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, 4, 2, 1, 1,
453       xnn_init_f32_minmax_scalar_params);
454   }
f32_igemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)455   static void f32_igemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
456     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, 4, 2, 1, 1,
457       xnn_init_f32_minmax_scalar_params);
458   }
f32_igemm_4x2__aarch64_neonfma_ld64(benchmark::State & state,const char * net)459   static void f32_igemm_4x2__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
460     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, 4, 2, 1, 1,
461       xnn_init_f32_minmax_scalar_params);
462   }
f32_igemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)463   static void f32_igemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
464     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1,
465       xnn_init_f32_minmax_scalar_params);
466   }
f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,const char * net)467   static void f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net) {
468     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, 4, 8, 1, 1,
469       xnn_init_f32_minmax_scalar_params);
470   }
f32_igemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)471   static void f32_igemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
472     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, 4, 8, 1, 1,
473       xnn_init_f32_minmax_scalar_params);
474   }
f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)475   static void f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
476     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
477       xnn_init_f32_minmax_scalar_params);
478   }
f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)479   static void f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
480     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
481       xnn_init_f32_minmax_scalar_params);
482   }
f32_igemm_4x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)483   static void f32_igemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
484     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1,
485       xnn_init_f32_minmax_scalar_params);
486   }
f32_igemm_4x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)487   static void f32_igemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
488     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1,
489       xnn_init_f32_minmax_scalar_params);
490   }
f32_igemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)491   static void f32_igemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
492     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1,
493       xnn_init_f32_minmax_scalar_params);
494   }
f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)495   static void f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
496     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8, 1, 1,
497       xnn_init_f32_minmax_scalar_params);
498   }
f32_igemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)499   static void f32_igemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
500     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1,
501       xnn_init_f32_minmax_scalar_params);
502   }
f32_igemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)503   static void f32_igemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
504     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1,
505       xnn_init_f32_minmax_scalar_params);
506   }
f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,const char * net)507   static void f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net) {
508     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53, 6, 8, 1, 1,
509       xnn_init_f32_minmax_scalar_params);
510   }
f32_igemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)511   static void f32_igemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
512     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1,
513       xnn_init_f32_minmax_scalar_params);
514   }
f32_igemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State & state,const char * net)515   static void f32_igemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
516     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1,
517       xnn_init_f32_minmax_scalar_params);
518   }
f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)519   static void f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
520     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
521       xnn_init_f32_minmax_scalar_params);
522   }
f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)523   static void f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
524     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
525       xnn_init_f32_minmax_scalar_params);
526   }
f32_igemm_6x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)527   static void f32_igemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
528     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1,
529       xnn_init_f32_minmax_scalar_params);
530   }
f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)531   static void f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
532     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
533       xnn_init_f32_minmax_scalar_params);
534   }
f32_igemm_1x8__neonfma_lane_ld64(benchmark::State & state,const char * net)535   static void f32_igemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
536     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1,
537       xnn_init_f32_minmax_scalar_params);
538   }
f32_igemm_4x2__neonfma_lane_ld64(benchmark::State & state,const char * net)539   static void f32_igemm_4x2__neonfma_lane_ld64(benchmark::State& state, const char* net) {
540     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64, 4, 2, 1, 1,
541       xnn_init_f32_minmax_scalar_params);
542   }
f32_igemm_6x2__neonfma_lane_ld64(benchmark::State & state,const char * net)543   static void f32_igemm_6x2__neonfma_lane_ld64(benchmark::State& state, const char* net) {
544     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x2__neonfma_lane_ld64, 6, 2, 1, 1,
545       xnn_init_f32_minmax_scalar_params);
546   }
f32_igemm_4x4__neonfma_lane_ld64(benchmark::State & state,const char * net)547   static void f32_igemm_4x4__neonfma_lane_ld64(benchmark::State& state, const char* net) {
548     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x4__neonfma_lane_ld64, 4, 4, 1, 1,
549       xnn_init_f32_minmax_scalar_params);
550   }
f32_igemm_4x8__neonfma_lane_ld128(benchmark::State & state,const char * net)551   static void f32_igemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
552     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1,
553       xnn_init_f32_minmax_scalar_params);
554   }
f32_igemm_4x8__neonfma_lane_ld64(benchmark::State & state,const char * net)555   static void f32_igemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
556     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1,
557       xnn_init_f32_minmax_scalar_params);
558   }
f32_igemm_6x8__neonfma_lane_ld64(benchmark::State & state,const char * net)559   static void f32_igemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
560     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1,
561       xnn_init_f32_minmax_scalar_params);
562   }
f32_igemm_6x8__neonfma_lane_ld128(benchmark::State & state,const char * net)563   static void f32_igemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
564     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1,
565       xnn_init_f32_minmax_scalar_params);
566   }
567 
568   BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a53)
BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a53)569   BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a53)
570   BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a75)
571   BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75)
572   BENCHMARK_CONV(f32_igemm_1x12__aarch64_neonfma_cortex_a53)
573   BENCHMARK_CONV(f32_igemm_4x2__aarch64_neonfma_cortex_a75)
574   BENCHMARK_CONV(f32_igemm_4x2__aarch64_neonfma_prfm_cortex_a75)
575   BENCHMARK_CONV(f32_igemm_4x2__aarch64_neonfma_ld64)
576   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a53)
577   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a53)
578   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a55)
579   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a75)
580   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75)
581   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_ld64)
582   BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_ld128)
583   BENCHMARK_CONV(f32_igemm_4x12__aarch64_neonfma_cortex_a53)
584   BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_cortex_a75)
585   BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75)
586   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a53)
587   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a53)
588   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a55)
589   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a73)
590   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a75)
591   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75)
592   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_ld64)
593   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_ld128)
594   BENCHMARK_CONV(f32_igemm_1x8__neonfma_lane_ld64)
595   BENCHMARK_CONV(f32_igemm_4x2__neonfma_lane_ld64)
596   BENCHMARK_CONV(f32_igemm_6x2__neonfma_lane_ld64)
597   BENCHMARK_CONV(f32_igemm_4x4__neonfma_lane_ld64)
598   BENCHMARK_CONV(f32_igemm_4x8__neonfma_lane_ld128)
599   BENCHMARK_CONV(f32_igemm_4x8__neonfma_lane_ld64)
600   BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld64)
601   BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld128)
602 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
603 
604 
605 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
606   static void f32_igemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
607     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1,
608       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
609   }
f32_igemm_4x2__neon_lane_ld64(benchmark::State & state,const char * net)610   static void f32_igemm_4x2__neon_lane_ld64(benchmark::State& state, const char* net) {
611     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1,
612       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
613   }
f32_igemm_6x2__neon_lane_ld64(benchmark::State & state,const char * net)614   static void f32_igemm_6x2__neon_lane_ld64(benchmark::State& state, const char* net) {
615     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x2__neon_lane_ld64, 6, 2, 1, 1,
616       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
617   }
f32_igemm_4x4__neon_lane_ld64(benchmark::State & state,const char * net)618   static void f32_igemm_4x4__neon_lane_ld64(benchmark::State& state, const char* net) {
619     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x4__neon_lane_ld64, 4, 4, 1, 1,
620       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
621   }
f32_igemm_4x8__neon_lane_ld64(benchmark::State & state,const char * net)622   static void f32_igemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
623     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1,
624       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
625   }
f32_igemm_4x8__neon_lane_ld128(benchmark::State & state,const char * net)626   static void f32_igemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
627     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1,
628       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
629   }
f32_igemm_6x8__neon_lane_ld64(benchmark::State & state,const char * net)630   static void f32_igemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
631     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1,
632       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
633   }
f32_igemm_6x8__neon_lane_ld128(benchmark::State & state,const char * net)634   static void f32_igemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
635     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1,
636       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
637   }
f32_igemm_1x8__neon_dup_ld64(benchmark::State & state,const char * net)638   static void f32_igemm_1x8__neon_dup_ld64(benchmark::State& state, const char* net) {
639     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64, 1, 8, 1, 1,
640       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
641   }
f32_igemm_4x8__neon_dup_ld128(benchmark::State & state,const char * net)642   static void f32_igemm_4x8__neon_dup_ld128(benchmark::State& state, const char* net) {
643     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld128, 4, 8, 1, 1,
644       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
645   }
f32_igemm_4x8__neon_dup_ld64(benchmark::State & state,const char * net)646   static void f32_igemm_4x8__neon_dup_ld64(benchmark::State& state, const char* net) {
647     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64, 4, 8, 1, 1,
648       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
649   }
f32_igemm_6x8__neon_dup_ld64(benchmark::State & state,const char * net)650   static void f32_igemm_6x8__neon_dup_ld64(benchmark::State& state, const char* net) {
651     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64, 6, 8, 1, 1,
652       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
653   }
f32_igemm_6x8__neon_dup_ld128(benchmark::State & state,const char * net)654   static void f32_igemm_6x8__neon_dup_ld128(benchmark::State& state, const char* net) {
655     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld128, 6, 8, 1, 1,
656       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
657   }
f32_igemm_1x8__neonfma_dup_ld64(benchmark::State & state,const char * net)658   static void f32_igemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
659     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1,
660       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
661   }
f32_igemm_4x8__neonfma_dup_ld128(benchmark::State & state,const char * net)662   static void f32_igemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
663     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1,
664       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
665   }
f32_igemm_4x8__neonfma_dup_ld64(benchmark::State & state,const char * net)666   static void f32_igemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
667     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1,
668       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
669   }
f32_igemm_6x8__neonfma_dup_ld64(benchmark::State & state,const char * net)670   static void f32_igemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
671     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1,
672       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
673   }
f32_igemm_6x8__neonfma_dup_ld128(benchmark::State & state,const char * net)674   static void f32_igemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
675     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1,
676       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
677   }
f32_igemm_1x8s4__neon(benchmark::State & state,const char * net)678   static void f32_igemm_1x8s4__neon(benchmark::State& state, const char* net) {
679     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8s4__neon, 1, 8, 1, 4,
680       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
681   }
f32_igemm_4x8s4__neon(benchmark::State & state,const char * net)682   static void f32_igemm_4x8s4__neon(benchmark::State& state, const char* net) {
683     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8s4__neon, 4, 8, 1, 4,
684       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
685   }
f32_igemm_6x8s4__neon(benchmark::State & state,const char * net)686   static void f32_igemm_6x8s4__neon(benchmark::State& state, const char* net) {
687     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8s4__neon, 6, 8, 1, 4,
688       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
689   }
f32_igemm_8x8s4__neon(benchmark::State & state,const char * net)690   static void f32_igemm_8x8s4__neon(benchmark::State& state, const char* net) {
691     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_8x8s4__neon, 8, 8, 1, 4,
692       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
693   }
f32_igemm_1x8s4__neonfma(benchmark::State & state,const char * net)694   static void f32_igemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
695     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma, 1, 8, 1, 4,
696       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
697   }
f32_igemm_4x8s4__neonfma(benchmark::State & state,const char * net)698   static void f32_igemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
699     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma, 4, 8, 1, 4,
700       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
701   }
f32_igemm_6x8s4__neonfma(benchmark::State & state,const char * net)702   static void f32_igemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
703     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma, 6, 8, 1, 4,
704       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
705   }
f32_igemm_8x8s4__neonfma(benchmark::State & state,const char * net)706   static void f32_igemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
707     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma, 8, 8, 1, 4,
708       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
709   }
710 
711   BENCHMARK_CONV(f32_igemm_1x8__neon_lane_ld64)
BENCHMARK_CONV(f32_igemm_4x2__neon_lane_ld64)712   BENCHMARK_CONV(f32_igemm_4x2__neon_lane_ld64)
713   BENCHMARK_CONV(f32_igemm_6x2__neon_lane_ld64)
714   BENCHMARK_CONV(f32_igemm_4x4__neon_lane_ld64)
715   BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld128)
716   BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld64)
717   BENCHMARK_CONV(f32_igemm_6x8__neon_lane_ld64)
718   BENCHMARK_CONV(f32_igemm_6x8__neon_lane_ld128)
719   BENCHMARK_CONV(f32_igemm_1x8__neon_dup_ld64)
720   BENCHMARK_CONV(f32_igemm_4x8__neon_dup_ld128)
721   BENCHMARK_CONV(f32_igemm_4x8__neon_dup_ld64)
722   BENCHMARK_CONV(f32_igemm_6x8__neon_dup_ld64)
723   BENCHMARK_CONV(f32_igemm_6x8__neon_dup_ld128)
724   BENCHMARK_CONV(f32_igemm_1x8__neonfma_dup_ld64)
725   BENCHMARK_CONV(f32_igemm_4x8__neonfma_dup_ld128)
726   BENCHMARK_CONV(f32_igemm_4x8__neonfma_dup_ld64)
727   BENCHMARK_CONV(f32_igemm_6x8__neonfma_dup_ld64)
728   BENCHMARK_CONV(f32_igemm_6x8__neonfma_dup_ld128)
729 
730   BENCHMARK_CONV(f32_igemm_1x8s4__neon)
731   BENCHMARK_CONV(f32_igemm_4x8s4__neon)
732   BENCHMARK_CONV(f32_igemm_6x8s4__neon)
733   BENCHMARK_CONV(f32_igemm_8x8s4__neon)
734   BENCHMARK_CONV(f32_igemm_1x8s4__neonfma)
735   BENCHMARK_CONV(f32_igemm_4x8s4__neonfma)
736   BENCHMARK_CONV(f32_igemm_6x8s4__neonfma)
737   BENCHMARK_CONV(f32_igemm_8x8s4__neonfma)
738 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
739 
740 
741 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
742   static void f32_igemm_1x8__sse_load1(benchmark::State& state, const char* net) {
743     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__sse_load1, 1, 8, 1, 1,
744       xnn_init_f32_minmax_sse_params);
745   }
f32_igemm_3x8__sse_load1(benchmark::State & state,const char * net)746   static void f32_igemm_3x8__sse_load1(benchmark::State& state, const char* net) {
747     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__sse_load1, 3, 8, 1, 1,
748       xnn_init_f32_minmax_sse_params);
749   }
f32_igemm_4x8__sse_load1(benchmark::State & state,const char * net)750   static void f32_igemm_4x8__sse_load1(benchmark::State& state, const char* net) {
751     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__sse_load1, 4, 8, 1, 1,
752       xnn_init_f32_minmax_sse_params);
753   }
f32_igemm_5x8__sse_load1(benchmark::State & state,const char * net)754   static void f32_igemm_5x8__sse_load1(benchmark::State& state, const char* net) {
755     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__sse_load1, 5, 8, 1, 1,
756       xnn_init_f32_minmax_sse_params);
757   }
758 
f32_igemm_1x8__sse_dup(benchmark::State & state,const char * net)759   static void f32_igemm_1x8__sse_dup(benchmark::State& state, const char* net) {
760     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__sse_dup, 1, 8, 1, 1,
761       xnn_init_f32_minmax_sse_params);
762   }
f32_igemm_3x8__sse_dup(benchmark::State & state,const char * net)763   static void f32_igemm_3x8__sse_dup(benchmark::State& state, const char* net) {
764     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__sse_dup, 3, 8, 1, 1,
765       xnn_init_f32_minmax_sse_params);
766   }
f32_igemm_4x8__sse_dup(benchmark::State & state,const char * net)767   static void f32_igemm_4x8__sse_dup(benchmark::State& state, const char* net) {
768     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__sse_dup, 4, 8, 1, 1,
769       xnn_init_f32_minmax_sse_params);
770   }
f32_igemm_5x8__sse_dup(benchmark::State & state,const char * net)771   static void f32_igemm_5x8__sse_dup(benchmark::State& state, const char* net) {
772     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__sse_dup, 5, 8, 1, 1,
773       xnn_init_f32_minmax_sse_params);
774   }
775 
f32_igemm_1x8s4__sse(benchmark::State & state,const char * net)776   static void f32_igemm_1x8s4__sse(benchmark::State& state, const char* net) {
777     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8s4__sse, 1, 8, 1, 4,
778       xnn_init_f32_minmax_sse_params);
779   }
f32_igemm_3x8s4__sse(benchmark::State & state,const char * net)780   static void f32_igemm_3x8s4__sse(benchmark::State& state, const char* net) {
781     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8s4__sse, 3, 8, 1, 4,
782       xnn_init_f32_minmax_sse_params);
783   }
f32_igemm_4x8s4__sse(benchmark::State & state,const char * net)784   static void f32_igemm_4x8s4__sse(benchmark::State& state, const char* net) {
785     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8s4__sse, 4, 8, 1, 4,
786       xnn_init_f32_minmax_sse_params);
787   }
f32_igemm_5x8s4__sse(benchmark::State & state,const char * net)788   static void f32_igemm_5x8s4__sse(benchmark::State& state, const char* net) {
789     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8s4__sse, 5, 8, 1, 4,
790       xnn_init_f32_minmax_sse_params);
791   }
792 
f32_igemm_1x8__sse2_dup(benchmark::State & state,const char * net)793   static void f32_igemm_1x8__sse2_dup(benchmark::State& state, const char* net) {
794     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup, 1, 8, 1, 1,
795       xnn_init_f32_minmax_sse_params);
796   }
f32_igemm_3x8__sse2_dup(benchmark::State & state,const char * net)797   static void f32_igemm_3x8__sse2_dup(benchmark::State& state, const char* net) {
798     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup, 3, 8, 1, 1,
799       xnn_init_f32_minmax_sse_params);
800   }
f32_igemm_4x8__sse2_dup(benchmark::State & state,const char * net)801   static void f32_igemm_4x8__sse2_dup(benchmark::State& state, const char* net) {
802     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__sse2_dup, 4, 8, 1, 1,
803       xnn_init_f32_minmax_sse_params);
804   }
f32_igemm_5x8__sse2_dup(benchmark::State & state,const char * net)805   static void f32_igemm_5x8__sse2_dup(benchmark::State& state, const char* net) {
806     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__sse2_dup, 5, 8, 1, 1,
807       xnn_init_f32_minmax_sse_params);
808   }
809 
f32_igemm_1x8__avx_broadcast(benchmark::State & state,const char * net)810   static void f32_igemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
811     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast, 1, 8, 1, 1,
812       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
813   }
f32_igemm_4x8__avx_broadcast(benchmark::State & state,const char * net)814   static void f32_igemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
815     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__avx_broadcast, 4, 8, 1, 1,
816       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
817   }
f32_igemm_5x8__avx_broadcast(benchmark::State & state,const char * net)818   static void f32_igemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
819     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast, 5, 8, 1, 1,
820       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
821   }
f32_igemm_6x8__avx_broadcast(benchmark::State & state,const char * net)822   static void f32_igemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
823     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast, 6, 8, 1, 1,
824       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
825   }
f32_igemm_7x8__avx_broadcast(benchmark::State & state,const char * net)826   static void f32_igemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
827     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast, 7, 8, 1, 1,
828       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
829   }
830 
f32_igemm_1x8__fma3_broadcast(benchmark::State & state,const char * net)831   static void f32_igemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
832     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1,
833       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
834   }
f32_igemm_4x8__fma3_broadcast(benchmark::State & state,const char * net)835   static void f32_igemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
836     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1,
837       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
838   }
f32_igemm_5x8__fma3_broadcast(benchmark::State & state,const char * net)839   static void f32_igemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
840     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1,
841       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
842   }
f32_igemm_6x8__fma3_broadcast(benchmark::State & state,const char * net)843   static void f32_igemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
844     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1,
845       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
846   }
f32_igemm_7x8__fma3_broadcast(benchmark::State & state,const char * net)847   static void f32_igemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
848     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1,
849       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
850   }
f32_igemm_8x8__fma3_broadcast(benchmark::State & state,const char * net)851   static void f32_igemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
852     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1,
853       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
854   }
855 
f32_igemm_1x16__avx512f_broadcast(benchmark::State & state,const char * net)856   static void f32_igemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
857     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1,
858       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
859   }
f32_igemm_4x16__avx512f_broadcast(benchmark::State & state,const char * net)860   static void f32_igemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
861     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1,
862       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
863   }
f32_igemm_5x16__avx512f_broadcast(benchmark::State & state,const char * net)864   static void f32_igemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
865     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1,
866       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
867   }
f32_igemm_6x16__avx512f_broadcast(benchmark::State & state,const char * net)868   static void f32_igemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
869     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1,
870       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
871   }
f32_igemm_7x16__avx512f_broadcast(benchmark::State & state,const char * net)872   static void f32_igemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
873     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1,
874       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
875   }
f32_igemm_8x16__avx512f_broadcast(benchmark::State & state,const char * net)876   static void f32_igemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
877     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1,
878       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
879   }
880 
881   BENCHMARK_CONV(f32_igemm_1x8__sse_load1)
BENCHMARK_CONV(f32_igemm_3x8__sse_load1)882   BENCHMARK_CONV(f32_igemm_3x8__sse_load1)
883   BENCHMARK_CONV(f32_igemm_4x8__sse_load1)
884   BENCHMARK_CONV(f32_igemm_5x8__sse_load1)
885 
886   BENCHMARK_CONV(f32_igemm_1x8__sse_dup)
887   BENCHMARK_CONV(f32_igemm_3x8__sse_dup)
888   BENCHMARK_CONV(f32_igemm_4x8__sse_dup)
889   BENCHMARK_CONV(f32_igemm_5x8__sse_dup)
890 
891   BENCHMARK_CONV(f32_igemm_1x8s4__sse)
892   BENCHMARK_CONV(f32_igemm_3x8s4__sse)
893   BENCHMARK_CONV(f32_igemm_4x8s4__sse)
894   BENCHMARK_CONV(f32_igemm_5x8s4__sse)
895 
896   BENCHMARK_CONV(f32_igemm_1x8__sse2_dup)
897   BENCHMARK_CONV(f32_igemm_3x8__sse2_dup)
898   BENCHMARK_CONV(f32_igemm_4x8__sse2_dup)
899   BENCHMARK_CONV(f32_igemm_5x8__sse2_dup)
900 
901   BENCHMARK_CONV(f32_igemm_1x8__avx_broadcast)
902   BENCHMARK_CONV(f32_igemm_4x8__avx_broadcast)
903   BENCHMARK_CONV(f32_igemm_5x8__avx_broadcast)
904   BENCHMARK_CONV(f32_igemm_6x8__avx_broadcast)
905   BENCHMARK_CONV(f32_igemm_7x8__avx_broadcast)
906 
907   BENCHMARK_CONV(f32_igemm_1x8__fma3_broadcast)
908   BENCHMARK_CONV(f32_igemm_4x8__fma3_broadcast)
909   BENCHMARK_CONV(f32_igemm_5x8__fma3_broadcast)
910   BENCHMARK_CONV(f32_igemm_6x8__fma3_broadcast)
911   BENCHMARK_CONV(f32_igemm_7x8__fma3_broadcast)
912   BENCHMARK_CONV(f32_igemm_8x8__fma3_broadcast)
913 
914   BENCHMARK_CONV(f32_igemm_1x16__avx512f_broadcast)
915   BENCHMARK_CONV(f32_igemm_4x16__avx512f_broadcast)
916   BENCHMARK_CONV(f32_igemm_5x16__avx512f_broadcast)
917   BENCHMARK_CONV(f32_igemm_6x16__avx512f_broadcast)
918   BENCHMARK_CONV(f32_igemm_7x16__avx512f_broadcast)
919   BENCHMARK_CONV(f32_igemm_8x16__avx512f_broadcast)
920 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
921 
922 
923 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
924   static void f32_igemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
925     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 3, 8, 1, 1,
926       xnn_init_f32_minmax_scalar_params);
927   }
928 
f32_igemm_4x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)929   static void f32_igemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
930     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 4, 8, 1, 1,
931       xnn_init_f32_minmax_scalar_params);
932   }
933 
f32_igemm_5x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)934   static void f32_igemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
935     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 5, 8, 1, 1,
936       xnn_init_f32_minmax_scalar_params);
937   }
938 
f32_igemm_6x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)939   static void f32_igemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
940     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 6, 8, 1, 1,
941       xnn_init_f32_minmax_scalar_params);
942   }
943 
f32_igemm_3x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)944   static void f32_igemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
945     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 3, 8, 1, 1,
946       xnn_init_f32_minmax_scalar_params);
947   }
948 
f32_igemm_4x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)949   static void f32_igemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
950     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 4, 8, 1, 1,
951       xnn_init_f32_minmax_scalar_params);
952   }
953 
f32_igemm_5x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)954   static void f32_igemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
955     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 5, 8, 1, 1,
956       xnn_init_f32_minmax_scalar_params);
957   }
958 
f32_igemm_6x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)959   static void f32_igemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
960     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 6, 8, 1, 1,
961       xnn_init_f32_minmax_scalar_params);
962   }
963 
f32_igemm_3x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)964   static void f32_igemm_3x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
965     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 3, 8, 1, 1,
966       xnn_init_f32_minmax_scalar_params);
967   }
968 
f32_igemm_4x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)969   static void f32_igemm_4x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
970     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, 8, 1, 1,
971       xnn_init_f32_minmax_scalar_params);
972   }
973 
f32_igemm_5x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)974   static void f32_igemm_5x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
975     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 5, 8, 1, 1,
976       xnn_init_f32_minmax_scalar_params);
977   }
978 
f32_igemm_6x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)979   static void f32_igemm_6x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
980     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 6, 8, 1, 1,
981       xnn_init_f32_minmax_scalar_params);
982   }
983 
f32_igemm_3x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)984   static void f32_igemm_3x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
985     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 3, 8, 1, 1,
986       xnn_init_f32_minmax_scalar_params);
987   }
988 
f32_igemm_4x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)989   static void f32_igemm_4x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
990     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, 8, 1, 1,
991       xnn_init_f32_minmax_scalar_params);
992   }
993 
f32_igemm_5x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)994   static void f32_igemm_5x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
995     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 5, 8, 1, 1,
996       xnn_init_f32_minmax_scalar_params);
997   }
998 
f32_igemm_6x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)999   static void f32_igemm_6x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1000     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 6, 8, 1, 1,
1001       xnn_init_f32_minmax_scalar_params);
1002   }
1003 
f32_igemm_3x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1004   static void f32_igemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1005     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm, 3, 8, 1, 4,
1006       xnn_init_f32_minmax_scalar_params);
1007   }
1008 
f32_igemm_4x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1009   static void f32_igemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1010     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, 8, 1, 4,
1011       xnn_init_f32_minmax_scalar_params);
1012   }
1013 
f32_igemm_5x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1014   static void f32_igemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1015     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm, 5, 8, 1, 4,
1016       xnn_init_f32_minmax_scalar_params);
1017   }
1018 
f32_igemm_6x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1019   static void f32_igemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1020     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm, 6, 8, 1, 4,
1021       xnn_init_f32_minmax_scalar_params);
1022   }
1023 
f32_igemm_3x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1024   static void f32_igemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1025     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86, 3, 8, 1, 4,
1026       xnn_init_f32_minmax_scalar_params);
1027   }
1028 
f32_igemm_4x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1029   static void f32_igemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1030     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, 8, 1, 4,
1031       xnn_init_f32_minmax_scalar_params);
1032   }
1033 
f32_igemm_5x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1034   static void f32_igemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1035     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86, 5, 8, 1, 4,
1036       xnn_init_f32_minmax_scalar_params);
1037   }
1038 
f32_igemm_6x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1039   static void f32_igemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1040     f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86, 6, 8, 1, 4,
1041       xnn_init_f32_minmax_scalar_params);
1042   }
1043 
1044   BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_arm_loadsplat)
BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_loadsplat)1045   BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_loadsplat)
1046   BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_arm_loadsplat)
1047   BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_arm_loadsplat)
1048   BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_x86_loadsplat)
1049   BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_x86_loadsplat)
1050   BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_x86_loadsplat)
1051   BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_x86_loadsplat)
1052   BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_arm_splat)
1053   BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_splat)
1054   BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_arm_splat)
1055   BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_arm_splat)
1056   BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_x86_splat)
1057   BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_x86_splat)
1058   BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_x86_splat)
1059   BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_x86_splat)
1060   BENCHMARK_CONV(f32_igemm_3x8s4__wasmsimd_arm)
1061   BENCHMARK_CONV(f32_igemm_4x8s4__wasmsimd_arm)
1062   BENCHMARK_CONV(f32_igemm_5x8s4__wasmsimd_arm)
1063   BENCHMARK_CONV(f32_igemm_6x8s4__wasmsimd_arm)
1064   BENCHMARK_CONV(f32_igemm_3x8s4__wasmsimd_x86)
1065   BENCHMARK_CONV(f32_igemm_4x8s4__wasmsimd_x86)
1066   BENCHMARK_CONV(f32_igemm_5x8s4__wasmsimd_x86)
1067   BENCHMARK_CONV(f32_igemm_6x8s4__wasmsimd_x86)
1068 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1069 
1070 
1071 static void f32_igemm_1x4__scalar(benchmark::State& state, const char* net) {
1072   f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x4__scalar, 1, 4, 1, 1,
1073     xnn_init_f32_minmax_scalar_params);
1074 }
1075 
f32_igemm_2x4__scalar(benchmark::State & state,const char * net)1076 static void f32_igemm_2x4__scalar(benchmark::State& state, const char* net) {
1077   f32_igemm(state, xnn_f32_igemm_minmax_ukernel_2x4__scalar, 2, 4, 1, 1,
1078     xnn_init_f32_minmax_scalar_params);
1079 }
1080 
f32_igemm_4x4__scalar(benchmark::State & state,const char * net)1081 static void f32_igemm_4x4__scalar(benchmark::State& state, const char* net) {
1082   f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x4__scalar, 4, 4, 1, 1,
1083     xnn_init_f32_minmax_scalar_params);
1084 }
1085 
1086 BENCHMARK_CONV(f32_igemm_1x4__scalar)
1087 BENCHMARK_CONV(f32_igemm_2x4__scalar)
1088 BENCHMARK_CONV(f32_igemm_4x4__scalar)
1089 
1090 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1091 BENCHMARK_MAIN();
1092 #endif
1093