xref: /aosp_15_r20/external/XNNPACK/bench/f32-gemm.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <chrono>
12 #include <cmath>
13 #include <functional>
14 #include <mutex>
15 #include <random>
16 #include <vector>
17 
18 #include <benchmark/benchmark.h>
19 #ifdef BENCHMARK_RUY
20 #include "ruy/ruy.h"
21 #endif  // BENCHMARK_RUY
22 #include "bench/gemm.h"
23 #include "bench/utils.h"
24 
25 #include <xnnpack.h>
26 #include <xnnpack/aligned-allocator.h>
27 #include <xnnpack/allocator.h>
28 #include <xnnpack/common.h>
29 #include <xnnpack/gemm.h>
30 #include <xnnpack/math.h>
31 #include <xnnpack/microfnptr.h>
32 #include <xnnpack/microparams-init.h>
33 #include <xnnpack/pack.h>
34 #include <xnnpack/packx.h>
35 #include <xnnpack/ppmm.h>
36 
37 
GEMMBenchmark(benchmark::State & state,xnn_f32_gemm_minmax_ukernel_function gemm,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)38 static void GEMMBenchmark(benchmark::State& state,
39   xnn_f32_gemm_minmax_ukernel_function gemm,
40   size_t mr, size_t nr, size_t kr, size_t sr,
41   xnn_init_f32_minmax_params_fn init_params,
42   benchmark::utils::IsaCheckFunction isa_check = nullptr)
43 {
44   if (isa_check && !isa_check(state)) {
45     return;
46   }
47 
48   const size_t mc = state.range(0);
49   const size_t nc = state.range(1);
50   const size_t kc = state.range(2);
51 
52   const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
53   const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
54 
55   std::random_device random_device;
56   auto rng = std::mt19937(random_device());
57   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
58 
59   std::vector<float> a(mc * kc + XNN_EXTRA_BYTES / sizeof(float));
60   std::generate(a.begin(), a.end(), std::ref(f32rng));
61   std::vector<float> k(nc * kc);
62   std::generate(k.begin(), k.end(), std::ref(f32rng));
63   std::vector<float> b(nc);
64   std::generate(b.begin(), b.end(), std::ref(f32rng));
65 
66   const size_t w_elements = nc_stride * kc_stride + nc_stride;
67   const size_t c_elements = mc * nc;
68   const size_t num_buffers = 1 +
69     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
70       sizeof(float) * (w_elements + c_elements));
71 
72   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
73   std::fill(w.begin(), w.end(), 0.0f);
74   xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
75   std::vector<float> c(c_elements * num_buffers);
76   std::fill(c.begin(), c.end(), std::nanf(""));
77 
78   xnn_f32_minmax_params params;
79   init_params(&params,
80     -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
81 
82   size_t buffer_index = 0;
83   for (auto _ : state) {
84     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
85     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
86     // - W is not in cache (for any cache level)
87     // - C is not in cache (for any cache level)
88     state.PauseTiming();
89     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
90     buffer_index = (buffer_index + 1) % num_buffers;
91     state.ResumeTiming();
92 
93     for (uint32_t m = 0; m < mc; m += mr) {
94       const uint32_t mb = min(mc - m, mr);
95       gemm(
96         mb, nc, kc * sizeof(float),
97         a.data() + m * kc, kc * sizeof(float),
98         w.data() + buffer_index * nc_stride * (kc_stride + 1),
99         c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
100         &params);
101     }
102   }
103 
104   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
105   if (cpu_frequency != 0) {
106     state.counters["cpufreq"] = cpu_frequency;
107   }
108 
109   state.counters["FLOPS"] = benchmark::Counter(
110     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
111 }
112 
PPMM1PBenchmark(benchmark::State & state,xnn_f32_ppmm_minmax_ukernel_function ppmm,xnn_x32_packx_ukernel_function packx,size_t mr,size_t nr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)113 static void PPMM1PBenchmark(benchmark::State& state,
114   xnn_f32_ppmm_minmax_ukernel_function ppmm,
115   xnn_x32_packx_ukernel_function packx,
116   size_t mr, size_t nr,
117   xnn_init_f32_minmax_params_fn init_params,
118   benchmark::utils::IsaCheckFunction isa_check = nullptr)
119 {
120   if (isa_check && !isa_check(state)) {
121     return;
122   }
123 
124   const size_t mc = state.range(0);
125   const size_t nc = state.range(1);
126   const size_t kc = state.range(2);
127 
128   const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
129 
130   std::random_device random_device;
131   auto rng = std::mt19937(random_device());
132   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
133 
134   std::vector<float> a(mc * kc + XNN_EXTRA_BYTES / sizeof(float));
135   std::generate(a.begin(), a.end(), std::ref(f32rng));
136   std::vector<float> k(nc * kc);
137   std::generate(k.begin(), k.end(), std::ref(f32rng));
138   std::vector<float> b(nc);
139   std::generate(b.begin(), b.end(), std::ref(f32rng));
140 
141   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> t(mr * kc);
142 
143   const size_t w_elements = nc_stride * kc + nc_stride;
144   const size_t c_elements = mc * nc;
145   const size_t num_buffers = 1 +
146     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
147       sizeof(float) * (w_elements + c_elements));
148 
149   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
150   std::fill(w.begin(), w.end(), 0.0f);
151   xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data(), 0, nullptr);
152   std::vector<float> c(c_elements * num_buffers);
153   std::fill(c.begin(), c.end(), std::nanf(""));
154 
155   xnn_f32_minmax_params params;
156   init_params(&params,
157     -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
158 
159   size_t buffer_index = 0;
160   for (auto _ : state) {
161     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
162     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
163     // - W is not in cache (for any cache level)
164     // - C is not in cache (for any cache level)
165     state.PauseTiming();
166     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
167     buffer_index = (buffer_index + 1) % num_buffers;
168     state.ResumeTiming();
169 
170     for (uint32_t m = 0; m < mc; m += mr) {
171       const uint32_t mb = min(mc - m, mr);
172       packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
173       ppmm(
174         mb, nc, kc * sizeof(float),
175         reinterpret_cast<const float*>(t.data()),
176         w.data() + nc_stride * buffer_index * (kc + 1),
177         c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
178         &params);
179     }
180   }
181 
182   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
183   if (cpu_frequency != 0) {
184     state.counters["cpufreq"] = cpu_frequency;
185   }
186 
187   state.counters["FLOPS"] = benchmark::Counter(
188     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
189 }
190 
PPMM2PBenchmark(benchmark::State & state,xnn_f32_ppmm_minmax_ukernel_function ppmm,xnn_x32_packx_ukernel_function packx,size_t mr,size_t nr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)191 static void PPMM2PBenchmark(benchmark::State& state,
192   xnn_f32_ppmm_minmax_ukernel_function ppmm,
193   xnn_x32_packx_ukernel_function packx,
194   size_t mr, size_t nr,
195   xnn_init_f32_minmax_params_fn init_params,
196   benchmark::utils::IsaCheckFunction isa_check = nullptr)
197 {
198   if (isa_check && !isa_check(state)) {
199     return;
200   }
201 
202   const size_t mc = state.range(0);
203   const size_t nc = state.range(1);
204   const size_t kc = state.range(2);
205 
206   const size_t mc_stride = benchmark::utils::RoundUp(mc, mr);
207   const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
208 
209   std::random_device random_device;
210   auto rng = std::mt19937(random_device());
211   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
212 
213   std::vector<float> a(mc * kc + XNN_EXTRA_BYTES / sizeof(float));
214   std::generate(a.begin(), a.end(), std::ref(f32rng));
215   std::vector<float> k(nc * kc);
216   std::generate(k.begin(), k.end(), std::ref(f32rng));
217   std::vector<float> b(nc);
218   std::generate(b.begin(), b.end(), std::ref(f32rng));
219 
220   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> t(mc_stride * kc);
221 
222   const size_t w_elements = nc_stride * kc + nc_stride;
223   const size_t c_elements = mc * nc;
224   const size_t num_buffers = 1 +
225     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
226       sizeof(float) * (w_elements + c_elements));
227 
228   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
229   std::fill(w.begin(), w.end(), 0.0f);
230   xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data(), 0, nullptr);
231   std::vector<float> c(c_elements * num_buffers);
232   std::fill(c.begin(), c.end(), std::nanf(""));
233 
234   xnn_f32_minmax_params params;
235   init_params(&params,
236     -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
237 
238   size_t buffer_index = 0;
239   for (auto _ : state) {
240     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
241     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
242     // - W is not in cache (for any cache level)
243     // - C is not in cache (for any cache level)
244     state.PauseTiming();
245     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
246     buffer_index = (buffer_index + 1) % num_buffers;
247     state.ResumeTiming();
248 
249     for (uint32_t m = 0; m < mc; m += mr) {
250       const uint32_t mb = min(mc - m, mr);
251       packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
252     }
253     for (uint32_t m = 0; m < mc; m += mr) {
254       const uint32_t mb = min(mc - m, mr);
255       ppmm(
256         mb, nc, kc * sizeof(float),
257         reinterpret_cast<const float*>(t.data() + m * kc),
258         w.data() + nc_stride * buffer_index * (kc + 1),
259         c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
260         &params);
261     }
262   }
263 
264   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
265   if (cpu_frequency != 0) {
266     state.counters["cpufreq"] = cpu_frequency;
267   }
268 
269   state.counters["FLOPS"] = benchmark::Counter(
270     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
271 }
272 
273 #ifdef BENCHMARK_RUY
RuyBenchmark(benchmark::State & state,uint32_t threads)274 static void RuyBenchmark(benchmark::State& state, uint32_t threads)
275 {
276   std::random_device random_device;
277   auto rng = std::mt19937(random_device());
278   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
279 
280   const size_t mc = state.range(0);
281   const size_t nc = state.range(1);
282   const size_t kc = state.range(2);
283 
284   const size_t num_buffers = 1 +
285     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
286       sizeof(float) * (nc * (mc + kc + 1)));
287 
288   std::vector<float> a(mc * kc + XNN_EXTRA_BYTES / sizeof(float));
289   std::generate(a.begin(), a.end(), std::ref(f32rng));
290   std::vector<float> k(num_buffers * nc * kc);
291   std::generate(k.begin(), k.end(), std::ref(f32rng));
292   std::vector<float> b(num_buffers * nc);
293   std::generate(b.begin(), b.end(), std::ref(f32rng));
294   std::vector<float> c(num_buffers * nc * mc);
295   std::fill(c.begin(), c.end(), std::nanf(""));
296 
297   // Note: context must be static to avoid the cost of re-creating it for each benchmark.
298   static ruy::Context context;
299   context.set_max_num_threads(threads);
300 
301   ruy::Matrix<float> ruy_a;
302   ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
303   ruy::Matrix<float> ruy_b;
304   ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
305   ruy_b.set_data(a.data());
306   ruy::Matrix<float> ruy_c;
307   ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
308 
309   ruy::MulParams<float, float> mul_params;
310 
311   // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
312   // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
313   // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
314   // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
315   static std::once_flag warmup;
316   std::call_once(warmup, [&](){
317     auto start = std::chrono::steady_clock::now();
318     do {
319       ruy_a.set_data(k.data());
320       ruy_c.set_data(c.data());
321       mul_params.set_bias(b.data());
322 
323       ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
324     } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
325   });
326 
327   size_t buffer_index = 0;
328   for (auto _ : state) {
329     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
330     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
331     // - K is not in cache (for any cache level)
332     // - B is not in cache (for any cache level)
333     // - C is not in cache (for any cache level)
334     state.PauseTiming();
335     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
336     buffer_index = (buffer_index + 1) % num_buffers;
337     state.ResumeTiming();
338 
339     ruy_a.set_data(k.data() + buffer_index * nc * kc);
340     ruy_c.set_data(c.data() + buffer_index * mc * nc);
341     mul_params.set_bias(b.data() + buffer_index * nc);
342 
343     ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
344   }
345 
346   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
347   if (cpu_frequency != 0) {
348     state.counters["cpufreq"] = cpu_frequency;
349   }
350 
351   state.counters["FLOPS"] = benchmark::Counter(
352     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
353 }
354 
ruy_st(benchmark::State & state,const char * net)355 static void ruy_st(benchmark::State& state, const char* net)
356 {
357   RuyBenchmark(state, 1);
358 }
359 #endif  // BENCHMARK_RUY
360 
361 #if XNN_PLATFORM_JIT
GEMMBenchmark(benchmark::State & state,xnn_jit_gemm_code_generator_function generator,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)362 static void GEMMBenchmark(benchmark::State& state,
363   xnn_jit_gemm_code_generator_function generator,
364   size_t mr, size_t nr, size_t kr, size_t sr,
365   xnn_init_f32_minmax_params_fn init_params,
366   benchmark::utils::IsaCheckFunction isa_check = nullptr)
367 {
368   if (isa_check && !isa_check(state)) {
369     return;
370   }
371 
372   const size_t mc = state.range(0);
373   const size_t nc = state.range(1);
374   const size_t kc = state.range(2);
375 
376   const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
377   const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
378 
379   std::random_device random_device;
380   auto rng = std::mt19937(random_device());
381   auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
382 
383   std::vector<float> a(mc * kc + XNN_EXTRA_BYTES / sizeof(float));
384   std::generate(a.begin(), a.end(), std::ref(f32rng));
385   std::vector<float> k(nc * kc);
386   std::generate(k.begin(), k.end(), std::ref(f32rng));
387   std::vector<float> b(nc);
388   std::generate(b.begin(), b.end(), std::ref(f32rng));
389 
390   const size_t w_elements = nc_stride * kc_stride + nc_stride;
391   const size_t c_elements = mc * nc;
392   const size_t num_buffers = 1 +
393     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
394       sizeof(float) * (w_elements + c_elements));
395 
396   std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
397   std::fill(w.begin(), w.end(), 0.0f);
398   xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
399   std::vector<float> c(c_elements * num_buffers);
400   std::fill(c.begin(), c.end(), std::nanf(""));
401 
402   xnn_f32_minmax_params params;
403   init_params(&params,
404     -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
405 
406   xnn_initialize(/*allocator=*/nullptr);
407   xnn_code_buffer code_buffer;
408   xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
409   jit_gemm_params jit_params = {
410     .f32_minmax = {
411       .min = -std::numeric_limits<float>::infinity(),
412       .max = +std::numeric_limits<float>::infinity()
413     }
414   };
415   generator(&code_buffer, mr, nc % nr, kc * sizeof(float), &jit_params);
416   xnn_finalize_code_memory(&code_buffer);
417   xnn_f32_gemm_minmax_ukernel_function gemm = reinterpret_cast<xnn_f32_gemm_minmax_ukernel_function>(code_buffer.start);
418 
419   size_t buffer_index = 0;
420   for (auto _ : state) {
421     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
422     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
423     // - W is not in cache (for any cache level)
424     // - C is not in cache (for any cache level)
425     state.PauseTiming();
426     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
427     buffer_index = (buffer_index + 1) % num_buffers;
428     state.ResumeTiming();
429 
430     for (uint32_t m = 0; m < mc; m += mr) {
431       const uint32_t mb = min(mc - m, mr);
432       gemm(
433         mb, nc, kc * sizeof(float),
434         a.data() + m * kc, kc * sizeof(float),
435         w.data() + buffer_index * nc_stride * (kc_stride + 1),
436         c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
437         &params);
438     }
439   }
440 
441   xnn_release_code_memory(&code_buffer);
442 
443   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
444   if (cpu_frequency != 0) {
445     state.counters["cpufreq"] = cpu_frequency;
446   }
447 
448   state.counters["FLOPS"] = benchmark::Counter(
449     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
450 }
451 #endif  // XNN_PLATFORM_JIT
452 
453 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
f32_gemm_1x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)454   static void f32_gemm_1x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
455     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, 1, 8, 1, 1,
456       xnn_init_f32_minmax_scalar_params);
457   }
f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)458   static void f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
459     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1,
460       xnn_init_f32_minmax_scalar_params);
461   }
f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)462   static void f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
463     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1,
464       xnn_init_f32_minmax_scalar_params);
465   }
f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,const char * net)466   static void f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net) {
467     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, 1, 8, 1, 1,
468       xnn_init_f32_minmax_scalar_params);
469   }
f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)470   static void f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
471     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
472       xnn_init_f32_minmax_scalar_params);
473   }
f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)474   static void f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
475     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
476       xnn_init_f32_minmax_scalar_params);
477   }
f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)478   static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
479     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1,
480       xnn_init_f32_minmax_scalar_params);
481   }
f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)482   static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
483     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1,
484       xnn_init_f32_minmax_scalar_params);
485   }
f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,const char * net)486   static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net) {
487     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, 4, 8, 1, 1,
488       xnn_init_f32_minmax_scalar_params);
489   }
f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)490   static void f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
491     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, 4, 8, 1, 1,
492       xnn_init_f32_minmax_scalar_params);
493   }
f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)494   static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
495     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
496       xnn_init_f32_minmax_scalar_params);
497   }
f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)498   static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
499     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
500       xnn_init_f32_minmax_scalar_params);
501   }
f32_gemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)502   static void f32_gemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
503     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, 4, 2, 1, 1,
504       xnn_init_f32_minmax_scalar_params);
505   }
f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)506   static void f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
507     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, 4, 2, 1, 1,
508       xnn_init_f32_minmax_scalar_params);
509   }
f32_gemm_4x2__aarch64_neonfma_ld64(benchmark::State & state,const char * net)510   static void f32_gemm_4x2__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
511     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, 4, 2, 1, 1,
512       xnn_init_f32_minmax_scalar_params);
513   }
f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)514   static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
515     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1,
516       xnn_init_f32_minmax_scalar_params);
517   }
f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)518   static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
519     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1,
520       xnn_init_f32_minmax_scalar_params);
521   }
f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)522   static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
523     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1,
524       xnn_init_f32_minmax_scalar_params);
525   }
f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)526   static void f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
527     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8, 1, 1,
528       xnn_init_f32_minmax_scalar_params);
529   }
f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)530   static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
531     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1,
532       xnn_init_f32_minmax_scalar_params);
533   }
f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)534   static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
535     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
536       xnn_init_f32_minmax_scalar_params);
537   }
f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)538   static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
539     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1,
540       xnn_init_f32_minmax_scalar_params);
541   }
f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,const char * net)542   static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net) {
543     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53, 6, 8, 1, 1,
544       xnn_init_f32_minmax_scalar_params);
545   }
f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)546   static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
547     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1,
548       xnn_init_f32_minmax_scalar_params);
549   }
f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State & state,const char * net)550   static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
551     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1,
552       xnn_init_f32_minmax_scalar_params);
553   }
f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)554   static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
555     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
556       xnn_init_f32_minmax_scalar_params);
557   }
f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)558   static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
559     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
560       xnn_init_f32_minmax_scalar_params);
561   }
f32_gemm_1x8__neonfma_lane_ld64(benchmark::State & state,const char * net)562   static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
563     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1,
564       xnn_init_f32_minmax_scalar_params);
565   }
f32_gemm_4x2__neonfma_lane_ld64(benchmark::State & state,const char * net)566   static void f32_gemm_4x2__neonfma_lane_ld64(benchmark::State& state, const char* net) {
567     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64, 4, 2, 1, 1,
568       xnn_init_f32_minmax_scalar_params);
569   }
f32_gemm_6x2__neonfma_lane_ld64(benchmark::State & state,const char * net)570   static void f32_gemm_6x2__neonfma_lane_ld64(benchmark::State& state, const char* net) {
571     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x2__neonfma_lane_ld64, 6, 2, 1, 1,
572       xnn_init_f32_minmax_scalar_params);
573   }
f32_gemm_4x8__neonfma_lane_ld64(benchmark::State & state,const char * net)574   static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
575     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1,
576       xnn_init_f32_minmax_scalar_params);
577   }
f32_gemm_4x8__neonfma_lane_ld128(benchmark::State & state,const char * net)578   static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
579     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1,
580       xnn_init_f32_minmax_scalar_params);
581   }
f32_gemm_5x8__neonfma_lane_ld64(benchmark::State & state,const char * net)582   static void f32_gemm_5x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
583     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64, 5, 8, 1, 1,
584       xnn_init_f32_minmax_scalar_params);
585   }
f32_gemm_6x8__neonfma_lane_ld64(benchmark::State & state,const char * net)586   static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
587     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1,
588       xnn_init_f32_minmax_scalar_params);
589   }
f32_gemm_6x8__neonfma_lane_ld128(benchmark::State & state,const char * net)590   static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
591     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1,
592       xnn_init_f32_minmax_scalar_params);
593   }
594 
595   BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_ld64)
BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)596   BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
597   BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
598   BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a53)
599   BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a75)
600   BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75)
601   BENCHMARK_GEMM(f32_gemm_4x2__aarch64_neonfma_cortex_a75)
602   BENCHMARK_GEMM(f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75)
603   BENCHMARK_GEMM(f32_gemm_4x2__aarch64_neonfma_ld64)
604   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
605   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53)
606   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a55)
607   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
608   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75)
609   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld128)
610   BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld64)
611   BENCHMARK_GEMM(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
612   BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a75)
613   BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75)
614   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a53)
615   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53)
616   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a55)
617   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a73)
618   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
619   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75)
620   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
621   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
622   BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
623   BENCHMARK_GEMM(f32_gemm_4x2__neonfma_lane_ld64)
624   BENCHMARK_GEMM(f32_gemm_6x2__neonfma_lane_ld64)
625   BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld64)
626   BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
627   BENCHMARK_GEMM(f32_gemm_5x8__neonfma_lane_ld64)
628   BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld64)
629   BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld128)
630 #endif  // XNN_ARCH_ARM64
631 
632 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
633   static void f32_gemm_4x4__aarch32_vfp_ld64(benchmark::State& state, const char* net) {
634     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, 4, 4, 1, 1,
635       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckVFP);
636   }
637 
f32_gemm_4x8__aarch32_neon_ld64(benchmark::State & state,const char * net)638   static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
639     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
640       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
641   }
f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)642   static void f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
643     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
644       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
645   }
f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)646   static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
647     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
648       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
649   }
f32_gemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State & state,const char * net)650   static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State& state, const char* net) {
651     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, 4, 8, 1, 1,
652       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
653   }
f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)654   static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
655     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
656       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
657   }
f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)658   static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
659     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
660       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
661   }
f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)662   static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
663     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
664       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
665   }
666 
667   BENCHMARK_GEMM(f32_gemm_4x4__aarch32_vfp_ld64)
BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)668   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)
669   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a7)
670   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a53)
671   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_prfm_cortex_a53)
672   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a55)
673   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a75)
674   BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_prfm_cortex_a75)
675 #endif  // XNN_ARCH_ARM
676 
677 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
678   static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
679     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1,
680       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
681   }
f32_gemm_4x2__neon_lane_ld64(benchmark::State & state,const char * net)682   static void f32_gemm_4x2__neon_lane_ld64(benchmark::State& state, const char* net) {
683     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1,
684       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
685   }
f32_gemm_6x2__neon_lane_ld64(benchmark::State & state,const char * net)686   static void f32_gemm_6x2__neon_lane_ld64(benchmark::State& state, const char* net) {
687     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, 6, 2, 1, 1,
688       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
689   }
f32_gemm_4x8__neon_lane_ld64(benchmark::State & state,const char * net)690   static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
691     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1,
692       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
693   }
f32_gemm_4x8__neon_lane_ld128(benchmark::State & state,const char * net)694   static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
695     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1,
696       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
697   }
f32_gemm_5x8__neon_lane_ld64(benchmark::State & state,const char * net)698   static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
699     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1,
700       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
701   }
f32_gemm_6x8__neon_lane_ld64(benchmark::State & state,const char * net)702   static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
703     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1,
704       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
705   }
f32_gemm_6x8__neon_lane_ld128(benchmark::State & state,const char * net)706   static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
707     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1,
708       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
709   }
f32_gemm_1x8__neonfma_dup_ld64(benchmark::State & state,const char * net)710   static void f32_gemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
711     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1,
712       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
713   }
f32_gemm_4x8__neonfma_dup_ld64(benchmark::State & state,const char * net)714   static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
715     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1,
716       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
717   }
f32_gemm_4x8__neonfma_dup_ld128(benchmark::State & state,const char * net)718   static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
719     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1,
720       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
721   }
f32_gemm_6x8__neonfma_dup_ld64(benchmark::State & state,const char * net)722   static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
723     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1,
724       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
725   }
f32_gemm_6x8__neonfma_dup_ld128(benchmark::State & state,const char * net)726   static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
727     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1,
728       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
729   }
f32_gemm_1x8s4__neon(benchmark::State & state,const char * net)730   static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
731     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__neon, 1, 8, 1, 4,
732       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
733   }
f32_gemm_1x8s4__neonfma(benchmark::State & state,const char * net)734   static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
735     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, 1, 8, 1, 4,
736       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
737   }
f32_gemm_4x8s4__neon(benchmark::State & state,const char * net)738   static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
739     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__neon, 4, 8, 1, 4,
740       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
741   }
f32_gemm_4x8s4__neonfma(benchmark::State & state,const char * net)742   static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
743     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma, 4, 8, 1, 4,
744       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
745   }
f32_gemm_6x8s4__neon(benchmark::State & state,const char * net)746   static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
747     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__neon, 6, 8, 1, 4,
748       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
749   }
f32_gemm_6x8s4__neonfma(benchmark::State & state,const char * net)750   static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
751     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, 6, 8, 1, 4,
752       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
753   }
f32_gemm_8x8s4__neon(benchmark::State & state,const char * net)754   static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
755     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8s4__neon, 8, 8, 1, 4,
756       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
757   }
f32_gemm_8x8s4__neonfma(benchmark::State & state,const char * net)758   static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
759     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, 8, 8, 1, 4,
760       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
761   }
f32_ppmm_4x8_unipass__neonfma(benchmark::State & state,const char * net)762   static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
763     PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8,
764       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
765   }
f32_ppmm_4x8_twopass__neonfma(benchmark::State & state,const char * net)766   static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
767     PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8,
768       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
769   }
770 
771   BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
BENCHMARK_GEMM(f32_gemm_4x2__neon_lane_ld64)772   BENCHMARK_GEMM(f32_gemm_4x2__neon_lane_ld64)
773   BENCHMARK_GEMM(f32_gemm_6x2__neon_lane_ld64)
774   BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)
775   BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
776   BENCHMARK_GEMM(f32_gemm_5x8__neon_lane_ld64)
777   BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld64)
778   BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld128)
779 
780   BENCHMARK_GEMM(f32_gemm_1x8__neonfma_dup_ld64)
781   BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld64)
782   BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld128)
783   BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld64)
784   BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld128)
785 
786   BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
787   BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
788   BENCHMARK_GEMM(f32_gemm_6x8s4__neon)
789   BENCHMARK_GEMM(f32_gemm_8x8s4__neon)
790 
791   BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
792   BENCHMARK_GEMM(f32_gemm_4x8s4__neonfma)
793   BENCHMARK_GEMM(f32_gemm_6x8s4__neonfma)
794   BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma)
795 
796   BENCHMARK_GEMM(f32_ppmm_4x8_unipass__neonfma)
797   BENCHMARK_GEMM(f32_ppmm_4x8_twopass__neonfma)
798 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
799 
800 
801 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT
802   static void jit_f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net)
803   {
804     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
805       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
806   }
jit_f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)807   static void jit_f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net)
808   {
809     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
810       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
811   }
jit_f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)812   static void jit_f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net)
813   {
814     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
815       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
816   }
jit_f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)817   static void jit_f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net)
818   {
819     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
820       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
821   }
jit_f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)822   static void jit_f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net)
823   {
824     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
825       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
826   }
jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)827   static void jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net)
828   {
829     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
830       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
831   }
832 
833   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a53)
BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a55)834   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a55)
835   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a75)
836   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75)
837   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_ld64)
838   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a7)
839 #endif  // XNN_ARCH_ARM && XNN_PLATFORM_JIT
840 
841 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
842   static void jit_f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net)
843   {
844     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
845       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
846   }
jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)847   static void jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net)
848   {
849     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
850       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
851   }
jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)852   static void jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net)
853   {
854     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
855       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
856   }
jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)857   static void jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net)
858   {
859     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
860       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
861   }
jit_f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)862   static void jit_f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net)
863   {
864     GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
865       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
866   }
867   BENCHMARK_GEMM(jit_f32_gemm_1x8__aarch64_neonfma_cortex_a75)
868   BENCHMARK_GEMM(jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75)
869   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75)
870   BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75)
871   BENCHMARK_GEMM(jit_f32_gemm_6x8__aarch64_neonfma_ld128)
872 
873 #define BENCHMARK_UPTO_MR_GEMM(name, max_mr, nr)                                \
874   static void name(benchmark::State &state, const char *net) {                  \
875     GEMMBenchmark(                                                              \
876         state,                                                                  \
877         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, \
878         max_mr, nr, 1, 1, xnn_init_f32_minmax_scalar_params,                    \
879         benchmark::utils::CheckNEON);                                           \
880   }                                                                             \
881   BENCHMARK_GEMM(name)
882   BENCHMARK_UPTO_MR_GEMM(jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8);
883   BENCHMARK_UPTO_MR_GEMM(jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_prfm_cortex_a75, 2, 8);
884   BENCHMARK_UPTO_MR_GEMM(jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_prfm_cortex_a75, 3, 8);
885   BENCHMARK_UPTO_MR_GEMM(jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8);
886   BENCHMARK_UPTO_MR_GEMM(jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8);
887   BENCHMARK_UPTO_MR_GEMM(jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8);
888 #undef BENCHMARK_UPTO_MR_GEMM
889 
890 #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
891 
892 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
f32_gemm_1x16__avx512f_broadcast(benchmark::State & state,const char * net)893   static void f32_gemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
894     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1,
895       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
896   }
f32_gemm_4x16__avx512f_broadcast(benchmark::State & state,const char * net)897   static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
898     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1,
899       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
900   }
f32_gemm_5x16__avx512f_broadcast(benchmark::State & state,const char * net)901   static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
902     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1,
903       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
904   }
f32_gemm_6x16__avx512f_broadcast(benchmark::State & state,const char * net)905   static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
906     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1,
907       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
908   }
f32_gemm_7x16__avx512f_broadcast(benchmark::State & state,const char * net)909   static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
910     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1,
911       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
912   }
f32_gemm_8x16__avx512f_broadcast(benchmark::State & state,const char * net)913   static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
914     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1,
915       xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
916   }
917 
f32_gemm_1x8__fma3_broadcast(benchmark::State & state,const char * net)918   static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
919     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1,
920       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
921   }
f32_gemm_4x8__fma3_broadcast(benchmark::State & state,const char * net)922   static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
923     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1,
924       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
925   }
f32_gemm_5x8__fma3_broadcast(benchmark::State & state,const char * net)926   static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
927     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1,
928       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
929   }
f32_gemm_6x8__fma3_broadcast(benchmark::State & state,const char * net)930   static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
931     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1,
932       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
933   }
f32_gemm_7x8__fma3_broadcast(benchmark::State & state,const char * net)934   static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
935     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1,
936       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
937   }
f32_gemm_8x8__fma3_broadcast(benchmark::State & state,const char * net)938   static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
939     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1,
940       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
941   }
f32_gemm_1x16__fma3_broadcast(benchmark::State & state,const char * net)942   static void f32_gemm_1x16__fma3_broadcast(benchmark::State& state, const char* net) {
943     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, 1, 16, 1, 1,
944       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
945   }
f32_gemm_3x16__fma3_broadcast(benchmark::State & state,const char * net)946   static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, const char* net) {
947     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, 4, 16, 1, 1,
948       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
949   }
f32_gemm_4x16__fma3_broadcast(benchmark::State & state,const char * net)950   static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, const char* net) {
951     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast, 4, 16, 1, 1,
952       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
953   }
f32_gemm_5x16__fma3_broadcast(benchmark::State & state,const char * net)954   static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, const char* net) {
955     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast, 5, 16, 1, 1,
956       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
957   }
958 
f32_gemm_1x16s4__fma3_broadcast(benchmark::State & state,const char * net)959   static void f32_gemm_1x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
960     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast, 1, 16, 1, 4,
961       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
962   }
f32_gemm_3x16s4__fma3_broadcast(benchmark::State & state,const char * net)963   static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
964     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast, 4, 16, 1, 4,
965       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
966   }
f32_gemm_4x16s4__fma3_broadcast(benchmark::State & state,const char * net)967   static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
968     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast, 4, 16, 1, 4,
969       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
970   }
f32_gemm_5x16s4__fma3_broadcast(benchmark::State & state,const char * net)971   static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
972     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast, 5, 16, 1, 4,
973       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
974   }
975 
f32_gemm_1x8__avx_broadcast(benchmark::State & state,const char * net)976   static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
977     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, 1, 8, 1, 1,
978       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
979   }
f32_gemm_4x8__avx_broadcast(benchmark::State & state,const char * net)980   static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
981     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast, 4, 8, 1, 1,
982       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
983   }
f32_gemm_5x8__avx_broadcast(benchmark::State & state,const char * net)984   static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
985     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, 5, 8, 1, 1,
986       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
987   }
f32_gemm_6x8__avx_broadcast(benchmark::State & state,const char * net)988   static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
989     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, 6, 8, 1, 1,
990       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
991   }
f32_gemm_7x8__avx_broadcast(benchmark::State & state,const char * net)992   static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
993     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast, 7, 8, 1, 1,
994       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
995   }
f32_gemm_1x16__avx_broadcast(benchmark::State & state,const char * net)996   static void f32_gemm_1x16__avx_broadcast(benchmark::State& state, const char* net) {
997     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast, 1, 16, 1, 1,
998       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
999   }
f32_gemm_3x16__avx_broadcast(benchmark::State & state,const char * net)1000   static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, const char* net) {
1001     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, 4, 16, 1, 1,
1002       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
1003   }
f32_gemm_4x16__avx_broadcast(benchmark::State & state,const char * net)1004   static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, const char* net) {
1005     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast, 4, 16, 1, 1,
1006       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
1007   }
f32_gemm_5x16__avx_broadcast(benchmark::State & state,const char * net)1008   static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, const char* net) {
1009     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast, 5, 16, 1, 1,
1010       xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
1011   }
1012 
f32_gemm_1x8__sse2_dup(benchmark::State & state,const char * net)1013   static void f32_gemm_1x8__sse2_dup(benchmark::State& state, const char* net) {
1014     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, 1, 8, 1, 1,
1015       xnn_init_f32_minmax_sse_params);
1016   }
f32_gemm_3x8__sse2_dup(benchmark::State & state,const char * net)1017   static void f32_gemm_3x8__sse2_dup(benchmark::State& state, const char* net) {
1018     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup, 3, 8, 1, 1,
1019       xnn_init_f32_minmax_sse_params);
1020   }
f32_gemm_4x8__sse2_dup(benchmark::State & state,const char * net)1021   static void f32_gemm_4x8__sse2_dup(benchmark::State& state, const char* net) {
1022     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup, 4, 8, 1, 1,
1023       xnn_init_f32_minmax_sse_params);
1024   }
f32_gemm_5x8__sse2_dup(benchmark::State & state,const char * net)1025   static void f32_gemm_5x8__sse2_dup(benchmark::State& state, const char* net) {
1026     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, 5, 8, 1, 1,
1027       xnn_init_f32_minmax_sse_params);
1028   }
1029 
f32_gemm_1x8__sse_load1(benchmark::State & state,const char * net)1030   static void f32_gemm_1x8__sse_load1(benchmark::State& state, const char* net) {
1031     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse_load1, 1, 8, 1, 1,
1032       xnn_init_f32_minmax_sse_params);
1033   }
f32_gemm_3x8__sse_load1(benchmark::State & state,const char * net)1034   static void f32_gemm_3x8__sse_load1(benchmark::State& state, const char* net) {
1035     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, 3, 8, 1, 1,
1036       xnn_init_f32_minmax_sse_params);
1037   }
f32_gemm_4x8__sse_load1(benchmark::State & state,const char * net)1038   static void f32_gemm_4x8__sse_load1(benchmark::State& state, const char* net) {
1039     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, 4, 8, 1, 1,
1040       xnn_init_f32_minmax_sse_params);
1041   }
f32_gemm_5x8__sse_load1(benchmark::State & state,const char * net)1042   static void f32_gemm_5x8__sse_load1(benchmark::State& state, const char* net) {
1043     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, 5, 8, 1, 1,
1044       xnn_init_f32_minmax_sse_params);
1045   }
1046 
f32_gemm_1x8__sse_dup(benchmark::State & state,const char * net)1047   static void f32_gemm_1x8__sse_dup(benchmark::State& state, const char* net) {
1048     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse_dup, 1, 8, 1, 1,
1049       xnn_init_f32_minmax_sse_params);
1050   }
f32_gemm_3x8__sse_dup(benchmark::State & state,const char * net)1051   static void f32_gemm_3x8__sse_dup(benchmark::State& state, const char* net) {
1052     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, 3, 8, 1, 1,
1053       xnn_init_f32_minmax_sse_params);
1054   }
f32_gemm_4x8__sse_dup(benchmark::State & state,const char * net)1055   static void f32_gemm_4x8__sse_dup(benchmark::State& state, const char* net) {
1056     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse_dup, 4, 8, 1, 1,
1057       xnn_init_f32_minmax_sse_params);
1058   }
f32_gemm_5x8__sse_dup(benchmark::State & state,const char * net)1059   static void f32_gemm_5x8__sse_dup(benchmark::State& state, const char* net) {
1060     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, 5, 8, 1, 1,
1061       xnn_init_f32_minmax_sse_params);
1062   }
1063 
f32_gemm_1x8s4__sse(benchmark::State & state,const char * net)1064   static void f32_gemm_1x8s4__sse(benchmark::State& state, const char* net) {
1065     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__sse, 1, 8, 1, 4,
1066       xnn_init_f32_minmax_sse_params);
1067   }
f32_gemm_3x8s4__sse(benchmark::State & state,const char * net)1068   static void f32_gemm_3x8s4__sse(benchmark::State& state, const char* net) {
1069     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__sse, 3, 8, 1, 4,
1070       xnn_init_f32_minmax_sse_params);
1071   }
f32_gemm_4x8s4__sse(benchmark::State & state,const char * net)1072   static void f32_gemm_4x8s4__sse(benchmark::State& state, const char* net) {
1073     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__sse, 4, 8, 1, 4,
1074       xnn_init_f32_minmax_sse_params);
1075   }
f32_gemm_5x8s4__sse(benchmark::State & state,const char * net)1076   static void f32_gemm_5x8s4__sse(benchmark::State& state, const char* net) {
1077     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__sse, 5, 8, 1, 4,
1078       xnn_init_f32_minmax_sse_params);
1079   }
1080 
f32_ppmm_4x8_unipass__sse(benchmark::State & state,const char * net)1081   static void f32_ppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
1082     PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8,
1083       xnn_init_f32_minmax_sse_params);
1084   }
f32_ppmm_4x8_twopass__sse(benchmark::State & state,const char * net)1085   static void f32_ppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
1086     PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8,
1087       xnn_init_f32_minmax_sse_params);
1088   }
1089 
1090   BENCHMARK_GEMM(f32_gemm_1x16__avx512f_broadcast)
BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)1091   BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)
1092   BENCHMARK_GEMM(f32_gemm_5x16__avx512f_broadcast)
1093   BENCHMARK_GEMM(f32_gemm_6x16__avx512f_broadcast)
1094   BENCHMARK_GEMM(f32_gemm_7x16__avx512f_broadcast)
1095   BENCHMARK_GEMM(f32_gemm_8x16__avx512f_broadcast)
1096 
1097   BENCHMARK_GEMM(f32_gemm_1x8__fma3_broadcast)
1098   BENCHMARK_GEMM(f32_gemm_4x8__fma3_broadcast)
1099   BENCHMARK_GEMM(f32_gemm_5x8__fma3_broadcast)
1100   BENCHMARK_GEMM(f32_gemm_6x8__fma3_broadcast)
1101   BENCHMARK_GEMM(f32_gemm_7x8__fma3_broadcast)
1102   BENCHMARK_GEMM(f32_gemm_8x8__fma3_broadcast)
1103   BENCHMARK_GEMM(f32_gemm_1x16__fma3_broadcast)
1104   BENCHMARK_GEMM(f32_gemm_3x16__fma3_broadcast)
1105   BENCHMARK_GEMM(f32_gemm_4x16__fma3_broadcast)
1106   BENCHMARK_GEMM(f32_gemm_5x16__fma3_broadcast)
1107 
1108   BENCHMARK_GEMM(f32_gemm_1x16s4__fma3_broadcast)
1109   BENCHMARK_GEMM(f32_gemm_3x16s4__fma3_broadcast)
1110   BENCHMARK_GEMM(f32_gemm_4x16s4__fma3_broadcast)
1111   BENCHMARK_GEMM(f32_gemm_5x16s4__fma3_broadcast)
1112 
1113   BENCHMARK_GEMM(f32_gemm_1x8__avx_broadcast)
1114   BENCHMARK_GEMM(f32_gemm_4x8__avx_broadcast)
1115   BENCHMARK_GEMM(f32_gemm_5x8__avx_broadcast)
1116   BENCHMARK_GEMM(f32_gemm_6x8__avx_broadcast)
1117   BENCHMARK_GEMM(f32_gemm_7x8__avx_broadcast)
1118   BENCHMARK_GEMM(f32_gemm_1x16__avx_broadcast)
1119   BENCHMARK_GEMM(f32_gemm_3x16__avx_broadcast)
1120   BENCHMARK_GEMM(f32_gemm_4x16__avx_broadcast)
1121   BENCHMARK_GEMM(f32_gemm_5x16__avx_broadcast)
1122 
1123   BENCHMARK_GEMM(f32_gemm_1x8__sse2_dup)
1124   BENCHMARK_GEMM(f32_gemm_3x8__sse2_dup)
1125   BENCHMARK_GEMM(f32_gemm_4x8__sse2_dup)
1126   BENCHMARK_GEMM(f32_gemm_5x8__sse2_dup)
1127 
1128   BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
1129   BENCHMARK_GEMM(f32_gemm_3x8__sse_load1)
1130   BENCHMARK_GEMM(f32_gemm_4x8__sse_load1)
1131   BENCHMARK_GEMM(f32_gemm_5x8__sse_load1)
1132 
1133   BENCHMARK_GEMM(f32_gemm_1x8__sse_dup)
1134   BENCHMARK_GEMM(f32_gemm_3x8__sse_dup)
1135   BENCHMARK_GEMM(f32_gemm_4x8__sse_dup)
1136   BENCHMARK_GEMM(f32_gemm_5x8__sse_dup)
1137 
1138   BENCHMARK_GEMM(f32_gemm_1x8s4__sse)
1139   BENCHMARK_GEMM(f32_gemm_3x8s4__sse)
1140   BENCHMARK_GEMM(f32_gemm_4x8s4__sse)
1141   BENCHMARK_GEMM(f32_gemm_5x8s4__sse)
1142 
1143   BENCHMARK_GEMM(f32_ppmm_4x8_unipass__sse)
1144   BENCHMARK_GEMM(f32_ppmm_4x8_twopass__sse)
1145 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1146 
1147 
1148 #if XNN_ARCH_WASMRELAXEDSIMD
1149   static void f32_gemm_3x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, const char* net) {
1150     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, 3, 8, 1, 1,
1151       xnn_init_f32_minmax_wasmsimd_params);
1152   }
f32_gemm_4x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,const char * net)1153   static void f32_gemm_4x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, const char* net) {
1154     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, 4, 8, 1, 1,
1155       xnn_init_f32_minmax_wasmsimd_params);
1156   }
f32_gemm_5x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,const char * net)1157   static void f32_gemm_5x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, const char* net) {
1158     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, 5, 8, 1, 1,
1159       xnn_init_f32_minmax_wasmsimd_params);
1160   }
f32_gemm_6x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,const char * net)1161   static void f32_gemm_6x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, const char* net) {
1162     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, 6, 8, 1, 1,
1163       xnn_init_f32_minmax_wasmsimd_params);
1164   }
f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,const char * net)1165   static void f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, const char* net) {
1166     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, 3, 8, 1, 1,
1167       xnn_init_f32_minmax_wasmsimd_params);
1168   }
f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,const char * net)1169   static void f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, const char* net) {
1170     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, 4, 8, 1, 1,
1171       xnn_init_f32_minmax_wasmsimd_params);
1172   }
f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,const char * net)1173   static void f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, const char* net) {
1174     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, 5, 8, 1, 1,
1175       xnn_init_f32_minmax_wasmsimd_params);
1176   }
f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,const char * net)1177   static void f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, const char* net) {
1178     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, 6, 8, 1, 1,
1179       xnn_init_f32_minmax_wasmsimd_params);
1180   }
f32_gemm_3x8__wasmrelaxedsimd_splat(benchmark::State & state,const char * net)1181   static void f32_gemm_3x8__wasmrelaxedsimd_splat(benchmark::State& state, const char* net) {
1182     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, 3, 8, 1, 1,
1183       xnn_init_f32_minmax_wasmsimd_params);
1184   }
f32_gemm_4x8__wasmrelaxedsimd_splat(benchmark::State & state,const char * net)1185   static void f32_gemm_4x8__wasmrelaxedsimd_splat(benchmark::State& state, const char* net) {
1186     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, 4, 8, 1, 1,
1187       xnn_init_f32_minmax_wasmsimd_params);
1188   }
f32_gemm_5x8__wasmrelaxedsimd_splat(benchmark::State & state,const char * net)1189   static void f32_gemm_5x8__wasmrelaxedsimd_splat(benchmark::State& state, const char* net) {
1190     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_splat, 5, 8, 1, 1,
1191       xnn_init_f32_minmax_wasmsimd_params);
1192   }
f32_gemm_6x8__wasmrelaxedsimd_splat(benchmark::State & state,const char * net)1193   static void f32_gemm_6x8__wasmrelaxedsimd_splat(benchmark::State& state, const char* net) {
1194     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_splat, 6, 8, 1, 1,
1195       xnn_init_f32_minmax_wasmsimd_params);
1196   }
f32_gemm_3x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,const char * net)1197   static void f32_gemm_3x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, const char* net) {
1198     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_splat, 3, 8, 1, 1,
1199       xnn_init_f32_minmax_wasmsimd_params);
1200   }
f32_gemm_4x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,const char * net)1201   static void f32_gemm_4x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, const char* net) {
1202     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat, 4, 8, 1, 1,
1203       xnn_init_f32_minmax_wasmsimd_params);
1204   }
f32_gemm_5x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,const char * net)1205   static void f32_gemm_5x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, const char* net) {
1206     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat, 5, 8, 1, 1,
1207       xnn_init_f32_minmax_wasmsimd_params);
1208   }
f32_gemm_6x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,const char * net)1209   static void f32_gemm_6x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, const char* net) {
1210     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat, 6, 8, 1, 1,
1211       xnn_init_f32_minmax_wasmsimd_params);
1212   }
f32_gemm_3x8s4__wasmrelaxedsimd(benchmark::State & state,const char * net)1213   static void f32_gemm_3x8s4__wasmrelaxedsimd(benchmark::State& state, const char* net) {
1214     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd, 3, 8, 1, 4,
1215       xnn_init_f32_minmax_wasmsimd_params);
1216   }
f32_gemm_4x8s4__wasmrelaxedsimd(benchmark::State & state,const char * net)1217   static void f32_gemm_4x8s4__wasmrelaxedsimd(benchmark::State& state, const char* net) {
1218     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd, 4, 8, 1, 4,
1219       xnn_init_f32_minmax_wasmsimd_params);
1220   }
f32_gemm_5x8s4__wasmrelaxedsimd(benchmark::State & state,const char * net)1221   static void f32_gemm_5x8s4__wasmrelaxedsimd(benchmark::State& state, const char* net) {
1222     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd, 5, 8, 1, 4,
1223       xnn_init_f32_minmax_wasmsimd_params);
1224   }
f32_gemm_6x8s4__wasmrelaxedsimd(benchmark::State & state,const char * net)1225   static void f32_gemm_6x8s4__wasmrelaxedsimd(benchmark::State& state, const char* net) {
1226     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd, 6, 8, 1, 4,
1227       xnn_init_f32_minmax_wasmsimd_params);
1228   }
f32_gemm_3x8s4__wasmrelaxedsimd_fma(benchmark::State & state,const char * net)1229   static void f32_gemm_3x8s4__wasmrelaxedsimd_fma(benchmark::State& state, const char* net) {
1230     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, 3, 8, 1, 4,
1231       xnn_init_f32_minmax_wasmsimd_params);
1232   }
f32_gemm_4x8s4__wasmrelaxedsimd_fma(benchmark::State & state,const char * net)1233   static void f32_gemm_4x8s4__wasmrelaxedsimd_fma(benchmark::State& state, const char* net) {
1234     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, 4, 8, 1, 4,
1235       xnn_init_f32_minmax_wasmsimd_params);
1236   }
f32_gemm_5x8s4__wasmrelaxedsimd_fma(benchmark::State & state,const char * net)1237   static void f32_gemm_5x8s4__wasmrelaxedsimd_fma(benchmark::State& state, const char* net) {
1238     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma, 5, 8, 1, 4,
1239       xnn_init_f32_minmax_wasmsimd_params);
1240   }
f32_gemm_6x8s4__wasmrelaxedsimd_fma(benchmark::State & state,const char * net)1241   static void f32_gemm_6x8s4__wasmrelaxedsimd_fma(benchmark::State& state, const char* net) {
1242     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd_fma, 6, 8, 1, 4,
1243       xnn_init_f32_minmax_wasmsimd_params);
1244   }
1245 
1246   BENCHMARK_GEMM(f32_gemm_3x8__wasmrelaxedsimd_loadsplat)
BENCHMARK_GEMM(f32_gemm_4x8__wasmrelaxedsimd_loadsplat)1247   BENCHMARK_GEMM(f32_gemm_4x8__wasmrelaxedsimd_loadsplat)
1248   BENCHMARK_GEMM(f32_gemm_5x8__wasmrelaxedsimd_loadsplat)
1249   BENCHMARK_GEMM(f32_gemm_6x8__wasmrelaxedsimd_loadsplat)
1250 
1251   BENCHMARK_GEMM(f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat)
1252   BENCHMARK_GEMM(f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat)
1253   BENCHMARK_GEMM(f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat)
1254   BENCHMARK_GEMM(f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat)
1255 
1256   BENCHMARK_GEMM(f32_gemm_3x8__wasmrelaxedsimd_splat)
1257   BENCHMARK_GEMM(f32_gemm_4x8__wasmrelaxedsimd_splat)
1258   BENCHMARK_GEMM(f32_gemm_5x8__wasmrelaxedsimd_splat)
1259   BENCHMARK_GEMM(f32_gemm_6x8__wasmrelaxedsimd_splat)
1260 
1261   BENCHMARK_GEMM(f32_gemm_3x8__wasmrelaxedsimd_fma_splat)
1262   BENCHMARK_GEMM(f32_gemm_4x8__wasmrelaxedsimd_fma_splat)
1263   BENCHMARK_GEMM(f32_gemm_5x8__wasmrelaxedsimd_fma_splat)
1264   BENCHMARK_GEMM(f32_gemm_6x8__wasmrelaxedsimd_fma_splat)
1265 
1266   BENCHMARK_GEMM(f32_gemm_3x8s4__wasmrelaxedsimd)
1267   BENCHMARK_GEMM(f32_gemm_4x8s4__wasmrelaxedsimd)
1268   BENCHMARK_GEMM(f32_gemm_5x8s4__wasmrelaxedsimd)
1269   BENCHMARK_GEMM(f32_gemm_6x8s4__wasmrelaxedsimd)
1270 
1271   BENCHMARK_GEMM(f32_gemm_3x8s4__wasmrelaxedsimd_fma)
1272   BENCHMARK_GEMM(f32_gemm_4x8s4__wasmrelaxedsimd_fma)
1273   BENCHMARK_GEMM(f32_gemm_5x8s4__wasmrelaxedsimd_fma)
1274   BENCHMARK_GEMM(f32_gemm_6x8s4__wasmrelaxedsimd_fma)
1275 #endif  // XNN_ARCH_WASMRELAXEDSIMD
1276 
1277 
1278 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1279   static void f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1280     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 3, 8, 1, 1,
1281       xnn_init_f32_minmax_wasmsimd_params);
1282   }
f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)1283   static void f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1284     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 4, 8, 1, 1,
1285       xnn_init_f32_minmax_wasmsimd_params);
1286   }
f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)1287   static void f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1288     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 5, 8, 1, 1,
1289       xnn_init_f32_minmax_wasmsimd_params);
1290   }
f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)1291   static void f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1292     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 6, 8, 1, 1,
1293       xnn_init_f32_minmax_wasmsimd_params);
1294   }
f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1295   static void f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1296     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 3, 8, 1, 1,
1297       xnn_init_f32_minmax_wasmsimd_params);
1298   }
f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1299   static void f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1300     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 4, 8, 1, 1,
1301       xnn_init_f32_minmax_wasmsimd_params);
1302   }
f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1303   static void f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1304     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 5, 8, 1, 1,
1305       xnn_init_f32_minmax_wasmsimd_params);
1306   }
f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1307   static void f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1308     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 6, 8, 1, 1,
1309       xnn_init_f32_minmax_wasmsimd_params);
1310   }
f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1311   static void f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1312     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 3, 8, 1, 1,
1313       xnn_init_f32_minmax_wasmsimd_params);
1314   }
f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1315   static void f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1316     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, 8, 1, 1,
1317       xnn_init_f32_minmax_wasmsimd_params);
1318   }
f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1319   static void f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1320     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 5, 8, 1, 1,
1321       xnn_init_f32_minmax_wasmsimd_params);
1322   }
f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1323   static void f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1324     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 6, 8, 1, 1,
1325       xnn_init_f32_minmax_wasmsimd_params);
1326   }
f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1327   static void f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1328     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 3, 8, 1, 1,
1329       xnn_init_f32_minmax_wasmsimd_params);
1330   }
f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1331   static void f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1332     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, 8, 1, 1,
1333       xnn_init_f32_minmax_wasmsimd_params);
1334   }
f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1335   static void f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1336     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 5, 8, 1, 1,
1337       xnn_init_f32_minmax_wasmsimd_params);
1338   }
f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1339   static void f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1340     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 6, 8, 1, 1,
1341       xnn_init_f32_minmax_wasmsimd_params);
1342   }
f32_gemm_3x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1343   static void f32_gemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1344     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, 3, 8, 1, 4,
1345       xnn_init_f32_minmax_wasmsimd_params);
1346   }
f32_gemm_4x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1347   static void f32_gemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1348     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, 8, 1, 4,
1349       xnn_init_f32_minmax_wasmsimd_params);
1350   }
f32_gemm_5x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1351   static void f32_gemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1352     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm, 5, 8, 1, 4,
1353       xnn_init_f32_minmax_wasmsimd_params);
1354   }
f32_gemm_6x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1355   static void f32_gemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1356     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, 6, 8, 1, 4,
1357       xnn_init_f32_minmax_wasmsimd_params);
1358   }
f32_gemm_3x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1359   static void f32_gemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1360     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, 3, 8, 1, 4,
1361       xnn_init_f32_minmax_wasmsimd_params);
1362   }
f32_gemm_4x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1363   static void f32_gemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1364     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, 8, 1, 4,
1365       xnn_init_f32_minmax_wasmsimd_params);
1366   }
f32_gemm_5x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1367   static void f32_gemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1368     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86, 5, 8, 1, 4,
1369       xnn_init_f32_minmax_wasmsimd_params);
1370   }
f32_gemm_6x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1371   static void f32_gemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1372     GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, 6, 8, 1, 4,
1373       xnn_init_f32_minmax_wasmsimd_params);
1374   }
1375 
f32_ppmm_4x8_unipass__wasmsimd_arm_splat(benchmark::State & state,const char * net)1376   static void f32_ppmm_4x8_unipass__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1377     PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1378       xnn_init_f32_minmax_wasmsimd_params);
1379   }
f32_ppmm_4x8_unipass__wasmsimd_x86_splat(benchmark::State & state,const char * net)1380   static void f32_ppmm_4x8_unipass__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1381     PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1382       xnn_init_f32_minmax_wasmsimd_params);
1383   }
1384 
f32_ppmm_4x8_twopass__wasmsimd_arm_splat(benchmark::State & state,const char * net)1385   static void f32_ppmm_4x8_twopass__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1386     PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1387       xnn_init_f32_minmax_wasmsimd_params);
1388   }
f32_ppmm_4x8_twopass__wasmsimd_x86_splat(benchmark::State & state,const char * net)1389   static void f32_ppmm_4x8_twopass__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1390     PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1391       xnn_init_f32_minmax_wasmsimd_params);
1392   }
1393 
1394   BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_arm_loadsplat)
BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_loadsplat)1395   BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_loadsplat)
1396   BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_arm_loadsplat)
1397   BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_arm_loadsplat)
1398 
1399   BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_x86_loadsplat)
1400   BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_x86_loadsplat)
1401   BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_x86_loadsplat)
1402   BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_x86_loadsplat)
1403 
1404   BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_arm_splat)
1405   BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_splat)
1406   BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_arm_splat)
1407   BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_arm_splat)
1408 
1409   BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_x86_splat)
1410   BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_x86_splat)
1411   BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_x86_splat)
1412   BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_x86_splat)
1413 
1414   BENCHMARK_GEMM(f32_gemm_3x8s4__wasmsimd_arm)
1415   BENCHMARK_GEMM(f32_gemm_4x8s4__wasmsimd_arm)
1416   BENCHMARK_GEMM(f32_gemm_5x8s4__wasmsimd_arm)
1417   BENCHMARK_GEMM(f32_gemm_6x8s4__wasmsimd_arm)
1418 
1419   BENCHMARK_GEMM(f32_gemm_3x8s4__wasmsimd_x86)
1420   BENCHMARK_GEMM(f32_gemm_4x8s4__wasmsimd_x86)
1421   BENCHMARK_GEMM(f32_gemm_5x8s4__wasmsimd_x86)
1422   BENCHMARK_GEMM(f32_gemm_6x8s4__wasmsimd_x86)
1423 
1424   BENCHMARK_GEMM(f32_ppmm_4x8_unipass__wasmsimd_arm_splat)
1425   BENCHMARK_GEMM(f32_ppmm_4x8_unipass__wasmsimd_x86_splat)
1426 
1427   BENCHMARK_GEMM(f32_ppmm_4x8_twopass__wasmsimd_arm_splat)
1428   BENCHMARK_GEMM(f32_ppmm_4x8_twopass__wasmsimd_x86_splat)
1429 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1430 
1431 
1432 static void f32_gemm_1x4__scalar(benchmark::State& state, const char* net) {
1433   GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x4__scalar, 1, 4, 1, 1,
1434     xnn_init_f32_minmax_scalar_params);
1435 }
f32_gemm_2x4__scalar(benchmark::State & state,const char * net)1436 static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) {
1437   GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_2x4__scalar, 2, 4, 1, 1,
1438     xnn_init_f32_minmax_scalar_params);
1439 }
f32_gemm_4x4__scalar(benchmark::State & state,const char * net)1440 static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) {
1441   GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__scalar, 4, 4, 1, 1,
1442     xnn_init_f32_minmax_scalar_params);
1443 }
1444 
f32_ppmm_2x4_unipass__scalar(benchmark::State & state,const char * net)1445 static void f32_ppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
1446   PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4,
1447     xnn_init_f32_minmax_scalar_params);
1448 }
f32_ppmm_4x2_unipass__scalar(benchmark::State & state,const char * net)1449 static void f32_ppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
1450   PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2,
1451     xnn_init_f32_minmax_scalar_params);
1452 }
f32_ppmm_4x4_unipass__scalar(benchmark::State & state,const char * net)1453 static void f32_ppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
1454   PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4,
1455     xnn_init_f32_minmax_scalar_params);
1456 }
f32_ppmm_3x3_unipass__scalar(benchmark::State & state,const char * net)1457 static void f32_ppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
1458   PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3,
1459     xnn_init_f32_minmax_scalar_params);
1460 }
1461 
f32_ppmm_2x4_twopass__scalar(benchmark::State & state,const char * net)1462 static void f32_ppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
1463   PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4,
1464     xnn_init_f32_minmax_scalar_params);
1465 }
f32_ppmm_4x2_twopass__scalar(benchmark::State & state,const char * net)1466 static void f32_ppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
1467   PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2,
1468     xnn_init_f32_minmax_scalar_params);
1469 }
f32_ppmm_4x4_twopass__scalar(benchmark::State & state,const char * net)1470 static void f32_ppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
1471   PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4,
1472     xnn_init_f32_minmax_scalar_params);
1473 }
f32_ppmm_3x3_twopass__scalar(benchmark::State & state,const char * net)1474 static void f32_ppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
1475   PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3,
1476     xnn_init_f32_minmax_scalar_params);
1477 }
1478 
1479 BENCHMARK_GEMM(f32_gemm_1x4__scalar)
1480 BENCHMARK_GEMM(f32_gemm_2x4__scalar)
1481 BENCHMARK_GEMM(f32_gemm_4x4__scalar)
1482 
1483 BENCHMARK_GEMM(f32_ppmm_2x4_unipass__scalar)
1484 BENCHMARK_GEMM(f32_ppmm_4x2_unipass__scalar)
1485 BENCHMARK_GEMM(f32_ppmm_4x4_unipass__scalar)
1486 BENCHMARK_GEMM(f32_ppmm_3x3_unipass__scalar)
1487 
1488 BENCHMARK_GEMM(f32_ppmm_2x4_twopass__scalar)
1489 BENCHMARK_GEMM(f32_ppmm_4x2_twopass__scalar)
1490 BENCHMARK_GEMM(f32_ppmm_4x4_twopass__scalar)
1491 BENCHMARK_GEMM(f32_ppmm_3x3_twopass__scalar)
1492 
1493 
1494 #ifdef BENCHMARK_RUY
1495 BENCHMARK_GEMM(ruy_st)
1496 #endif  // BENCHMARK_RUY
1497 
1498 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1499 BENCHMARK_MAIN();
1500 #endif
1501