xref: /aosp_15_r20/external/XNNPACK/bench/f32-gemm-e2e.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <cstring>
9 #include <functional>
10 #include <random>
11 #include <vector>
12 
13 #include <xnnpack.h>
14 
15 #include <benchmark/benchmark.h>
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19 
20 #include <xnnpack.h>
21 #include <xnnpack/gemm.h>
22 #include <xnnpack/igemm.h>
23 #include <xnnpack/microfnptr.h>
24 #include <xnnpack/microparams-init.h>
25 
26 
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_f32_gemm_minmax_ukernel_function gemm_minmax,xnn_f32_igemm_minmax_ukernel_function igemm_minmax,xnn_f32_gemm_minmax_ukernel_function gemm1_minmax,xnn_f32_igemm_minmax_ukernel_function igemm1_minmax,xnn_f32_gemm_relu_ukernel_function gemm_relu,xnn_f32_igemm_relu_ukernel_function igemm_relu,xnn_f32_gemm_relu_ukernel_function gemm1_relu,xnn_f32_igemm_relu_ukernel_function igemm1_relu,xnn_f32_gemm_ukernel_function gemm,xnn_f32_igemm_ukernel_function igemm,xnn_f32_gemm_ukernel_function gemm1,xnn_f32_igemm_ukernel_function igemm1,xnn_init_f32_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)27 static void GEMMEnd2EndBenchmark(
28   benchmark::State& state,
29   models::ExecutionPlanFactory model_factory,
30   xnn_f32_gemm_minmax_ukernel_function gemm_minmax,
31   xnn_f32_igemm_minmax_ukernel_function igemm_minmax,
32   xnn_f32_gemm_minmax_ukernel_function gemm1_minmax,
33   xnn_f32_igemm_minmax_ukernel_function igemm1_minmax,
34   xnn_f32_gemm_relu_ukernel_function gemm_relu,
35   xnn_f32_igemm_relu_ukernel_function igemm_relu,
36   xnn_f32_gemm_relu_ukernel_function gemm1_relu,
37   xnn_f32_igemm_relu_ukernel_function igemm1_relu,
38   xnn_f32_gemm_ukernel_function gemm,
39   xnn_f32_igemm_ukernel_function igemm,
40   xnn_f32_gemm_ukernel_function gemm1,
41   xnn_f32_igemm_ukernel_function igemm1,
42   xnn_init_f32_minmax_params_fn init_params,
43   uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
44   benchmark::utils::IsaCheckFunction isa_check = nullptr)
45 {
46   if (isa_check && !isa_check(state)) {
47     return;
48   }
49   if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
50     state.SkipWithError("failed to initialize XNNPACK");
51     return;
52   }
53 
54   // Override microkernels chosen in xnn_initialize
55   // Note: do not directly assign to xnn_params.f32.gemm because it breaks older gcc.
56   std::memset(&xnn_params.f32.gemm, 0, sizeof(xnn_params.f32.gemm));
57   std::memset(&xnn_params.f32.gemm2, 0, sizeof(xnn_params.f32.gemm2));
58   xnn_params.f32.gemm.minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm_minmax));
59   xnn_params.f32.gemm.minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm_minmax));
60   xnn_params.f32.gemm.minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1_minmax));
61   xnn_params.f32.gemm.minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1_minmax));
62   xnn_params.f32.gemm.relu.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm_relu));
63   xnn_params.f32.gemm.relu.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm_relu));
64   xnn_params.f32.gemm.relu.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1_relu));
65   xnn_params.f32.gemm.relu.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1_relu));
66   xnn_params.f32.gemm.linear.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
67   xnn_params.f32.gemm.linear.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
68   xnn_params.f32.gemm.linear.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
69   xnn_params.f32.gemm.linear.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
70   xnn_params.f32.gemm.init.f32 = init_params;
71   xnn_params.f32.gemm.mr = mr;
72   xnn_params.f32.gemm.nr = nr;
73   xnn_params.f32.gemm.log2_kr = log2_kr;
74   xnn_params.f32.gemm.log2_sr = log2_sr;
75 
76   #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT
77     // If JIT is enabled, we want to make sure that we are still benchmarking
78     // non-JIT microkernels, so nullify the pointers to generators.
79     xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(nullptr);
80     xnn_params.f32.gemm.generator.gemm1 = xnn_init_hmp_gemm_codegen(nullptr);
81     xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(nullptr);
82     xnn_params.f32.gemm.generator.igemm1 = xnn_init_hmp_igemm_codegen(nullptr);
83   #endif  // XNN_PLATFORM_JIT && XNN_ENABLE_JIT
84 
85   auto execution_plan = model_factory(nullptr);
86   if (execution_plan.empty()) {
87     state.SkipWithError("failed to create a model");
88     return;
89   }
90 
91   for (auto _ : state) {
92     for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
93       xnn_status status = xnn_run_operator(op.get(), nullptr);
94       if (status != xnn_status_success) {
95         state.SkipWithError("failed to run a model");
96         return;
97       }
98     }
99   }
100 
101   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
102   if (cpu_frequency != 0) {
103     state.counters["cpufreq"] = cpu_frequency;
104   }
105 }
106 
107 #if XNN_PLATFORM_JIT
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_jit_gemm_code_generator_function gemm_generator,xnn_jit_gemm_code_generator_function gemm1_generator,xnn_jit_igemm_code_generator_function igemm_generator,xnn_jit_igemm_code_generator_function igemm1_generator,xnn_init_f32_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)108 static void GEMMEnd2EndBenchmark(
109   benchmark::State& state,
110   models::ExecutionPlanFactory model_factory,
111   xnn_jit_gemm_code_generator_function gemm_generator,
112   xnn_jit_gemm_code_generator_function gemm1_generator,
113   xnn_jit_igemm_code_generator_function igemm_generator,
114   xnn_jit_igemm_code_generator_function igemm1_generator,
115   xnn_init_f32_minmax_params_fn init_params,
116   uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
117   benchmark::utils::IsaCheckFunction isa_check = nullptr)
118 {
119   if (isa_check && !isa_check(state)) {
120     return;
121   }
122   if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
123     state.SkipWithError("failed to initialize XNNPACK");
124     return;
125   }
126 
127   // Nullify the microkernels to ensure we run JIT kernels.
128   for (size_t i = 0; i < XNN_MAX_MR; i++) {
129     xnn_params.f32.gemm.minmax.gemm[i] = xnn_init_hmp_gemm_ukernel(nullptr);
130     xnn_params.f32.gemm.minmax.igemm[i] = xnn_init_hmp_igemm_ukernel(nullptr);
131   }
132   xnn_params.f32.gemm.init.f32 = init_params;
133   xnn_params.f32.gemm.mr = mr;
134   xnn_params.f32.gemm.nr = nr;
135   xnn_params.f32.gemm.log2_kr = log2_kr;
136   xnn_params.f32.gemm.log2_sr = log2_sr;
137 
138   xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(gemm_generator);
139   xnn_params.f32.gemm.generator.gemm1 = xnn_init_hmp_gemm_codegen(gemm1_generator);
140   xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(igemm_generator);
141   xnn_params.f32.gemm.generator.igemm1 = xnn_init_hmp_igemm_codegen(igemm1_generator);
142 
143   auto execution_plan = model_factory(nullptr);
144   if (execution_plan.empty()) {
145     state.SkipWithError("failed to create a model");
146     return;
147   }
148 
149   for (auto _ : state) {
150     for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
151       xnn_status status = xnn_run_operator(op.get(), nullptr);
152       if (status != xnn_status_success) {
153         state.SkipWithError("failed to run a model");
154         return;
155       }
156     }
157   }
158 
159   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
160   if (cpu_frequency != 0) {
161     state.counters["cpufreq"] = cpu_frequency;
162   }
163 }
164 #endif  // XNN_PLATFORM_JIT
165 
166 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
f32_gemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)167   static void f32_gemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
168     GEMMEnd2EndBenchmark(state, model,
169       xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75,
170       xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75,
171       xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75,
172       xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75,
173       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
174       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
175       xnn_init_f32_minmax_scalar_params,
176       4 /* mr */, 2 /* nr */);
177   }
f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)178   static void f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
179     GEMMEnd2EndBenchmark(state, model,
180       xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75,
181       xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75,
182       xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75,
183       xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75,
184       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
185       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
186       xnn_init_f32_minmax_scalar_params,
187       4 /* mr */, 2 /* nr */);
188   }
f32_gemm_4x2__aarch64_neonfma_ld64(benchmark::State & state,models::ExecutionPlanFactory model)189   static void f32_gemm_4x2__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
190     GEMMEnd2EndBenchmark(state, model,
191       xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64,
192       xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_ld64,
193       xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64,
194       xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_ld64,
195       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
196       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
197       xnn_init_f32_minmax_scalar_params,
198       4 /* mr */, 2 /* nr */);
199   }
f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)200   static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
201     GEMMEnd2EndBenchmark(state, model,
202       xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53,
203       xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53,
204       xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53,
205       xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53,
206       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
207       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
208       xnn_init_f32_minmax_scalar_params,
209       4 /* mr */, 12 /* nr */);
210   }
f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)211   static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
212     GEMMEnd2EndBenchmark(state, model,
213       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53,
214       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53,
215       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
216       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
217       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
218       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
219       xnn_init_f32_minmax_scalar_params,
220       4 /* mr */, 8 /* nr */);
221   }
f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)222   static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
223     GEMMEnd2EndBenchmark(state, model,
224       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53,
225       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53,
226       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53,
227       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53,
228       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
229       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
230       xnn_init_f32_minmax_scalar_params,
231       4 /* mr */, 8 /* nr */);
232   }
f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)233   static void f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
234     GEMMEnd2EndBenchmark(state, model,
235       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55,
236       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55,
237       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
238       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
239       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
240       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
241       xnn_init_f32_minmax_scalar_params,
242       4 /* mr */, 8 /* nr */);
243   }
f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)244   static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
245     GEMMEnd2EndBenchmark(state, model,
246       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75,
247       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75,
248       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
249       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
250       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
251       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
252       xnn_init_f32_minmax_scalar_params,
253       4 /* mr */, 8 /* nr */);
254   }
f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)255   static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
256     GEMMEnd2EndBenchmark(state, model,
257       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75,
258       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75,
259       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
260       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
261       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
262       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
263       xnn_init_f32_minmax_scalar_params,
264       4 /* mr */, 8 /* nr */);
265   }
f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State & state,models::ExecutionPlanFactory model)266   static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
267     GEMMEnd2EndBenchmark(state, model,
268       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64,
269       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64,
270       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64,
271       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
272       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
273       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
274       xnn_init_f32_minmax_scalar_params,
275       4 /* mr */, 8 /* nr */);
276   }
f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State & state,models::ExecutionPlanFactory model)277   static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
278     GEMMEnd2EndBenchmark(state, model,
279       xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128,
280       xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128,
281       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64,
282       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
283       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
284       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
285       xnn_init_f32_minmax_scalar_params,
286       4 /* mr */, 8 /* nr */);
287   }
f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)288   static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
289     GEMMEnd2EndBenchmark(state, model,
290       xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75,
291       xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75,
292       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
293       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
294       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
295       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
296       xnn_init_f32_minmax_scalar_params,
297       5 /* mr */, 8 /* nr */);
298   }
f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)299   static void f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
300     GEMMEnd2EndBenchmark(state, model,
301       xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75,
302       xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75,
303       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
304       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
305       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
306       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
307       xnn_init_f32_minmax_scalar_params,
308       5 /* mr */, 8 /* nr */);
309   }
f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)310   static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
311     GEMMEnd2EndBenchmark(state, model,
312       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53,
313       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53,
314       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
315       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
316       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
317       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
318       xnn_init_f32_minmax_scalar_params,
319       6 /* mr */, 8 /* nr */);
320   }
f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)321   static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
322     GEMMEnd2EndBenchmark(state, model,
323       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53,
324       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53,
325       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53,
326       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53,
327       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
328       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
329       xnn_init_f32_minmax_scalar_params,
330       6 /* mr */, 8 /* nr */);
331   }
f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)332   static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
333     GEMMEnd2EndBenchmark(state, model,
334       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55,
335       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55,
336       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
337       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
338       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
339       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
340       xnn_init_f32_minmax_scalar_params,
341       6 /* mr */, 8 /* nr */);
342   }
f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State & state,models::ExecutionPlanFactory model)343   static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, models::ExecutionPlanFactory model) {
344     GEMMEnd2EndBenchmark(state, model,
345       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73,
346       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73,
347       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
348       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
349       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
350       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
351       xnn_init_f32_minmax_scalar_params,
352       6 /* mr */, 8 /* nr */);
353   }
f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)354   static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
355     GEMMEnd2EndBenchmark(state, model,
356       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75,
357       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75,
358       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
359       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
360       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
361       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
362       xnn_init_f32_minmax_scalar_params,
363       6 /* mr */, 8 /* nr */);
364   }
f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)365   static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
366     GEMMEnd2EndBenchmark(state, model,
367       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
368       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
369       xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
370       xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
371       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
372       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
373       xnn_init_f32_minmax_scalar_params,
374       6 /* mr */, 8 /* nr */);
375   }
f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State & state,models::ExecutionPlanFactory model)376   static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
377     GEMMEnd2EndBenchmark(state, model,
378       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64,
379       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64,
380       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
381       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
382       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
383       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
384       xnn_init_f32_minmax_scalar_params,
385       6 /* mr */, 8 /* nr */);
386   }
f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,models::ExecutionPlanFactory model)387   static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
388     GEMMEnd2EndBenchmark(state, model,
389       xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128,
390       xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128,
391       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
392       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
393       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
394       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
395       xnn_init_f32_minmax_scalar_params,
396       6 /* mr */, 8 /* nr */);
397   }
f32_gemm_4x2__neonfma_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)398   static void f32_gemm_4x2__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
399     GEMMEnd2EndBenchmark(state, model,
400       xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64,
401       xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64,
402       xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64,
403       xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64,
404       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
405       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
406       xnn_init_f32_minmax_scalar_params,
407       4 /* mr */, 2 /* nr */);
408   }
f32_gemm_6x2__neonfma_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)409   static void f32_gemm_6x2__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
410     GEMMEnd2EndBenchmark(state, model,
411       xnn_f32_gemm_minmax_ukernel_6x2__neonfma_lane_ld64,
412       xnn_f32_igemm_minmax_ukernel_6x2__neonfma_lane_ld64,
413       xnn_f32_gemm_minmax_ukernel_6x2__neonfma_lane_ld64,
414       xnn_f32_igemm_minmax_ukernel_6x2__neonfma_lane_ld64,
415       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
416       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
417       xnn_init_f32_minmax_scalar_params,
418       6 /* mr */, 2 /* nr */);
419   }
f32_gemm_4x8__neonfma_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)420   static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
421     GEMMEnd2EndBenchmark(state, model,
422       xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64,
423       xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64,
424       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
425       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
426       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
427       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
428       xnn_init_f32_minmax_scalar_params,
429       4 /* mr */, 8 /* nr */);
430   }
f32_gemm_4x8__neonfma_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)431   static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
432     GEMMEnd2EndBenchmark(state, model,
433       xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128,
434       xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld128,
435       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
436       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
437       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
438       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
439       xnn_init_f32_minmax_scalar_params,
440       4 /* mr */, 8 /* nr */);
441   }
f32_gemm_6x8__neonfma_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)442   static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
443     GEMMEnd2EndBenchmark(state, model,
444       xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64,
445       xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64,
446       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
447       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
448       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
449       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
450       xnn_init_f32_minmax_scalar_params,
451       6 /* mr */, 8 /* nr */);
452   }
f32_gemm_6x8__neonfma_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)453   static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
454     GEMMEnd2EndBenchmark(state, model,
455       xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128,
456       xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128,
457       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
458       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
459       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
460       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
461       xnn_init_f32_minmax_scalar_params,
462       6 /* mr */, 8 /* nr */);
463   }
464 
465   BENCHMARK_FP32_END2END(f32_gemm_4x2__aarch64_neonfma_cortex_a75)
466   BENCHMARK_FP32_END2END(f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75)
467   BENCHMARK_FP32_END2END(f32_gemm_4x2__aarch64_neonfma_ld64)
468   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_ld64)
469   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_ld128);
470   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_ld64);
471   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_ld128);
472   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
473   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53)
474   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a55)
475   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
476   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75)
477   BENCHMARK_FP32_END2END(f32_gemm_5x8__aarch64_neonfma_cortex_a75);
478   BENCHMARK_FP32_END2END(f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75);
479   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a53);
480   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53);
481   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a55);
482   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a73);
483   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a75);
484   BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75);
485   BENCHMARK_FP32_END2END(f32_gemm_4x12__aarch64_neonfma_cortex_a53);
486 
487   BENCHMARK_FP32_END2END(f32_gemm_4x2__neonfma_lane_ld64);
488   BENCHMARK_FP32_END2END(f32_gemm_6x2__neonfma_lane_ld64);
489 
490   BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_lane_ld64);
491   BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_lane_ld128);
492 
493   BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_lane_ld64);
494   BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_lane_ld128);
495 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
496 
497 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)498   static void jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75(
499       benchmark::State &state, models::ExecutionPlanFactory model) {
500     GEMMEnd2EndBenchmark(
501         state, model,
502         xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
503         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
504         xnn_generate_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
505         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
506         xnn_init_f32_minmax_scalar_params, 4 /* mr */, 8 /* nr */);
507   }
jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)508   static void jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(
509       benchmark::State &state, models::ExecutionPlanFactory model) {
510     GEMMEnd2EndBenchmark(
511         state, model,
512         xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75,
513         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
514         xnn_generate_f32_igemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75,
515         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
516         xnn_init_f32_minmax_scalar_params, 4 /* mr */, 8 /* nr */);
517   }
jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)518   static void jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_cortex_a75(
519       benchmark::State &state, models::ExecutionPlanFactory model) {
520     GEMMEnd2EndBenchmark(
521         state, model,
522         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
523         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
524         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
525         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
526         xnn_init_f32_minmax_scalar_params, 1 /* mr */, 8 /* nr */);
527   }
jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)528   static void jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_cortex_a75(
529       benchmark::State &state, models::ExecutionPlanFactory model) {
530     GEMMEnd2EndBenchmark(
531         state, model,
532         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
533         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
534         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
535         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
536         xnn_init_f32_minmax_scalar_params, 2 /* mr */, 8 /* nr */);
537   }
jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)538   static void jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_cortex_a75(
539       benchmark::State &state, models::ExecutionPlanFactory model) {
540     GEMMEnd2EndBenchmark(
541         state, model,
542         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
543         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
544         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
545         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
546         xnn_init_f32_minmax_scalar_params, 3 /* mr */, 8 /* nr */);
547   }
jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)548   static void jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_cortex_a75(
549       benchmark::State &state, models::ExecutionPlanFactory model) {
550     GEMMEnd2EndBenchmark(
551         state, model,
552         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
553         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
554         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
555         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
556         xnn_init_f32_minmax_scalar_params, 4 /* mr */, 8 /* nr */);
557   }
jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)558   static void jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_cortex_a75(
559       benchmark::State &state, models::ExecutionPlanFactory model) {
560     GEMMEnd2EndBenchmark(
561         state, model,
562         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
563         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
564         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
565         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
566         xnn_init_f32_minmax_scalar_params, 5 /* mr */, 8 /* nr */);
567   }
jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)568   static void jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_cortex_a75(
569       benchmark::State &state, models::ExecutionPlanFactory model) {
570     GEMMEnd2EndBenchmark(
571         state, model,
572         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
573         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
574         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
575         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
576         xnn_init_f32_minmax_scalar_params, 6 /* mr */, 8 /* nr */);
577   }
jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)578   static void jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_prfm_cortex_a75(
579       benchmark::State &state, models::ExecutionPlanFactory model) {
580     GEMMEnd2EndBenchmark(
581         state, model,
582         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
583         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
584         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
585         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
586         xnn_init_f32_minmax_scalar_params, 1 /* mr */, 8 /* nr */);
587   }
jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)588   static void jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_prfm_cortex_a75(
589       benchmark::State &state, models::ExecutionPlanFactory model) {
590     GEMMEnd2EndBenchmark(
591         state, model,
592         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
593         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
594         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
595         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
596         xnn_init_f32_minmax_scalar_params, 2 /* mr */, 8 /* nr */);
597   }
jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)598   static void jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_prfm_cortex_a75(
599       benchmark::State &state, models::ExecutionPlanFactory model) {
600     GEMMEnd2EndBenchmark(
601         state, model,
602         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
603         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
604         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
605         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
606         xnn_init_f32_minmax_scalar_params, 3 /* mr */, 8 /* nr */);
607   }
jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)608   static void jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_prfm_cortex_a75(
609       benchmark::State &state, models::ExecutionPlanFactory model) {
610     GEMMEnd2EndBenchmark(
611         state, model,
612         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
613         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
614         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
615         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
616         xnn_init_f32_minmax_scalar_params, 4 /* mr */, 8 /* nr */);
617   }
jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)618   static void jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_prfm_cortex_a75(
619       benchmark::State &state, models::ExecutionPlanFactory model) {
620     GEMMEnd2EndBenchmark(
621         state, model,
622         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
623         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
624         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
625         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
626         xnn_init_f32_minmax_scalar_params, 5 /* mr */, 8 /* nr */);
627   }
jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)628   static void jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_prfm_cortex_a75(
629       benchmark::State &state, models::ExecutionPlanFactory model) {
630     GEMMEnd2EndBenchmark(
631         state, model,
632         xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
633         xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
634         xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
635         xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
636         xnn_init_f32_minmax_scalar_params, 6 /* mr */, 8 /* nr */);
637   }
638 
639 BENCHMARK_FP32_END2END(jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75);
640 BENCHMARK_FP32_END2END(jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75);
641 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_cortex_a75);
642 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_cortex_a75);
643 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_cortex_a75);
644 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_cortex_a75);
645 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_cortex_a75);
646 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_cortex_a75);
647 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_prfm_cortex_a75);
648 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_prfm_cortex_a75);
649 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_prfm_cortex_a75);
650 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_prfm_cortex_a75);
651 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_prfm_cortex_a75);
652 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_prfm_cortex_a75);
653 
654 #endif  // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
655 
656 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
f32_gemm_4x8__aarch32_neon_ld64(benchmark::State & state,models::ExecutionPlanFactory model)657   static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
658     GEMMEnd2EndBenchmark(state, model,
659       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64,
660       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_ld64,
661       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
662       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
663       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
664       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
665       xnn_init_f32_minmax_scalar_params,
666       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
667       benchmark::utils::CheckNEON);
668   }
f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)669   static void f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
670     GEMMEnd2EndBenchmark(state, model,
671       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7,
672       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7,
673       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
674       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
675       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
676       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
677       xnn_init_f32_minmax_scalar_params,
678       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
679       benchmark::utils::CheckNEON);
680   }
f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)681   static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
682     GEMMEnd2EndBenchmark(state, model,
683       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53,
684       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53,
685       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
686       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
687       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
688       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
689       xnn_init_f32_minmax_scalar_params,
690       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
691       benchmark::utils::CheckNEON);
692   }
f32_gemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)693   static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
694     GEMMEnd2EndBenchmark(state, model,
695       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53,
696       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53,
697       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
698       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
699       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
700       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
701       xnn_init_f32_minmax_scalar_params,
702       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
703       benchmark::utils::CheckNEON);
704   }
f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)705   static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
706     GEMMEnd2EndBenchmark(state, model,
707       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55,
708       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55,
709       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
710       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
711       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
712       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
713       xnn_init_f32_minmax_scalar_params,
714       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
715       benchmark::utils::CheckNEON);
716   }
f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)717   static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
718     GEMMEnd2EndBenchmark(state, model,
719       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75,
720       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75,
721       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
722       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
723       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
724       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
725       xnn_init_f32_minmax_scalar_params,
726       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
727       benchmark::utils::CheckNEON);
728   }
f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)729   static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
730     GEMMEnd2EndBenchmark(state, model,
731       xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75,
732       xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75,
733       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
734       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
735       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
736       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
737       xnn_init_f32_minmax_scalar_params,
738       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
739       benchmark::utils::CheckNEON);
740   }
741 
742   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_ld64);
743   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a7);
744   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a53);
745   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_prfm_cortex_a53);
746   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a55);
747   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a75);
748   BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_prfm_cortex_a75);
749 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
750 
751 
752 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
f32_gemm_4x2__neon_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)753   static void f32_gemm_4x2__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
754     GEMMEnd2EndBenchmark(state, model,
755       xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64,
756       xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64,
757       xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64,
758       xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64,
759       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
760       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
761       xnn_init_f32_minmax_scalar_params,
762       4 /* mr */, 2 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
763       benchmark::utils::CheckNEON);
764   }
765 
f32_gemm_6x2__neon_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)766   static void f32_gemm_6x2__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
767     GEMMEnd2EndBenchmark(state, model,
768       xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64,
769       xnn_f32_igemm_minmax_ukernel_6x2__neon_lane_ld64,
770       xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64,
771       xnn_f32_igemm_minmax_ukernel_6x2__neon_lane_ld64,
772       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
773       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
774       xnn_init_f32_minmax_scalar_params,
775       6 /* mr */, 2 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
776       benchmark::utils::CheckNEON);
777   }
778 
f32_gemm_4x8__neon_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)779   static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
780     GEMMEnd2EndBenchmark(state, model,
781       xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64,
782       xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64,
783       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
784       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
785       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
786       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
787       xnn_init_f32_minmax_scalar_params,
788       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
789       benchmark::utils::CheckNEON);
790   }
791 
f32_gemm_4x8__neon_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)792   static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
793     GEMMEnd2EndBenchmark(state, model,
794       xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128,
795       xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128,
796       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
797       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
798       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
799       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
800       xnn_init_f32_minmax_scalar_params,
801       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
802       benchmark::utils::CheckNEON);
803   }
804 
f32_gemm_6x8__neon_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)805   static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
806     GEMMEnd2EndBenchmark(state, model,
807       xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64,
808       xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64,
809       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
810       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
811       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
812       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
813       xnn_init_f32_minmax_scalar_params,
814       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
815       benchmark::utils::CheckNEON);
816   }
817 
f32_gemm_6x8__neon_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)818   static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
819     GEMMEnd2EndBenchmark(state, model,
820       xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128,
821       xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128,
822       xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
823       xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
824       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
825       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
826       xnn_init_f32_minmax_scalar_params,
827       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
828       benchmark::utils::CheckNEON);
829   }
830 
f32_gemm_4x8__neon_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)831   static void f32_gemm_4x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
832     GEMMEnd2EndBenchmark(state, model,
833       xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld64,
834       xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64,
835       xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
836       xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
837       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
838       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
839       xnn_init_f32_minmax_scalar_params,
840       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
841       benchmark::utils::CheckNEON);
842   }
843 
f32_gemm_4x8__neon_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)844   static void f32_gemm_4x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
845     GEMMEnd2EndBenchmark(state, model,
846       xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128,
847       xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld128,
848       xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
849       xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
850       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
851       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
852       xnn_init_f32_minmax_scalar_params,
853       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
854       benchmark::utils::CheckNEON);
855   }
856 
f32_gemm_6x8__neon_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)857   static void f32_gemm_6x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
858     GEMMEnd2EndBenchmark(state, model,
859       xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64,
860       xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64,
861       xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
862       xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
863       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
864       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
865       xnn_init_f32_minmax_scalar_params,
866       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
867       benchmark::utils::CheckNEON);
868   }
869 
f32_gemm_6x8__neon_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)870   static void f32_gemm_6x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
871     GEMMEnd2EndBenchmark(state, model,
872       xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld128,
873       xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld128,
874       xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
875       xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
876       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
877       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
878       xnn_init_f32_minmax_scalar_params,
879       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
880       benchmark::utils::CheckNEON);
881   }
882 
f32_gemm_4x8__neonfma_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)883   static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
884     GEMMEnd2EndBenchmark(state, model,
885       xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64,
886       xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64,
887       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
888       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
889       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
890       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
891       xnn_init_f32_minmax_scalar_params,
892       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
893       benchmark::utils::CheckNEONFMA);
894   }
895 
f32_gemm_4x8__neonfma_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)896   static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
897     GEMMEnd2EndBenchmark(state, model,
898       xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128,
899       xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld128,
900       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
901       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
902       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
903       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
904       xnn_init_f32_minmax_scalar_params,
905       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
906       benchmark::utils::CheckNEONFMA);
907   }
908 
f32_gemm_6x8__neonfma_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)909   static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
910     GEMMEnd2EndBenchmark(state, model,
911       xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64,
912       xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64,
913       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
914       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
915       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
916       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
917       xnn_init_f32_minmax_scalar_params,
918       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
919       benchmark::utils::CheckNEONFMA);
920   }
921 
f32_gemm_6x8__neonfma_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)922   static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
923     GEMMEnd2EndBenchmark(state, model,
924       xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128,
925       xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld128,
926       xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
927       xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
928       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
929       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
930       xnn_init_f32_minmax_scalar_params,
931       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
932       benchmark::utils::CheckNEONFMA);
933   }
934 
f32_gemm_4x8s4__neon(benchmark::State & state,models::ExecutionPlanFactory model)935   static void f32_gemm_4x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
936     GEMMEnd2EndBenchmark(state, model,
937       xnn_f32_gemm_minmax_ukernel_4x8s4__neon,
938       xnn_f32_igemm_minmax_ukernel_4x8s4__neon,
939       xnn_f32_gemm_minmax_ukernel_1x8s4__neon,
940       xnn_f32_igemm_minmax_ukernel_1x8s4__neon,
941       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
942       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
943       xnn_init_f32_minmax_scalar_params,
944       4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
945       benchmark::utils::CheckNEON);
946   }
947 
f32_gemm_4x8s4__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)948   static void f32_gemm_4x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
949     GEMMEnd2EndBenchmark(state, model,
950       xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma,
951       xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma,
952       xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma,
953       xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma,
954       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
955       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
956       xnn_init_f32_minmax_scalar_params,
957       4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
958       benchmark::utils::CheckNEONFMA);
959   }
960 
f32_gemm_6x8s4__neon(benchmark::State & state,models::ExecutionPlanFactory model)961   static void f32_gemm_6x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
962     GEMMEnd2EndBenchmark(state, model,
963       xnn_f32_gemm_minmax_ukernel_6x8s4__neon,
964       xnn_f32_igemm_minmax_ukernel_6x8s4__neon,
965       xnn_f32_gemm_minmax_ukernel_1x8s4__neon,
966       xnn_f32_igemm_minmax_ukernel_1x8s4__neon,
967       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
968       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
969       xnn_init_f32_minmax_scalar_params,
970       6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
971       benchmark::utils::CheckNEON);
972   }
973 
f32_gemm_6x8s4__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)974   static void f32_gemm_6x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
975     GEMMEnd2EndBenchmark(state, model,
976       xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma,
977       xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma,
978       xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma,
979       xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma,
980       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
981       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
982       xnn_init_f32_minmax_scalar_params,
983       6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
984       benchmark::utils::CheckNEONFMA);
985   }
986 
f32_gemm_8x8s4__neon(benchmark::State & state,models::ExecutionPlanFactory model)987   static void f32_gemm_8x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
988     GEMMEnd2EndBenchmark(state, model,
989       xnn_f32_gemm_minmax_ukernel_8x8s4__neon,
990       xnn_f32_igemm_minmax_ukernel_8x8s4__neon,
991       xnn_f32_gemm_minmax_ukernel_1x8s4__neon,
992       xnn_f32_igemm_minmax_ukernel_1x8s4__neon,
993       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
994       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
995       xnn_init_f32_minmax_scalar_params,
996       8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
997       benchmark::utils::CheckNEON);
998   }
999 
f32_gemm_8x8s4__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)1000   static void f32_gemm_8x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
1001     GEMMEnd2EndBenchmark(state, model,
1002       xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma,
1003       xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma,
1004       xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma,
1005       xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma,
1006       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1007       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1008       xnn_init_f32_minmax_scalar_params,
1009       8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
1010       benchmark::utils::CheckNEONFMA);
1011   }
1012 
1013   BENCHMARK_FP32_END2END(f32_gemm_4x2__neon_lane_ld64);
1014   BENCHMARK_FP32_END2END(f32_gemm_6x2__neon_lane_ld64);
1015 
1016   BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_lane_ld64);
1017   BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_lane_ld128);
1018   BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_lane_ld64);
1019   BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_lane_ld128);
1020 
1021   BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_dup_ld64);
1022   BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_dup_ld128);
1023   BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_dup_ld64);
1024   BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_dup_ld128);
1025 
1026   BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_dup_ld64);
1027   BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_dup_ld128);
1028   BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_dup_ld64);
1029   BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_dup_ld128);
1030 
1031   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__neon);
1032   BENCHMARK_FP32_END2END(f32_gemm_6x8s4__neon);
1033   BENCHMARK_FP32_END2END(f32_gemm_8x8s4__neon);
1034 
1035   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__neonfma);
1036   BENCHMARK_FP32_END2END(f32_gemm_6x8s4__neonfma);
1037   BENCHMARK_FP32_END2END(f32_gemm_8x8s4__neonfma);
1038 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1039 
1040 
1041 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
f32_gemm_4x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1042   static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1043     GEMMEnd2EndBenchmark(state, model,
1044       xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast,
1045       xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast,
1046       xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
1047       xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
1048       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1049       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1050       xnn_init_f32_minmax_scalar_params,
1051       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1052       benchmark::utils::CheckAVX512F);
1053   }
f32_gemm_5x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1054   static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1055     GEMMEnd2EndBenchmark(state, model,
1056       xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast,
1057       xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast,
1058       xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
1059       xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
1060       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1061       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1062       xnn_init_f32_minmax_scalar_params,
1063       5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1064       benchmark::utils::CheckAVX512F);
1065   }
f32_gemm_6x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1066   static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1067     GEMMEnd2EndBenchmark(state, model,
1068       xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast,
1069       xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast,
1070       xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
1071       xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
1072       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1073       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1074       xnn_init_f32_minmax_scalar_params,
1075       6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1076       benchmark::utils::CheckAVX512F);
1077   }
f32_gemm_7x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1078   static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1079     GEMMEnd2EndBenchmark(state, model,
1080       xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast,
1081       xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast,
1082       xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
1083       xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
1084       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1085       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1086       xnn_init_f32_minmax_scalar_params,
1087       7 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1088       benchmark::utils::CheckAVX512F);
1089   }
f32_gemm_8x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1090   static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1091     GEMMEnd2EndBenchmark(state, model,
1092       xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast,
1093       xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast,
1094       xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
1095       xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
1096       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1097       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1098       xnn_init_f32_minmax_scalar_params,
1099       8 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1100       benchmark::utils::CheckAVX512F);
1101   }
1102 
f32_gemm_4x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1103   static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1104     GEMMEnd2EndBenchmark(state, model,
1105       xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast,
1106       xnn_f32_igemm_minmax_ukernel_4x8__fma3_broadcast,
1107       xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
1108       xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
1109       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1110       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1111       xnn_init_f32_minmax_avx_params,
1112       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1113       benchmark::utils::CheckFMA3);
1114   }
f32_gemm_5x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1115   static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1116     GEMMEnd2EndBenchmark(state, model,
1117       xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast,
1118       xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast,
1119       xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
1120       xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
1121       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1122       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1123       xnn_init_f32_minmax_avx_params,
1124       5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1125       benchmark::utils::CheckFMA3);
1126   }
f32_gemm_6x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1127   static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1128     GEMMEnd2EndBenchmark(state, model,
1129       xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast,
1130       xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast,
1131       xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
1132       xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
1133       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1134       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1135       xnn_init_f32_minmax_avx_params,
1136       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1137       benchmark::utils::CheckFMA3);
1138   }
f32_gemm_7x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1139   static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1140     GEMMEnd2EndBenchmark(state, model,
1141       xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast,
1142       xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast,
1143       xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
1144       xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
1145       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1146       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1147       xnn_init_f32_minmax_avx_params,
1148       7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1149       benchmark::utils::CheckFMA3);
1150   }
f32_gemm_8x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1151   static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1152     GEMMEnd2EndBenchmark(state, model,
1153       xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast,
1154       xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast,
1155       xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
1156       xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
1157       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1158       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1159       xnn_init_f32_minmax_avx_params,
1160       8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1161       benchmark::utils::CheckFMA3);
1162   }
f32_gemm_3x16__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1163   static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1164     GEMMEnd2EndBenchmark(state, model,
1165       xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast,
1166       xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast,
1167       xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast,
1168       xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast,
1169       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1170       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1171       xnn_init_f32_minmax_avx_params,
1172       3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1173       benchmark::utils::CheckFMA3);
1174   }
f32_gemm_4x16__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1175   static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1176     GEMMEnd2EndBenchmark(state, model,
1177       xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast,
1178       xnn_f32_igemm_minmax_ukernel_4x16__fma3_broadcast,
1179       xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast,
1180       xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast,
1181       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1182       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1183       xnn_init_f32_minmax_avx_params,
1184       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1185       benchmark::utils::CheckFMA3);
1186   }
f32_gemm_5x16__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1187   static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1188     GEMMEnd2EndBenchmark(state, model,
1189       xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast,
1190       xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast,
1191       xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast,
1192       xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast,
1193       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1194       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1195       xnn_init_f32_minmax_avx_params,
1196       5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1197       benchmark::utils::CheckFMA3);
1198   }
f32_gemm_3x16s4__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1199   static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1200     GEMMEnd2EndBenchmark(state, model,
1201       xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast,
1202       xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast,
1203       xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast,
1204       xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast,
1205       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1206       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1207       xnn_init_f32_minmax_avx_params,
1208       3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
1209       benchmark::utils::CheckFMA3);
1210   }
f32_gemm_4x16s4__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1211   static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1212     GEMMEnd2EndBenchmark(state, model,
1213       xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast,
1214       xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast,
1215       xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast,
1216       xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast,
1217       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1218       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1219       xnn_init_f32_minmax_avx_params,
1220       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
1221       benchmark::utils::CheckFMA3);
1222   }
f32_gemm_5x16s4__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1223   static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1224     GEMMEnd2EndBenchmark(state, model,
1225       xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast,
1226       xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast,
1227       xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast,
1228       xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast,
1229       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1230       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1231       xnn_init_f32_minmax_avx_params,
1232       5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
1233       benchmark::utils::CheckFMA3);
1234   }
1235 
f32_gemm_4x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1236   static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1237     GEMMEnd2EndBenchmark(state, model,
1238       xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast,
1239       xnn_f32_igemm_minmax_ukernel_4x8__avx_broadcast,
1240       xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
1241       xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
1242       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1243       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1244       xnn_init_f32_minmax_avx_params,
1245       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1246       benchmark::utils::CheckAVX);
1247   }
f32_gemm_5x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1248   static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1249     GEMMEnd2EndBenchmark(state, model,
1250       xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast,
1251       xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast,
1252       xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
1253       xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
1254       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1255       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1256       xnn_init_f32_minmax_avx_params,
1257       5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1258       benchmark::utils::CheckAVX);
1259   }
f32_gemm_6x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1260   static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1261     GEMMEnd2EndBenchmark(state, model,
1262       xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast,
1263       xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast,
1264       xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
1265       xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
1266       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1267       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1268       xnn_init_f32_minmax_avx_params,
1269       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1270       benchmark::utils::CheckAVX);
1271   }
f32_gemm_7x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1272   static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1273     GEMMEnd2EndBenchmark(state, model,
1274       xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast,
1275       xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast,
1276       xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
1277       xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
1278       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1279       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1280       xnn_init_f32_minmax_avx_params,
1281       7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1282       benchmark::utils::CheckAVX);
1283   }
f32_gemm_3x16__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1284   static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1285     GEMMEnd2EndBenchmark(state, model,
1286       xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast,
1287       xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast,
1288       xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast,
1289       xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast,
1290       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1291       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1292       xnn_init_f32_minmax_avx_params,
1293       3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1294       benchmark::utils::CheckAVX);
1295   }
f32_gemm_4x16__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1296   static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1297     GEMMEnd2EndBenchmark(state, model,
1298       xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast,
1299       xnn_f32_igemm_minmax_ukernel_4x16__avx_broadcast,
1300       xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast,
1301       xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast,
1302       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1303       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1304       xnn_init_f32_minmax_avx_params,
1305       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1306       benchmark::utils::CheckAVX);
1307   }
f32_gemm_5x16__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1308   static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1309     GEMMEnd2EndBenchmark(state, model,
1310       xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast,
1311       xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast,
1312       xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast,
1313       xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast,
1314       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1315       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1316       xnn_init_f32_minmax_avx_params,
1317       5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1318       benchmark::utils::CheckAVX);
1319   }
1320 
f32_gemm_3x8__sse2_dup(benchmark::State & state,models::ExecutionPlanFactory model)1321   static void f32_gemm_3x8__sse2_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1322     GEMMEnd2EndBenchmark(state, model,
1323       xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup,
1324       xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup,
1325       xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup,
1326       xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup,
1327       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1328       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1329       xnn_init_f32_minmax_sse_params,
1330       3 /* mr */, 8 /* nr */);
1331   }
f32_gemm_4x8__sse2_dup(benchmark::State & state,models::ExecutionPlanFactory model)1332   static void f32_gemm_4x8__sse2_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1333     GEMMEnd2EndBenchmark(state, model,
1334       xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup,
1335       xnn_f32_igemm_minmax_ukernel_4x8__sse2_dup,
1336       xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup,
1337       xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup,
1338       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1339       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1340       xnn_init_f32_minmax_sse_params,
1341       4 /* mr */, 8 /* nr */);
1342   }
f32_gemm_5x8__sse2_dup(benchmark::State & state,models::ExecutionPlanFactory model)1343   static void f32_gemm_5x8__sse2_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1344     GEMMEnd2EndBenchmark(state, model,
1345       xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup,
1346       xnn_f32_igemm_minmax_ukernel_5x8__sse2_dup,
1347       xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup,
1348       xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup,
1349       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1350       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1351       xnn_init_f32_minmax_sse_params,
1352       5 /* mr */, 8 /* nr */);
1353   }
1354 
f32_gemm_3x8__sse_load1(benchmark::State & state,models::ExecutionPlanFactory model)1355   static void f32_gemm_3x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
1356     GEMMEnd2EndBenchmark(state, model,
1357       xnn_f32_gemm_minmax_ukernel_3x8__sse_load1,
1358       xnn_f32_igemm_minmax_ukernel_3x8__sse_load1,
1359       xnn_f32_gemm_minmax_ukernel_1x8__sse_load1,
1360       xnn_f32_igemm_minmax_ukernel_1x8__sse_load1,
1361       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1362       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1363       xnn_init_f32_minmax_sse_params,
1364       3 /* mr */, 8 /* nr */);
1365   }
f32_gemm_4x8__sse_load1(benchmark::State & state,models::ExecutionPlanFactory model)1366   static void f32_gemm_4x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
1367     GEMMEnd2EndBenchmark(state, model,
1368       xnn_f32_gemm_minmax_ukernel_4x8__sse_load1,
1369       xnn_f32_igemm_minmax_ukernel_4x8__sse_load1,
1370       xnn_f32_gemm_minmax_ukernel_1x8__sse_load1,
1371       xnn_f32_igemm_minmax_ukernel_1x8__sse_load1,
1372       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1373       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1374       xnn_init_f32_minmax_sse_params,
1375       4 /* mr */, 8 /* nr */);
1376   }
f32_gemm_5x8__sse_load1(benchmark::State & state,models::ExecutionPlanFactory model)1377   static void f32_gemm_5x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
1378     GEMMEnd2EndBenchmark(state, model,
1379       xnn_f32_gemm_minmax_ukernel_5x8__sse_load1,
1380       xnn_f32_igemm_minmax_ukernel_5x8__sse_load1,
1381       xnn_f32_gemm_minmax_ukernel_1x8__sse_load1,
1382       xnn_f32_igemm_minmax_ukernel_1x8__sse_load1,
1383       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1384       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1385       xnn_init_f32_minmax_sse_params,
1386       5 /* mr */, 8 /* nr */);
1387   }
f32_gemm_3x8__sse_dup(benchmark::State & state,models::ExecutionPlanFactory model)1388   static void f32_gemm_3x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1389     GEMMEnd2EndBenchmark(state, model,
1390       xnn_f32_gemm_minmax_ukernel_3x8__sse_dup,
1391       xnn_f32_igemm_minmax_ukernel_3x8__sse_dup,
1392       xnn_f32_gemm_minmax_ukernel_1x8__sse_dup,
1393       xnn_f32_igemm_minmax_ukernel_1x8__sse_dup,
1394       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1395       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1396       xnn_init_f32_minmax_sse_params,
1397       3 /* mr */, 8 /* nr */);
1398   }
f32_gemm_4x8__sse_dup(benchmark::State & state,models::ExecutionPlanFactory model)1399   static void f32_gemm_4x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1400     GEMMEnd2EndBenchmark(state, model,
1401       xnn_f32_gemm_minmax_ukernel_4x8__sse_dup,
1402       xnn_f32_igemm_minmax_ukernel_4x8__sse_dup,
1403       xnn_f32_gemm_minmax_ukernel_1x8__sse_dup,
1404       xnn_f32_igemm_minmax_ukernel_1x8__sse_dup,
1405       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1406       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1407       xnn_init_f32_minmax_sse_params,
1408       4 /* mr */, 8 /* nr */);
1409   }
f32_gemm_5x8__sse_dup(benchmark::State & state,models::ExecutionPlanFactory model)1410   static void f32_gemm_5x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1411     GEMMEnd2EndBenchmark(state, model,
1412       xnn_f32_gemm_minmax_ukernel_5x8__sse_dup,
1413       xnn_f32_igemm_minmax_ukernel_5x8__sse_dup,
1414       xnn_f32_gemm_minmax_ukernel_1x8__sse_dup,
1415       xnn_f32_igemm_minmax_ukernel_1x8__sse_dup,
1416       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1417       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1418       xnn_init_f32_minmax_sse_params,
1419       5 /* mr */, 8 /* nr */);
1420   }
f32_gemm_3x8s4__sse(benchmark::State & state,models::ExecutionPlanFactory model)1421   static void f32_gemm_3x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
1422     GEMMEnd2EndBenchmark(state, model,
1423       xnn_f32_gemm_minmax_ukernel_3x8s4__sse,
1424       xnn_f32_igemm_minmax_ukernel_3x8s4__sse,
1425       xnn_f32_gemm_minmax_ukernel_1x8s4__sse,
1426       xnn_f32_igemm_minmax_ukernel_1x8s4__sse,
1427       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1428       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1429       xnn_init_f32_minmax_sse_params,
1430       3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1431   }
f32_gemm_4x8s4__sse(benchmark::State & state,models::ExecutionPlanFactory model)1432   static void f32_gemm_4x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
1433     GEMMEnd2EndBenchmark(state, model,
1434       xnn_f32_gemm_minmax_ukernel_4x8s4__sse,
1435       xnn_f32_igemm_minmax_ukernel_4x8s4__sse,
1436       xnn_f32_gemm_minmax_ukernel_1x8s4__sse,
1437       xnn_f32_igemm_minmax_ukernel_1x8s4__sse,
1438       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1439       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1440       xnn_init_f32_minmax_sse_params,
1441       4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1442   }
f32_gemm_5x8s4__sse(benchmark::State & state,models::ExecutionPlanFactory model)1443   static void f32_gemm_5x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
1444     GEMMEnd2EndBenchmark(state, model,
1445       xnn_f32_gemm_minmax_ukernel_5x8s4__sse,
1446       xnn_f32_igemm_minmax_ukernel_5x8s4__sse,
1447       xnn_f32_gemm_minmax_ukernel_1x8s4__sse,
1448       xnn_f32_igemm_minmax_ukernel_1x8s4__sse,
1449       nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1450       nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1451       xnn_init_f32_minmax_sse_params,
1452       5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1453   }
1454 
1455   BENCHMARK_FP32_END2END(f32_gemm_4x16__avx512f_broadcast);
1456   BENCHMARK_FP32_END2END(f32_gemm_5x16__avx512f_broadcast);
1457   BENCHMARK_FP32_END2END(f32_gemm_6x16__avx512f_broadcast);
1458   BENCHMARK_FP32_END2END(f32_gemm_7x16__avx512f_broadcast);
1459   BENCHMARK_FP32_END2END(f32_gemm_8x16__avx512f_broadcast);
1460 
1461   BENCHMARK_FP32_END2END(f32_gemm_4x8__fma3_broadcast);
1462   BENCHMARK_FP32_END2END(f32_gemm_5x8__fma3_broadcast);
1463   BENCHMARK_FP32_END2END(f32_gemm_6x8__fma3_broadcast);
1464   BENCHMARK_FP32_END2END(f32_gemm_7x8__fma3_broadcast);
1465   BENCHMARK_FP32_END2END(f32_gemm_8x8__fma3_broadcast);
1466   BENCHMARK_FP32_END2END(f32_gemm_3x16__fma3_broadcast);
1467   BENCHMARK_FP32_END2END(f32_gemm_4x16__fma3_broadcast);
1468   BENCHMARK_FP32_END2END(f32_gemm_5x16__fma3_broadcast);
1469 
1470   BENCHMARK_FP32_END2END(f32_gemm_3x16s4__fma3_broadcast);
1471   BENCHMARK_FP32_END2END(f32_gemm_4x16s4__fma3_broadcast);
1472   BENCHMARK_FP32_END2END(f32_gemm_5x16s4__fma3_broadcast);
1473 
1474   BENCHMARK_FP32_END2END(f32_gemm_4x8__avx_broadcast);
1475   BENCHMARK_FP32_END2END(f32_gemm_5x8__avx_broadcast);
1476   BENCHMARK_FP32_END2END(f32_gemm_6x8__avx_broadcast);
1477   BENCHMARK_FP32_END2END(f32_gemm_7x8__avx_broadcast);
1478   BENCHMARK_FP32_END2END(f32_gemm_3x16__avx_broadcast);
1479   BENCHMARK_FP32_END2END(f32_gemm_4x16__avx_broadcast);
1480   BENCHMARK_FP32_END2END(f32_gemm_5x16__avx_broadcast);
1481 
1482   BENCHMARK_FP32_END2END(f32_gemm_3x8__sse2_dup);
1483   BENCHMARK_FP32_END2END(f32_gemm_4x8__sse2_dup);
1484   BENCHMARK_FP32_END2END(f32_gemm_5x8__sse2_dup);
1485 
1486   BENCHMARK_FP32_END2END(f32_gemm_3x8__sse_load1);
1487   BENCHMARK_FP32_END2END(f32_gemm_4x8__sse_load1);
1488   BENCHMARK_FP32_END2END(f32_gemm_5x8__sse_load1);
1489 
1490   BENCHMARK_FP32_END2END(f32_gemm_3x8__sse_dup);
1491   BENCHMARK_FP32_END2END(f32_gemm_4x8__sse_dup);
1492   BENCHMARK_FP32_END2END(f32_gemm_5x8__sse_dup);
1493 
1494   BENCHMARK_FP32_END2END(f32_gemm_3x8s4__sse);
1495   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__sse);
1496   BENCHMARK_FP32_END2END(f32_gemm_5x8s4__sse);
1497 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1498 
1499 
1500 #if XNN_ARCH_WASMRELAXEDSIMD
f32_gemm_3x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1501   static void f32_gemm_3x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1502     GEMMEnd2EndBenchmark(state, model,
1503       xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat,
1504       xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat,
1505       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1506       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1507       xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_loadsplat,
1508       xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_loadsplat,
1509       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1510       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1511       xnn_f32_gemm_ukernel_3x8__wasmsimd_loadsplat,
1512       xnn_f32_igemm_ukernel_3x8__wasmsimd_loadsplat,
1513       xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1514       xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1515       xnn_init_f32_minmax_wasmsimd_params,
1516       3 /* mr */, 8 /* nr */);
1517   }
f32_gemm_4x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1518   static void f32_gemm_4x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1519     GEMMEnd2EndBenchmark(state, model,
1520       xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat,
1521       xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat,
1522       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1523       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1524       xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_loadsplat,
1525       xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_loadsplat,
1526       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1527       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1528       xnn_f32_gemm_ukernel_4x8__wasmsimd_loadsplat,
1529       xnn_f32_igemm_ukernel_4x8__wasmsimd_loadsplat,
1530       xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1531       xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1532       xnn_init_f32_minmax_wasmsimd_params,
1533       4 /* mr */, 8 /* nr */);
1534   }
f32_gemm_5x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1535   static void f32_gemm_5x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1536     GEMMEnd2EndBenchmark(state, model,
1537       xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat,
1538       xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat,
1539       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1540       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1541       xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_loadsplat,
1542       xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_loadsplat,
1543       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1544       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1545       xnn_f32_gemm_ukernel_5x8__wasmsimd_loadsplat,
1546       xnn_f32_igemm_ukernel_5x8__wasmsimd_loadsplat,
1547       xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1548       xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1549       xnn_init_f32_minmax_wasmsimd_params,
1550       5 /* mr */, 8 /* nr */);
1551   }
f32_gemm_6x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1552   static void f32_gemm_6x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1553     GEMMEnd2EndBenchmark(state, model,
1554       xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat,
1555       xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat,
1556       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1557       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1558       xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_loadsplat,
1559       xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_loadsplat,
1560       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1561       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1562       xnn_f32_gemm_ukernel_6x8__wasmsimd_loadsplat,
1563       xnn_f32_igemm_ukernel_6x8__wasmsimd_loadsplat,
1564       xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1565       xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1566       xnn_init_f32_minmax_wasmsimd_params,
1567       6 /* mr */, 8 /* nr */);
1568   }
f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1569   static void f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1570     GEMMEnd2EndBenchmark(state, model,
1571       xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat,
1572       xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat,
1573       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1574       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1575       xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat,
1576       xnn_f32_igemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat,
1577       xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1578       xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1579       xnn_f32_gemm_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat,
1580       xnn_f32_igemm_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat,
1581       xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1582       xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1583       xnn_init_f32_minmax_wasmsimd_params,
1584       3 /* mr */, 8 /* nr */);
1585   }
f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1586   static void f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1587     GEMMEnd2EndBenchmark(state, model,
1588       xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat,
1589       xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat,
1590       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1591       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1592       xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat,
1593       xnn_f32_igemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat,
1594       xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1595       xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1596       xnn_f32_gemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat,
1597       xnn_f32_igemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat,
1598       xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1599       xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1600       xnn_init_f32_minmax_wasmsimd_params,
1601       4 /* mr */, 8 /* nr */);
1602   }
f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1603   static void f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1604     GEMMEnd2EndBenchmark(state, model,
1605       xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat,
1606       xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat,
1607       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1608       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1609       xnn_f32_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat,
1610       xnn_f32_igemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat,
1611       xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1612       xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1613       xnn_f32_gemm_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat,
1614       xnn_f32_igemm_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat,
1615       xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1616       xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1617       xnn_init_f32_minmax_wasmsimd_params,
1618       5 /* mr */, 8 /* nr */);
1619   }
f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1620   static void f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1621     GEMMEnd2EndBenchmark(state, model,
1622       xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat,
1623       xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat,
1624       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1625       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1626       xnn_f32_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat,
1627       xnn_f32_igemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat,
1628       xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1629       xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1630       xnn_f32_gemm_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat,
1631       xnn_f32_igemm_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat,
1632       xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1633       xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1634       xnn_init_f32_minmax_wasmsimd_params,
1635       6 /* mr */, 8 /* nr */);
1636   }
f32_gemm_3x8__wasmrelaxedsimd_splat(benchmark::State & state,models::ExecutionPlanFactory model)1637   static void f32_gemm_3x8__wasmrelaxedsimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1638     GEMMEnd2EndBenchmark(state, model,
1639       xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat,
1640       xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat,
1641       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1642       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1643       xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_splat,
1644       xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_splat,
1645       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
1646       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
1647       xnn_f32_gemm_ukernel_3x8__wasmsimd_splat,
1648       xnn_f32_igemm_ukernel_3x8__wasmsimd_splat,
1649       xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
1650       xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
1651       xnn_init_f32_minmax_wasmsimd_params,
1652       3 /* mr */, 8 /* nr */);
1653   }
f32_gemm_4x8__wasmrelaxedsimd_splat(benchmark::State & state,models::ExecutionPlanFactory model)1654   static void f32_gemm_4x8__wasmrelaxedsimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1655     GEMMEnd2EndBenchmark(state, model,
1656       xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat,
1657       xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat,
1658       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1659       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1660       xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat,
1661       xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat,
1662       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
1663       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
1664       xnn_f32_gemm_ukernel_4x8__wasmsimd_splat,
1665       xnn_f32_igemm_ukernel_4x8__wasmsimd_splat,
1666       xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
1667       xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
1668       xnn_init_f32_minmax_wasmsimd_params,
1669       4 /* mr */, 8 /* nr */);
1670   }
f32_gemm_5x8__wasmrelaxedsimd_splat(benchmark::State & state,models::ExecutionPlanFactory model)1671   static void f32_gemm_5x8__wasmrelaxedsimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1672     GEMMEnd2EndBenchmark(state, model,
1673       xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_splat,
1674       xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_splat,
1675       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1676       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1677       xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat,
1678       xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat,
1679       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
1680       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
1681       xnn_f32_gemm_ukernel_5x8__wasmsimd_splat,
1682       xnn_f32_igemm_ukernel_5x8__wasmsimd_splat,
1683       xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
1684       xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
1685       xnn_init_f32_minmax_wasmsimd_params,
1686       5 /* mr */, 8 /* nr */);
1687   }
f32_gemm_6x8__wasmrelaxedsimd_splat(benchmark::State & state,models::ExecutionPlanFactory model)1688   static void f32_gemm_6x8__wasmrelaxedsimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1689     GEMMEnd2EndBenchmark(state, model,
1690       xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_splat,
1691       xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_splat,
1692       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1693       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1694       xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_splat,
1695       xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_splat,
1696       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
1697       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
1698       xnn_f32_gemm_ukernel_6x8__wasmsimd_splat,
1699       xnn_f32_igemm_ukernel_6x8__wasmsimd_splat,
1700       xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
1701       xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
1702       xnn_init_f32_minmax_wasmsimd_params,
1703       6 /* mr */, 8 /* nr */);
1704   }
f32_gemm_3x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,models::ExecutionPlanFactory model)1705   static void f32_gemm_3x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1706     GEMMEnd2EndBenchmark(state, model,
1707       xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_splat,
1708       xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_splat,
1709       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1710       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1711       xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat,
1712       xnn_f32_igemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat,
1713       xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1714       xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1715       xnn_f32_gemm_ukernel_3x8__wasmrelaxedsimd_fma_splat,
1716       xnn_f32_igemm_ukernel_3x8__wasmrelaxedsimd_fma_splat,
1717       xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1718       xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1719       xnn_init_f32_minmax_wasmsimd_params,
1720       3 /* mr */, 8 /* nr */);
1721   }
f32_gemm_4x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,models::ExecutionPlanFactory model)1722   static void f32_gemm_4x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1723     GEMMEnd2EndBenchmark(state, model,
1724       xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat,
1725       xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat,
1726       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1727       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1728       xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_splat,
1729       xnn_f32_igemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_splat,
1730       xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1731       xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1732       xnn_f32_gemm_ukernel_4x8__wasmrelaxedsimd_fma_splat,
1733       xnn_f32_igemm_ukernel_4x8__wasmrelaxedsimd_fma_splat,
1734       xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1735       xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1736       xnn_init_f32_minmax_wasmsimd_params,
1737       4 /* mr */, 8 /* nr */);
1738   }
f32_gemm_5x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,models::ExecutionPlanFactory model)1739   static void f32_gemm_5x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1740     GEMMEnd2EndBenchmark(state, model,
1741       xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat,
1742       xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat,
1743       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1744       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1745       xnn_f32_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat,
1746       xnn_f32_igemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat,
1747       xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1748       xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1749       xnn_f32_gemm_ukernel_5x8__wasmrelaxedsimd_fma_splat,
1750       xnn_f32_igemm_ukernel_5x8__wasmrelaxedsimd_fma_splat,
1751       xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1752       xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1753       xnn_init_f32_minmax_wasmsimd_params,
1754       5 /* mr */, 8 /* nr */);
1755   }
f32_gemm_6x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,models::ExecutionPlanFactory model)1756   static void f32_gemm_6x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1757     GEMMEnd2EndBenchmark(state, model,
1758       xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat,
1759       xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat,
1760       xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1761       xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1762       xnn_f32_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat,
1763       xnn_f32_igemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat,
1764       xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1765       xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1766       xnn_f32_gemm_ukernel_6x8__wasmrelaxedsimd_fma_splat,
1767       xnn_f32_igemm_ukernel_6x8__wasmrelaxedsimd_fma_splat,
1768       xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1769       xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1770       xnn_init_f32_minmax_wasmsimd_params,
1771       6 /* mr */, 8 /* nr */);
1772   }
f32_gemm_3x8s4__wasmrelaxedsimd(benchmark::State & state,models::ExecutionPlanFactory model)1773   static void f32_gemm_3x8s4__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) {
1774     GEMMEnd2EndBenchmark(state, model,
1775       xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd,
1776       xnn_f32_igemm_minmax_ukernel_3x8s4__wasmrelaxedsimd,
1777       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1778       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1779       xnn_f32_gemm_relu_ukernel_3x8s4__wasmsimd,
1780       xnn_f32_igemm_relu_ukernel_3x8s4__wasmsimd,
1781       xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
1782       xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
1783       xnn_f32_gemm_ukernel_3x8s4__wasmsimd,
1784       xnn_f32_igemm_ukernel_3x8s4__wasmsimd,
1785       xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
1786       xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
1787       xnn_init_f32_minmax_wasmsimd_params,
1788       3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1789   }
f32_gemm_4x8s4__wasmrelaxedsimd(benchmark::State & state,models::ExecutionPlanFactory model)1790   static void f32_gemm_4x8s4__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) {
1791     GEMMEnd2EndBenchmark(state, model,
1792       xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd,
1793       xnn_f32_igemm_minmax_ukernel_4x8s4__wasmrelaxedsimd,
1794       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1795       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1796       xnn_f32_gemm_relu_ukernel_4x8s4__wasmsimd,
1797       xnn_f32_igemm_relu_ukernel_4x8s4__wasmsimd,
1798       xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
1799       xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
1800       xnn_f32_gemm_ukernel_4x8s4__wasmsimd,
1801       xnn_f32_igemm_ukernel_4x8s4__wasmsimd,
1802       xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
1803       xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
1804       xnn_init_f32_minmax_wasmsimd_params,
1805       4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1806   }
f32_gemm_5x8s4__wasmrelaxedsimd(benchmark::State & state,models::ExecutionPlanFactory model)1807   static void f32_gemm_5x8s4__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) {
1808     GEMMEnd2EndBenchmark(state, model,
1809       xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd,
1810       xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd,
1811       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1812       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1813       xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd,
1814       xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd,
1815       xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
1816       xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
1817       xnn_f32_gemm_ukernel_5x8s4__wasmsimd,
1818       xnn_f32_igemm_ukernel_5x8s4__wasmsimd,
1819       xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
1820       xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
1821       xnn_init_f32_minmax_wasmsimd_params,
1822       5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1823   }
f32_gemm_6x8s4__wasmrelaxedsimd(benchmark::State & state,models::ExecutionPlanFactory model)1824   static void f32_gemm_6x8s4__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) {
1825     GEMMEnd2EndBenchmark(state, model,
1826       xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd,
1827       xnn_f32_igemm_minmax_ukernel_6x8s4__wasmrelaxedsimd,
1828       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1829       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1830       xnn_f32_gemm_relu_ukernel_6x8s4__wasmsimd,
1831       xnn_f32_igemm_relu_ukernel_6x8s4__wasmsimd,
1832       xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
1833       xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
1834       xnn_f32_gemm_ukernel_6x8s4__wasmsimd,
1835       xnn_f32_igemm_ukernel_6x8s4__wasmsimd,
1836       xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
1837       xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
1838       xnn_init_f32_minmax_wasmsimd_params,
1839       6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1840   }
f32_gemm_3x8s4__wasmrelaxedsimd_fma(benchmark::State & state,models::ExecutionPlanFactory model)1841   static void f32_gemm_3x8s4__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) {
1842     GEMMEnd2EndBenchmark(state, model,
1843       xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma,
1844       xnn_f32_igemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma,
1845       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1846       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1847       xnn_f32_gemm_relu_ukernel_3x8s4__wasmrelaxedsimd_fma,
1848       xnn_f32_igemm_relu_ukernel_3x8s4__wasmrelaxedsimd_fma,
1849       xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1850       xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1851       xnn_f32_gemm_ukernel_3x8s4__wasmrelaxedsimd_fma,
1852       xnn_f32_igemm_ukernel_3x8s4__wasmrelaxedsimd_fma,
1853       xnn_f32_gemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1854       xnn_f32_igemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1855       xnn_init_f32_minmax_wasmsimd_params,
1856       3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1857   }
f32_gemm_4x8s4__wasmrelaxedsimd_fma(benchmark::State & state,models::ExecutionPlanFactory model)1858   static void f32_gemm_4x8s4__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) {
1859     GEMMEnd2EndBenchmark(state, model,
1860       xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma,
1861       xnn_f32_igemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma,
1862       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1863       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1864       xnn_f32_gemm_relu_ukernel_4x8s4__wasmrelaxedsimd_fma,
1865       xnn_f32_igemm_relu_ukernel_4x8s4__wasmrelaxedsimd_fma,
1866       xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1867       xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1868       xnn_f32_gemm_ukernel_4x8s4__wasmrelaxedsimd_fma,
1869       xnn_f32_igemm_ukernel_4x8s4__wasmrelaxedsimd_fma,
1870       xnn_f32_gemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1871       xnn_f32_igemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1872       xnn_init_f32_minmax_wasmsimd_params,
1873       4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1874   }
f32_gemm_5x8s4__wasmrelaxedsimd_fma(benchmark::State & state,models::ExecutionPlanFactory model)1875   static void f32_gemm_5x8s4__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) {
1876     GEMMEnd2EndBenchmark(state, model,
1877       xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma,
1878       xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma,
1879       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1880       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1881       xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma,
1882       xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma,
1883       xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1884       xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1885       xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma,
1886       xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma,
1887       xnn_f32_gemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1888       xnn_f32_igemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1889       xnn_init_f32_minmax_wasmsimd_params,
1890       5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1891   }
f32_gemm_6x8s4__wasmrelaxedsimd_fma(benchmark::State & state,models::ExecutionPlanFactory model)1892   static void f32_gemm_6x8s4__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) {
1893     GEMMEnd2EndBenchmark(state, model,
1894       xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd_fma,
1895       xnn_f32_igemm_minmax_ukernel_6x8s4__wasmrelaxedsimd_fma,
1896       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1897       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1898       xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma,
1899       xnn_f32_igemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma,
1900       xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1901       xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1902       xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma,
1903       xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma,
1904       xnn_f32_gemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1905       xnn_f32_igemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1906       xnn_init_f32_minmax_wasmsimd_params,
1907       6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1908   }
1909 
1910   BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmrelaxedsimd_loadsplat);
1911   BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmrelaxedsimd_loadsplat);
1912   BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmrelaxedsimd_loadsplat);
1913   BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmrelaxedsimd_loadsplat);
1914 
1915   BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat);
1916   BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat);
1917   BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat);
1918   BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat);
1919 
1920   BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmrelaxedsimd_splat);
1921   BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmrelaxedsimd_splat);
1922   BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmrelaxedsimd_splat);
1923   BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmrelaxedsimd_splat);
1924 
1925   BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmrelaxedsimd_fma_splat);
1926   BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmrelaxedsimd_fma_splat);
1927   BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmrelaxedsimd_fma_splat);
1928   BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmrelaxedsimd_fma_splat);
1929 
1930   BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmrelaxedsimd);
1931   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmrelaxedsimd);
1932   BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmrelaxedsimd);
1933   BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmrelaxedsimd);
1934 
1935   BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmrelaxedsimd_fma);
1936   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmrelaxedsimd_fma);
1937   BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmrelaxedsimd_fma);
1938   BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmrelaxedsimd_fma);
1939 #endif  // XNN_ARCH_WASMRELAXEDSIMD
1940 
1941 
1942 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1943   static void f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1944     GEMMEnd2EndBenchmark(state, model,
1945       xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat,
1946       xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat,
1947       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1948       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1949       xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_loadsplat,
1950       xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_loadsplat,
1951       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1952       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1953       xnn_f32_gemm_ukernel_3x8__wasmsimd_loadsplat,
1954       xnn_f32_igemm_ukernel_3x8__wasmsimd_loadsplat,
1955       xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1956       xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1957       xnn_init_f32_minmax_wasmsimd_params,
1958       3 /* mr */, 8 /* nr */);
1959   }
f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1960   static void f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1961     GEMMEnd2EndBenchmark(state, model,
1962       xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat,
1963       xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat,
1964       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1965       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1966       xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_loadsplat,
1967       xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_loadsplat,
1968       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1969       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1970       xnn_f32_gemm_ukernel_4x8__wasmsimd_loadsplat,
1971       xnn_f32_igemm_ukernel_4x8__wasmsimd_loadsplat,
1972       xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1973       xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1974       xnn_init_f32_minmax_wasmsimd_params,
1975       4 /* mr */, 8 /* nr */);
1976   }
f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1977   static void f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1978     GEMMEnd2EndBenchmark(state, model,
1979       xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat,
1980       xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat,
1981       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1982       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1983       xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_loadsplat,
1984       xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_loadsplat,
1985       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1986       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1987       xnn_f32_gemm_ukernel_5x8__wasmsimd_loadsplat,
1988       xnn_f32_igemm_ukernel_5x8__wasmsimd_loadsplat,
1989       xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1990       xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1991       xnn_init_f32_minmax_wasmsimd_params,
1992       5 /* mr */, 8 /* nr */);
1993   }
f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1994   static void f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1995     GEMMEnd2EndBenchmark(state, model,
1996       xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat,
1997       xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat,
1998       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1999       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
2000       xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_loadsplat,
2001       xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_loadsplat,
2002       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2003       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2004       xnn_f32_gemm_ukernel_6x8__wasmsimd_loadsplat,
2005       xnn_f32_igemm_ukernel_6x8__wasmsimd_loadsplat,
2006       xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
2007       xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
2008       xnn_init_f32_minmax_wasmsimd_params,
2009       6 /* mr */, 8 /* nr */);
2010   }
f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)2011   static void f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
2012     GEMMEnd2EndBenchmark(state, model,
2013       xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat,
2014       xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat,
2015       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2016       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2017       xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_loadsplat,
2018       xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_loadsplat,
2019       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2020       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2021       xnn_f32_gemm_ukernel_3x8__wasmsimd_loadsplat,
2022       xnn_f32_igemm_ukernel_3x8__wasmsimd_loadsplat,
2023       xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
2024       xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
2025       xnn_init_f32_minmax_wasmsimd_params,
2026       3 /* mr */, 8 /* nr */);
2027   }
f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)2028   static void f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
2029     GEMMEnd2EndBenchmark(state, model,
2030       xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat,
2031       xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat,
2032       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2033       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2034       xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_loadsplat,
2035       xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_loadsplat,
2036       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2037       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2038       xnn_f32_gemm_ukernel_4x8__wasmsimd_loadsplat,
2039       xnn_f32_igemm_ukernel_4x8__wasmsimd_loadsplat,
2040       xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
2041       xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
2042       xnn_init_f32_minmax_wasmsimd_params,
2043       4 /* mr */, 8 /* nr */);
2044   }
f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)2045   static void f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
2046     GEMMEnd2EndBenchmark(state, model,
2047       xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat,
2048       xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat,
2049       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2050       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2051       xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_loadsplat,
2052       xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_loadsplat,
2053       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2054       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2055       xnn_f32_gemm_ukernel_5x8__wasmsimd_loadsplat,
2056       xnn_f32_igemm_ukernel_5x8__wasmsimd_loadsplat,
2057       xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
2058       xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
2059       xnn_init_f32_minmax_wasmsimd_params,
2060       5 /* mr */, 8 /* nr */);
2061   }
f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)2062   static void f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
2063     GEMMEnd2EndBenchmark(state, model,
2064       xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat,
2065       xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat,
2066       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2067       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2068       xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_loadsplat,
2069       xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_loadsplat,
2070       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2071       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2072       xnn_f32_gemm_ukernel_6x8__wasmsimd_loadsplat,
2073       xnn_f32_igemm_ukernel_6x8__wasmsimd_loadsplat,
2074       xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
2075       xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
2076       xnn_init_f32_minmax_wasmsimd_params,
2077       6 /* mr */, 8 /* nr */);
2078   }
f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)2079   static void f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2080     GEMMEnd2EndBenchmark(state, model,
2081       xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat,
2082       xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat,
2083       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2084       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2085       xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_splat,
2086       xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_splat,
2087       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2088       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2089       xnn_f32_gemm_ukernel_3x8__wasmsimd_splat,
2090       xnn_f32_igemm_ukernel_3x8__wasmsimd_splat,
2091       xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2092       xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2093       xnn_init_f32_minmax_wasmsimd_params,
2094       3 /* mr */, 8 /* nr */);
2095   }
f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)2096   static void f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2097     GEMMEnd2EndBenchmark(state, model,
2098       xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat,
2099       xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_splat,
2100       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2101       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2102       xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat,
2103       xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat,
2104       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2105       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2106       xnn_f32_gemm_ukernel_4x8__wasmsimd_splat,
2107       xnn_f32_igemm_ukernel_4x8__wasmsimd_splat,
2108       xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2109       xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2110       xnn_init_f32_minmax_wasmsimd_params,
2111       4 /* mr */, 8 /* nr */);
2112   }
f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)2113   static void f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2114     GEMMEnd2EndBenchmark(state, model,
2115       xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat,
2116       xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat,
2117       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2118       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2119       xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat,
2120       xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat,
2121       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2122       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2123       xnn_f32_gemm_ukernel_5x8__wasmsimd_splat,
2124       xnn_f32_igemm_ukernel_5x8__wasmsimd_splat,
2125       xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2126       xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2127       xnn_init_f32_minmax_wasmsimd_params,
2128       5 /* mr */, 8 /* nr */);
2129   }
f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)2130   static void f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2131     GEMMEnd2EndBenchmark(state, model,
2132       xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat,
2133       xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat,
2134       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2135       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2136       xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_splat,
2137       xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_splat,
2138       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2139       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2140       xnn_f32_gemm_ukernel_6x8__wasmsimd_splat,
2141       xnn_f32_igemm_ukernel_6x8__wasmsimd_splat,
2142       xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2143       xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2144       xnn_init_f32_minmax_wasmsimd_params,
2145       6 /* mr */, 8 /* nr */);
2146   }
f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)2147   static void f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2148     GEMMEnd2EndBenchmark(state, model,
2149       xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat,
2150       xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat,
2151       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2152       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2153       xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_splat,
2154       xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_splat,
2155       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2156       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2157       xnn_f32_gemm_ukernel_3x8__wasmsimd_splat,
2158       xnn_f32_igemm_ukernel_3x8__wasmsimd_splat,
2159       xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2160       xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2161       xnn_init_f32_minmax_wasmsimd_params,
2162       3 /* mr */, 8 /* nr */);
2163   }
f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)2164   static void f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2165     GEMMEnd2EndBenchmark(state, model,
2166       xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat,
2167       xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat,
2168       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2169       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2170       xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat,
2171       xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat,
2172       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2173       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2174       xnn_f32_gemm_ukernel_4x8__wasmsimd_splat,
2175       xnn_f32_igemm_ukernel_4x8__wasmsimd_splat,
2176       xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2177       xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2178       xnn_init_f32_minmax_wasmsimd_params,
2179       4 /* mr */, 8 /* nr */);
2180   }
f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)2181   static void f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2182     GEMMEnd2EndBenchmark(state, model,
2183       xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat,
2184       xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat,
2185       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2186       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2187       xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat,
2188       xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat,
2189       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2190       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2191       xnn_f32_gemm_ukernel_5x8__wasmsimd_splat,
2192       xnn_f32_igemm_ukernel_5x8__wasmsimd_splat,
2193       xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2194       xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2195       xnn_init_f32_minmax_wasmsimd_params,
2196       5 /* mr */, 8 /* nr */);
2197   }
f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)2198   static void f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2199     GEMMEnd2EndBenchmark(state, model,
2200       xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat,
2201       xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat,
2202       xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2203       xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2204       xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_splat,
2205       xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_splat,
2206       xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2207       xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2208       xnn_f32_gemm_ukernel_6x8__wasmsimd_splat,
2209       xnn_f32_igemm_ukernel_6x8__wasmsimd_splat,
2210       xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2211       xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2212       xnn_init_f32_minmax_wasmsimd_params,
2213       6 /* mr */, 8 /* nr */);
2214   }
f32_gemm_3x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)2215   static void f32_gemm_3x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
2216     GEMMEnd2EndBenchmark(state, model,
2217       xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm,
2218       xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm,
2219       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2220       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2221       xnn_f32_gemm_relu_ukernel_3x8s4__wasmsimd,
2222       xnn_f32_igemm_relu_ukernel_3x8s4__wasmsimd,
2223       xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2224       xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2225       xnn_f32_gemm_ukernel_3x8s4__wasmsimd,
2226       xnn_f32_igemm_ukernel_3x8s4__wasmsimd,
2227       xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2228       xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2229       xnn_init_f32_minmax_wasmsimd_params,
2230       3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2231   }
f32_gemm_4x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)2232   static void f32_gemm_4x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
2233     GEMMEnd2EndBenchmark(state, model,
2234       xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm,
2235       xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm,
2236       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2237       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2238       xnn_f32_gemm_relu_ukernel_4x8s4__wasmsimd,
2239       xnn_f32_igemm_relu_ukernel_4x8s4__wasmsimd,
2240       xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2241       xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2242       xnn_f32_gemm_ukernel_4x8s4__wasmsimd,
2243       xnn_f32_igemm_ukernel_4x8s4__wasmsimd,
2244       xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2245       xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2246       xnn_init_f32_minmax_wasmsimd_params,
2247       4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2248   }
f32_gemm_5x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)2249   static void f32_gemm_5x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
2250     GEMMEnd2EndBenchmark(state, model,
2251       xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm,
2252       xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm,
2253       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2254       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2255       xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd,
2256       xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd,
2257       xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2258       xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2259       xnn_f32_gemm_ukernel_5x8s4__wasmsimd,
2260       xnn_f32_igemm_ukernel_5x8s4__wasmsimd,
2261       xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2262       xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2263       xnn_init_f32_minmax_wasmsimd_params,
2264       5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2265   }
f32_gemm_6x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)2266   static void f32_gemm_6x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
2267     GEMMEnd2EndBenchmark(state, model,
2268       xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm,
2269       xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm,
2270       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2271       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2272       xnn_f32_gemm_relu_ukernel_6x8s4__wasmsimd,
2273       xnn_f32_igemm_relu_ukernel_6x8s4__wasmsimd,
2274       xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2275       xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2276       xnn_f32_gemm_ukernel_6x8s4__wasmsimd,
2277       xnn_f32_igemm_ukernel_6x8s4__wasmsimd,
2278       xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2279       xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2280       xnn_init_f32_minmax_wasmsimd_params,
2281       6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2282   }
f32_gemm_3x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)2283   static void f32_gemm_3x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
2284     GEMMEnd2EndBenchmark(state, model,
2285       xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86,
2286       xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86,
2287       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2288       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2289       xnn_f32_gemm_relu_ukernel_3x8s4__wasmsimd,
2290       xnn_f32_igemm_relu_ukernel_3x8s4__wasmsimd,
2291       xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2292       xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2293       xnn_f32_gemm_ukernel_3x8s4__wasmsimd,
2294       xnn_f32_igemm_ukernel_3x8s4__wasmsimd,
2295       xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2296       xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2297       xnn_init_f32_minmax_wasmsimd_params,
2298       3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2299   }
f32_gemm_4x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)2300   static void f32_gemm_4x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
2301     GEMMEnd2EndBenchmark(state, model,
2302       xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86,
2303       xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86,
2304       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2305       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2306       xnn_f32_gemm_relu_ukernel_4x8s4__wasmsimd,
2307       xnn_f32_igemm_relu_ukernel_4x8s4__wasmsimd,
2308       xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2309       xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2310       xnn_f32_gemm_ukernel_4x8s4__wasmsimd,
2311       xnn_f32_igemm_ukernel_4x8s4__wasmsimd,
2312       xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2313       xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2314       xnn_init_f32_minmax_wasmsimd_params,
2315       4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2316   }
f32_gemm_5x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)2317   static void f32_gemm_5x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
2318     GEMMEnd2EndBenchmark(state, model,
2319       xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86,
2320       xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86,
2321       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2322       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2323       xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd,
2324       xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd,
2325       xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2326       xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2327       xnn_f32_gemm_ukernel_5x8s4__wasmsimd,
2328       xnn_f32_igemm_ukernel_5x8s4__wasmsimd,
2329       xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2330       xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2331       xnn_init_f32_minmax_wasmsimd_params,
2332       5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2333   }
f32_gemm_6x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)2334   static void f32_gemm_6x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
2335     GEMMEnd2EndBenchmark(state, model,
2336       xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86,
2337       xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86,
2338       xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2339       xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2340       xnn_f32_gemm_relu_ukernel_6x8s4__wasmsimd,
2341       xnn_f32_igemm_relu_ukernel_6x8s4__wasmsimd,
2342       xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2343       xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2344       xnn_f32_gemm_ukernel_6x8s4__wasmsimd,
2345       xnn_f32_igemm_ukernel_6x8s4__wasmsimd,
2346       xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2347       xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2348       xnn_init_f32_minmax_wasmsimd_params,
2349       6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2350   }
2351 
2352   BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_arm_loadsplat);
2353   BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_arm_loadsplat);
2354   BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_arm_loadsplat);
2355   BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_arm_loadsplat);
2356 
2357   BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_x86_loadsplat);
2358   BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_x86_loadsplat);
2359   BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_x86_loadsplat);
2360   BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_x86_loadsplat);
2361 
2362   BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_arm_splat);
2363   BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_arm_splat);
2364   BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_arm_splat);
2365   BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_arm_splat);
2366 
2367   BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_x86_splat);
2368   BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_x86_splat);
2369   BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_x86_splat);
2370   BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_x86_splat);
2371 
2372   BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmsimd_arm);
2373   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmsimd_arm);
2374   BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmsimd_arm);
2375   BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmsimd_arm);
2376 
2377   BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmsimd_x86);
2378   BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmsimd_x86);
2379   BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmsimd_x86);
2380   BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmsimd_x86);
2381 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2382 
2383 
2384 #if XNN_ARCH_WASM
f32_gemm_2x4__wasm(benchmark::State & state,models::ExecutionPlanFactory model)2385   static void f32_gemm_2x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) {
2386     GEMMEnd2EndBenchmark(state, model,
2387       xnn_f32_gemm_minmax_ukernel_2x4__wasm,
2388       xnn_f32_igemm_minmax_ukernel_2x4__wasm,
2389       xnn_f32_gemm_minmax_ukernel_1x4__wasm,
2390       xnn_f32_igemm_minmax_ukernel_1x4__wasm,
2391       xnn_f32_gemm_relu_ukernel_2x4__wasm,
2392       xnn_f32_igemm_relu_ukernel_2x4__wasm,
2393       xnn_f32_gemm_relu_ukernel_1x4__wasm,
2394       xnn_f32_igemm_relu_ukernel_1x4__wasm,
2395       xnn_f32_gemm_ukernel_2x4__scalar,
2396       xnn_f32_igemm_ukernel_2x4__scalar,
2397       xnn_f32_gemm_ukernel_1x4__scalar,
2398       xnn_f32_igemm_ukernel_1x4__scalar,
2399       xnn_init_f32_minmax_scalar_params,
2400       2 /* mr */, 4 /* nr */);
2401   }
2402 
f32_gemm_4x4__wasm(benchmark::State & state,models::ExecutionPlanFactory model)2403   static void f32_gemm_4x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) {
2404     GEMMEnd2EndBenchmark(state, model,
2405       xnn_f32_gemm_minmax_ukernel_4x4__wasm,
2406       xnn_f32_igemm_minmax_ukernel_4x4__wasm,
2407       xnn_f32_gemm_minmax_ukernel_1x4__wasm,
2408       xnn_f32_igemm_minmax_ukernel_1x4__wasm,
2409       xnn_f32_gemm_relu_ukernel_4x4__wasm,
2410       xnn_f32_igemm_relu_ukernel_4x4__wasm,
2411       xnn_f32_gemm_relu_ukernel_1x4__wasm,
2412       xnn_f32_igemm_relu_ukernel_1x4__wasm,
2413       xnn_f32_gemm_ukernel_4x4__scalar,
2414       xnn_f32_igemm_ukernel_4x4__scalar,
2415       xnn_f32_gemm_ukernel_1x4__scalar,
2416       xnn_f32_igemm_ukernel_1x4__scalar,
2417       xnn_init_f32_minmax_scalar_params,
2418       4 /* mr */, 4 /* nr */);
2419   }
2420 
2421   BENCHMARK_FP32_END2END(f32_gemm_2x4__wasm);
2422   BENCHMARK_FP32_END2END(f32_gemm_4x4__wasm);
2423 #endif  // XNN_ARCH_WASM
2424 
2425 
f32_gemm_2x4__scalar(benchmark::State & state,models::ExecutionPlanFactory model)2426 static void f32_gemm_2x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
2427   GEMMEnd2EndBenchmark(state, model,
2428     xnn_f32_gemm_minmax_ukernel_2x4__scalar,
2429     xnn_f32_igemm_minmax_ukernel_2x4__scalar,
2430     xnn_f32_gemm_minmax_ukernel_1x4__scalar,
2431     xnn_f32_igemm_minmax_ukernel_1x4__scalar,
2432     xnn_f32_gemm_relu_ukernel_2x4__scalar,
2433     xnn_f32_igemm_relu_ukernel_2x4__scalar,
2434     xnn_f32_gemm_relu_ukernel_1x4__scalar,
2435     xnn_f32_igemm_relu_ukernel_1x4__scalar,
2436     xnn_f32_gemm_ukernel_2x4__scalar,
2437     xnn_f32_igemm_ukernel_2x4__scalar,
2438     xnn_f32_gemm_ukernel_1x4__scalar,
2439     xnn_f32_igemm_ukernel_1x4__scalar,
2440     xnn_init_f32_minmax_scalar_params,
2441     2 /* mr */, 4 /* nr */);
2442 }
2443 
f32_gemm_4x4__scalar(benchmark::State & state,models::ExecutionPlanFactory model)2444 static void f32_gemm_4x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
2445   GEMMEnd2EndBenchmark(state, model,
2446     xnn_f32_gemm_minmax_ukernel_4x4__scalar,
2447     xnn_f32_igemm_minmax_ukernel_4x4__scalar,
2448     xnn_f32_gemm_minmax_ukernel_1x4__scalar,
2449     xnn_f32_igemm_minmax_ukernel_1x4__scalar,
2450     xnn_f32_gemm_relu_ukernel_4x4__scalar,
2451     xnn_f32_igemm_relu_ukernel_4x4__scalar,
2452     xnn_f32_gemm_relu_ukernel_1x4__scalar,
2453     xnn_f32_igemm_relu_ukernel_1x4__scalar,
2454     xnn_f32_gemm_ukernel_4x4__scalar,
2455     xnn_f32_igemm_ukernel_4x4__scalar,
2456     xnn_f32_gemm_ukernel_1x4__scalar,
2457     xnn_f32_igemm_ukernel_1x4__scalar,
2458     xnn_init_f32_minmax_scalar_params,
2459     4 /* mr */, 4 /* nr */);
2460 }
2461 
2462 BENCHMARK_FP32_END2END(f32_gemm_2x4__scalar);
2463 BENCHMARK_FP32_END2END(f32_gemm_4x4__scalar);
2464 
2465 
2466 #ifndef XNNPACK_BENCHMARK_NO_MAIN
2467 BENCHMARK_MAIN();
2468 #endif
2469