1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <cstring>
9 #include <functional>
10 #include <random>
11 #include <vector>
12
13 #include <xnnpack.h>
14
15 #include <benchmark/benchmark.h>
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19
20 #include <xnnpack.h>
21 #include <xnnpack/gemm.h>
22 #include <xnnpack/igemm.h>
23 #include <xnnpack/microfnptr.h>
24 #include <xnnpack/microparams-init.h>
25
26
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_f32_gemm_minmax_ukernel_function gemm_minmax,xnn_f32_igemm_minmax_ukernel_function igemm_minmax,xnn_f32_gemm_minmax_ukernel_function gemm1_minmax,xnn_f32_igemm_minmax_ukernel_function igemm1_minmax,xnn_f32_gemm_relu_ukernel_function gemm_relu,xnn_f32_igemm_relu_ukernel_function igemm_relu,xnn_f32_gemm_relu_ukernel_function gemm1_relu,xnn_f32_igemm_relu_ukernel_function igemm1_relu,xnn_f32_gemm_ukernel_function gemm,xnn_f32_igemm_ukernel_function igemm,xnn_f32_gemm_ukernel_function gemm1,xnn_f32_igemm_ukernel_function igemm1,xnn_init_f32_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)27 static void GEMMEnd2EndBenchmark(
28 benchmark::State& state,
29 models::ExecutionPlanFactory model_factory,
30 xnn_f32_gemm_minmax_ukernel_function gemm_minmax,
31 xnn_f32_igemm_minmax_ukernel_function igemm_minmax,
32 xnn_f32_gemm_minmax_ukernel_function gemm1_minmax,
33 xnn_f32_igemm_minmax_ukernel_function igemm1_minmax,
34 xnn_f32_gemm_relu_ukernel_function gemm_relu,
35 xnn_f32_igemm_relu_ukernel_function igemm_relu,
36 xnn_f32_gemm_relu_ukernel_function gemm1_relu,
37 xnn_f32_igemm_relu_ukernel_function igemm1_relu,
38 xnn_f32_gemm_ukernel_function gemm,
39 xnn_f32_igemm_ukernel_function igemm,
40 xnn_f32_gemm_ukernel_function gemm1,
41 xnn_f32_igemm_ukernel_function igemm1,
42 xnn_init_f32_minmax_params_fn init_params,
43 uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
44 benchmark::utils::IsaCheckFunction isa_check = nullptr)
45 {
46 if (isa_check && !isa_check(state)) {
47 return;
48 }
49 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
50 state.SkipWithError("failed to initialize XNNPACK");
51 return;
52 }
53
54 // Override microkernels chosen in xnn_initialize
55 // Note: do not directly assign to xnn_params.f32.gemm because it breaks older gcc.
56 std::memset(&xnn_params.f32.gemm, 0, sizeof(xnn_params.f32.gemm));
57 std::memset(&xnn_params.f32.gemm2, 0, sizeof(xnn_params.f32.gemm2));
58 xnn_params.f32.gemm.minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm_minmax));
59 xnn_params.f32.gemm.minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm_minmax));
60 xnn_params.f32.gemm.minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1_minmax));
61 xnn_params.f32.gemm.minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1_minmax));
62 xnn_params.f32.gemm.relu.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm_relu));
63 xnn_params.f32.gemm.relu.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm_relu));
64 xnn_params.f32.gemm.relu.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1_relu));
65 xnn_params.f32.gemm.relu.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1_relu));
66 xnn_params.f32.gemm.linear.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
67 xnn_params.f32.gemm.linear.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
68 xnn_params.f32.gemm.linear.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
69 xnn_params.f32.gemm.linear.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
70 xnn_params.f32.gemm.init.f32 = init_params;
71 xnn_params.f32.gemm.mr = mr;
72 xnn_params.f32.gemm.nr = nr;
73 xnn_params.f32.gemm.log2_kr = log2_kr;
74 xnn_params.f32.gemm.log2_sr = log2_sr;
75
76 #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT
77 // If JIT is enabled, we want to make sure that we are still benchmarking
78 // non-JIT microkernels, so nullify the pointers to generators.
79 xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(nullptr);
80 xnn_params.f32.gemm.generator.gemm1 = xnn_init_hmp_gemm_codegen(nullptr);
81 xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(nullptr);
82 xnn_params.f32.gemm.generator.igemm1 = xnn_init_hmp_igemm_codegen(nullptr);
83 #endif // XNN_PLATFORM_JIT && XNN_ENABLE_JIT
84
85 auto execution_plan = model_factory(nullptr);
86 if (execution_plan.empty()) {
87 state.SkipWithError("failed to create a model");
88 return;
89 }
90
91 for (auto _ : state) {
92 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
93 xnn_status status = xnn_run_operator(op.get(), nullptr);
94 if (status != xnn_status_success) {
95 state.SkipWithError("failed to run a model");
96 return;
97 }
98 }
99 }
100
101 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
102 if (cpu_frequency != 0) {
103 state.counters["cpufreq"] = cpu_frequency;
104 }
105 }
106
107 #if XNN_PLATFORM_JIT
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_jit_gemm_code_generator_function gemm_generator,xnn_jit_gemm_code_generator_function gemm1_generator,xnn_jit_igemm_code_generator_function igemm_generator,xnn_jit_igemm_code_generator_function igemm1_generator,xnn_init_f32_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)108 static void GEMMEnd2EndBenchmark(
109 benchmark::State& state,
110 models::ExecutionPlanFactory model_factory,
111 xnn_jit_gemm_code_generator_function gemm_generator,
112 xnn_jit_gemm_code_generator_function gemm1_generator,
113 xnn_jit_igemm_code_generator_function igemm_generator,
114 xnn_jit_igemm_code_generator_function igemm1_generator,
115 xnn_init_f32_minmax_params_fn init_params,
116 uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
117 benchmark::utils::IsaCheckFunction isa_check = nullptr)
118 {
119 if (isa_check && !isa_check(state)) {
120 return;
121 }
122 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
123 state.SkipWithError("failed to initialize XNNPACK");
124 return;
125 }
126
127 // Nullify the microkernels to ensure we run JIT kernels.
128 for (size_t i = 0; i < XNN_MAX_MR; i++) {
129 xnn_params.f32.gemm.minmax.gemm[i] = xnn_init_hmp_gemm_ukernel(nullptr);
130 xnn_params.f32.gemm.minmax.igemm[i] = xnn_init_hmp_igemm_ukernel(nullptr);
131 }
132 xnn_params.f32.gemm.init.f32 = init_params;
133 xnn_params.f32.gemm.mr = mr;
134 xnn_params.f32.gemm.nr = nr;
135 xnn_params.f32.gemm.log2_kr = log2_kr;
136 xnn_params.f32.gemm.log2_sr = log2_sr;
137
138 xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(gemm_generator);
139 xnn_params.f32.gemm.generator.gemm1 = xnn_init_hmp_gemm_codegen(gemm1_generator);
140 xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(igemm_generator);
141 xnn_params.f32.gemm.generator.igemm1 = xnn_init_hmp_igemm_codegen(igemm1_generator);
142
143 auto execution_plan = model_factory(nullptr);
144 if (execution_plan.empty()) {
145 state.SkipWithError("failed to create a model");
146 return;
147 }
148
149 for (auto _ : state) {
150 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
151 xnn_status status = xnn_run_operator(op.get(), nullptr);
152 if (status != xnn_status_success) {
153 state.SkipWithError("failed to run a model");
154 return;
155 }
156 }
157 }
158
159 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
160 if (cpu_frequency != 0) {
161 state.counters["cpufreq"] = cpu_frequency;
162 }
163 }
164 #endif // XNN_PLATFORM_JIT
165
166 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
f32_gemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)167 static void f32_gemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
168 GEMMEnd2EndBenchmark(state, model,
169 xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75,
170 xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75,
171 xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75,
172 xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75,
173 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
174 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
175 xnn_init_f32_minmax_scalar_params,
176 4 /* mr */, 2 /* nr */);
177 }
f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)178 static void f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
179 GEMMEnd2EndBenchmark(state, model,
180 xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75,
181 xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75,
182 xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75,
183 xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75,
184 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
185 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
186 xnn_init_f32_minmax_scalar_params,
187 4 /* mr */, 2 /* nr */);
188 }
f32_gemm_4x2__aarch64_neonfma_ld64(benchmark::State & state,models::ExecutionPlanFactory model)189 static void f32_gemm_4x2__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
190 GEMMEnd2EndBenchmark(state, model,
191 xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64,
192 xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_ld64,
193 xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64,
194 xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_ld64,
195 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
196 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
197 xnn_init_f32_minmax_scalar_params,
198 4 /* mr */, 2 /* nr */);
199 }
f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)200 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
201 GEMMEnd2EndBenchmark(state, model,
202 xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53,
203 xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53,
204 xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53,
205 xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53,
206 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
207 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
208 xnn_init_f32_minmax_scalar_params,
209 4 /* mr */, 12 /* nr */);
210 }
f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)211 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
212 GEMMEnd2EndBenchmark(state, model,
213 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53,
214 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53,
215 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
216 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
217 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
218 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
219 xnn_init_f32_minmax_scalar_params,
220 4 /* mr */, 8 /* nr */);
221 }
f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)222 static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
223 GEMMEnd2EndBenchmark(state, model,
224 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53,
225 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53,
226 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53,
227 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53,
228 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
229 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
230 xnn_init_f32_minmax_scalar_params,
231 4 /* mr */, 8 /* nr */);
232 }
f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)233 static void f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
234 GEMMEnd2EndBenchmark(state, model,
235 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55,
236 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55,
237 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
238 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
239 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
240 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
241 xnn_init_f32_minmax_scalar_params,
242 4 /* mr */, 8 /* nr */);
243 }
f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)244 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
245 GEMMEnd2EndBenchmark(state, model,
246 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75,
247 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75,
248 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
249 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
250 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
251 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
252 xnn_init_f32_minmax_scalar_params,
253 4 /* mr */, 8 /* nr */);
254 }
f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)255 static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
256 GEMMEnd2EndBenchmark(state, model,
257 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75,
258 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75,
259 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
260 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
261 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
262 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
263 xnn_init_f32_minmax_scalar_params,
264 4 /* mr */, 8 /* nr */);
265 }
f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State & state,models::ExecutionPlanFactory model)266 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
267 GEMMEnd2EndBenchmark(state, model,
268 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64,
269 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64,
270 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64,
271 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
272 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
273 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
274 xnn_init_f32_minmax_scalar_params,
275 4 /* mr */, 8 /* nr */);
276 }
f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State & state,models::ExecutionPlanFactory model)277 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
278 GEMMEnd2EndBenchmark(state, model,
279 xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128,
280 xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128,
281 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64,
282 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
283 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
284 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
285 xnn_init_f32_minmax_scalar_params,
286 4 /* mr */, 8 /* nr */);
287 }
f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)288 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
289 GEMMEnd2EndBenchmark(state, model,
290 xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75,
291 xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75,
292 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
293 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
294 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
295 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
296 xnn_init_f32_minmax_scalar_params,
297 5 /* mr */, 8 /* nr */);
298 }
f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)299 static void f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
300 GEMMEnd2EndBenchmark(state, model,
301 xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75,
302 xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75,
303 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
304 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
305 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
306 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
307 xnn_init_f32_minmax_scalar_params,
308 5 /* mr */, 8 /* nr */);
309 }
f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)310 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
311 GEMMEnd2EndBenchmark(state, model,
312 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53,
313 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53,
314 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
315 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
316 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
317 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
318 xnn_init_f32_minmax_scalar_params,
319 6 /* mr */, 8 /* nr */);
320 }
f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)321 static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
322 GEMMEnd2EndBenchmark(state, model,
323 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53,
324 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53,
325 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53,
326 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53,
327 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
328 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
329 xnn_init_f32_minmax_scalar_params,
330 6 /* mr */, 8 /* nr */);
331 }
f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)332 static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
333 GEMMEnd2EndBenchmark(state, model,
334 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55,
335 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55,
336 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
337 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53,
338 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
339 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
340 xnn_init_f32_minmax_scalar_params,
341 6 /* mr */, 8 /* nr */);
342 }
f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State & state,models::ExecutionPlanFactory model)343 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, models::ExecutionPlanFactory model) {
344 GEMMEnd2EndBenchmark(state, model,
345 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73,
346 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73,
347 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
348 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
349 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
350 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
351 xnn_init_f32_minmax_scalar_params,
352 6 /* mr */, 8 /* nr */);
353 }
f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)354 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
355 GEMMEnd2EndBenchmark(state, model,
356 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75,
357 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75,
358 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
359 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75,
360 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
361 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
362 xnn_init_f32_minmax_scalar_params,
363 6 /* mr */, 8 /* nr */);
364 }
f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)365 static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
366 GEMMEnd2EndBenchmark(state, model,
367 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
368 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75,
369 xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
370 xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
371 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
372 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
373 xnn_init_f32_minmax_scalar_params,
374 6 /* mr */, 8 /* nr */);
375 }
f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State & state,models::ExecutionPlanFactory model)376 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
377 GEMMEnd2EndBenchmark(state, model,
378 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64,
379 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64,
380 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
381 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
382 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
383 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
384 xnn_init_f32_minmax_scalar_params,
385 6 /* mr */, 8 /* nr */);
386 }
f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,models::ExecutionPlanFactory model)387 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
388 GEMMEnd2EndBenchmark(state, model,
389 xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128,
390 xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128,
391 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
392 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
393 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
394 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
395 xnn_init_f32_minmax_scalar_params,
396 6 /* mr */, 8 /* nr */);
397 }
f32_gemm_4x2__neonfma_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)398 static void f32_gemm_4x2__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
399 GEMMEnd2EndBenchmark(state, model,
400 xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64,
401 xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64,
402 xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64,
403 xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64,
404 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
405 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
406 xnn_init_f32_minmax_scalar_params,
407 4 /* mr */, 2 /* nr */);
408 }
f32_gemm_6x2__neonfma_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)409 static void f32_gemm_6x2__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
410 GEMMEnd2EndBenchmark(state, model,
411 xnn_f32_gemm_minmax_ukernel_6x2__neonfma_lane_ld64,
412 xnn_f32_igemm_minmax_ukernel_6x2__neonfma_lane_ld64,
413 xnn_f32_gemm_minmax_ukernel_6x2__neonfma_lane_ld64,
414 xnn_f32_igemm_minmax_ukernel_6x2__neonfma_lane_ld64,
415 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
416 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
417 xnn_init_f32_minmax_scalar_params,
418 6 /* mr */, 2 /* nr */);
419 }
f32_gemm_4x8__neonfma_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)420 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
421 GEMMEnd2EndBenchmark(state, model,
422 xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64,
423 xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64,
424 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
425 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
426 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
427 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
428 xnn_init_f32_minmax_scalar_params,
429 4 /* mr */, 8 /* nr */);
430 }
f32_gemm_4x8__neonfma_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)431 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
432 GEMMEnd2EndBenchmark(state, model,
433 xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128,
434 xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld128,
435 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
436 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
437 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
438 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
439 xnn_init_f32_minmax_scalar_params,
440 4 /* mr */, 8 /* nr */);
441 }
f32_gemm_6x8__neonfma_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)442 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
443 GEMMEnd2EndBenchmark(state, model,
444 xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64,
445 xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64,
446 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
447 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
448 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
449 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
450 xnn_init_f32_minmax_scalar_params,
451 6 /* mr */, 8 /* nr */);
452 }
f32_gemm_6x8__neonfma_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)453 static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
454 GEMMEnd2EndBenchmark(state, model,
455 xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128,
456 xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128,
457 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64,
458 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64,
459 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
460 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
461 xnn_init_f32_minmax_scalar_params,
462 6 /* mr */, 8 /* nr */);
463 }
464
465 BENCHMARK_FP32_END2END(f32_gemm_4x2__aarch64_neonfma_cortex_a75)
466 BENCHMARK_FP32_END2END(f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75)
467 BENCHMARK_FP32_END2END(f32_gemm_4x2__aarch64_neonfma_ld64)
468 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_ld64)
469 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_ld128);
470 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_ld64);
471 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_ld128);
472 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
473 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53)
474 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a55)
475 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
476 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75)
477 BENCHMARK_FP32_END2END(f32_gemm_5x8__aarch64_neonfma_cortex_a75);
478 BENCHMARK_FP32_END2END(f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75);
479 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a53);
480 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53);
481 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a55);
482 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a73);
483 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a75);
484 BENCHMARK_FP32_END2END(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75);
485 BENCHMARK_FP32_END2END(f32_gemm_4x12__aarch64_neonfma_cortex_a53);
486
487 BENCHMARK_FP32_END2END(f32_gemm_4x2__neonfma_lane_ld64);
488 BENCHMARK_FP32_END2END(f32_gemm_6x2__neonfma_lane_ld64);
489
490 BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_lane_ld64);
491 BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_lane_ld128);
492
493 BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_lane_ld64);
494 BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_lane_ld128);
495 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
496
497 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)498 static void jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75(
499 benchmark::State &state, models::ExecutionPlanFactory model) {
500 GEMMEnd2EndBenchmark(
501 state, model,
502 xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
503 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
504 xnn_generate_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
505 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
506 xnn_init_f32_minmax_scalar_params, 4 /* mr */, 8 /* nr */);
507 }
jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)508 static void jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(
509 benchmark::State &state, models::ExecutionPlanFactory model) {
510 GEMMEnd2EndBenchmark(
511 state, model,
512 xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75,
513 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
514 xnn_generate_f32_igemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75,
515 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
516 xnn_init_f32_minmax_scalar_params, 4 /* mr */, 8 /* nr */);
517 }
jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)518 static void jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_cortex_a75(
519 benchmark::State &state, models::ExecutionPlanFactory model) {
520 GEMMEnd2EndBenchmark(
521 state, model,
522 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
523 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
524 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
525 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
526 xnn_init_f32_minmax_scalar_params, 1 /* mr */, 8 /* nr */);
527 }
jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)528 static void jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_cortex_a75(
529 benchmark::State &state, models::ExecutionPlanFactory model) {
530 GEMMEnd2EndBenchmark(
531 state, model,
532 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
533 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
534 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
535 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
536 xnn_init_f32_minmax_scalar_params, 2 /* mr */, 8 /* nr */);
537 }
jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)538 static void jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_cortex_a75(
539 benchmark::State &state, models::ExecutionPlanFactory model) {
540 GEMMEnd2EndBenchmark(
541 state, model,
542 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
543 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
544 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
545 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
546 xnn_init_f32_minmax_scalar_params, 3 /* mr */, 8 /* nr */);
547 }
jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)548 static void jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_cortex_a75(
549 benchmark::State &state, models::ExecutionPlanFactory model) {
550 GEMMEnd2EndBenchmark(
551 state, model,
552 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
553 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
554 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
555 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
556 xnn_init_f32_minmax_scalar_params, 4 /* mr */, 8 /* nr */);
557 }
jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)558 static void jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_cortex_a75(
559 benchmark::State &state, models::ExecutionPlanFactory model) {
560 GEMMEnd2EndBenchmark(
561 state, model,
562 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
563 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
564 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
565 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
566 xnn_init_f32_minmax_scalar_params, 5 /* mr */, 8 /* nr */);
567 }
jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)568 static void jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_cortex_a75(
569 benchmark::State &state, models::ExecutionPlanFactory model) {
570 GEMMEnd2EndBenchmark(
571 state, model,
572 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
573 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
574 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_cortex_a75,
575 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
576 xnn_init_f32_minmax_scalar_params, 6 /* mr */, 8 /* nr */);
577 }
jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)578 static void jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_prfm_cortex_a75(
579 benchmark::State &state, models::ExecutionPlanFactory model) {
580 GEMMEnd2EndBenchmark(
581 state, model,
582 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
583 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
584 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
585 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
586 xnn_init_f32_minmax_scalar_params, 1 /* mr */, 8 /* nr */);
587 }
jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)588 static void jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_prfm_cortex_a75(
589 benchmark::State &state, models::ExecutionPlanFactory model) {
590 GEMMEnd2EndBenchmark(
591 state, model,
592 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
593 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
594 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
595 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
596 xnn_init_f32_minmax_scalar_params, 2 /* mr */, 8 /* nr */);
597 }
jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)598 static void jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_prfm_cortex_a75(
599 benchmark::State &state, models::ExecutionPlanFactory model) {
600 GEMMEnd2EndBenchmark(
601 state, model,
602 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
603 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
604 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
605 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
606 xnn_init_f32_minmax_scalar_params, 3 /* mr */, 8 /* nr */);
607 }
jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)608 static void jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_prfm_cortex_a75(
609 benchmark::State &state, models::ExecutionPlanFactory model) {
610 GEMMEnd2EndBenchmark(
611 state, model,
612 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
613 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
614 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
615 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
616 xnn_init_f32_minmax_scalar_params, 4 /* mr */, 8 /* nr */);
617 }
jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)618 static void jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_prfm_cortex_a75(
619 benchmark::State &state, models::ExecutionPlanFactory model) {
620 GEMMEnd2EndBenchmark(
621 state, model,
622 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
623 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
624 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
625 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
626 xnn_init_f32_minmax_scalar_params, 5 /* mr */, 8 /* nr */);
627 }
jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)628 static void jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_prfm_cortex_a75(
629 benchmark::State &state, models::ExecutionPlanFactory model) {
630 GEMMEnd2EndBenchmark(
631 state, model,
632 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
633 xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
634 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75,
635 xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75,
636 xnn_init_f32_minmax_scalar_params, 6 /* mr */, 8 /* nr */);
637 }
638
639 BENCHMARK_FP32_END2END(jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75);
640 BENCHMARK_FP32_END2END(jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75);
641 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_cortex_a75);
642 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_cortex_a75);
643 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_cortex_a75);
644 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_cortex_a75);
645 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_cortex_a75);
646 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_cortex_a75);
647 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_prfm_cortex_a75);
648 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_prfm_cortex_a75);
649 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_prfm_cortex_a75);
650 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_prfm_cortex_a75);
651 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_prfm_cortex_a75);
652 BENCHMARK_FP32_END2END(jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_prfm_cortex_a75);
653
654 #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
655
656 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
f32_gemm_4x8__aarch32_neon_ld64(benchmark::State & state,models::ExecutionPlanFactory model)657 static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
658 GEMMEnd2EndBenchmark(state, model,
659 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64,
660 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_ld64,
661 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
662 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
663 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
664 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
665 xnn_init_f32_minmax_scalar_params,
666 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
667 benchmark::utils::CheckNEON);
668 }
f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)669 static void f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
670 GEMMEnd2EndBenchmark(state, model,
671 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7,
672 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7,
673 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
674 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
675 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
676 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
677 xnn_init_f32_minmax_scalar_params,
678 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
679 benchmark::utils::CheckNEON);
680 }
f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)681 static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
682 GEMMEnd2EndBenchmark(state, model,
683 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53,
684 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53,
685 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
686 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
687 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
688 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
689 xnn_init_f32_minmax_scalar_params,
690 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
691 benchmark::utils::CheckNEON);
692 }
f32_gemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)693 static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
694 GEMMEnd2EndBenchmark(state, model,
695 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53,
696 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53,
697 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
698 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
699 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
700 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
701 xnn_init_f32_minmax_scalar_params,
702 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
703 benchmark::utils::CheckNEON);
704 }
f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)705 static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
706 GEMMEnd2EndBenchmark(state, model,
707 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55,
708 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55,
709 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
710 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
711 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
712 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
713 xnn_init_f32_minmax_scalar_params,
714 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
715 benchmark::utils::CheckNEON);
716 }
f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)717 static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
718 GEMMEnd2EndBenchmark(state, model,
719 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75,
720 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75,
721 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
722 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
723 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
724 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
725 xnn_init_f32_minmax_scalar_params,
726 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
727 benchmark::utils::CheckNEON);
728 }
f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)729 static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
730 GEMMEnd2EndBenchmark(state, model,
731 xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75,
732 xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75,
733 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
734 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
735 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
736 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
737 xnn_init_f32_minmax_scalar_params,
738 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
739 benchmark::utils::CheckNEON);
740 }
741
742 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_ld64);
743 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a7);
744 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a53);
745 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_prfm_cortex_a53);
746 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a55);
747 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_cortex_a75);
748 BENCHMARK_FP32_END2END(f32_gemm_4x8__aarch32_neon_prfm_cortex_a75);
749 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
750
751
752 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
f32_gemm_4x2__neon_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)753 static void f32_gemm_4x2__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
754 GEMMEnd2EndBenchmark(state, model,
755 xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64,
756 xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64,
757 xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64,
758 xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64,
759 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
760 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
761 xnn_init_f32_minmax_scalar_params,
762 4 /* mr */, 2 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
763 benchmark::utils::CheckNEON);
764 }
765
f32_gemm_6x2__neon_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)766 static void f32_gemm_6x2__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
767 GEMMEnd2EndBenchmark(state, model,
768 xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64,
769 xnn_f32_igemm_minmax_ukernel_6x2__neon_lane_ld64,
770 xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64,
771 xnn_f32_igemm_minmax_ukernel_6x2__neon_lane_ld64,
772 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
773 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
774 xnn_init_f32_minmax_scalar_params,
775 6 /* mr */, 2 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
776 benchmark::utils::CheckNEON);
777 }
778
f32_gemm_4x8__neon_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)779 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
780 GEMMEnd2EndBenchmark(state, model,
781 xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64,
782 xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64,
783 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
784 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
785 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
786 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
787 xnn_init_f32_minmax_scalar_params,
788 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
789 benchmark::utils::CheckNEON);
790 }
791
f32_gemm_4x8__neon_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)792 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
793 GEMMEnd2EndBenchmark(state, model,
794 xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128,
795 xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128,
796 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
797 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
798 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
799 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
800 xnn_init_f32_minmax_scalar_params,
801 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
802 benchmark::utils::CheckNEON);
803 }
804
f32_gemm_6x8__neon_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)805 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
806 GEMMEnd2EndBenchmark(state, model,
807 xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64,
808 xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64,
809 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
810 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
811 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
812 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
813 xnn_init_f32_minmax_scalar_params,
814 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
815 benchmark::utils::CheckNEON);
816 }
817
f32_gemm_6x8__neon_lane_ld128(benchmark::State & state,models::ExecutionPlanFactory model)818 static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
819 GEMMEnd2EndBenchmark(state, model,
820 xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128,
821 xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128,
822 xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64,
823 xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64,
824 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
825 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
826 xnn_init_f32_minmax_scalar_params,
827 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
828 benchmark::utils::CheckNEON);
829 }
830
f32_gemm_4x8__neon_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)831 static void f32_gemm_4x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
832 GEMMEnd2EndBenchmark(state, model,
833 xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld64,
834 xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64,
835 xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
836 xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
837 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
838 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
839 xnn_init_f32_minmax_scalar_params,
840 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
841 benchmark::utils::CheckNEON);
842 }
843
f32_gemm_4x8__neon_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)844 static void f32_gemm_4x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
845 GEMMEnd2EndBenchmark(state, model,
846 xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128,
847 xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld128,
848 xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
849 xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
850 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
851 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
852 xnn_init_f32_minmax_scalar_params,
853 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
854 benchmark::utils::CheckNEON);
855 }
856
f32_gemm_6x8__neon_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)857 static void f32_gemm_6x8__neon_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
858 GEMMEnd2EndBenchmark(state, model,
859 xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64,
860 xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64,
861 xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
862 xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
863 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
864 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
865 xnn_init_f32_minmax_scalar_params,
866 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
867 benchmark::utils::CheckNEON);
868 }
869
f32_gemm_6x8__neon_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)870 static void f32_gemm_6x8__neon_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
871 GEMMEnd2EndBenchmark(state, model,
872 xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld128,
873 xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld128,
874 xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64,
875 xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64,
876 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
877 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
878 xnn_init_f32_minmax_scalar_params,
879 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
880 benchmark::utils::CheckNEON);
881 }
882
f32_gemm_4x8__neonfma_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)883 static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
884 GEMMEnd2EndBenchmark(state, model,
885 xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64,
886 xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64,
887 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
888 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
889 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
890 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
891 xnn_init_f32_minmax_scalar_params,
892 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
893 benchmark::utils::CheckNEONFMA);
894 }
895
f32_gemm_4x8__neonfma_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)896 static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
897 GEMMEnd2EndBenchmark(state, model,
898 xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128,
899 xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld128,
900 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
901 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
902 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
903 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
904 xnn_init_f32_minmax_scalar_params,
905 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
906 benchmark::utils::CheckNEONFMA);
907 }
908
f32_gemm_6x8__neonfma_dup_ld64(benchmark::State & state,models::ExecutionPlanFactory model)909 static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
910 GEMMEnd2EndBenchmark(state, model,
911 xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64,
912 xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64,
913 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
914 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
915 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
916 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
917 xnn_init_f32_minmax_scalar_params,
918 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
919 benchmark::utils::CheckNEONFMA);
920 }
921
f32_gemm_6x8__neonfma_dup_ld128(benchmark::State & state,models::ExecutionPlanFactory model)922 static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
923 GEMMEnd2EndBenchmark(state, model,
924 xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128,
925 xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld128,
926 xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64,
927 xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64,
928 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
929 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
930 xnn_init_f32_minmax_scalar_params,
931 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
932 benchmark::utils::CheckNEONFMA);
933 }
934
f32_gemm_4x8s4__neon(benchmark::State & state,models::ExecutionPlanFactory model)935 static void f32_gemm_4x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
936 GEMMEnd2EndBenchmark(state, model,
937 xnn_f32_gemm_minmax_ukernel_4x8s4__neon,
938 xnn_f32_igemm_minmax_ukernel_4x8s4__neon,
939 xnn_f32_gemm_minmax_ukernel_1x8s4__neon,
940 xnn_f32_igemm_minmax_ukernel_1x8s4__neon,
941 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
942 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
943 xnn_init_f32_minmax_scalar_params,
944 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
945 benchmark::utils::CheckNEON);
946 }
947
f32_gemm_4x8s4__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)948 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
949 GEMMEnd2EndBenchmark(state, model,
950 xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma,
951 xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma,
952 xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma,
953 xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma,
954 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
955 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
956 xnn_init_f32_minmax_scalar_params,
957 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
958 benchmark::utils::CheckNEONFMA);
959 }
960
f32_gemm_6x8s4__neon(benchmark::State & state,models::ExecutionPlanFactory model)961 static void f32_gemm_6x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
962 GEMMEnd2EndBenchmark(state, model,
963 xnn_f32_gemm_minmax_ukernel_6x8s4__neon,
964 xnn_f32_igemm_minmax_ukernel_6x8s4__neon,
965 xnn_f32_gemm_minmax_ukernel_1x8s4__neon,
966 xnn_f32_igemm_minmax_ukernel_1x8s4__neon,
967 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
968 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
969 xnn_init_f32_minmax_scalar_params,
970 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
971 benchmark::utils::CheckNEON);
972 }
973
f32_gemm_6x8s4__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)974 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
975 GEMMEnd2EndBenchmark(state, model,
976 xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma,
977 xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma,
978 xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma,
979 xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma,
980 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
981 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
982 xnn_init_f32_minmax_scalar_params,
983 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
984 benchmark::utils::CheckNEONFMA);
985 }
986
f32_gemm_8x8s4__neon(benchmark::State & state,models::ExecutionPlanFactory model)987 static void f32_gemm_8x8s4__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
988 GEMMEnd2EndBenchmark(state, model,
989 xnn_f32_gemm_minmax_ukernel_8x8s4__neon,
990 xnn_f32_igemm_minmax_ukernel_8x8s4__neon,
991 xnn_f32_gemm_minmax_ukernel_1x8s4__neon,
992 xnn_f32_igemm_minmax_ukernel_1x8s4__neon,
993 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
994 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
995 xnn_init_f32_minmax_scalar_params,
996 8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
997 benchmark::utils::CheckNEON);
998 }
999
f32_gemm_8x8s4__neonfma(benchmark::State & state,models::ExecutionPlanFactory model)1000 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
1001 GEMMEnd2EndBenchmark(state, model,
1002 xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma,
1003 xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma,
1004 xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma,
1005 xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma,
1006 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1007 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1008 xnn_init_f32_minmax_scalar_params,
1009 8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */,
1010 benchmark::utils::CheckNEONFMA);
1011 }
1012
1013 BENCHMARK_FP32_END2END(f32_gemm_4x2__neon_lane_ld64);
1014 BENCHMARK_FP32_END2END(f32_gemm_6x2__neon_lane_ld64);
1015
1016 BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_lane_ld64);
1017 BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_lane_ld128);
1018 BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_lane_ld64);
1019 BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_lane_ld128);
1020
1021 BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_dup_ld64);
1022 BENCHMARK_FP32_END2END(f32_gemm_4x8__neon_dup_ld128);
1023 BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_dup_ld64);
1024 BENCHMARK_FP32_END2END(f32_gemm_6x8__neon_dup_ld128);
1025
1026 BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_dup_ld64);
1027 BENCHMARK_FP32_END2END(f32_gemm_4x8__neonfma_dup_ld128);
1028 BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_dup_ld64);
1029 BENCHMARK_FP32_END2END(f32_gemm_6x8__neonfma_dup_ld128);
1030
1031 BENCHMARK_FP32_END2END(f32_gemm_4x8s4__neon);
1032 BENCHMARK_FP32_END2END(f32_gemm_6x8s4__neon);
1033 BENCHMARK_FP32_END2END(f32_gemm_8x8s4__neon);
1034
1035 BENCHMARK_FP32_END2END(f32_gemm_4x8s4__neonfma);
1036 BENCHMARK_FP32_END2END(f32_gemm_6x8s4__neonfma);
1037 BENCHMARK_FP32_END2END(f32_gemm_8x8s4__neonfma);
1038 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1039
1040
1041 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
f32_gemm_4x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1042 static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1043 GEMMEnd2EndBenchmark(state, model,
1044 xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast,
1045 xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast,
1046 xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
1047 xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
1048 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1049 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1050 xnn_init_f32_minmax_scalar_params,
1051 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1052 benchmark::utils::CheckAVX512F);
1053 }
f32_gemm_5x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1054 static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1055 GEMMEnd2EndBenchmark(state, model,
1056 xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast,
1057 xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast,
1058 xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
1059 xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
1060 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1061 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1062 xnn_init_f32_minmax_scalar_params,
1063 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1064 benchmark::utils::CheckAVX512F);
1065 }
f32_gemm_6x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1066 static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1067 GEMMEnd2EndBenchmark(state, model,
1068 xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast,
1069 xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast,
1070 xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
1071 xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
1072 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1073 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1074 xnn_init_f32_minmax_scalar_params,
1075 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1076 benchmark::utils::CheckAVX512F);
1077 }
f32_gemm_7x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1078 static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1079 GEMMEnd2EndBenchmark(state, model,
1080 xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast,
1081 xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast,
1082 xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
1083 xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
1084 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1085 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1086 xnn_init_f32_minmax_scalar_params,
1087 7 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1088 benchmark::utils::CheckAVX512F);
1089 }
f32_gemm_8x16__avx512f_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1090 static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1091 GEMMEnd2EndBenchmark(state, model,
1092 xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast,
1093 xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast,
1094 xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast,
1095 xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast,
1096 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1097 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1098 xnn_init_f32_minmax_scalar_params,
1099 8 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1100 benchmark::utils::CheckAVX512F);
1101 }
1102
f32_gemm_4x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1103 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1104 GEMMEnd2EndBenchmark(state, model,
1105 xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast,
1106 xnn_f32_igemm_minmax_ukernel_4x8__fma3_broadcast,
1107 xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
1108 xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
1109 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1110 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1111 xnn_init_f32_minmax_avx_params,
1112 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1113 benchmark::utils::CheckFMA3);
1114 }
f32_gemm_5x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1115 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1116 GEMMEnd2EndBenchmark(state, model,
1117 xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast,
1118 xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast,
1119 xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
1120 xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
1121 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1122 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1123 xnn_init_f32_minmax_avx_params,
1124 5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1125 benchmark::utils::CheckFMA3);
1126 }
f32_gemm_6x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1127 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1128 GEMMEnd2EndBenchmark(state, model,
1129 xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast,
1130 xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast,
1131 xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
1132 xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
1133 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1134 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1135 xnn_init_f32_minmax_avx_params,
1136 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1137 benchmark::utils::CheckFMA3);
1138 }
f32_gemm_7x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1139 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1140 GEMMEnd2EndBenchmark(state, model,
1141 xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast,
1142 xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast,
1143 xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
1144 xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
1145 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1146 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1147 xnn_init_f32_minmax_avx_params,
1148 7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1149 benchmark::utils::CheckFMA3);
1150 }
f32_gemm_8x8__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1151 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1152 GEMMEnd2EndBenchmark(state, model,
1153 xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast,
1154 xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast,
1155 xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast,
1156 xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast,
1157 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1158 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1159 xnn_init_f32_minmax_avx_params,
1160 8 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1161 benchmark::utils::CheckFMA3);
1162 }
f32_gemm_3x16__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1163 static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1164 GEMMEnd2EndBenchmark(state, model,
1165 xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast,
1166 xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast,
1167 xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast,
1168 xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast,
1169 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1170 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1171 xnn_init_f32_minmax_avx_params,
1172 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1173 benchmark::utils::CheckFMA3);
1174 }
f32_gemm_4x16__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1175 static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1176 GEMMEnd2EndBenchmark(state, model,
1177 xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast,
1178 xnn_f32_igemm_minmax_ukernel_4x16__fma3_broadcast,
1179 xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast,
1180 xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast,
1181 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1182 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1183 xnn_init_f32_minmax_avx_params,
1184 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1185 benchmark::utils::CheckFMA3);
1186 }
f32_gemm_5x16__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1187 static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1188 GEMMEnd2EndBenchmark(state, model,
1189 xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast,
1190 xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast,
1191 xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast,
1192 xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast,
1193 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1194 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1195 xnn_init_f32_minmax_avx_params,
1196 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1197 benchmark::utils::CheckFMA3);
1198 }
f32_gemm_3x16s4__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1199 static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1200 GEMMEnd2EndBenchmark(state, model,
1201 xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast,
1202 xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast,
1203 xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast,
1204 xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast,
1205 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1206 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1207 xnn_init_f32_minmax_avx_params,
1208 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
1209 benchmark::utils::CheckFMA3);
1210 }
f32_gemm_4x16s4__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1211 static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1212 GEMMEnd2EndBenchmark(state, model,
1213 xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast,
1214 xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast,
1215 xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast,
1216 xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast,
1217 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1218 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1219 xnn_init_f32_minmax_avx_params,
1220 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
1221 benchmark::utils::CheckFMA3);
1222 }
f32_gemm_5x16s4__fma3_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1223 static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1224 GEMMEnd2EndBenchmark(state, model,
1225 xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast,
1226 xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast,
1227 xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast,
1228 xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast,
1229 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1230 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1231 xnn_init_f32_minmax_avx_params,
1232 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 2 /* log2_sr */,
1233 benchmark::utils::CheckFMA3);
1234 }
1235
f32_gemm_4x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1236 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1237 GEMMEnd2EndBenchmark(state, model,
1238 xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast,
1239 xnn_f32_igemm_minmax_ukernel_4x8__avx_broadcast,
1240 xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
1241 xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
1242 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1243 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1244 xnn_init_f32_minmax_avx_params,
1245 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1246 benchmark::utils::CheckAVX);
1247 }
f32_gemm_5x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1248 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1249 GEMMEnd2EndBenchmark(state, model,
1250 xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast,
1251 xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast,
1252 xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
1253 xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
1254 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1255 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1256 xnn_init_f32_minmax_avx_params,
1257 5 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1258 benchmark::utils::CheckAVX);
1259 }
f32_gemm_6x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1260 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1261 GEMMEnd2EndBenchmark(state, model,
1262 xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast,
1263 xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast,
1264 xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
1265 xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
1266 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1267 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1268 xnn_init_f32_minmax_avx_params,
1269 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1270 benchmark::utils::CheckAVX);
1271 }
f32_gemm_7x8__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1272 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1273 GEMMEnd2EndBenchmark(state, model,
1274 xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast,
1275 xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast,
1276 xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast,
1277 xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast,
1278 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1279 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1280 xnn_init_f32_minmax_avx_params,
1281 7 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1282 benchmark::utils::CheckAVX);
1283 }
f32_gemm_3x16__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1284 static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1285 GEMMEnd2EndBenchmark(state, model,
1286 xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast,
1287 xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast,
1288 xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast,
1289 xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast,
1290 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1291 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1292 xnn_init_f32_minmax_avx_params,
1293 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1294 benchmark::utils::CheckAVX);
1295 }
f32_gemm_4x16__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1296 static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1297 GEMMEnd2EndBenchmark(state, model,
1298 xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast,
1299 xnn_f32_igemm_minmax_ukernel_4x16__avx_broadcast,
1300 xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast,
1301 xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast,
1302 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1303 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1304 xnn_init_f32_minmax_avx_params,
1305 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1306 benchmark::utils::CheckAVX);
1307 }
f32_gemm_5x16__avx_broadcast(benchmark::State & state,models::ExecutionPlanFactory model)1308 static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, models::ExecutionPlanFactory model) {
1309 GEMMEnd2EndBenchmark(state, model,
1310 xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast,
1311 xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast,
1312 xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast,
1313 xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast,
1314 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1315 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1316 xnn_init_f32_minmax_avx_params,
1317 5 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
1318 benchmark::utils::CheckAVX);
1319 }
1320
f32_gemm_3x8__sse2_dup(benchmark::State & state,models::ExecutionPlanFactory model)1321 static void f32_gemm_3x8__sse2_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1322 GEMMEnd2EndBenchmark(state, model,
1323 xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup,
1324 xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup,
1325 xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup,
1326 xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup,
1327 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1328 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1329 xnn_init_f32_minmax_sse_params,
1330 3 /* mr */, 8 /* nr */);
1331 }
f32_gemm_4x8__sse2_dup(benchmark::State & state,models::ExecutionPlanFactory model)1332 static void f32_gemm_4x8__sse2_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1333 GEMMEnd2EndBenchmark(state, model,
1334 xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup,
1335 xnn_f32_igemm_minmax_ukernel_4x8__sse2_dup,
1336 xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup,
1337 xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup,
1338 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1339 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1340 xnn_init_f32_minmax_sse_params,
1341 4 /* mr */, 8 /* nr */);
1342 }
f32_gemm_5x8__sse2_dup(benchmark::State & state,models::ExecutionPlanFactory model)1343 static void f32_gemm_5x8__sse2_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1344 GEMMEnd2EndBenchmark(state, model,
1345 xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup,
1346 xnn_f32_igemm_minmax_ukernel_5x8__sse2_dup,
1347 xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup,
1348 xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup,
1349 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1350 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1351 xnn_init_f32_minmax_sse_params,
1352 5 /* mr */, 8 /* nr */);
1353 }
1354
f32_gemm_3x8__sse_load1(benchmark::State & state,models::ExecutionPlanFactory model)1355 static void f32_gemm_3x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
1356 GEMMEnd2EndBenchmark(state, model,
1357 xnn_f32_gemm_minmax_ukernel_3x8__sse_load1,
1358 xnn_f32_igemm_minmax_ukernel_3x8__sse_load1,
1359 xnn_f32_gemm_minmax_ukernel_1x8__sse_load1,
1360 xnn_f32_igemm_minmax_ukernel_1x8__sse_load1,
1361 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1362 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1363 xnn_init_f32_minmax_sse_params,
1364 3 /* mr */, 8 /* nr */);
1365 }
f32_gemm_4x8__sse_load1(benchmark::State & state,models::ExecutionPlanFactory model)1366 static void f32_gemm_4x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
1367 GEMMEnd2EndBenchmark(state, model,
1368 xnn_f32_gemm_minmax_ukernel_4x8__sse_load1,
1369 xnn_f32_igemm_minmax_ukernel_4x8__sse_load1,
1370 xnn_f32_gemm_minmax_ukernel_1x8__sse_load1,
1371 xnn_f32_igemm_minmax_ukernel_1x8__sse_load1,
1372 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1373 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1374 xnn_init_f32_minmax_sse_params,
1375 4 /* mr */, 8 /* nr */);
1376 }
f32_gemm_5x8__sse_load1(benchmark::State & state,models::ExecutionPlanFactory model)1377 static void f32_gemm_5x8__sse_load1(benchmark::State& state, models::ExecutionPlanFactory model) {
1378 GEMMEnd2EndBenchmark(state, model,
1379 xnn_f32_gemm_minmax_ukernel_5x8__sse_load1,
1380 xnn_f32_igemm_minmax_ukernel_5x8__sse_load1,
1381 xnn_f32_gemm_minmax_ukernel_1x8__sse_load1,
1382 xnn_f32_igemm_minmax_ukernel_1x8__sse_load1,
1383 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1384 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1385 xnn_init_f32_minmax_sse_params,
1386 5 /* mr */, 8 /* nr */);
1387 }
f32_gemm_3x8__sse_dup(benchmark::State & state,models::ExecutionPlanFactory model)1388 static void f32_gemm_3x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1389 GEMMEnd2EndBenchmark(state, model,
1390 xnn_f32_gemm_minmax_ukernel_3x8__sse_dup,
1391 xnn_f32_igemm_minmax_ukernel_3x8__sse_dup,
1392 xnn_f32_gemm_minmax_ukernel_1x8__sse_dup,
1393 xnn_f32_igemm_minmax_ukernel_1x8__sse_dup,
1394 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1395 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1396 xnn_init_f32_minmax_sse_params,
1397 3 /* mr */, 8 /* nr */);
1398 }
f32_gemm_4x8__sse_dup(benchmark::State & state,models::ExecutionPlanFactory model)1399 static void f32_gemm_4x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1400 GEMMEnd2EndBenchmark(state, model,
1401 xnn_f32_gemm_minmax_ukernel_4x8__sse_dup,
1402 xnn_f32_igemm_minmax_ukernel_4x8__sse_dup,
1403 xnn_f32_gemm_minmax_ukernel_1x8__sse_dup,
1404 xnn_f32_igemm_minmax_ukernel_1x8__sse_dup,
1405 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1406 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1407 xnn_init_f32_minmax_sse_params,
1408 4 /* mr */, 8 /* nr */);
1409 }
f32_gemm_5x8__sse_dup(benchmark::State & state,models::ExecutionPlanFactory model)1410 static void f32_gemm_5x8__sse_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1411 GEMMEnd2EndBenchmark(state, model,
1412 xnn_f32_gemm_minmax_ukernel_5x8__sse_dup,
1413 xnn_f32_igemm_minmax_ukernel_5x8__sse_dup,
1414 xnn_f32_gemm_minmax_ukernel_1x8__sse_dup,
1415 xnn_f32_igemm_minmax_ukernel_1x8__sse_dup,
1416 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1417 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1418 xnn_init_f32_minmax_sse_params,
1419 5 /* mr */, 8 /* nr */);
1420 }
f32_gemm_3x8s4__sse(benchmark::State & state,models::ExecutionPlanFactory model)1421 static void f32_gemm_3x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
1422 GEMMEnd2EndBenchmark(state, model,
1423 xnn_f32_gemm_minmax_ukernel_3x8s4__sse,
1424 xnn_f32_igemm_minmax_ukernel_3x8s4__sse,
1425 xnn_f32_gemm_minmax_ukernel_1x8s4__sse,
1426 xnn_f32_igemm_minmax_ukernel_1x8s4__sse,
1427 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1428 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1429 xnn_init_f32_minmax_sse_params,
1430 3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1431 }
f32_gemm_4x8s4__sse(benchmark::State & state,models::ExecutionPlanFactory model)1432 static void f32_gemm_4x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
1433 GEMMEnd2EndBenchmark(state, model,
1434 xnn_f32_gemm_minmax_ukernel_4x8s4__sse,
1435 xnn_f32_igemm_minmax_ukernel_4x8s4__sse,
1436 xnn_f32_gemm_minmax_ukernel_1x8s4__sse,
1437 xnn_f32_igemm_minmax_ukernel_1x8s4__sse,
1438 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1439 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1440 xnn_init_f32_minmax_sse_params,
1441 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1442 }
f32_gemm_5x8s4__sse(benchmark::State & state,models::ExecutionPlanFactory model)1443 static void f32_gemm_5x8s4__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
1444 GEMMEnd2EndBenchmark(state, model,
1445 xnn_f32_gemm_minmax_ukernel_5x8s4__sse,
1446 xnn_f32_igemm_minmax_ukernel_5x8s4__sse,
1447 xnn_f32_gemm_minmax_ukernel_1x8s4__sse,
1448 xnn_f32_igemm_minmax_ukernel_1x8s4__sse,
1449 nullptr /* gemm_relu */, nullptr /* igemm_relu */, nullptr /* gemm1_relu */, nullptr /* igemm1_relu */,
1450 nullptr /* gemm */, nullptr /* igemm */, nullptr /* gemm1 */, nullptr /* igemm1 */,
1451 xnn_init_f32_minmax_sse_params,
1452 5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1453 }
1454
1455 BENCHMARK_FP32_END2END(f32_gemm_4x16__avx512f_broadcast);
1456 BENCHMARK_FP32_END2END(f32_gemm_5x16__avx512f_broadcast);
1457 BENCHMARK_FP32_END2END(f32_gemm_6x16__avx512f_broadcast);
1458 BENCHMARK_FP32_END2END(f32_gemm_7x16__avx512f_broadcast);
1459 BENCHMARK_FP32_END2END(f32_gemm_8x16__avx512f_broadcast);
1460
1461 BENCHMARK_FP32_END2END(f32_gemm_4x8__fma3_broadcast);
1462 BENCHMARK_FP32_END2END(f32_gemm_5x8__fma3_broadcast);
1463 BENCHMARK_FP32_END2END(f32_gemm_6x8__fma3_broadcast);
1464 BENCHMARK_FP32_END2END(f32_gemm_7x8__fma3_broadcast);
1465 BENCHMARK_FP32_END2END(f32_gemm_8x8__fma3_broadcast);
1466 BENCHMARK_FP32_END2END(f32_gemm_3x16__fma3_broadcast);
1467 BENCHMARK_FP32_END2END(f32_gemm_4x16__fma3_broadcast);
1468 BENCHMARK_FP32_END2END(f32_gemm_5x16__fma3_broadcast);
1469
1470 BENCHMARK_FP32_END2END(f32_gemm_3x16s4__fma3_broadcast);
1471 BENCHMARK_FP32_END2END(f32_gemm_4x16s4__fma3_broadcast);
1472 BENCHMARK_FP32_END2END(f32_gemm_5x16s4__fma3_broadcast);
1473
1474 BENCHMARK_FP32_END2END(f32_gemm_4x8__avx_broadcast);
1475 BENCHMARK_FP32_END2END(f32_gemm_5x8__avx_broadcast);
1476 BENCHMARK_FP32_END2END(f32_gemm_6x8__avx_broadcast);
1477 BENCHMARK_FP32_END2END(f32_gemm_7x8__avx_broadcast);
1478 BENCHMARK_FP32_END2END(f32_gemm_3x16__avx_broadcast);
1479 BENCHMARK_FP32_END2END(f32_gemm_4x16__avx_broadcast);
1480 BENCHMARK_FP32_END2END(f32_gemm_5x16__avx_broadcast);
1481
1482 BENCHMARK_FP32_END2END(f32_gemm_3x8__sse2_dup);
1483 BENCHMARK_FP32_END2END(f32_gemm_4x8__sse2_dup);
1484 BENCHMARK_FP32_END2END(f32_gemm_5x8__sse2_dup);
1485
1486 BENCHMARK_FP32_END2END(f32_gemm_3x8__sse_load1);
1487 BENCHMARK_FP32_END2END(f32_gemm_4x8__sse_load1);
1488 BENCHMARK_FP32_END2END(f32_gemm_5x8__sse_load1);
1489
1490 BENCHMARK_FP32_END2END(f32_gemm_3x8__sse_dup);
1491 BENCHMARK_FP32_END2END(f32_gemm_4x8__sse_dup);
1492 BENCHMARK_FP32_END2END(f32_gemm_5x8__sse_dup);
1493
1494 BENCHMARK_FP32_END2END(f32_gemm_3x8s4__sse);
1495 BENCHMARK_FP32_END2END(f32_gemm_4x8s4__sse);
1496 BENCHMARK_FP32_END2END(f32_gemm_5x8s4__sse);
1497 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1498
1499
1500 #if XNN_ARCH_WASMRELAXEDSIMD
f32_gemm_3x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1501 static void f32_gemm_3x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1502 GEMMEnd2EndBenchmark(state, model,
1503 xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat,
1504 xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat,
1505 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1506 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1507 xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_loadsplat,
1508 xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_loadsplat,
1509 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1510 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1511 xnn_f32_gemm_ukernel_3x8__wasmsimd_loadsplat,
1512 xnn_f32_igemm_ukernel_3x8__wasmsimd_loadsplat,
1513 xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1514 xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1515 xnn_init_f32_minmax_wasmsimd_params,
1516 3 /* mr */, 8 /* nr */);
1517 }
f32_gemm_4x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1518 static void f32_gemm_4x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1519 GEMMEnd2EndBenchmark(state, model,
1520 xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat,
1521 xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat,
1522 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1523 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1524 xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_loadsplat,
1525 xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_loadsplat,
1526 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1527 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1528 xnn_f32_gemm_ukernel_4x8__wasmsimd_loadsplat,
1529 xnn_f32_igemm_ukernel_4x8__wasmsimd_loadsplat,
1530 xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1531 xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1532 xnn_init_f32_minmax_wasmsimd_params,
1533 4 /* mr */, 8 /* nr */);
1534 }
f32_gemm_5x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1535 static void f32_gemm_5x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1536 GEMMEnd2EndBenchmark(state, model,
1537 xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat,
1538 xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat,
1539 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1540 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1541 xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_loadsplat,
1542 xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_loadsplat,
1543 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1544 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1545 xnn_f32_gemm_ukernel_5x8__wasmsimd_loadsplat,
1546 xnn_f32_igemm_ukernel_5x8__wasmsimd_loadsplat,
1547 xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1548 xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1549 xnn_init_f32_minmax_wasmsimd_params,
1550 5 /* mr */, 8 /* nr */);
1551 }
f32_gemm_6x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1552 static void f32_gemm_6x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1553 GEMMEnd2EndBenchmark(state, model,
1554 xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat,
1555 xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat,
1556 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1557 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat,
1558 xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_loadsplat,
1559 xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_loadsplat,
1560 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1561 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1562 xnn_f32_gemm_ukernel_6x8__wasmsimd_loadsplat,
1563 xnn_f32_igemm_ukernel_6x8__wasmsimd_loadsplat,
1564 xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1565 xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1566 xnn_init_f32_minmax_wasmsimd_params,
1567 6 /* mr */, 8 /* nr */);
1568 }
f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1569 static void f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1570 GEMMEnd2EndBenchmark(state, model,
1571 xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat,
1572 xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat,
1573 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1574 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1575 xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat,
1576 xnn_f32_igemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat,
1577 xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1578 xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1579 xnn_f32_gemm_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat,
1580 xnn_f32_igemm_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat,
1581 xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1582 xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1583 xnn_init_f32_minmax_wasmsimd_params,
1584 3 /* mr */, 8 /* nr */);
1585 }
f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1586 static void f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1587 GEMMEnd2EndBenchmark(state, model,
1588 xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat,
1589 xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat,
1590 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1591 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1592 xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat,
1593 xnn_f32_igemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat,
1594 xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1595 xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1596 xnn_f32_gemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat,
1597 xnn_f32_igemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat,
1598 xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1599 xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1600 xnn_init_f32_minmax_wasmsimd_params,
1601 4 /* mr */, 8 /* nr */);
1602 }
f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1603 static void f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1604 GEMMEnd2EndBenchmark(state, model,
1605 xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat,
1606 xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat,
1607 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1608 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1609 xnn_f32_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat,
1610 xnn_f32_igemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat,
1611 xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1612 xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1613 xnn_f32_gemm_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat,
1614 xnn_f32_igemm_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat,
1615 xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1616 xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1617 xnn_init_f32_minmax_wasmsimd_params,
1618 5 /* mr */, 8 /* nr */);
1619 }
f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1620 static void f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1621 GEMMEnd2EndBenchmark(state, model,
1622 xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat,
1623 xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat,
1624 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1625 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1626 xnn_f32_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat,
1627 xnn_f32_igemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat,
1628 xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1629 xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1630 xnn_f32_gemm_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat,
1631 xnn_f32_igemm_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat,
1632 xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1633 xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat,
1634 xnn_init_f32_minmax_wasmsimd_params,
1635 6 /* mr */, 8 /* nr */);
1636 }
f32_gemm_3x8__wasmrelaxedsimd_splat(benchmark::State & state,models::ExecutionPlanFactory model)1637 static void f32_gemm_3x8__wasmrelaxedsimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1638 GEMMEnd2EndBenchmark(state, model,
1639 xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat,
1640 xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat,
1641 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1642 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1643 xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_splat,
1644 xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_splat,
1645 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
1646 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
1647 xnn_f32_gemm_ukernel_3x8__wasmsimd_splat,
1648 xnn_f32_igemm_ukernel_3x8__wasmsimd_splat,
1649 xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
1650 xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
1651 xnn_init_f32_minmax_wasmsimd_params,
1652 3 /* mr */, 8 /* nr */);
1653 }
f32_gemm_4x8__wasmrelaxedsimd_splat(benchmark::State & state,models::ExecutionPlanFactory model)1654 static void f32_gemm_4x8__wasmrelaxedsimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1655 GEMMEnd2EndBenchmark(state, model,
1656 xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat,
1657 xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat,
1658 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1659 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1660 xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat,
1661 xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat,
1662 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
1663 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
1664 xnn_f32_gemm_ukernel_4x8__wasmsimd_splat,
1665 xnn_f32_igemm_ukernel_4x8__wasmsimd_splat,
1666 xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
1667 xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
1668 xnn_init_f32_minmax_wasmsimd_params,
1669 4 /* mr */, 8 /* nr */);
1670 }
f32_gemm_5x8__wasmrelaxedsimd_splat(benchmark::State & state,models::ExecutionPlanFactory model)1671 static void f32_gemm_5x8__wasmrelaxedsimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1672 GEMMEnd2EndBenchmark(state, model,
1673 xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_splat,
1674 xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_splat,
1675 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1676 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1677 xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat,
1678 xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat,
1679 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
1680 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
1681 xnn_f32_gemm_ukernel_5x8__wasmsimd_splat,
1682 xnn_f32_igemm_ukernel_5x8__wasmsimd_splat,
1683 xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
1684 xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
1685 xnn_init_f32_minmax_wasmsimd_params,
1686 5 /* mr */, 8 /* nr */);
1687 }
f32_gemm_6x8__wasmrelaxedsimd_splat(benchmark::State & state,models::ExecutionPlanFactory model)1688 static void f32_gemm_6x8__wasmrelaxedsimd_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1689 GEMMEnd2EndBenchmark(state, model,
1690 xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_splat,
1691 xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_splat,
1692 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1693 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_splat,
1694 xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_splat,
1695 xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_splat,
1696 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
1697 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
1698 xnn_f32_gemm_ukernel_6x8__wasmsimd_splat,
1699 xnn_f32_igemm_ukernel_6x8__wasmsimd_splat,
1700 xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
1701 xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
1702 xnn_init_f32_minmax_wasmsimd_params,
1703 6 /* mr */, 8 /* nr */);
1704 }
f32_gemm_3x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,models::ExecutionPlanFactory model)1705 static void f32_gemm_3x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1706 GEMMEnd2EndBenchmark(state, model,
1707 xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_splat,
1708 xnn_f32_igemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_splat,
1709 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1710 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1711 xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat,
1712 xnn_f32_igemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat,
1713 xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1714 xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1715 xnn_f32_gemm_ukernel_3x8__wasmrelaxedsimd_fma_splat,
1716 xnn_f32_igemm_ukernel_3x8__wasmrelaxedsimd_fma_splat,
1717 xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1718 xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1719 xnn_init_f32_minmax_wasmsimd_params,
1720 3 /* mr */, 8 /* nr */);
1721 }
f32_gemm_4x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,models::ExecutionPlanFactory model)1722 static void f32_gemm_4x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1723 GEMMEnd2EndBenchmark(state, model,
1724 xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat,
1725 xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat,
1726 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1727 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1728 xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_splat,
1729 xnn_f32_igemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_splat,
1730 xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1731 xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1732 xnn_f32_gemm_ukernel_4x8__wasmrelaxedsimd_fma_splat,
1733 xnn_f32_igemm_ukernel_4x8__wasmrelaxedsimd_fma_splat,
1734 xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1735 xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1736 xnn_init_f32_minmax_wasmsimd_params,
1737 4 /* mr */, 8 /* nr */);
1738 }
f32_gemm_5x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,models::ExecutionPlanFactory model)1739 static void f32_gemm_5x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1740 GEMMEnd2EndBenchmark(state, model,
1741 xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat,
1742 xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat,
1743 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1744 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1745 xnn_f32_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat,
1746 xnn_f32_igemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat,
1747 xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1748 xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1749 xnn_f32_gemm_ukernel_5x8__wasmrelaxedsimd_fma_splat,
1750 xnn_f32_igemm_ukernel_5x8__wasmrelaxedsimd_fma_splat,
1751 xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1752 xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1753 xnn_init_f32_minmax_wasmsimd_params,
1754 5 /* mr */, 8 /* nr */);
1755 }
f32_gemm_6x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,models::ExecutionPlanFactory model)1756 static void f32_gemm_6x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
1757 GEMMEnd2EndBenchmark(state, model,
1758 xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat,
1759 xnn_f32_igemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat,
1760 xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1761 xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1762 xnn_f32_gemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat,
1763 xnn_f32_igemm_relu_ukernel_6x8__wasmrelaxedsimd_fma_splat,
1764 xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1765 xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1766 xnn_f32_gemm_ukernel_6x8__wasmrelaxedsimd_fma_splat,
1767 xnn_f32_igemm_ukernel_6x8__wasmrelaxedsimd_fma_splat,
1768 xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1769 xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat,
1770 xnn_init_f32_minmax_wasmsimd_params,
1771 6 /* mr */, 8 /* nr */);
1772 }
f32_gemm_3x8s4__wasmrelaxedsimd(benchmark::State & state,models::ExecutionPlanFactory model)1773 static void f32_gemm_3x8s4__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) {
1774 GEMMEnd2EndBenchmark(state, model,
1775 xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd,
1776 xnn_f32_igemm_minmax_ukernel_3x8s4__wasmrelaxedsimd,
1777 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1778 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1779 xnn_f32_gemm_relu_ukernel_3x8s4__wasmsimd,
1780 xnn_f32_igemm_relu_ukernel_3x8s4__wasmsimd,
1781 xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
1782 xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
1783 xnn_f32_gemm_ukernel_3x8s4__wasmsimd,
1784 xnn_f32_igemm_ukernel_3x8s4__wasmsimd,
1785 xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
1786 xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
1787 xnn_init_f32_minmax_wasmsimd_params,
1788 3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1789 }
f32_gemm_4x8s4__wasmrelaxedsimd(benchmark::State & state,models::ExecutionPlanFactory model)1790 static void f32_gemm_4x8s4__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) {
1791 GEMMEnd2EndBenchmark(state, model,
1792 xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd,
1793 xnn_f32_igemm_minmax_ukernel_4x8s4__wasmrelaxedsimd,
1794 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1795 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1796 xnn_f32_gemm_relu_ukernel_4x8s4__wasmsimd,
1797 xnn_f32_igemm_relu_ukernel_4x8s4__wasmsimd,
1798 xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
1799 xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
1800 xnn_f32_gemm_ukernel_4x8s4__wasmsimd,
1801 xnn_f32_igemm_ukernel_4x8s4__wasmsimd,
1802 xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
1803 xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
1804 xnn_init_f32_minmax_wasmsimd_params,
1805 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1806 }
f32_gemm_5x8s4__wasmrelaxedsimd(benchmark::State & state,models::ExecutionPlanFactory model)1807 static void f32_gemm_5x8s4__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) {
1808 GEMMEnd2EndBenchmark(state, model,
1809 xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd,
1810 xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd,
1811 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1812 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1813 xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd,
1814 xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd,
1815 xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
1816 xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
1817 xnn_f32_gemm_ukernel_5x8s4__wasmsimd,
1818 xnn_f32_igemm_ukernel_5x8s4__wasmsimd,
1819 xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
1820 xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
1821 xnn_init_f32_minmax_wasmsimd_params,
1822 5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1823 }
f32_gemm_6x8s4__wasmrelaxedsimd(benchmark::State & state,models::ExecutionPlanFactory model)1824 static void f32_gemm_6x8s4__wasmrelaxedsimd(benchmark::State& state, models::ExecutionPlanFactory model) {
1825 GEMMEnd2EndBenchmark(state, model,
1826 xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd,
1827 xnn_f32_igemm_minmax_ukernel_6x8s4__wasmrelaxedsimd,
1828 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1829 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd,
1830 xnn_f32_gemm_relu_ukernel_6x8s4__wasmsimd,
1831 xnn_f32_igemm_relu_ukernel_6x8s4__wasmsimd,
1832 xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
1833 xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
1834 xnn_f32_gemm_ukernel_6x8s4__wasmsimd,
1835 xnn_f32_igemm_ukernel_6x8s4__wasmsimd,
1836 xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
1837 xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
1838 xnn_init_f32_minmax_wasmsimd_params,
1839 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1840 }
f32_gemm_3x8s4__wasmrelaxedsimd_fma(benchmark::State & state,models::ExecutionPlanFactory model)1841 static void f32_gemm_3x8s4__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) {
1842 GEMMEnd2EndBenchmark(state, model,
1843 xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma,
1844 xnn_f32_igemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma,
1845 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1846 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1847 xnn_f32_gemm_relu_ukernel_3x8s4__wasmrelaxedsimd_fma,
1848 xnn_f32_igemm_relu_ukernel_3x8s4__wasmrelaxedsimd_fma,
1849 xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1850 xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1851 xnn_f32_gemm_ukernel_3x8s4__wasmrelaxedsimd_fma,
1852 xnn_f32_igemm_ukernel_3x8s4__wasmrelaxedsimd_fma,
1853 xnn_f32_gemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1854 xnn_f32_igemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1855 xnn_init_f32_minmax_wasmsimd_params,
1856 3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1857 }
f32_gemm_4x8s4__wasmrelaxedsimd_fma(benchmark::State & state,models::ExecutionPlanFactory model)1858 static void f32_gemm_4x8s4__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) {
1859 GEMMEnd2EndBenchmark(state, model,
1860 xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma,
1861 xnn_f32_igemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma,
1862 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1863 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1864 xnn_f32_gemm_relu_ukernel_4x8s4__wasmrelaxedsimd_fma,
1865 xnn_f32_igemm_relu_ukernel_4x8s4__wasmrelaxedsimd_fma,
1866 xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1867 xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1868 xnn_f32_gemm_ukernel_4x8s4__wasmrelaxedsimd_fma,
1869 xnn_f32_igemm_ukernel_4x8s4__wasmrelaxedsimd_fma,
1870 xnn_f32_gemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1871 xnn_f32_igemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1872 xnn_init_f32_minmax_wasmsimd_params,
1873 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1874 }
f32_gemm_5x8s4__wasmrelaxedsimd_fma(benchmark::State & state,models::ExecutionPlanFactory model)1875 static void f32_gemm_5x8s4__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) {
1876 GEMMEnd2EndBenchmark(state, model,
1877 xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma,
1878 xnn_f32_igemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma,
1879 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1880 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1881 xnn_f32_gemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma,
1882 xnn_f32_igemm_relu_ukernel_5x8s4__wasmrelaxedsimd_fma,
1883 xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1884 xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1885 xnn_f32_gemm_ukernel_5x8s4__wasmrelaxedsimd_fma,
1886 xnn_f32_igemm_ukernel_5x8s4__wasmrelaxedsimd_fma,
1887 xnn_f32_gemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1888 xnn_f32_igemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1889 xnn_init_f32_minmax_wasmsimd_params,
1890 5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1891 }
f32_gemm_6x8s4__wasmrelaxedsimd_fma(benchmark::State & state,models::ExecutionPlanFactory model)1892 static void f32_gemm_6x8s4__wasmrelaxedsimd_fma(benchmark::State& state, models::ExecutionPlanFactory model) {
1893 GEMMEnd2EndBenchmark(state, model,
1894 xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd_fma,
1895 xnn_f32_igemm_minmax_ukernel_6x8s4__wasmrelaxedsimd_fma,
1896 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1897 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmrelaxedsimd_fma,
1898 xnn_f32_gemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma,
1899 xnn_f32_igemm_relu_ukernel_6x8s4__wasmrelaxedsimd_fma,
1900 xnn_f32_gemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1901 xnn_f32_igemm_relu_ukernel_1x8s4__wasmrelaxedsimd_fma,
1902 xnn_f32_gemm_ukernel_6x8s4__wasmrelaxedsimd_fma,
1903 xnn_f32_igemm_ukernel_6x8s4__wasmrelaxedsimd_fma,
1904 xnn_f32_gemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1905 xnn_f32_igemm_ukernel_1x8s4__wasmrelaxedsimd_fma,
1906 xnn_init_f32_minmax_wasmsimd_params,
1907 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
1908 }
1909
1910 BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmrelaxedsimd_loadsplat);
1911 BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmrelaxedsimd_loadsplat);
1912 BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmrelaxedsimd_loadsplat);
1913 BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmrelaxedsimd_loadsplat);
1914
1915 BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat);
1916 BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat);
1917 BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat);
1918 BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat);
1919
1920 BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmrelaxedsimd_splat);
1921 BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmrelaxedsimd_splat);
1922 BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmrelaxedsimd_splat);
1923 BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmrelaxedsimd_splat);
1924
1925 BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmrelaxedsimd_fma_splat);
1926 BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmrelaxedsimd_fma_splat);
1927 BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmrelaxedsimd_fma_splat);
1928 BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmrelaxedsimd_fma_splat);
1929
1930 BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmrelaxedsimd);
1931 BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmrelaxedsimd);
1932 BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmrelaxedsimd);
1933 BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmrelaxedsimd);
1934
1935 BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmrelaxedsimd_fma);
1936 BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmrelaxedsimd_fma);
1937 BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmrelaxedsimd_fma);
1938 BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmrelaxedsimd_fma);
1939 #endif // XNN_ARCH_WASMRELAXEDSIMD
1940
1941
1942 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1943 static void f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1944 GEMMEnd2EndBenchmark(state, model,
1945 xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat,
1946 xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat,
1947 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1948 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1949 xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_loadsplat,
1950 xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_loadsplat,
1951 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1952 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1953 xnn_f32_gemm_ukernel_3x8__wasmsimd_loadsplat,
1954 xnn_f32_igemm_ukernel_3x8__wasmsimd_loadsplat,
1955 xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1956 xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1957 xnn_init_f32_minmax_wasmsimd_params,
1958 3 /* mr */, 8 /* nr */);
1959 }
f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1960 static void f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1961 GEMMEnd2EndBenchmark(state, model,
1962 xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat,
1963 xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat,
1964 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1965 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1966 xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_loadsplat,
1967 xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_loadsplat,
1968 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1969 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1970 xnn_f32_gemm_ukernel_4x8__wasmsimd_loadsplat,
1971 xnn_f32_igemm_ukernel_4x8__wasmsimd_loadsplat,
1972 xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1973 xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1974 xnn_init_f32_minmax_wasmsimd_params,
1975 4 /* mr */, 8 /* nr */);
1976 }
f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1977 static void f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1978 GEMMEnd2EndBenchmark(state, model,
1979 xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat,
1980 xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat,
1981 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1982 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1983 xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_loadsplat,
1984 xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_loadsplat,
1985 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1986 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
1987 xnn_f32_gemm_ukernel_5x8__wasmsimd_loadsplat,
1988 xnn_f32_igemm_ukernel_5x8__wasmsimd_loadsplat,
1989 xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
1990 xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
1991 xnn_init_f32_minmax_wasmsimd_params,
1992 5 /* mr */, 8 /* nr */);
1993 }
f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)1994 static void f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
1995 GEMMEnd2EndBenchmark(state, model,
1996 xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat,
1997 xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat,
1998 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
1999 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_loadsplat,
2000 xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_loadsplat,
2001 xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_loadsplat,
2002 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2003 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2004 xnn_f32_gemm_ukernel_6x8__wasmsimd_loadsplat,
2005 xnn_f32_igemm_ukernel_6x8__wasmsimd_loadsplat,
2006 xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
2007 xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
2008 xnn_init_f32_minmax_wasmsimd_params,
2009 6 /* mr */, 8 /* nr */);
2010 }
f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)2011 static void f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
2012 GEMMEnd2EndBenchmark(state, model,
2013 xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat,
2014 xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat,
2015 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2016 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2017 xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_loadsplat,
2018 xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_loadsplat,
2019 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2020 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2021 xnn_f32_gemm_ukernel_3x8__wasmsimd_loadsplat,
2022 xnn_f32_igemm_ukernel_3x8__wasmsimd_loadsplat,
2023 xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
2024 xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
2025 xnn_init_f32_minmax_wasmsimd_params,
2026 3 /* mr */, 8 /* nr */);
2027 }
f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)2028 static void f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
2029 GEMMEnd2EndBenchmark(state, model,
2030 xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat,
2031 xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat,
2032 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2033 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2034 xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_loadsplat,
2035 xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_loadsplat,
2036 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2037 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2038 xnn_f32_gemm_ukernel_4x8__wasmsimd_loadsplat,
2039 xnn_f32_igemm_ukernel_4x8__wasmsimd_loadsplat,
2040 xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
2041 xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
2042 xnn_init_f32_minmax_wasmsimd_params,
2043 4 /* mr */, 8 /* nr */);
2044 }
f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)2045 static void f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
2046 GEMMEnd2EndBenchmark(state, model,
2047 xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat,
2048 xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat,
2049 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2050 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2051 xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_loadsplat,
2052 xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_loadsplat,
2053 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2054 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2055 xnn_f32_gemm_ukernel_5x8__wasmsimd_loadsplat,
2056 xnn_f32_igemm_ukernel_5x8__wasmsimd_loadsplat,
2057 xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
2058 xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
2059 xnn_init_f32_minmax_wasmsimd_params,
2060 5 /* mr */, 8 /* nr */);
2061 }
f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State & state,models::ExecutionPlanFactory model)2062 static void f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, models::ExecutionPlanFactory model) {
2063 GEMMEnd2EndBenchmark(state, model,
2064 xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat,
2065 xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat,
2066 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2067 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat,
2068 xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_loadsplat,
2069 xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_loadsplat,
2070 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2071 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_loadsplat,
2072 xnn_f32_gemm_ukernel_6x8__wasmsimd_loadsplat,
2073 xnn_f32_igemm_ukernel_6x8__wasmsimd_loadsplat,
2074 xnn_f32_gemm_ukernel_1x8__wasmsimd_loadsplat,
2075 xnn_f32_igemm_ukernel_1x8__wasmsimd_loadsplat,
2076 xnn_init_f32_minmax_wasmsimd_params,
2077 6 /* mr */, 8 /* nr */);
2078 }
f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)2079 static void f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2080 GEMMEnd2EndBenchmark(state, model,
2081 xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat,
2082 xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat,
2083 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2084 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2085 xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_splat,
2086 xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_splat,
2087 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2088 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2089 xnn_f32_gemm_ukernel_3x8__wasmsimd_splat,
2090 xnn_f32_igemm_ukernel_3x8__wasmsimd_splat,
2091 xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2092 xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2093 xnn_init_f32_minmax_wasmsimd_params,
2094 3 /* mr */, 8 /* nr */);
2095 }
f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)2096 static void f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2097 GEMMEnd2EndBenchmark(state, model,
2098 xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat,
2099 xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_splat,
2100 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2101 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2102 xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat,
2103 xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat,
2104 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2105 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2106 xnn_f32_gemm_ukernel_4x8__wasmsimd_splat,
2107 xnn_f32_igemm_ukernel_4x8__wasmsimd_splat,
2108 xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2109 xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2110 xnn_init_f32_minmax_wasmsimd_params,
2111 4 /* mr */, 8 /* nr */);
2112 }
f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)2113 static void f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2114 GEMMEnd2EndBenchmark(state, model,
2115 xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat,
2116 xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat,
2117 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2118 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2119 xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat,
2120 xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat,
2121 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2122 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2123 xnn_f32_gemm_ukernel_5x8__wasmsimd_splat,
2124 xnn_f32_igemm_ukernel_5x8__wasmsimd_splat,
2125 xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2126 xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2127 xnn_init_f32_minmax_wasmsimd_params,
2128 5 /* mr */, 8 /* nr */);
2129 }
f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State & state,models::ExecutionPlanFactory model)2130 static void f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2131 GEMMEnd2EndBenchmark(state, model,
2132 xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat,
2133 xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat,
2134 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2135 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat,
2136 xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_splat,
2137 xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_splat,
2138 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2139 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2140 xnn_f32_gemm_ukernel_6x8__wasmsimd_splat,
2141 xnn_f32_igemm_ukernel_6x8__wasmsimd_splat,
2142 xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2143 xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2144 xnn_init_f32_minmax_wasmsimd_params,
2145 6 /* mr */, 8 /* nr */);
2146 }
f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)2147 static void f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2148 GEMMEnd2EndBenchmark(state, model,
2149 xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat,
2150 xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat,
2151 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2152 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2153 xnn_f32_gemm_relu_ukernel_3x8__wasmsimd_splat,
2154 xnn_f32_igemm_relu_ukernel_3x8__wasmsimd_splat,
2155 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2156 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2157 xnn_f32_gemm_ukernel_3x8__wasmsimd_splat,
2158 xnn_f32_igemm_ukernel_3x8__wasmsimd_splat,
2159 xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2160 xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2161 xnn_init_f32_minmax_wasmsimd_params,
2162 3 /* mr */, 8 /* nr */);
2163 }
f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)2164 static void f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2165 GEMMEnd2EndBenchmark(state, model,
2166 xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat,
2167 xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat,
2168 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2169 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2170 xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat,
2171 xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat,
2172 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2173 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2174 xnn_f32_gemm_ukernel_4x8__wasmsimd_splat,
2175 xnn_f32_igemm_ukernel_4x8__wasmsimd_splat,
2176 xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2177 xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2178 xnn_init_f32_minmax_wasmsimd_params,
2179 4 /* mr */, 8 /* nr */);
2180 }
f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)2181 static void f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2182 GEMMEnd2EndBenchmark(state, model,
2183 xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat,
2184 xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat,
2185 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2186 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2187 xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat,
2188 xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat,
2189 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2190 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2191 xnn_f32_gemm_ukernel_5x8__wasmsimd_splat,
2192 xnn_f32_igemm_ukernel_5x8__wasmsimd_splat,
2193 xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2194 xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2195 xnn_init_f32_minmax_wasmsimd_params,
2196 5 /* mr */, 8 /* nr */);
2197 }
f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State & state,models::ExecutionPlanFactory model)2198 static void f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State& state, models::ExecutionPlanFactory model) {
2199 GEMMEnd2EndBenchmark(state, model,
2200 xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat,
2201 xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat,
2202 xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2203 xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat,
2204 xnn_f32_gemm_relu_ukernel_6x8__wasmsimd_splat,
2205 xnn_f32_igemm_relu_ukernel_6x8__wasmsimd_splat,
2206 xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat,
2207 xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat,
2208 xnn_f32_gemm_ukernel_6x8__wasmsimd_splat,
2209 xnn_f32_igemm_ukernel_6x8__wasmsimd_splat,
2210 xnn_f32_gemm_ukernel_1x8__wasmsimd_splat,
2211 xnn_f32_igemm_ukernel_1x8__wasmsimd_splat,
2212 xnn_init_f32_minmax_wasmsimd_params,
2213 6 /* mr */, 8 /* nr */);
2214 }
f32_gemm_3x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)2215 static void f32_gemm_3x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
2216 GEMMEnd2EndBenchmark(state, model,
2217 xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm,
2218 xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm,
2219 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2220 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2221 xnn_f32_gemm_relu_ukernel_3x8s4__wasmsimd,
2222 xnn_f32_igemm_relu_ukernel_3x8s4__wasmsimd,
2223 xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2224 xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2225 xnn_f32_gemm_ukernel_3x8s4__wasmsimd,
2226 xnn_f32_igemm_ukernel_3x8s4__wasmsimd,
2227 xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2228 xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2229 xnn_init_f32_minmax_wasmsimd_params,
2230 3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2231 }
f32_gemm_4x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)2232 static void f32_gemm_4x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
2233 GEMMEnd2EndBenchmark(state, model,
2234 xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm,
2235 xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm,
2236 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2237 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2238 xnn_f32_gemm_relu_ukernel_4x8s4__wasmsimd,
2239 xnn_f32_igemm_relu_ukernel_4x8s4__wasmsimd,
2240 xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2241 xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2242 xnn_f32_gemm_ukernel_4x8s4__wasmsimd,
2243 xnn_f32_igemm_ukernel_4x8s4__wasmsimd,
2244 xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2245 xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2246 xnn_init_f32_minmax_wasmsimd_params,
2247 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2248 }
f32_gemm_5x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)2249 static void f32_gemm_5x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
2250 GEMMEnd2EndBenchmark(state, model,
2251 xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm,
2252 xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm,
2253 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2254 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2255 xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd,
2256 xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd,
2257 xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2258 xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2259 xnn_f32_gemm_ukernel_5x8s4__wasmsimd,
2260 xnn_f32_igemm_ukernel_5x8s4__wasmsimd,
2261 xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2262 xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2263 xnn_init_f32_minmax_wasmsimd_params,
2264 5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2265 }
f32_gemm_6x8s4__wasmsimd_arm(benchmark::State & state,models::ExecutionPlanFactory model)2266 static void f32_gemm_6x8s4__wasmsimd_arm(benchmark::State& state, models::ExecutionPlanFactory model) {
2267 GEMMEnd2EndBenchmark(state, model,
2268 xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm,
2269 xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm,
2270 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2271 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm,
2272 xnn_f32_gemm_relu_ukernel_6x8s4__wasmsimd,
2273 xnn_f32_igemm_relu_ukernel_6x8s4__wasmsimd,
2274 xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2275 xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2276 xnn_f32_gemm_ukernel_6x8s4__wasmsimd,
2277 xnn_f32_igemm_ukernel_6x8s4__wasmsimd,
2278 xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2279 xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2280 xnn_init_f32_minmax_wasmsimd_params,
2281 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2282 }
f32_gemm_3x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)2283 static void f32_gemm_3x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
2284 GEMMEnd2EndBenchmark(state, model,
2285 xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86,
2286 xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86,
2287 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2288 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2289 xnn_f32_gemm_relu_ukernel_3x8s4__wasmsimd,
2290 xnn_f32_igemm_relu_ukernel_3x8s4__wasmsimd,
2291 xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2292 xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2293 xnn_f32_gemm_ukernel_3x8s4__wasmsimd,
2294 xnn_f32_igemm_ukernel_3x8s4__wasmsimd,
2295 xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2296 xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2297 xnn_init_f32_minmax_wasmsimd_params,
2298 3 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2299 }
f32_gemm_4x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)2300 static void f32_gemm_4x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
2301 GEMMEnd2EndBenchmark(state, model,
2302 xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86,
2303 xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86,
2304 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2305 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2306 xnn_f32_gemm_relu_ukernel_4x8s4__wasmsimd,
2307 xnn_f32_igemm_relu_ukernel_4x8s4__wasmsimd,
2308 xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2309 xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2310 xnn_f32_gemm_ukernel_4x8s4__wasmsimd,
2311 xnn_f32_igemm_ukernel_4x8s4__wasmsimd,
2312 xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2313 xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2314 xnn_init_f32_minmax_wasmsimd_params,
2315 4 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2316 }
f32_gemm_5x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)2317 static void f32_gemm_5x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
2318 GEMMEnd2EndBenchmark(state, model,
2319 xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86,
2320 xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86,
2321 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2322 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2323 xnn_f32_gemm_relu_ukernel_5x8s4__wasmsimd,
2324 xnn_f32_igemm_relu_ukernel_5x8s4__wasmsimd,
2325 xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2326 xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2327 xnn_f32_gemm_ukernel_5x8s4__wasmsimd,
2328 xnn_f32_igemm_ukernel_5x8s4__wasmsimd,
2329 xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2330 xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2331 xnn_init_f32_minmax_wasmsimd_params,
2332 5 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2333 }
f32_gemm_6x8s4__wasmsimd_x86(benchmark::State & state,models::ExecutionPlanFactory model)2334 static void f32_gemm_6x8s4__wasmsimd_x86(benchmark::State& state, models::ExecutionPlanFactory model) {
2335 GEMMEnd2EndBenchmark(state, model,
2336 xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86,
2337 xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86,
2338 xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2339 xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86,
2340 xnn_f32_gemm_relu_ukernel_6x8s4__wasmsimd,
2341 xnn_f32_igemm_relu_ukernel_6x8s4__wasmsimd,
2342 xnn_f32_gemm_relu_ukernel_1x8s4__wasmsimd,
2343 xnn_f32_igemm_relu_ukernel_1x8s4__wasmsimd,
2344 xnn_f32_gemm_ukernel_6x8s4__wasmsimd,
2345 xnn_f32_igemm_ukernel_6x8s4__wasmsimd,
2346 xnn_f32_gemm_ukernel_1x8s4__wasmsimd,
2347 xnn_f32_igemm_ukernel_1x8s4__wasmsimd,
2348 xnn_init_f32_minmax_wasmsimd_params,
2349 6 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
2350 }
2351
2352 BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_arm_loadsplat);
2353 BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_arm_loadsplat);
2354 BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_arm_loadsplat);
2355 BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_arm_loadsplat);
2356
2357 BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_x86_loadsplat);
2358 BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_x86_loadsplat);
2359 BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_x86_loadsplat);
2360 BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_x86_loadsplat);
2361
2362 BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_arm_splat);
2363 BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_arm_splat);
2364 BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_arm_splat);
2365 BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_arm_splat);
2366
2367 BENCHMARK_FP32_END2END(f32_gemm_3x8__wasmsimd_x86_splat);
2368 BENCHMARK_FP32_END2END(f32_gemm_4x8__wasmsimd_x86_splat);
2369 BENCHMARK_FP32_END2END(f32_gemm_5x8__wasmsimd_x86_splat);
2370 BENCHMARK_FP32_END2END(f32_gemm_6x8__wasmsimd_x86_splat);
2371
2372 BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmsimd_arm);
2373 BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmsimd_arm);
2374 BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmsimd_arm);
2375 BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmsimd_arm);
2376
2377 BENCHMARK_FP32_END2END(f32_gemm_3x8s4__wasmsimd_x86);
2378 BENCHMARK_FP32_END2END(f32_gemm_4x8s4__wasmsimd_x86);
2379 BENCHMARK_FP32_END2END(f32_gemm_5x8s4__wasmsimd_x86);
2380 BENCHMARK_FP32_END2END(f32_gemm_6x8s4__wasmsimd_x86);
2381 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2382
2383
2384 #if XNN_ARCH_WASM
f32_gemm_2x4__wasm(benchmark::State & state,models::ExecutionPlanFactory model)2385 static void f32_gemm_2x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) {
2386 GEMMEnd2EndBenchmark(state, model,
2387 xnn_f32_gemm_minmax_ukernel_2x4__wasm,
2388 xnn_f32_igemm_minmax_ukernel_2x4__wasm,
2389 xnn_f32_gemm_minmax_ukernel_1x4__wasm,
2390 xnn_f32_igemm_minmax_ukernel_1x4__wasm,
2391 xnn_f32_gemm_relu_ukernel_2x4__wasm,
2392 xnn_f32_igemm_relu_ukernel_2x4__wasm,
2393 xnn_f32_gemm_relu_ukernel_1x4__wasm,
2394 xnn_f32_igemm_relu_ukernel_1x4__wasm,
2395 xnn_f32_gemm_ukernel_2x4__scalar,
2396 xnn_f32_igemm_ukernel_2x4__scalar,
2397 xnn_f32_gemm_ukernel_1x4__scalar,
2398 xnn_f32_igemm_ukernel_1x4__scalar,
2399 xnn_init_f32_minmax_scalar_params,
2400 2 /* mr */, 4 /* nr */);
2401 }
2402
f32_gemm_4x4__wasm(benchmark::State & state,models::ExecutionPlanFactory model)2403 static void f32_gemm_4x4__wasm(benchmark::State& state, models::ExecutionPlanFactory model) {
2404 GEMMEnd2EndBenchmark(state, model,
2405 xnn_f32_gemm_minmax_ukernel_4x4__wasm,
2406 xnn_f32_igemm_minmax_ukernel_4x4__wasm,
2407 xnn_f32_gemm_minmax_ukernel_1x4__wasm,
2408 xnn_f32_igemm_minmax_ukernel_1x4__wasm,
2409 xnn_f32_gemm_relu_ukernel_4x4__wasm,
2410 xnn_f32_igemm_relu_ukernel_4x4__wasm,
2411 xnn_f32_gemm_relu_ukernel_1x4__wasm,
2412 xnn_f32_igemm_relu_ukernel_1x4__wasm,
2413 xnn_f32_gemm_ukernel_4x4__scalar,
2414 xnn_f32_igemm_ukernel_4x4__scalar,
2415 xnn_f32_gemm_ukernel_1x4__scalar,
2416 xnn_f32_igemm_ukernel_1x4__scalar,
2417 xnn_init_f32_minmax_scalar_params,
2418 4 /* mr */, 4 /* nr */);
2419 }
2420
2421 BENCHMARK_FP32_END2END(f32_gemm_2x4__wasm);
2422 BENCHMARK_FP32_END2END(f32_gemm_4x4__wasm);
2423 #endif // XNN_ARCH_WASM
2424
2425
f32_gemm_2x4__scalar(benchmark::State & state,models::ExecutionPlanFactory model)2426 static void f32_gemm_2x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
2427 GEMMEnd2EndBenchmark(state, model,
2428 xnn_f32_gemm_minmax_ukernel_2x4__scalar,
2429 xnn_f32_igemm_minmax_ukernel_2x4__scalar,
2430 xnn_f32_gemm_minmax_ukernel_1x4__scalar,
2431 xnn_f32_igemm_minmax_ukernel_1x4__scalar,
2432 xnn_f32_gemm_relu_ukernel_2x4__scalar,
2433 xnn_f32_igemm_relu_ukernel_2x4__scalar,
2434 xnn_f32_gemm_relu_ukernel_1x4__scalar,
2435 xnn_f32_igemm_relu_ukernel_1x4__scalar,
2436 xnn_f32_gemm_ukernel_2x4__scalar,
2437 xnn_f32_igemm_ukernel_2x4__scalar,
2438 xnn_f32_gemm_ukernel_1x4__scalar,
2439 xnn_f32_igemm_ukernel_1x4__scalar,
2440 xnn_init_f32_minmax_scalar_params,
2441 2 /* mr */, 4 /* nr */);
2442 }
2443
f32_gemm_4x4__scalar(benchmark::State & state,models::ExecutionPlanFactory model)2444 static void f32_gemm_4x4__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
2445 GEMMEnd2EndBenchmark(state, model,
2446 xnn_f32_gemm_minmax_ukernel_4x4__scalar,
2447 xnn_f32_igemm_minmax_ukernel_4x4__scalar,
2448 xnn_f32_gemm_minmax_ukernel_1x4__scalar,
2449 xnn_f32_igemm_minmax_ukernel_1x4__scalar,
2450 xnn_f32_gemm_relu_ukernel_4x4__scalar,
2451 xnn_f32_igemm_relu_ukernel_4x4__scalar,
2452 xnn_f32_gemm_relu_ukernel_1x4__scalar,
2453 xnn_f32_igemm_relu_ukernel_1x4__scalar,
2454 xnn_f32_gemm_ukernel_4x4__scalar,
2455 xnn_f32_igemm_ukernel_4x4__scalar,
2456 xnn_f32_gemm_ukernel_1x4__scalar,
2457 xnn_f32_igemm_ukernel_1x4__scalar,
2458 xnn_init_f32_minmax_scalar_params,
2459 4 /* mr */, 4 /* nr */);
2460 }
2461
2462 BENCHMARK_FP32_END2END(f32_gemm_2x4__scalar);
2463 BENCHMARK_FP32_END2END(f32_gemm_4x4__scalar);
2464
2465
2466 #ifndef XNNPACK_BENCHMARK_NO_MAIN
2467 BENCHMARK_MAIN();
2468 #endif
2469