1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <algorithm>
10 #include <cfloat>
11 #include <chrono>
12 #include <cmath>
13 #include <functional>
14 #include <mutex>
15 #include <random>
16 #include <vector>
17
18 #include <benchmark/benchmark.h>
19 #ifdef BENCHMARK_RUY
20 #include "ruy/ruy.h"
21 #endif // BENCHMARK_RUY
22 #include "bench/gemm.h"
23 #include "bench/utils.h"
24
25 #include <xnnpack.h>
26 #include <xnnpack/aligned-allocator.h>
27 #include <xnnpack/allocator.h>
28 #include <xnnpack/common.h>
29 #include <xnnpack/gemm.h>
30 #include <xnnpack/math.h>
31 #include <xnnpack/microfnptr.h>
32 #include <xnnpack/microparams-init.h>
33 #include <xnnpack/pack.h>
34 #include <xnnpack/packx.h>
35 #include <xnnpack/ppmm.h>
36
37
GEMMBenchmark(benchmark::State & state,xnn_f32_gemm_minmax_ukernel_function gemm,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)38 static void GEMMBenchmark(benchmark::State& state,
39 xnn_f32_gemm_minmax_ukernel_function gemm,
40 size_t mr, size_t nr, size_t kr, size_t sr,
41 xnn_init_f32_minmax_params_fn init_params,
42 benchmark::utils::IsaCheckFunction isa_check = nullptr)
43 {
44 if (isa_check && !isa_check(state)) {
45 return;
46 }
47
48 const size_t mc = state.range(0);
49 const size_t nc = state.range(1);
50 const size_t kc = state.range(2);
51
52 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
53 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
54
55 std::random_device random_device;
56 auto rng = std::mt19937(random_device());
57 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
58
59 std::vector<float> a(mc * kc + XNN_EXTRA_BYTES / sizeof(float));
60 std::generate(a.begin(), a.end(), std::ref(f32rng));
61 std::vector<float> k(nc * kc);
62 std::generate(k.begin(), k.end(), std::ref(f32rng));
63 std::vector<float> b(nc);
64 std::generate(b.begin(), b.end(), std::ref(f32rng));
65
66 const size_t w_elements = nc_stride * kc_stride + nc_stride;
67 const size_t c_elements = mc * nc;
68 const size_t num_buffers = 1 +
69 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
70 sizeof(float) * (w_elements + c_elements));
71
72 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
73 std::fill(w.begin(), w.end(), 0.0f);
74 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
75 std::vector<float> c(c_elements * num_buffers);
76 std::fill(c.begin(), c.end(), std::nanf(""));
77
78 xnn_f32_minmax_params params;
79 init_params(¶ms,
80 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
81
82 size_t buffer_index = 0;
83 for (auto _ : state) {
84 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
85 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
86 // - W is not in cache (for any cache level)
87 // - C is not in cache (for any cache level)
88 state.PauseTiming();
89 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
90 buffer_index = (buffer_index + 1) % num_buffers;
91 state.ResumeTiming();
92
93 for (uint32_t m = 0; m < mc; m += mr) {
94 const uint32_t mb = min(mc - m, mr);
95 gemm(
96 mb, nc, kc * sizeof(float),
97 a.data() + m * kc, kc * sizeof(float),
98 w.data() + buffer_index * nc_stride * (kc_stride + 1),
99 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
100 ¶ms);
101 }
102 }
103
104 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
105 if (cpu_frequency != 0) {
106 state.counters["cpufreq"] = cpu_frequency;
107 }
108
109 state.counters["FLOPS"] = benchmark::Counter(
110 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
111 }
112
PPMM1PBenchmark(benchmark::State & state,xnn_f32_ppmm_minmax_ukernel_function ppmm,xnn_x32_packx_ukernel_function packx,size_t mr,size_t nr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)113 static void PPMM1PBenchmark(benchmark::State& state,
114 xnn_f32_ppmm_minmax_ukernel_function ppmm,
115 xnn_x32_packx_ukernel_function packx,
116 size_t mr, size_t nr,
117 xnn_init_f32_minmax_params_fn init_params,
118 benchmark::utils::IsaCheckFunction isa_check = nullptr)
119 {
120 if (isa_check && !isa_check(state)) {
121 return;
122 }
123
124 const size_t mc = state.range(0);
125 const size_t nc = state.range(1);
126 const size_t kc = state.range(2);
127
128 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
129
130 std::random_device random_device;
131 auto rng = std::mt19937(random_device());
132 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
133
134 std::vector<float> a(mc * kc + XNN_EXTRA_BYTES / sizeof(float));
135 std::generate(a.begin(), a.end(), std::ref(f32rng));
136 std::vector<float> k(nc * kc);
137 std::generate(k.begin(), k.end(), std::ref(f32rng));
138 std::vector<float> b(nc);
139 std::generate(b.begin(), b.end(), std::ref(f32rng));
140
141 std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> t(mr * kc);
142
143 const size_t w_elements = nc_stride * kc + nc_stride;
144 const size_t c_elements = mc * nc;
145 const size_t num_buffers = 1 +
146 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
147 sizeof(float) * (w_elements + c_elements));
148
149 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
150 std::fill(w.begin(), w.end(), 0.0f);
151 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data(), 0, nullptr);
152 std::vector<float> c(c_elements * num_buffers);
153 std::fill(c.begin(), c.end(), std::nanf(""));
154
155 xnn_f32_minmax_params params;
156 init_params(¶ms,
157 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
158
159 size_t buffer_index = 0;
160 for (auto _ : state) {
161 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
162 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
163 // - W is not in cache (for any cache level)
164 // - C is not in cache (for any cache level)
165 state.PauseTiming();
166 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
167 buffer_index = (buffer_index + 1) % num_buffers;
168 state.ResumeTiming();
169
170 for (uint32_t m = 0; m < mc; m += mr) {
171 const uint32_t mb = min(mc - m, mr);
172 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data());
173 ppmm(
174 mb, nc, kc * sizeof(float),
175 reinterpret_cast<const float*>(t.data()),
176 w.data() + nc_stride * buffer_index * (kc + 1),
177 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
178 ¶ms);
179 }
180 }
181
182 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
183 if (cpu_frequency != 0) {
184 state.counters["cpufreq"] = cpu_frequency;
185 }
186
187 state.counters["FLOPS"] = benchmark::Counter(
188 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
189 }
190
PPMM2PBenchmark(benchmark::State & state,xnn_f32_ppmm_minmax_ukernel_function ppmm,xnn_x32_packx_ukernel_function packx,size_t mr,size_t nr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)191 static void PPMM2PBenchmark(benchmark::State& state,
192 xnn_f32_ppmm_minmax_ukernel_function ppmm,
193 xnn_x32_packx_ukernel_function packx,
194 size_t mr, size_t nr,
195 xnn_init_f32_minmax_params_fn init_params,
196 benchmark::utils::IsaCheckFunction isa_check = nullptr)
197 {
198 if (isa_check && !isa_check(state)) {
199 return;
200 }
201
202 const size_t mc = state.range(0);
203 const size_t nc = state.range(1);
204 const size_t kc = state.range(2);
205
206 const size_t mc_stride = benchmark::utils::RoundUp(mc, mr);
207 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
208
209 std::random_device random_device;
210 auto rng = std::mt19937(random_device());
211 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
212
213 std::vector<float> a(mc * kc + XNN_EXTRA_BYTES / sizeof(float));
214 std::generate(a.begin(), a.end(), std::ref(f32rng));
215 std::vector<float> k(nc * kc);
216 std::generate(k.begin(), k.end(), std::ref(f32rng));
217 std::vector<float> b(nc);
218 std::generate(b.begin(), b.end(), std::ref(f32rng));
219
220 std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> t(mc_stride * kc);
221
222 const size_t w_elements = nc_stride * kc + nc_stride;
223 const size_t c_elements = mc * nc;
224 const size_t num_buffers = 1 +
225 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
226 sizeof(float) * (w_elements + c_elements));
227
228 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
229 std::fill(w.begin(), w.end(), 0.0f);
230 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, 1 /* kr */, 1 /* sr */, k.data(), b.data(), w.data(), 0, nullptr);
231 std::vector<float> c(c_elements * num_buffers);
232 std::fill(c.begin(), c.end(), std::nanf(""));
233
234 xnn_f32_minmax_params params;
235 init_params(¶ms,
236 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
237
238 size_t buffer_index = 0;
239 for (auto _ : state) {
240 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
241 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
242 // - W is not in cache (for any cache level)
243 // - C is not in cache (for any cache level)
244 state.PauseTiming();
245 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
246 buffer_index = (buffer_index + 1) % num_buffers;
247 state.ResumeTiming();
248
249 for (uint32_t m = 0; m < mc; m += mr) {
250 const uint32_t mb = min(mc - m, mr);
251 packx(mb, kc, reinterpret_cast<const uint32_t*>(a.data() + m * kc), kc, t.data() + m * kc);
252 }
253 for (uint32_t m = 0; m < mc; m += mr) {
254 const uint32_t mb = min(mc - m, mr);
255 ppmm(
256 mb, nc, kc * sizeof(float),
257 reinterpret_cast<const float*>(t.data() + m * kc),
258 w.data() + nc_stride * buffer_index * (kc + 1),
259 c.data() + (mc * buffer_index + m) * nc, nc * sizeof(float), nr * sizeof(float),
260 ¶ms);
261 }
262 }
263
264 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
265 if (cpu_frequency != 0) {
266 state.counters["cpufreq"] = cpu_frequency;
267 }
268
269 state.counters["FLOPS"] = benchmark::Counter(
270 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
271 }
272
273 #ifdef BENCHMARK_RUY
RuyBenchmark(benchmark::State & state,uint32_t threads)274 static void RuyBenchmark(benchmark::State& state, uint32_t threads)
275 {
276 std::random_device random_device;
277 auto rng = std::mt19937(random_device());
278 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
279
280 const size_t mc = state.range(0);
281 const size_t nc = state.range(1);
282 const size_t kc = state.range(2);
283
284 const size_t num_buffers = 1 +
285 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
286 sizeof(float) * (nc * (mc + kc + 1)));
287
288 std::vector<float> a(mc * kc + XNN_EXTRA_BYTES / sizeof(float));
289 std::generate(a.begin(), a.end(), std::ref(f32rng));
290 std::vector<float> k(num_buffers * nc * kc);
291 std::generate(k.begin(), k.end(), std::ref(f32rng));
292 std::vector<float> b(num_buffers * nc);
293 std::generate(b.begin(), b.end(), std::ref(f32rng));
294 std::vector<float> c(num_buffers * nc * mc);
295 std::fill(c.begin(), c.end(), std::nanf(""));
296
297 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
298 static ruy::Context context;
299 context.set_max_num_threads(threads);
300
301 ruy::Matrix<float> ruy_a;
302 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
303 ruy::Matrix<float> ruy_b;
304 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
305 ruy_b.set_data(a.data());
306 ruy::Matrix<float> ruy_c;
307 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
308
309 ruy::MulParams<float, float> mul_params;
310
311 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
312 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
313 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
314 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
315 static std::once_flag warmup;
316 std::call_once(warmup, [&](){
317 auto start = std::chrono::steady_clock::now();
318 do {
319 ruy_a.set_data(k.data());
320 ruy_c.set_data(c.data());
321 mul_params.set_bias(b.data());
322
323 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
324 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
325 });
326
327 size_t buffer_index = 0;
328 for (auto _ : state) {
329 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
330 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
331 // - K is not in cache (for any cache level)
332 // - B is not in cache (for any cache level)
333 // - C is not in cache (for any cache level)
334 state.PauseTiming();
335 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
336 buffer_index = (buffer_index + 1) % num_buffers;
337 state.ResumeTiming();
338
339 ruy_a.set_data(k.data() + buffer_index * nc * kc);
340 ruy_c.set_data(c.data() + buffer_index * mc * nc);
341 mul_params.set_bias(b.data() + buffer_index * nc);
342
343 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
344 }
345
346 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
347 if (cpu_frequency != 0) {
348 state.counters["cpufreq"] = cpu_frequency;
349 }
350
351 state.counters["FLOPS"] = benchmark::Counter(
352 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
353 }
354
ruy_st(benchmark::State & state,const char * net)355 static void ruy_st(benchmark::State& state, const char* net)
356 {
357 RuyBenchmark(state, 1);
358 }
359 #endif // BENCHMARK_RUY
360
361 #if XNN_PLATFORM_JIT
GEMMBenchmark(benchmark::State & state,xnn_jit_gemm_code_generator_function generator,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)362 static void GEMMBenchmark(benchmark::State& state,
363 xnn_jit_gemm_code_generator_function generator,
364 size_t mr, size_t nr, size_t kr, size_t sr,
365 xnn_init_f32_minmax_params_fn init_params,
366 benchmark::utils::IsaCheckFunction isa_check = nullptr)
367 {
368 if (isa_check && !isa_check(state)) {
369 return;
370 }
371
372 const size_t mc = state.range(0);
373 const size_t nc = state.range(1);
374 const size_t kc = state.range(2);
375
376 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
377 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
378
379 std::random_device random_device;
380 auto rng = std::mt19937(random_device());
381 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
382
383 std::vector<float> a(mc * kc + XNN_EXTRA_BYTES / sizeof(float));
384 std::generate(a.begin(), a.end(), std::ref(f32rng));
385 std::vector<float> k(nc * kc);
386 std::generate(k.begin(), k.end(), std::ref(f32rng));
387 std::vector<float> b(nc);
388 std::generate(b.begin(), b.end(), std::ref(f32rng));
389
390 const size_t w_elements = nc_stride * kc_stride + nc_stride;
391 const size_t c_elements = mc * nc;
392 const size_t num_buffers = 1 +
393 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
394 sizeof(float) * (w_elements + c_elements));
395
396 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
397 std::fill(w.begin(), w.end(), 0.0f);
398 xnn_pack_f32_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, nullptr);
399 std::vector<float> c(c_elements * num_buffers);
400 std::fill(c.begin(), c.end(), std::nanf(""));
401
402 xnn_f32_minmax_params params;
403 init_params(¶ms,
404 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
405
406 xnn_initialize(/*allocator=*/nullptr);
407 xnn_code_buffer code_buffer;
408 xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
409 jit_gemm_params jit_params = {
410 .f32_minmax = {
411 .min = -std::numeric_limits<float>::infinity(),
412 .max = +std::numeric_limits<float>::infinity()
413 }
414 };
415 generator(&code_buffer, mr, nc % nr, kc * sizeof(float), &jit_params);
416 xnn_finalize_code_memory(&code_buffer);
417 xnn_f32_gemm_minmax_ukernel_function gemm = reinterpret_cast<xnn_f32_gemm_minmax_ukernel_function>(code_buffer.start);
418
419 size_t buffer_index = 0;
420 for (auto _ : state) {
421 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
422 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
423 // - W is not in cache (for any cache level)
424 // - C is not in cache (for any cache level)
425 state.PauseTiming();
426 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
427 buffer_index = (buffer_index + 1) % num_buffers;
428 state.ResumeTiming();
429
430 for (uint32_t m = 0; m < mc; m += mr) {
431 const uint32_t mb = min(mc - m, mr);
432 gemm(
433 mb, nc, kc * sizeof(float),
434 a.data() + m * kc, kc * sizeof(float),
435 w.data() + buffer_index * nc_stride * (kc_stride + 1),
436 c.data() + (buffer_index * mc + m) * nc, nc * sizeof(float), nr * sizeof(float),
437 ¶ms);
438 }
439 }
440
441 xnn_release_code_memory(&code_buffer);
442
443 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
444 if (cpu_frequency != 0) {
445 state.counters["cpufreq"] = cpu_frequency;
446 }
447
448 state.counters["FLOPS"] = benchmark::Counter(
449 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
450 }
451 #endif // XNN_PLATFORM_JIT
452
453 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
f32_gemm_1x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)454 static void f32_gemm_1x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
455 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, 1, 8, 1, 1,
456 xnn_init_f32_minmax_scalar_params);
457 }
f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)458 static void f32_gemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
459 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1,
460 xnn_init_f32_minmax_scalar_params);
461 }
f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)462 static void f32_gemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
463 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1,
464 xnn_init_f32_minmax_scalar_params);
465 }
f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,const char * net)466 static void f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net) {
467 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, 1, 8, 1, 1,
468 xnn_init_f32_minmax_scalar_params);
469 }
f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)470 static void f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
471 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
472 xnn_init_f32_minmax_scalar_params);
473 }
f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)474 static void f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
475 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
476 xnn_init_f32_minmax_scalar_params);
477 }
f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)478 static void f32_gemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
479 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1,
480 xnn_init_f32_minmax_scalar_params);
481 }
f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)482 static void f32_gemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
483 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1,
484 xnn_init_f32_minmax_scalar_params);
485 }
f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,const char * net)486 static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net) {
487 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, 4, 8, 1, 1,
488 xnn_init_f32_minmax_scalar_params);
489 }
f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)490 static void f32_gemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
491 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, 4, 8, 1, 1,
492 xnn_init_f32_minmax_scalar_params);
493 }
f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)494 static void f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
495 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
496 xnn_init_f32_minmax_scalar_params);
497 }
f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)498 static void f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
499 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
500 xnn_init_f32_minmax_scalar_params);
501 }
f32_gemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)502 static void f32_gemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
503 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, 4, 2, 1, 1,
504 xnn_init_f32_minmax_scalar_params);
505 }
f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)506 static void f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
507 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, 4, 2, 1, 1,
508 xnn_init_f32_minmax_scalar_params);
509 }
f32_gemm_4x2__aarch64_neonfma_ld64(benchmark::State & state,const char * net)510 static void f32_gemm_4x2__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
511 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, 4, 2, 1, 1,
512 xnn_init_f32_minmax_scalar_params);
513 }
f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)514 static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
515 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1,
516 xnn_init_f32_minmax_scalar_params);
517 }
f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)518 static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
519 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1,
520 xnn_init_f32_minmax_scalar_params);
521 }
f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)522 static void f32_gemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
523 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1,
524 xnn_init_f32_minmax_scalar_params);
525 }
f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)526 static void f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
527 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8, 1, 1,
528 xnn_init_f32_minmax_scalar_params);
529 }
f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)530 static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
531 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1,
532 xnn_init_f32_minmax_scalar_params);
533 }
f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)534 static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
535 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
536 xnn_init_f32_minmax_scalar_params);
537 }
f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)538 static void f32_gemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
539 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1,
540 xnn_init_f32_minmax_scalar_params);
541 }
f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,const char * net)542 static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net) {
543 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53, 6, 8, 1, 1,
544 xnn_init_f32_minmax_scalar_params);
545 }
f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)546 static void f32_gemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
547 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1,
548 xnn_init_f32_minmax_scalar_params);
549 }
f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State & state,const char * net)550 static void f32_gemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
551 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1,
552 xnn_init_f32_minmax_scalar_params);
553 }
f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)554 static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
555 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
556 xnn_init_f32_minmax_scalar_params);
557 }
f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)558 static void f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
559 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
560 xnn_init_f32_minmax_scalar_params);
561 }
f32_gemm_1x8__neonfma_lane_ld64(benchmark::State & state,const char * net)562 static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
563 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1,
564 xnn_init_f32_minmax_scalar_params);
565 }
f32_gemm_4x2__neonfma_lane_ld64(benchmark::State & state,const char * net)566 static void f32_gemm_4x2__neonfma_lane_ld64(benchmark::State& state, const char* net) {
567 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64, 4, 2, 1, 1,
568 xnn_init_f32_minmax_scalar_params);
569 }
f32_gemm_6x2__neonfma_lane_ld64(benchmark::State & state,const char * net)570 static void f32_gemm_6x2__neonfma_lane_ld64(benchmark::State& state, const char* net) {
571 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x2__neonfma_lane_ld64, 6, 2, 1, 1,
572 xnn_init_f32_minmax_scalar_params);
573 }
f32_gemm_4x8__neonfma_lane_ld64(benchmark::State & state,const char * net)574 static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
575 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1,
576 xnn_init_f32_minmax_scalar_params);
577 }
f32_gemm_4x8__neonfma_lane_ld128(benchmark::State & state,const char * net)578 static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
579 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1,
580 xnn_init_f32_minmax_scalar_params);
581 }
f32_gemm_5x8__neonfma_lane_ld64(benchmark::State & state,const char * net)582 static void f32_gemm_5x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
583 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64, 5, 8, 1, 1,
584 xnn_init_f32_minmax_scalar_params);
585 }
f32_gemm_6x8__neonfma_lane_ld64(benchmark::State & state,const char * net)586 static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
587 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1,
588 xnn_init_f32_minmax_scalar_params);
589 }
f32_gemm_6x8__neonfma_lane_ld128(benchmark::State & state,const char * net)590 static void f32_gemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
591 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1,
592 xnn_init_f32_minmax_scalar_params);
593 }
594
595 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_ld64)
BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)596 BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
597 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
598 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a53)
599 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a75)
600 BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75)
601 BENCHMARK_GEMM(f32_gemm_4x2__aarch64_neonfma_cortex_a75)
602 BENCHMARK_GEMM(f32_gemm_4x2__aarch64_neonfma_prfm_cortex_a75)
603 BENCHMARK_GEMM(f32_gemm_4x2__aarch64_neonfma_ld64)
604 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a53)
605 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a53)
606 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a55)
607 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_cortex_a75)
608 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75)
609 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld128)
610 BENCHMARK_GEMM(f32_gemm_4x8__aarch64_neonfma_ld64)
611 BENCHMARK_GEMM(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
612 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_cortex_a75)
613 BENCHMARK_GEMM(f32_gemm_5x8__aarch64_neonfma_prfm_cortex_a75)
614 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a53)
615 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a53)
616 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a55)
617 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a73)
618 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
619 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_prfm_cortex_a75)
620 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
621 BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
622 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
623 BENCHMARK_GEMM(f32_gemm_4x2__neonfma_lane_ld64)
624 BENCHMARK_GEMM(f32_gemm_6x2__neonfma_lane_ld64)
625 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld64)
626 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
627 BENCHMARK_GEMM(f32_gemm_5x8__neonfma_lane_ld64)
628 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld64)
629 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld128)
630 #endif // XNN_ARCH_ARM64
631
632 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
633 static void f32_gemm_4x4__aarch32_vfp_ld64(benchmark::State& state, const char* net) {
634 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, 4, 4, 1, 1,
635 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckVFP);
636 }
637
f32_gemm_4x8__aarch32_neon_ld64(benchmark::State & state,const char * net)638 static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
639 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
640 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
641 }
f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)642 static void f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
643 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
644 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
645 }
f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)646 static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
647 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
648 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
649 }
f32_gemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State & state,const char * net)650 static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State& state, const char* net) {
651 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, 4, 8, 1, 1,
652 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
653 }
f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)654 static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
655 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
656 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
657 }
f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)658 static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
659 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
660 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
661 }
f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)662 static void f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
663 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
664 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
665 }
666
667 BENCHMARK_GEMM(f32_gemm_4x4__aarch32_vfp_ld64)
BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)668 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)
669 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a7)
670 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a53)
671 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_prfm_cortex_a53)
672 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a55)
673 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a75)
674 BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_prfm_cortex_a75)
675 #endif // XNN_ARCH_ARM
676
677 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
678 static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
679 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1,
680 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
681 }
f32_gemm_4x2__neon_lane_ld64(benchmark::State & state,const char * net)682 static void f32_gemm_4x2__neon_lane_ld64(benchmark::State& state, const char* net) {
683 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1,
684 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
685 }
f32_gemm_6x2__neon_lane_ld64(benchmark::State & state,const char * net)686 static void f32_gemm_6x2__neon_lane_ld64(benchmark::State& state, const char* net) {
687 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, 6, 2, 1, 1,
688 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
689 }
f32_gemm_4x8__neon_lane_ld64(benchmark::State & state,const char * net)690 static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
691 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1,
692 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
693 }
f32_gemm_4x8__neon_lane_ld128(benchmark::State & state,const char * net)694 static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
695 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1,
696 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
697 }
f32_gemm_5x8__neon_lane_ld64(benchmark::State & state,const char * net)698 static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
699 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1,
700 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
701 }
f32_gemm_6x8__neon_lane_ld64(benchmark::State & state,const char * net)702 static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
703 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1,
704 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
705 }
f32_gemm_6x8__neon_lane_ld128(benchmark::State & state,const char * net)706 static void f32_gemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
707 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1,
708 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
709 }
f32_gemm_1x8__neonfma_dup_ld64(benchmark::State & state,const char * net)710 static void f32_gemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
711 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1,
712 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
713 }
f32_gemm_4x8__neonfma_dup_ld64(benchmark::State & state,const char * net)714 static void f32_gemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
715 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1,
716 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
717 }
f32_gemm_4x8__neonfma_dup_ld128(benchmark::State & state,const char * net)718 static void f32_gemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
719 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1,
720 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
721 }
f32_gemm_6x8__neonfma_dup_ld64(benchmark::State & state,const char * net)722 static void f32_gemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
723 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1,
724 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
725 }
f32_gemm_6x8__neonfma_dup_ld128(benchmark::State & state,const char * net)726 static void f32_gemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
727 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1,
728 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
729 }
f32_gemm_1x8s4__neon(benchmark::State & state,const char * net)730 static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
731 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__neon, 1, 8, 1, 4,
732 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
733 }
f32_gemm_1x8s4__neonfma(benchmark::State & state,const char * net)734 static void f32_gemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
735 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, 1, 8, 1, 4,
736 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
737 }
f32_gemm_4x8s4__neon(benchmark::State & state,const char * net)738 static void f32_gemm_4x8s4__neon(benchmark::State& state, const char* net) {
739 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__neon, 4, 8, 1, 4,
740 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
741 }
f32_gemm_4x8s4__neonfma(benchmark::State & state,const char * net)742 static void f32_gemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
743 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__neonfma, 4, 8, 1, 4,
744 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
745 }
f32_gemm_6x8s4__neon(benchmark::State & state,const char * net)746 static void f32_gemm_6x8s4__neon(benchmark::State& state, const char* net) {
747 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__neon, 6, 8, 1, 4,
748 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
749 }
f32_gemm_6x8s4__neonfma(benchmark::State & state,const char * net)750 static void f32_gemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
751 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, 6, 8, 1, 4,
752 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
753 }
f32_gemm_8x8s4__neon(benchmark::State & state,const char * net)754 static void f32_gemm_8x8s4__neon(benchmark::State& state, const char* net) {
755 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8s4__neon, 8, 8, 1, 4,
756 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
757 }
f32_gemm_8x8s4__neonfma(benchmark::State & state,const char * net)758 static void f32_gemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
759 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, 8, 8, 1, 4,
760 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
761 }
f32_ppmm_4x8_unipass__neonfma(benchmark::State & state,const char * net)762 static void f32_ppmm_4x8_unipass__neonfma(benchmark::State& state, const char* net) {
763 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8,
764 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
765 }
f32_ppmm_4x8_twopass__neonfma(benchmark::State & state,const char * net)766 static void f32_ppmm_4x8_twopass__neonfma(benchmark::State& state, const char* net) {
767 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8,
768 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
769 }
770
771 BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
BENCHMARK_GEMM(f32_gemm_4x2__neon_lane_ld64)772 BENCHMARK_GEMM(f32_gemm_4x2__neon_lane_ld64)
773 BENCHMARK_GEMM(f32_gemm_6x2__neon_lane_ld64)
774 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)
775 BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
776 BENCHMARK_GEMM(f32_gemm_5x8__neon_lane_ld64)
777 BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld64)
778 BENCHMARK_GEMM(f32_gemm_6x8__neon_lane_ld128)
779
780 BENCHMARK_GEMM(f32_gemm_1x8__neonfma_dup_ld64)
781 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld64)
782 BENCHMARK_GEMM(f32_gemm_4x8__neonfma_dup_ld128)
783 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld64)
784 BENCHMARK_GEMM(f32_gemm_6x8__neonfma_dup_ld128)
785
786 BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
787 BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
788 BENCHMARK_GEMM(f32_gemm_6x8s4__neon)
789 BENCHMARK_GEMM(f32_gemm_8x8s4__neon)
790
791 BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
792 BENCHMARK_GEMM(f32_gemm_4x8s4__neonfma)
793 BENCHMARK_GEMM(f32_gemm_6x8s4__neonfma)
794 BENCHMARK_GEMM(f32_gemm_8x8s4__neonfma)
795
796 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__neonfma)
797 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__neonfma)
798 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
799
800
801 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT
802 static void jit_f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net)
803 {
804 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
805 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
806 }
jit_f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)807 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net)
808 {
809 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
810 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
811 }
jit_f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)812 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net)
813 {
814 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
815 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
816 }
jit_f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)817 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net)
818 {
819 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
820 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
821 }
jit_f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)822 static void jit_f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net)
823 {
824 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
825 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
826 }
jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)827 static void jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net)
828 {
829 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
830 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
831 }
832
833 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a53)
BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a55)834 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a55)
835 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a75)
836 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_prfm_cortex_a75)
837 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_ld64)
838 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch32_neon_cortex_a7)
839 #endif // XNN_ARCH_ARM && XNN_PLATFORM_JIT
840
841 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
842 static void jit_f32_gemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net)
843 {
844 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
845 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
846 }
jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)847 static void jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net)
848 {
849 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
850 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
851 }
jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)852 static void jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net)
853 {
854 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
855 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
856 }
jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)857 static void jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net)
858 {
859 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
860 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
861 }
jit_f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)862 static void jit_f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net)
863 {
864 GEMMBenchmark(state, xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
865 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
866 }
867 BENCHMARK_GEMM(jit_f32_gemm_1x8__aarch64_neonfma_cortex_a75)
868 BENCHMARK_GEMM(jit_f32_gemm_1x8__aarch64_neonfma_prfm_cortex_a75)
869 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch64_neonfma_cortex_a75)
870 BENCHMARK_GEMM(jit_f32_gemm_4x8__aarch64_neonfma_prfm_cortex_a75)
871 BENCHMARK_GEMM(jit_f32_gemm_6x8__aarch64_neonfma_ld128)
872
873 #define BENCHMARK_UPTO_MR_GEMM(name, max_mr, nr) \
874 static void name(benchmark::State &state, const char *net) { \
875 GEMMBenchmark( \
876 state, \
877 xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, \
878 max_mr, nr, 1, 1, xnn_init_f32_minmax_scalar_params, \
879 benchmark::utils::CheckNEON); \
880 } \
881 BENCHMARK_GEMM(name)
882 BENCHMARK_UPTO_MR_GEMM(jit_f32_gemm_upto6x8_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8);
883 BENCHMARK_UPTO_MR_GEMM(jit_f32_gemm_upto6x8_2x8__aarch64_neonfma_prfm_cortex_a75, 2, 8);
884 BENCHMARK_UPTO_MR_GEMM(jit_f32_gemm_upto6x8_3x8__aarch64_neonfma_prfm_cortex_a75, 3, 8);
885 BENCHMARK_UPTO_MR_GEMM(jit_f32_gemm_upto6x8_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8);
886 BENCHMARK_UPTO_MR_GEMM(jit_f32_gemm_upto6x8_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8);
887 BENCHMARK_UPTO_MR_GEMM(jit_f32_gemm_upto6x8_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8);
888 #undef BENCHMARK_UPTO_MR_GEMM
889
890 #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
891
892 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
f32_gemm_1x16__avx512f_broadcast(benchmark::State & state,const char * net)893 static void f32_gemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
894 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1,
895 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
896 }
f32_gemm_4x16__avx512f_broadcast(benchmark::State & state,const char * net)897 static void f32_gemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
898 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1,
899 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
900 }
f32_gemm_5x16__avx512f_broadcast(benchmark::State & state,const char * net)901 static void f32_gemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
902 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1,
903 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
904 }
f32_gemm_6x16__avx512f_broadcast(benchmark::State & state,const char * net)905 static void f32_gemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
906 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1,
907 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
908 }
f32_gemm_7x16__avx512f_broadcast(benchmark::State & state,const char * net)909 static void f32_gemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
910 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1,
911 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
912 }
f32_gemm_8x16__avx512f_broadcast(benchmark::State & state,const char * net)913 static void f32_gemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
914 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1,
915 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckAVX512F);
916 }
917
f32_gemm_1x8__fma3_broadcast(benchmark::State & state,const char * net)918 static void f32_gemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
919 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1,
920 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
921 }
f32_gemm_4x8__fma3_broadcast(benchmark::State & state,const char * net)922 static void f32_gemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
923 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1,
924 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
925 }
f32_gemm_5x8__fma3_broadcast(benchmark::State & state,const char * net)926 static void f32_gemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
927 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1,
928 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
929 }
f32_gemm_6x8__fma3_broadcast(benchmark::State & state,const char * net)930 static void f32_gemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
931 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1,
932 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
933 }
f32_gemm_7x8__fma3_broadcast(benchmark::State & state,const char * net)934 static void f32_gemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
935 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1,
936 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
937 }
f32_gemm_8x8__fma3_broadcast(benchmark::State & state,const char * net)938 static void f32_gemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
939 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1,
940 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
941 }
f32_gemm_1x16__fma3_broadcast(benchmark::State & state,const char * net)942 static void f32_gemm_1x16__fma3_broadcast(benchmark::State& state, const char* net) {
943 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, 1, 16, 1, 1,
944 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
945 }
f32_gemm_3x16__fma3_broadcast(benchmark::State & state,const char * net)946 static void f32_gemm_3x16__fma3_broadcast(benchmark::State& state, const char* net) {
947 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, 4, 16, 1, 1,
948 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
949 }
f32_gemm_4x16__fma3_broadcast(benchmark::State & state,const char * net)950 static void f32_gemm_4x16__fma3_broadcast(benchmark::State& state, const char* net) {
951 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast, 4, 16, 1, 1,
952 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
953 }
f32_gemm_5x16__fma3_broadcast(benchmark::State & state,const char * net)954 static void f32_gemm_5x16__fma3_broadcast(benchmark::State& state, const char* net) {
955 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast, 5, 16, 1, 1,
956 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
957 }
958
f32_gemm_1x16s4__fma3_broadcast(benchmark::State & state,const char * net)959 static void f32_gemm_1x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
960 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast, 1, 16, 1, 4,
961 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
962 }
f32_gemm_3x16s4__fma3_broadcast(benchmark::State & state,const char * net)963 static void f32_gemm_3x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
964 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast, 4, 16, 1, 4,
965 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
966 }
f32_gemm_4x16s4__fma3_broadcast(benchmark::State & state,const char * net)967 static void f32_gemm_4x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
968 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast, 4, 16, 1, 4,
969 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
970 }
f32_gemm_5x16s4__fma3_broadcast(benchmark::State & state,const char * net)971 static void f32_gemm_5x16s4__fma3_broadcast(benchmark::State& state, const char* net) {
972 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast, 5, 16, 1, 4,
973 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
974 }
975
f32_gemm_1x8__avx_broadcast(benchmark::State & state,const char * net)976 static void f32_gemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
977 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, 1, 8, 1, 1,
978 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
979 }
f32_gemm_4x8__avx_broadcast(benchmark::State & state,const char * net)980 static void f32_gemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
981 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__avx_broadcast, 4, 8, 1, 1,
982 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
983 }
f32_gemm_5x8__avx_broadcast(benchmark::State & state,const char * net)984 static void f32_gemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
985 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, 5, 8, 1, 1,
986 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
987 }
f32_gemm_6x8__avx_broadcast(benchmark::State & state,const char * net)988 static void f32_gemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
989 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, 6, 8, 1, 1,
990 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
991 }
f32_gemm_7x8__avx_broadcast(benchmark::State & state,const char * net)992 static void f32_gemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
993 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast, 7, 8, 1, 1,
994 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
995 }
f32_gemm_1x16__avx_broadcast(benchmark::State & state,const char * net)996 static void f32_gemm_1x16__avx_broadcast(benchmark::State& state, const char* net) {
997 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast, 1, 16, 1, 1,
998 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
999 }
f32_gemm_3x16__avx_broadcast(benchmark::State & state,const char * net)1000 static void f32_gemm_3x16__avx_broadcast(benchmark::State& state, const char* net) {
1001 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, 4, 16, 1, 1,
1002 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
1003 }
f32_gemm_4x16__avx_broadcast(benchmark::State & state,const char * net)1004 static void f32_gemm_4x16__avx_broadcast(benchmark::State& state, const char* net) {
1005 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast, 4, 16, 1, 1,
1006 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
1007 }
f32_gemm_5x16__avx_broadcast(benchmark::State & state,const char * net)1008 static void f32_gemm_5x16__avx_broadcast(benchmark::State& state, const char* net) {
1009 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast, 5, 16, 1, 1,
1010 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
1011 }
1012
f32_gemm_1x8__sse2_dup(benchmark::State & state,const char * net)1013 static void f32_gemm_1x8__sse2_dup(benchmark::State& state, const char* net) {
1014 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, 1, 8, 1, 1,
1015 xnn_init_f32_minmax_sse_params);
1016 }
f32_gemm_3x8__sse2_dup(benchmark::State & state,const char * net)1017 static void f32_gemm_3x8__sse2_dup(benchmark::State& state, const char* net) {
1018 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse2_dup, 3, 8, 1, 1,
1019 xnn_init_f32_minmax_sse_params);
1020 }
f32_gemm_4x8__sse2_dup(benchmark::State & state,const char * net)1021 static void f32_gemm_4x8__sse2_dup(benchmark::State& state, const char* net) {
1022 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse2_dup, 4, 8, 1, 1,
1023 xnn_init_f32_minmax_sse_params);
1024 }
f32_gemm_5x8__sse2_dup(benchmark::State & state,const char * net)1025 static void f32_gemm_5x8__sse2_dup(benchmark::State& state, const char* net) {
1026 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, 5, 8, 1, 1,
1027 xnn_init_f32_minmax_sse_params);
1028 }
1029
f32_gemm_1x8__sse_load1(benchmark::State & state,const char * net)1030 static void f32_gemm_1x8__sse_load1(benchmark::State& state, const char* net) {
1031 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse_load1, 1, 8, 1, 1,
1032 xnn_init_f32_minmax_sse_params);
1033 }
f32_gemm_3x8__sse_load1(benchmark::State & state,const char * net)1034 static void f32_gemm_3x8__sse_load1(benchmark::State& state, const char* net) {
1035 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, 3, 8, 1, 1,
1036 xnn_init_f32_minmax_sse_params);
1037 }
f32_gemm_4x8__sse_load1(benchmark::State & state,const char * net)1038 static void f32_gemm_4x8__sse_load1(benchmark::State& state, const char* net) {
1039 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, 4, 8, 1, 1,
1040 xnn_init_f32_minmax_sse_params);
1041 }
f32_gemm_5x8__sse_load1(benchmark::State & state,const char * net)1042 static void f32_gemm_5x8__sse_load1(benchmark::State& state, const char* net) {
1043 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, 5, 8, 1, 1,
1044 xnn_init_f32_minmax_sse_params);
1045 }
1046
f32_gemm_1x8__sse_dup(benchmark::State & state,const char * net)1047 static void f32_gemm_1x8__sse_dup(benchmark::State& state, const char* net) {
1048 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8__sse_dup, 1, 8, 1, 1,
1049 xnn_init_f32_minmax_sse_params);
1050 }
f32_gemm_3x8__sse_dup(benchmark::State & state,const char * net)1051 static void f32_gemm_3x8__sse_dup(benchmark::State& state, const char* net) {
1052 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, 3, 8, 1, 1,
1053 xnn_init_f32_minmax_sse_params);
1054 }
f32_gemm_4x8__sse_dup(benchmark::State & state,const char * net)1055 static void f32_gemm_4x8__sse_dup(benchmark::State& state, const char* net) {
1056 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__sse_dup, 4, 8, 1, 1,
1057 xnn_init_f32_minmax_sse_params);
1058 }
f32_gemm_5x8__sse_dup(benchmark::State & state,const char * net)1059 static void f32_gemm_5x8__sse_dup(benchmark::State& state, const char* net) {
1060 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, 5, 8, 1, 1,
1061 xnn_init_f32_minmax_sse_params);
1062 }
1063
f32_gemm_1x8s4__sse(benchmark::State & state,const char * net)1064 static void f32_gemm_1x8s4__sse(benchmark::State& state, const char* net) {
1065 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x8s4__sse, 1, 8, 1, 4,
1066 xnn_init_f32_minmax_sse_params);
1067 }
f32_gemm_3x8s4__sse(benchmark::State & state,const char * net)1068 static void f32_gemm_3x8s4__sse(benchmark::State& state, const char* net) {
1069 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__sse, 3, 8, 1, 4,
1070 xnn_init_f32_minmax_sse_params);
1071 }
f32_gemm_4x8s4__sse(benchmark::State & state,const char * net)1072 static void f32_gemm_4x8s4__sse(benchmark::State& state, const char* net) {
1073 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__sse, 4, 8, 1, 4,
1074 xnn_init_f32_minmax_sse_params);
1075 }
f32_gemm_5x8s4__sse(benchmark::State & state,const char * net)1076 static void f32_gemm_5x8s4__sse(benchmark::State& state, const char* net) {
1077 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__sse, 5, 8, 1, 4,
1078 xnn_init_f32_minmax_sse_params);
1079 }
1080
f32_ppmm_4x8_unipass__sse(benchmark::State & state,const char * net)1081 static void f32_ppmm_4x8_unipass__sse(benchmark::State& state, const char* net) {
1082 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8,
1083 xnn_init_f32_minmax_sse_params);
1084 }
f32_ppmm_4x8_twopass__sse(benchmark::State & state,const char * net)1085 static void f32_ppmm_4x8_twopass__sse(benchmark::State& state, const char* net) {
1086 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__sse, xnn_x32_packx_ukernel_4x__sse, 4, 8,
1087 xnn_init_f32_minmax_sse_params);
1088 }
1089
1090 BENCHMARK_GEMM(f32_gemm_1x16__avx512f_broadcast)
BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)1091 BENCHMARK_GEMM(f32_gemm_4x16__avx512f_broadcast)
1092 BENCHMARK_GEMM(f32_gemm_5x16__avx512f_broadcast)
1093 BENCHMARK_GEMM(f32_gemm_6x16__avx512f_broadcast)
1094 BENCHMARK_GEMM(f32_gemm_7x16__avx512f_broadcast)
1095 BENCHMARK_GEMM(f32_gemm_8x16__avx512f_broadcast)
1096
1097 BENCHMARK_GEMM(f32_gemm_1x8__fma3_broadcast)
1098 BENCHMARK_GEMM(f32_gemm_4x8__fma3_broadcast)
1099 BENCHMARK_GEMM(f32_gemm_5x8__fma3_broadcast)
1100 BENCHMARK_GEMM(f32_gemm_6x8__fma3_broadcast)
1101 BENCHMARK_GEMM(f32_gemm_7x8__fma3_broadcast)
1102 BENCHMARK_GEMM(f32_gemm_8x8__fma3_broadcast)
1103 BENCHMARK_GEMM(f32_gemm_1x16__fma3_broadcast)
1104 BENCHMARK_GEMM(f32_gemm_3x16__fma3_broadcast)
1105 BENCHMARK_GEMM(f32_gemm_4x16__fma3_broadcast)
1106 BENCHMARK_GEMM(f32_gemm_5x16__fma3_broadcast)
1107
1108 BENCHMARK_GEMM(f32_gemm_1x16s4__fma3_broadcast)
1109 BENCHMARK_GEMM(f32_gemm_3x16s4__fma3_broadcast)
1110 BENCHMARK_GEMM(f32_gemm_4x16s4__fma3_broadcast)
1111 BENCHMARK_GEMM(f32_gemm_5x16s4__fma3_broadcast)
1112
1113 BENCHMARK_GEMM(f32_gemm_1x8__avx_broadcast)
1114 BENCHMARK_GEMM(f32_gemm_4x8__avx_broadcast)
1115 BENCHMARK_GEMM(f32_gemm_5x8__avx_broadcast)
1116 BENCHMARK_GEMM(f32_gemm_6x8__avx_broadcast)
1117 BENCHMARK_GEMM(f32_gemm_7x8__avx_broadcast)
1118 BENCHMARK_GEMM(f32_gemm_1x16__avx_broadcast)
1119 BENCHMARK_GEMM(f32_gemm_3x16__avx_broadcast)
1120 BENCHMARK_GEMM(f32_gemm_4x16__avx_broadcast)
1121 BENCHMARK_GEMM(f32_gemm_5x16__avx_broadcast)
1122
1123 BENCHMARK_GEMM(f32_gemm_1x8__sse2_dup)
1124 BENCHMARK_GEMM(f32_gemm_3x8__sse2_dup)
1125 BENCHMARK_GEMM(f32_gemm_4x8__sse2_dup)
1126 BENCHMARK_GEMM(f32_gemm_5x8__sse2_dup)
1127
1128 BENCHMARK_GEMM(f32_gemm_1x8__sse_load1)
1129 BENCHMARK_GEMM(f32_gemm_3x8__sse_load1)
1130 BENCHMARK_GEMM(f32_gemm_4x8__sse_load1)
1131 BENCHMARK_GEMM(f32_gemm_5x8__sse_load1)
1132
1133 BENCHMARK_GEMM(f32_gemm_1x8__sse_dup)
1134 BENCHMARK_GEMM(f32_gemm_3x8__sse_dup)
1135 BENCHMARK_GEMM(f32_gemm_4x8__sse_dup)
1136 BENCHMARK_GEMM(f32_gemm_5x8__sse_dup)
1137
1138 BENCHMARK_GEMM(f32_gemm_1x8s4__sse)
1139 BENCHMARK_GEMM(f32_gemm_3x8s4__sse)
1140 BENCHMARK_GEMM(f32_gemm_4x8s4__sse)
1141 BENCHMARK_GEMM(f32_gemm_5x8s4__sse)
1142
1143 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__sse)
1144 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__sse)
1145 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1146
1147
1148 #if XNN_ARCH_WASMRELAXEDSIMD
1149 static void f32_gemm_3x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, const char* net) {
1150 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, 3, 8, 1, 1,
1151 xnn_init_f32_minmax_wasmsimd_params);
1152 }
f32_gemm_4x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,const char * net)1153 static void f32_gemm_4x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, const char* net) {
1154 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, 4, 8, 1, 1,
1155 xnn_init_f32_minmax_wasmsimd_params);
1156 }
f32_gemm_5x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,const char * net)1157 static void f32_gemm_5x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, const char* net) {
1158 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, 5, 8, 1, 1,
1159 xnn_init_f32_minmax_wasmsimd_params);
1160 }
f32_gemm_6x8__wasmrelaxedsimd_loadsplat(benchmark::State & state,const char * net)1161 static void f32_gemm_6x8__wasmrelaxedsimd_loadsplat(benchmark::State& state, const char* net) {
1162 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, 6, 8, 1, 1,
1163 xnn_init_f32_minmax_wasmsimd_params);
1164 }
f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,const char * net)1165 static void f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, const char* net) {
1166 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_loadsplat, 3, 8, 1, 1,
1167 xnn_init_f32_minmax_wasmsimd_params);
1168 }
f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,const char * net)1169 static void f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, const char* net) {
1170 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat, 4, 8, 1, 1,
1171 xnn_init_f32_minmax_wasmsimd_params);
1172 }
f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,const char * net)1173 static void f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, const char* net) {
1174 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_loadsplat, 5, 8, 1, 1,
1175 xnn_init_f32_minmax_wasmsimd_params);
1176 }
f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State & state,const char * net)1177 static void f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat(benchmark::State& state, const char* net) {
1178 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_loadsplat, 6, 8, 1, 1,
1179 xnn_init_f32_minmax_wasmsimd_params);
1180 }
f32_gemm_3x8__wasmrelaxedsimd_splat(benchmark::State & state,const char * net)1181 static void f32_gemm_3x8__wasmrelaxedsimd_splat(benchmark::State& state, const char* net) {
1182 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, 3, 8, 1, 1,
1183 xnn_init_f32_minmax_wasmsimd_params);
1184 }
f32_gemm_4x8__wasmrelaxedsimd_splat(benchmark::State & state,const char * net)1185 static void f32_gemm_4x8__wasmrelaxedsimd_splat(benchmark::State& state, const char* net) {
1186 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, 4, 8, 1, 1,
1187 xnn_init_f32_minmax_wasmsimd_params);
1188 }
f32_gemm_5x8__wasmrelaxedsimd_splat(benchmark::State & state,const char * net)1189 static void f32_gemm_5x8__wasmrelaxedsimd_splat(benchmark::State& state, const char* net) {
1190 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_splat, 5, 8, 1, 1,
1191 xnn_init_f32_minmax_wasmsimd_params);
1192 }
f32_gemm_6x8__wasmrelaxedsimd_splat(benchmark::State & state,const char * net)1193 static void f32_gemm_6x8__wasmrelaxedsimd_splat(benchmark::State& state, const char* net) {
1194 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_splat, 6, 8, 1, 1,
1195 xnn_init_f32_minmax_wasmsimd_params);
1196 }
f32_gemm_3x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,const char * net)1197 static void f32_gemm_3x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, const char* net) {
1198 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_fma_splat, 3, 8, 1, 1,
1199 xnn_init_f32_minmax_wasmsimd_params);
1200 }
f32_gemm_4x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,const char * net)1201 static void f32_gemm_4x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, const char* net) {
1202 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_splat, 4, 8, 1, 1,
1203 xnn_init_f32_minmax_wasmsimd_params);
1204 }
f32_gemm_5x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,const char * net)1205 static void f32_gemm_5x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, const char* net) {
1206 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat, 5, 8, 1, 1,
1207 xnn_init_f32_minmax_wasmsimd_params);
1208 }
f32_gemm_6x8__wasmrelaxedsimd_fma_splat(benchmark::State & state,const char * net)1209 static void f32_gemm_6x8__wasmrelaxedsimd_fma_splat(benchmark::State& state, const char* net) {
1210 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_fma_splat, 6, 8, 1, 1,
1211 xnn_init_f32_minmax_wasmsimd_params);
1212 }
f32_gemm_3x8s4__wasmrelaxedsimd(benchmark::State & state,const char * net)1213 static void f32_gemm_3x8s4__wasmrelaxedsimd(benchmark::State& state, const char* net) {
1214 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd, 3, 8, 1, 4,
1215 xnn_init_f32_minmax_wasmsimd_params);
1216 }
f32_gemm_4x8s4__wasmrelaxedsimd(benchmark::State & state,const char * net)1217 static void f32_gemm_4x8s4__wasmrelaxedsimd(benchmark::State& state, const char* net) {
1218 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd, 4, 8, 1, 4,
1219 xnn_init_f32_minmax_wasmsimd_params);
1220 }
f32_gemm_5x8s4__wasmrelaxedsimd(benchmark::State & state,const char * net)1221 static void f32_gemm_5x8s4__wasmrelaxedsimd(benchmark::State& state, const char* net) {
1222 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd, 5, 8, 1, 4,
1223 xnn_init_f32_minmax_wasmsimd_params);
1224 }
f32_gemm_6x8s4__wasmrelaxedsimd(benchmark::State & state,const char * net)1225 static void f32_gemm_6x8s4__wasmrelaxedsimd(benchmark::State& state, const char* net) {
1226 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd, 6, 8, 1, 4,
1227 xnn_init_f32_minmax_wasmsimd_params);
1228 }
f32_gemm_3x8s4__wasmrelaxedsimd_fma(benchmark::State & state,const char * net)1229 static void f32_gemm_3x8s4__wasmrelaxedsimd_fma(benchmark::State& state, const char* net) {
1230 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, 3, 8, 1, 4,
1231 xnn_init_f32_minmax_wasmsimd_params);
1232 }
f32_gemm_4x8s4__wasmrelaxedsimd_fma(benchmark::State & state,const char * net)1233 static void f32_gemm_4x8s4__wasmrelaxedsimd_fma(benchmark::State& state, const char* net) {
1234 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, 4, 8, 1, 4,
1235 xnn_init_f32_minmax_wasmsimd_params);
1236 }
f32_gemm_5x8s4__wasmrelaxedsimd_fma(benchmark::State & state,const char * net)1237 static void f32_gemm_5x8s4__wasmrelaxedsimd_fma(benchmark::State& state, const char* net) {
1238 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmrelaxedsimd_fma, 5, 8, 1, 4,
1239 xnn_init_f32_minmax_wasmsimd_params);
1240 }
f32_gemm_6x8s4__wasmrelaxedsimd_fma(benchmark::State & state,const char * net)1241 static void f32_gemm_6x8s4__wasmrelaxedsimd_fma(benchmark::State& state, const char* net) {
1242 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmrelaxedsimd_fma, 6, 8, 1, 4,
1243 xnn_init_f32_minmax_wasmsimd_params);
1244 }
1245
1246 BENCHMARK_GEMM(f32_gemm_3x8__wasmrelaxedsimd_loadsplat)
BENCHMARK_GEMM(f32_gemm_4x8__wasmrelaxedsimd_loadsplat)1247 BENCHMARK_GEMM(f32_gemm_4x8__wasmrelaxedsimd_loadsplat)
1248 BENCHMARK_GEMM(f32_gemm_5x8__wasmrelaxedsimd_loadsplat)
1249 BENCHMARK_GEMM(f32_gemm_6x8__wasmrelaxedsimd_loadsplat)
1250
1251 BENCHMARK_GEMM(f32_gemm_3x8__wasmrelaxedsimd_fma_loadsplat)
1252 BENCHMARK_GEMM(f32_gemm_4x8__wasmrelaxedsimd_fma_loadsplat)
1253 BENCHMARK_GEMM(f32_gemm_5x8__wasmrelaxedsimd_fma_loadsplat)
1254 BENCHMARK_GEMM(f32_gemm_6x8__wasmrelaxedsimd_fma_loadsplat)
1255
1256 BENCHMARK_GEMM(f32_gemm_3x8__wasmrelaxedsimd_splat)
1257 BENCHMARK_GEMM(f32_gemm_4x8__wasmrelaxedsimd_splat)
1258 BENCHMARK_GEMM(f32_gemm_5x8__wasmrelaxedsimd_splat)
1259 BENCHMARK_GEMM(f32_gemm_6x8__wasmrelaxedsimd_splat)
1260
1261 BENCHMARK_GEMM(f32_gemm_3x8__wasmrelaxedsimd_fma_splat)
1262 BENCHMARK_GEMM(f32_gemm_4x8__wasmrelaxedsimd_fma_splat)
1263 BENCHMARK_GEMM(f32_gemm_5x8__wasmrelaxedsimd_fma_splat)
1264 BENCHMARK_GEMM(f32_gemm_6x8__wasmrelaxedsimd_fma_splat)
1265
1266 BENCHMARK_GEMM(f32_gemm_3x8s4__wasmrelaxedsimd)
1267 BENCHMARK_GEMM(f32_gemm_4x8s4__wasmrelaxedsimd)
1268 BENCHMARK_GEMM(f32_gemm_5x8s4__wasmrelaxedsimd)
1269 BENCHMARK_GEMM(f32_gemm_6x8s4__wasmrelaxedsimd)
1270
1271 BENCHMARK_GEMM(f32_gemm_3x8s4__wasmrelaxedsimd_fma)
1272 BENCHMARK_GEMM(f32_gemm_4x8s4__wasmrelaxedsimd_fma)
1273 BENCHMARK_GEMM(f32_gemm_5x8s4__wasmrelaxedsimd_fma)
1274 BENCHMARK_GEMM(f32_gemm_6x8s4__wasmrelaxedsimd_fma)
1275 #endif // XNN_ARCH_WASMRELAXEDSIMD
1276
1277
1278 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1279 static void f32_gemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1280 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 3, 8, 1, 1,
1281 xnn_init_f32_minmax_wasmsimd_params);
1282 }
f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)1283 static void f32_gemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1284 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 4, 8, 1, 1,
1285 xnn_init_f32_minmax_wasmsimd_params);
1286 }
f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)1287 static void f32_gemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1288 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 5, 8, 1, 1,
1289 xnn_init_f32_minmax_wasmsimd_params);
1290 }
f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)1291 static void f32_gemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
1292 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 6, 8, 1, 1,
1293 xnn_init_f32_minmax_wasmsimd_params);
1294 }
f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1295 static void f32_gemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1296 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 3, 8, 1, 1,
1297 xnn_init_f32_minmax_wasmsimd_params);
1298 }
f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1299 static void f32_gemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1300 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 4, 8, 1, 1,
1301 xnn_init_f32_minmax_wasmsimd_params);
1302 }
f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1303 static void f32_gemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1304 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 5, 8, 1, 1,
1305 xnn_init_f32_minmax_wasmsimd_params);
1306 }
f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)1307 static void f32_gemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
1308 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 6, 8, 1, 1,
1309 xnn_init_f32_minmax_wasmsimd_params);
1310 }
f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1311 static void f32_gemm_3x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1312 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 3, 8, 1, 1,
1313 xnn_init_f32_minmax_wasmsimd_params);
1314 }
f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1315 static void f32_gemm_4x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1316 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, 8, 1, 1,
1317 xnn_init_f32_minmax_wasmsimd_params);
1318 }
f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1319 static void f32_gemm_5x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1320 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 5, 8, 1, 1,
1321 xnn_init_f32_minmax_wasmsimd_params);
1322 }
f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)1323 static void f32_gemm_6x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1324 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 6, 8, 1, 1,
1325 xnn_init_f32_minmax_wasmsimd_params);
1326 }
f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1327 static void f32_gemm_3x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1328 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 3, 8, 1, 1,
1329 xnn_init_f32_minmax_wasmsimd_params);
1330 }
f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1331 static void f32_gemm_4x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1332 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, 8, 1, 1,
1333 xnn_init_f32_minmax_wasmsimd_params);
1334 }
f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1335 static void f32_gemm_5x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1336 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 5, 8, 1, 1,
1337 xnn_init_f32_minmax_wasmsimd_params);
1338 }
f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)1339 static void f32_gemm_6x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1340 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 6, 8, 1, 1,
1341 xnn_init_f32_minmax_wasmsimd_params);
1342 }
f32_gemm_3x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1343 static void f32_gemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1344 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, 3, 8, 1, 4,
1345 xnn_init_f32_minmax_wasmsimd_params);
1346 }
f32_gemm_4x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1347 static void f32_gemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1348 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, 8, 1, 4,
1349 xnn_init_f32_minmax_wasmsimd_params);
1350 }
f32_gemm_5x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1351 static void f32_gemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1352 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_arm, 5, 8, 1, 4,
1353 xnn_init_f32_minmax_wasmsimd_params);
1354 }
f32_gemm_6x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1355 static void f32_gemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1356 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, 6, 8, 1, 4,
1357 xnn_init_f32_minmax_wasmsimd_params);
1358 }
f32_gemm_3x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1359 static void f32_gemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1360 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, 3, 8, 1, 4,
1361 xnn_init_f32_minmax_wasmsimd_params);
1362 }
f32_gemm_4x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1363 static void f32_gemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1364 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, 8, 1, 4,
1365 xnn_init_f32_minmax_wasmsimd_params);
1366 }
f32_gemm_5x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1367 static void f32_gemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1368 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_5x8s4__wasmsimd_x86, 5, 8, 1, 4,
1369 xnn_init_f32_minmax_wasmsimd_params);
1370 }
f32_gemm_6x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1371 static void f32_gemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1372 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, 6, 8, 1, 4,
1373 xnn_init_f32_minmax_wasmsimd_params);
1374 }
1375
f32_ppmm_4x8_unipass__wasmsimd_arm_splat(benchmark::State & state,const char * net)1376 static void f32_ppmm_4x8_unipass__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1377 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1378 xnn_init_f32_minmax_wasmsimd_params);
1379 }
f32_ppmm_4x8_unipass__wasmsimd_x86_splat(benchmark::State & state,const char * net)1380 static void f32_ppmm_4x8_unipass__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1381 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1382 xnn_init_f32_minmax_wasmsimd_params);
1383 }
1384
f32_ppmm_4x8_twopass__wasmsimd_arm_splat(benchmark::State & state,const char * net)1385 static void f32_ppmm_4x8_twopass__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
1386 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_arm_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1387 xnn_init_f32_minmax_wasmsimd_params);
1388 }
f32_ppmm_4x8_twopass__wasmsimd_x86_splat(benchmark::State & state,const char * net)1389 static void f32_ppmm_4x8_twopass__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1390 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x8__wasmsimd_x86_splat, xnn_x32_packx_ukernel_4x__wasmsimd, 4, 8,
1391 xnn_init_f32_minmax_wasmsimd_params);
1392 }
1393
1394 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_arm_loadsplat)
BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_loadsplat)1395 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_loadsplat)
1396 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_arm_loadsplat)
1397 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_arm_loadsplat)
1398
1399 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_x86_loadsplat)
1400 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_x86_loadsplat)
1401 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_x86_loadsplat)
1402 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_x86_loadsplat)
1403
1404 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_arm_splat)
1405 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_arm_splat)
1406 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_arm_splat)
1407 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_arm_splat)
1408
1409 BENCHMARK_GEMM(f32_gemm_3x8__wasmsimd_x86_splat)
1410 BENCHMARK_GEMM(f32_gemm_4x8__wasmsimd_x86_splat)
1411 BENCHMARK_GEMM(f32_gemm_5x8__wasmsimd_x86_splat)
1412 BENCHMARK_GEMM(f32_gemm_6x8__wasmsimd_x86_splat)
1413
1414 BENCHMARK_GEMM(f32_gemm_3x8s4__wasmsimd_arm)
1415 BENCHMARK_GEMM(f32_gemm_4x8s4__wasmsimd_arm)
1416 BENCHMARK_GEMM(f32_gemm_5x8s4__wasmsimd_arm)
1417 BENCHMARK_GEMM(f32_gemm_6x8s4__wasmsimd_arm)
1418
1419 BENCHMARK_GEMM(f32_gemm_3x8s4__wasmsimd_x86)
1420 BENCHMARK_GEMM(f32_gemm_4x8s4__wasmsimd_x86)
1421 BENCHMARK_GEMM(f32_gemm_5x8s4__wasmsimd_x86)
1422 BENCHMARK_GEMM(f32_gemm_6x8s4__wasmsimd_x86)
1423
1424 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__wasmsimd_arm_splat)
1425 BENCHMARK_GEMM(f32_ppmm_4x8_unipass__wasmsimd_x86_splat)
1426
1427 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__wasmsimd_arm_splat)
1428 BENCHMARK_GEMM(f32_ppmm_4x8_twopass__wasmsimd_x86_splat)
1429 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1430
1431
1432 static void f32_gemm_1x4__scalar(benchmark::State& state, const char* net) {
1433 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_1x4__scalar, 1, 4, 1, 1,
1434 xnn_init_f32_minmax_scalar_params);
1435 }
f32_gemm_2x4__scalar(benchmark::State & state,const char * net)1436 static void f32_gemm_2x4__scalar(benchmark::State& state, const char* net) {
1437 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_2x4__scalar, 2, 4, 1, 1,
1438 xnn_init_f32_minmax_scalar_params);
1439 }
f32_gemm_4x4__scalar(benchmark::State & state,const char * net)1440 static void f32_gemm_4x4__scalar(benchmark::State& state, const char* net) {
1441 GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__scalar, 4, 4, 1, 1,
1442 xnn_init_f32_minmax_scalar_params);
1443 }
1444
f32_ppmm_2x4_unipass__scalar(benchmark::State & state,const char * net)1445 static void f32_ppmm_2x4_unipass__scalar(benchmark::State& state, const char* net) {
1446 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4,
1447 xnn_init_f32_minmax_scalar_params);
1448 }
f32_ppmm_4x2_unipass__scalar(benchmark::State & state,const char * net)1449 static void f32_ppmm_4x2_unipass__scalar(benchmark::State& state, const char* net) {
1450 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2,
1451 xnn_init_f32_minmax_scalar_params);
1452 }
f32_ppmm_4x4_unipass__scalar(benchmark::State & state,const char * net)1453 static void f32_ppmm_4x4_unipass__scalar(benchmark::State& state, const char* net) {
1454 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4,
1455 xnn_init_f32_minmax_scalar_params);
1456 }
f32_ppmm_3x3_unipass__scalar(benchmark::State & state,const char * net)1457 static void f32_ppmm_3x3_unipass__scalar(benchmark::State& state, const char* net) {
1458 PPMM1PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3,
1459 xnn_init_f32_minmax_scalar_params);
1460 }
1461
f32_ppmm_2x4_twopass__scalar(benchmark::State & state,const char * net)1462 static void f32_ppmm_2x4_twopass__scalar(benchmark::State& state, const char* net) {
1463 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_2x4__scalar, xnn_x32_packx_ukernel_2x__scalar, 2, 4,
1464 xnn_init_f32_minmax_scalar_params);
1465 }
f32_ppmm_4x2_twopass__scalar(benchmark::State & state,const char * net)1466 static void f32_ppmm_4x2_twopass__scalar(benchmark::State& state, const char* net) {
1467 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x2__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 2,
1468 xnn_init_f32_minmax_scalar_params);
1469 }
f32_ppmm_4x4_twopass__scalar(benchmark::State & state,const char * net)1470 static void f32_ppmm_4x4_twopass__scalar(benchmark::State& state, const char* net) {
1471 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_4x4__scalar, xnn_x32_packx_ukernel_4x__scalar, 4, 4,
1472 xnn_init_f32_minmax_scalar_params);
1473 }
f32_ppmm_3x3_twopass__scalar(benchmark::State & state,const char * net)1474 static void f32_ppmm_3x3_twopass__scalar(benchmark::State& state, const char* net) {
1475 PPMM2PBenchmark(state, xnn_f32_ppmm_minmax_ukernel_3x3__scalar, xnn_x32_packx_ukernel_3x__scalar, 3, 3,
1476 xnn_init_f32_minmax_scalar_params);
1477 }
1478
1479 BENCHMARK_GEMM(f32_gemm_1x4__scalar)
1480 BENCHMARK_GEMM(f32_gemm_2x4__scalar)
1481 BENCHMARK_GEMM(f32_gemm_4x4__scalar)
1482
1483 BENCHMARK_GEMM(f32_ppmm_2x4_unipass__scalar)
1484 BENCHMARK_GEMM(f32_ppmm_4x2_unipass__scalar)
1485 BENCHMARK_GEMM(f32_ppmm_4x4_unipass__scalar)
1486 BENCHMARK_GEMM(f32_ppmm_3x3_unipass__scalar)
1487
1488 BENCHMARK_GEMM(f32_ppmm_2x4_twopass__scalar)
1489 BENCHMARK_GEMM(f32_ppmm_4x2_twopass__scalar)
1490 BENCHMARK_GEMM(f32_ppmm_4x4_twopass__scalar)
1491 BENCHMARK_GEMM(f32_ppmm_3x3_twopass__scalar)
1492
1493
1494 #ifdef BENCHMARK_RUY
1495 BENCHMARK_GEMM(ruy_st)
1496 #endif // BENCHMARK_RUY
1497
1498 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1499 BENCHMARK_MAIN();
1500 #endif
1501